summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorph10 <ph10@2f5784b3-3f2a-0410-8824-cb99058d5e15>2012-01-21 16:37:17 +0000
committerph10 <ph10@2f5784b3-3f2a-0410-8824-cb99058d5e15>2012-01-21 16:37:17 +0000
commitec7abfc6da4749a0deda01d514b353a43bdf39db (patch)
tree6236bb15f857322a56762945d3e9b84f6b393274
parent2e9472220441a6c61e9ed14f3fe3d33686e241b1 (diff)
downloadpcre-ec7abfc6da4749a0deda01d514b353a43bdf39db.tar.gz
Source file tidies for 8.30-RC1 release; fix Makefile.am bugs for building
symbolic links to man pages. git-svn-id: svn://vcs.exim.org/pcre/code/trunk@903 2f5784b3-3f2a-0410-8824-cb99058d5e15
-rw-r--r--CMakeLists.txt2
-rw-r--r--ChangeLog40
-rw-r--r--Makefile.am54
-rw-r--r--NEWS6
-rw-r--r--README4
-rwxr-xr-xRunTest2
-rw-r--r--configure.ac7
-rw-r--r--doc/html/index.html16
-rw-r--r--doc/html/pcre-config.html8
-rw-r--r--doc/html/pcre.html10
-rw-r--r--doc/html/pcre16.html118
-rw-r--r--doc/html/pcre_config.html5
-rw-r--r--doc/html/pcre_fullinfo.html2
-rw-r--r--doc/html/pcre_jit_stack_alloc.html2
-rw-r--r--doc/html/pcre_pattern_to_host_byte_order.html6
-rw-r--r--doc/html/pcre_utf16_to_host_byte_order.html6
-rw-r--r--doc/html/pcreapi.html86
-rw-r--r--doc/html/pcrebuild.html12
-rw-r--r--doc/html/pcrecallout.html6
-rw-r--r--doc/html/pcrecpp.html2
-rw-r--r--doc/html/pcrejit.html8
-rw-r--r--doc/html/pcrematching.html4
-rw-r--r--doc/html/pcrepartial.html11
-rw-r--r--doc/html/pcrepattern.html22
-rw-r--r--doc/html/pcreposix.html2
-rw-r--r--doc/html/pcreprecompile.html2
-rw-r--r--doc/html/pcrestack.html34
-rw-r--r--doc/html/pcresyntax.html6
-rw-r--r--doc/html/pcretest.html50
-rw-r--r--doc/html/pcreunicode.html8
-rw-r--r--doc/pcre-config.18
-rw-r--r--doc/pcre.310
-rw-r--r--doc/pcre.txt364
-rw-r--r--doc/pcre16.3118
-rw-r--r--doc/pcre_config.34
-rw-r--r--doc/pcre_fullinfo.32
-rw-r--r--doc/pcre_jit_stack_alloc.32
-rw-r--r--doc/pcre_pattern_to_host_byte_order.36
-rw-r--r--doc/pcre_utf16_to_host_byte_order.36
-rw-r--r--doc/pcreapi.356
-rw-r--r--doc/pcrebuild.312
-rw-r--r--doc/pcrecallout.36
-rw-r--r--doc/pcrecpp.32
-rw-r--r--doc/pcrejit.38
-rw-r--r--doc/pcrematching.34
-rw-r--r--doc/pcrepartial.38
-rw-r--r--doc/pcrepattern.324
-rw-r--r--doc/pcreposix.32
-rw-r--r--doc/pcreprecompile.32
-rw-r--r--doc/pcrestack.318
-rw-r--r--doc/pcresyntax.36
-rw-r--r--doc/pcretest.156
-rw-r--r--doc/pcreunicode.38
-rw-r--r--doc/perltest.txt4
-rw-r--r--pcre-config.in4
-rw-r--r--pcre_compile.c12
-rw-r--r--pcre_exec.c38
-rw-r--r--pcreposix.c2
-rw-r--r--pcretest.c18
-rwxr-xr-xperltest.pl14
60 files changed, 742 insertions, 623 deletions
diff --git a/CMakeLists.txt b/CMakeLists.txt
index c0ff97e..a253546 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -393,7 +393,7 @@ SET(PCRE_SOURCES
pcre_newline.c
pcre_ord2utf8.c
pcre_refcount.c
- pcre_string_utils.c
+ pcre_string_utils.c
pcre_study.c
pcre_tables.c
pcre_ucd.c
diff --git a/ChangeLog b/ChangeLog
index cbf818b..b8b2587 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -32,33 +32,33 @@ Version 8.30
8. Ovector size of 2 is also supported by JIT based pcre_exec (the ovector size
rounding is not applied in this particular case).
-
+
9. The invalid Unicode surrogate codepoints U+D800 to U+DFFF are now rejected
if they appear, or are escaped, in patterns.
-
-10. Get rid of a number of -Wunused-but-set-variable warnings.
-
-11. The pattern /(?=(*:x))(q|)/ matches an empty string, and returns the mark
- "x". The similar pattern /(?=(*:x))((*:y)q|)/ did not return a mark at all.
- Oddly, Perl behaves the same way. PCRE has been fixed so that this pattern
- also returns the mark "x". This bug applied to capturing parentheses,
- non-capturing parentheses, and atomic parentheses. It also applied to some
+
+10. Get rid of a number of -Wunused-but-set-variable warnings.
+
+11. The pattern /(?=(*:x))(q|)/ matches an empty string, and returns the mark
+ "x". The similar pattern /(?=(*:x))((*:y)q|)/ did not return a mark at all.
+ Oddly, Perl behaves the same way. PCRE has been fixed so that this pattern
+ also returns the mark "x". This bug applied to capturing parentheses,
+ non-capturing parentheses, and atomic parentheses. It also applied to some
assertions.
-
-12. Stephen Kelly's patch to CMakeLists.txt allows it to parse the version
- information out of configure.ac instead of relying on pcre.h.generic, which
- is not stored in the repository.
-
+
+12. Stephen Kelly's patch to CMakeLists.txt allows it to parse the version
+ information out of configure.ac instead of relying on pcre.h.generic, which
+ is not stored in the repository.
+
13. Applied Dmitry V. Levin's patch for a more portable method for linking with
-lreadline.
-
-14. ZH added PCRE_CONFIG_JITTARGET; added its output to pcretest -C.
-15. Applied Graycode's patch to put the top-level frame on the stack rather
- than the heap when not using the stack for recursion. This gives a
+14. ZH added PCRE_CONFIG_JITTARGET; added its output to pcretest -C.
+
+15. Applied Graycode's patch to put the top-level frame on the stack rather
+ than the heap when not using the stack for recursion. This gives a
performance improvement in many cases when recursion is not deep.
-
-16. Experimental code added to "pcretest -C" to output the stack frame size.
+
+16. Experimental code added to "pcretest -C" to output the stack frame size.
Version 8.21 12-Dec-2011
diff --git a/Makefile.am b/Makefile.am
index f71339c..054640a 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -520,34 +520,32 @@ dist_man_MANS = \
doc/pcreunicode.3
# Arrange for the per-function man pages to have 16-bit names as well.
-install-exec-hook:
- pushd $(DESTDIR)($man3dir)
- ln -s pcre_assign_jit_stack.3 pcre16_assign_jit_stack.3
- ln -s pcre_compile.3 pcre16_compile.3
- ln -s pcre_compile2.3 pcre16_compile2.3
- ln -s pcre_config.3 pcre16_config.3
- ln -s pcre_copy_named_substring.3 pcre16_copy_named_substring.3
- ln -s pcre_copy_substring.3 pcre16_copy_substring.3
- ln -s pcre_dfa_exec.3 pcre16_dfa_exec.3
- ln -s pcre_exec.3 pcre16_exec.3
- ln -s pcre_free_study.3 pcre16_free_study.3
- ln -s pcre_free_substring.3 pcre16_free_substring.3
- ln -s pcre_free_substring_list.3 pcre16_free_substring_list.3
- ln -s pcre_fullinfo.3 pcre16_fullinfo.3
- ln -s pcre_get_named_substring.3 pcre16_get_named_substring.3
- ln -s pcre_get_stringnumber.3 pcre16_get_stringnumber.3
- ln -s pcre_get_stringtable_entries.3 pcre16_get_stringtable_entries.3
- ln -s pcre_get_substring.3 pcre16_get_substring.3
- ln -s pcre_get_substring_list.3 pcre16_get_substring_list.3
- ln -s pcre_jit_stack_alloc.3 pcre16_jit_stack_alloc.3
- ln -s pcre_jit_stack_free.3 pcre16_jit_stack_free.3
- ln -s pcre_maketables.3 pcre16_maketables.3
- ln -s pcre_pattern_to_host_byte_order.3 pcre16_pattern_to_host_byte_order.3
- ln -s pcre_refcount.3 pcre16_refcount.3
- ln -s pcre_study.3 pcre16_study.3
- ln -s pcre_utf16_to_host_byte_order.3 pcre16_utf16_to_host_byte_order.3
- ln -s pcre_version.3 pcre16_version.3
- popd
+install-data-hook:
+ ln -s $(DESTDIR)$(man3dir)/pcre_assign_jit_stack.3 $(DESTDIR)$(man3dir)/pcre16_assign_jit_stack.3
+ ln -s $(DESTDIR)$(man3dir)/pcre_compile.3 $(DESTDIR)$(man3dir)/pcre16_compile.3
+ ln -s $(DESTDIR)$(man3dir)/pcre_compile2.3 $(DESTDIR)$(man3dir)/pcre16_compile2.3
+ ln -s $(DESTDIR)$(man3dir)/pcre_config.3 $(DESTDIR)$(man3dir)/pcre16_config.3
+ ln -s $(DESTDIR)$(man3dir)/pcre_copy_named_substring.3 $(DESTDIR)$(man3dir)/pcre16_copy_named_substring.3
+ ln -s $(DESTDIR)$(man3dir)/pcre_copy_substring.3 $(DESTDIR)$(man3dir)/pcre16_copy_substring.3
+ ln -s $(DESTDIR)$(man3dir)/pcre_dfa_exec.3 $(DESTDIR)$(man3dir)/pcre16_dfa_exec.3
+ ln -s $(DESTDIR)$(man3dir)/pcre_exec.3 $(DESTDIR)$(man3dir)/pcre16_exec.3
+ ln -s $(DESTDIR)$(man3dir)/pcre_free_study.3 $(DESTDIR)$(man3dir)/pcre16_free_study.3
+ ln -s $(DESTDIR)$(man3dir)/pcre_free_substring.3 $(DESTDIR)$(man3dir)/pcre16_free_substring.3
+ ln -s $(DESTDIR)$(man3dir)/pcre_free_substring_list.3 $(DESTDIR)$(man3dir)/pcre16_free_substring_list.3
+ ln -s $(DESTDIR)$(man3dir)/pcre_fullinfo.3 $(DESTDIR)$(man3dir)/pcre16_fullinfo.3
+ ln -s $(DESTDIR)$(man3dir)/pcre_get_named_substring.3 $(DESTDIR)$(man3dir)/pcre16_get_named_substring.3
+ ln -s $(DESTDIR)$(man3dir)/pcre_get_stringnumber.3 $(DESTDIR)$(man3dir)/pcre16_get_stringnumber.3
+ ln -s $(DESTDIR)$(man3dir)/pcre_get_stringtable_entries.3 $(DESTDIR)$(man3dir)/pcre16_get_stringtable_entries.3
+ ln -s $(DESTDIR)$(man3dir)/pcre_get_substring.3 $(DESTDIR)$(man3dir)/pcre16_get_substring.3
+ ln -s $(DESTDIR)$(man3dir)/pcre_get_substring_list.3 $(DESTDIR)$(man3dir)/pcre16_get_substring_list.3
+ ln -s $(DESTDIR)$(man3dir)/pcre_jit_stack_alloc.3 $(DESTDIR)$(man3dir)/pcre16_jit_stack_alloc.3
+ ln -s $(DESTDIR)$(man3dir)/pcre_jit_stack_free.3 $(DESTDIR)$(man3dir)/pcre16_jit_stack_free.3
+ ln -s $(DESTDIR)$(man3dir)/pcre_maketables.3 $(DESTDIR)$(man3dir)/pcre16_maketables.3
+ ln -s $(DESTDIR)$(man3dir)/pcre_pattern_to_host_byte_order.3 $(DESTDIR)$(man3dir)/pcre16_pattern_to_host_byte_order.3
+ ln -s $(DESTDIR)$(man3dir)/pcre_refcount.3 $(DESTDIR)$(man3dir)/pcre16_refcount.3
+ ln -s $(DESTDIR)$(man3dir)/pcre_study.3 $(DESTDIR)$(man3dir)/pcre16_study.3
+ ln -s $(DESTDIR)$(man3dir)/pcre_utf16_to_host_byte_order.3 $(DESTDIR)$(man3dir)/pcre16_utf16_to_host_byte_order.3
+ ln -s $(DESTDIR)$(man3dir)/pcre_version.3 $(DESTDIR)$(man3dir)/pcre16_version.3
pcrecpp_man = doc/pcrecpp.3
EXTRA_DIST += $(pcrecpp_man)
diff --git a/NEWS b/NEWS
index 58b025b..699a301 100644
--- a/NEWS
+++ b/NEWS
@@ -5,20 +5,20 @@ Release 8.30
------------
Release 8.30 introduces a major new feature: support for 16-bit character
-strings, compiled as a separate library. There are a few changes to the
+strings, compiled as a separate library. There are a few changes to the
8-bit library, in addition to some bug fixes.
. The pcre_info() function, which has been obsolete for over 10 years, has
been removed.
. When a compiled pattern was saved to a file and later reloaded on a host
- with different endianness, PCRE used automatically to swap the bytes in some
+ with different endianness, PCRE used automatically to swap the bytes in some
of the data fields. With the advent of the 16-bit library, where more of this
swapping is needed, it is no longer done automatically. Instead, the bad
endianness is detected and a specific error is given. The user can then call
a new function called pcre_pattern_to_host_byte_order() (or an equivalent
16-bit function) to do the swap.
-
+
. In UTF-8 mode, the values 0xd800 to 0xdfff are not legal Unicode
code points and are now faulted. (They are the so-called "surrogates"
that are reserved for coding high values in UTF-16.)
diff --git a/README b/README
index 10669e1..5c52ee3 100644
--- a/README
+++ b/README
@@ -201,7 +201,7 @@ library. They are also documented in the pcrebuild man page.
platforms. It is not possible to use both --enable-utf and --enable-ebcdic at
the same time.
-. There are no separate options for enabling UTF-8 and UTF-16 independently
+. There are no separate options for enabling UTF-8 and UTF-16 independently
because that would allow ridiculous settings such as requesting UTF-16
support while building only the 8-bit library. However, the option
--enable-utf8 is retained for backwards compatibility with earlier releases
@@ -669,7 +669,7 @@ general cases, UTF-8/16 support, and Unicode property support, respectively.
The twentieth test is run only in 16-bit mode. It tests some specific 16-bit
features of the DFA matching engine.
-The twenty-first and twenty-second tests are run only in 16-bit mode, when the
+The twenty-first and twenty-second tests are run only in 16-bit mode, when the
link size is set to 2. They test reloading pre-compiled patterns.
diff --git a/RunTest b/RunTest
index 882ea81..aae292e 100755
--- a/RunTest
+++ b/RunTest
@@ -275,7 +275,7 @@ if [ $do1 = no -a $do2 = no -a $do3 = no -a $do4 = no -a \
do19=yes
do20=yes
do21=yes
- do22=yes
+ do22=yes
fi
# Show which release and which test data
diff --git a/configure.ac b/configure.ac
index ce78147..5db475c 100644
--- a/configure.ac
+++ b/configure.ac
@@ -11,7 +11,7 @@ dnl be defined as -RC2, for example. For real releases, it should be empty.
m4_define(pcre_major, [8])
m4_define(pcre_minor, [30])
m4_define(pcre_prerelease, [-RC1])
-m4_define(pcre_date, [2012-01-20])
+m4_define(pcre_date, [2012-01-21])
# NOTE: The CMakeLists.txt file searches for the above variables in the first
# 50 lines of this file. Please update that if the variables above are moved.
@@ -774,8 +774,9 @@ AC_SUBST(EXTRA_LIBPCRE16_LDFLAGS)
AC_SUBST(EXTRA_LIBPCREPOSIX_LDFLAGS)
AC_SUBST(EXTRA_LIBPCRECPP_LDFLAGS)
-# When we run 'make distcheck', use these arguments.
-DISTCHECK_CONFIGURE_FLAGS="--enable-pcre16 --enable-jit --enable-cpp --enable-unicode-properties"
+# When we run 'make distcheck', use these arguments. Turning off compiler
+# optimization makes it run faster.
+DISTCHECK_CONFIGURE_FLAGS="CFLAGS='' CXXFLAGS='' --enable-pcre16 --enable-jit --enable-cpp --enable-unicode-properties"
AC_SUBST(DISTCHECK_CONFIGURE_FLAGS)
# Check that, if --enable-pcregrep-libz or --enable-pcregrep-libbz2 is
diff --git a/doc/html/index.html b/doc/html/index.html
index 20720df..b87c2a9 100644
--- a/doc/html/index.html
+++ b/doc/html/index.html
@@ -1,10 +1,10 @@
<html>
-<!-- This is a manually maintained file that is the root of the HTML version of
- the PCRE documentation. When the HTML documents are built from the man
- page versions, the entire doc/html directory is emptied, this file is then
- copied into doc/html/index.html, and the remaining files therein are
+<!-- This is a manually maintained file that is the root of the HTML version of
+ the PCRE documentation. When the HTML documents are built from the man
+ page versions, the entire doc/html directory is emptied, this file is then
+ copied into doc/html/index.html, and the remaining files therein are
created by the 132html script.
--->
+-->
<head>
<title>PCRE specification</title>
</head>
@@ -86,11 +86,11 @@ The HTML documentation for PCRE comprises the following pages:
</table>
<p>
-There are also individual pages that summarize the interface for each function
+There are also individual pages that summarize the interface for each function
in the library. There is a single page for each pair of 8-bit/16-bit functions.
</p>
-<table>
+<table>
<tr><td><a href="pcre_assign_jit_stack.html">pcre_assign_jit_stack</a></td>
<td>&nbsp;&nbsp;Assign stack for JIT matching</td></tr>
@@ -153,7 +153,7 @@ in the library. There is a single page for each pair of 8-bit/16-bit functions.
<tr><td><a href="pcre_maketables.html">pcre_maketables</a></td>
<td>&nbsp;&nbsp;Build character tables in current locale</td></tr>
-
+
<tr><td><a href="pcre_pattern_to_host_byte_order.html">pcre_pattern_to_host_byte_order</a></td>
<td>&nbsp;&nbsp;Convert compiled pattern to host byte order if necessary</td></tr>
diff --git a/doc/html/pcre-config.html b/doc/html/pcre-config.html
index 141b805..87c874d 100644
--- a/doc/html/pcre-config.html
+++ b/doc/html/pcre-config.html
@@ -23,15 +23,15 @@ man page, in case the conversion went wrong.
<br><a name="SEC1" href="#TOC1">SYNOPSIS</a><br>
<P>
<b>pcre-config [--prefix] [--exec-prefix] [--version] [--libs]</b>
-<b>[--libs16] [--libs-cpp] [--libs-posix] [--cflags] </b>
+<b>[--libs16] [--libs-cpp] [--libs-posix] [--cflags]</b>
<b>[--cflags-posix]</b>
</P>
<br><a name="SEC2" href="#TOC1">DESCRIPTION</a><br>
<P>
<b>pcre-config</b> returns the configuration of the installed PCRE
-libraries and the options required to compile a program to use them. Some of
-the options apply only to the 8-bit or 16-bit libraries, respectively, and are
-not available if only one of those libraries has been built. If an unavailable
+libraries and the options required to compile a program to use them. Some of
+the options apply only to the 8-bit or 16-bit libraries, respectively, and are
+not available if only one of those libraries has been built. If an unavailable
option is encountered, the "usage" information is output.
</P>
<br><a name="SEC3" href="#TOC1">OPTIONS</a><br>
diff --git a/doc/html/pcre.html b/doc/html/pcre.html
index 52afb47..ff5202f 100644
--- a/doc/html/pcre.html
+++ b/doc/html/pcre.html
@@ -28,11 +28,11 @@ support for one or two .NET and Oniguruma syntax items, and there is an option
for requesting some minor changes that give better JavaScript compatibility.
</P>
<P>
-Starting with release 8.30, it is possible to compile two separate PCRE
+Starting with release 8.30, it is possible to compile two separate PCRE
libraries: the original, which supports 8-bit character strings (including
UTF-8 strings), and a second library that supports 16-bit character strings
(including UTF-16 strings). The build process allows either one or both to be
-built. The majority of the work to make this possible was done by Zoltan
+built. The majority of the work to make this possible was done by Zoltan
Herczeg.
</P>
<P>
@@ -42,8 +42,8 @@ over-complication and reduce the documentation maintenance load, most of the
documentation describes the 8-bit library, with the differences for the 16-bit
library described separately in the
<a href="pcre16.html"><b>pcre16</b></a>
-page. References to functions or structures of the form <i>pcre[16]_xxx</i>
-should be read as meaning "<i>pcre_xxx</i> when using the 8-bit library and
+page. References to functions or structures of the form <i>pcre[16]_xxx</i>
+should be read as meaning "<i>pcre_xxx</i> when using the 8-bit library and
<i>pcre16_xxx</i> when using the 16-bit library".
</P>
<P>
@@ -109,7 +109,7 @@ all the sections, except the <b>pcredemo</b> section, are concatenated, for ease
of searching. The sections are as follows:
<pre>
pcre this document
- pcre16 details of the 16-bit library
+ pcre16 details of the 16-bit library
pcre-config show PCRE installation configuration information
pcreapi details of PCRE's native C API
pcrebuild options for building PCRE
diff --git a/doc/html/pcre16.html b/doc/html/pcre16.html
index 66e89cd..126ff75 100644
--- a/doc/html/pcre16.html
+++ b/doc/html/pcre16.html
@@ -160,7 +160,7 @@ man page, in case the conversion went wrong.
<br><a name="SEC5" href="#TOC1">PCRE 16-BIT API 16-BIT-ONLY FUNCTION</a><br>
<P>
<b>int pcre16_utf16_to_host_byte_order(PCRE_UCHAR16 *<i>output</i>,</b>
-<b>PCRE_SPTR16 <i>input</i>, int <i>length</i>, int *<i>byte_order</i>, </b>
+<b>PCRE_SPTR16 <i>input</i>, int <i>length</i>, int *<i>byte_order</i>,</b>
<b>int <i>keep_boms</i>);</b>
</P>
<br><a name="SEC6" href="#TOC1">THE PCRE 16-BIT LIBRARY</a><br>
@@ -177,8 +177,8 @@ to the 16-bit library. This page describes what is different when you use the
16-bit library.
</P>
<P>
-WARNING: A single application can be linked with both libraries, but you must
-take care when processing any particular pattern to use functions from just one
+WARNING: A single application can be linked with both libraries, but you must
+take care when processing any particular pattern to use functions from just one
library. For example, if you want to study a pattern that was compiled with
<b>pcre16_compile()</b>, you must do so with <b>pcre16_study()</b>, not
<b>pcre_study()</b>, and you must free the study data with
@@ -186,52 +186,52 @@ library. For example, if you want to study a pattern that was compiled with
</P>
<br><a name="SEC7" href="#TOC1">THE HEADER FILE</a><br>
<P>
-There is only one header file, <b>pcre.h</b>. It contains prototypes for all the
+There is only one header file, <b>pcre.h</b>. It contains prototypes for all the
functions in both libraries, as well as definitions of flags, structures, error
codes, etc.
</P>
<br><a name="SEC8" href="#TOC1">THE LIBRARY NAME</a><br>
<P>
-In Unix-like systems, the 16-bit library is called <b>libpcre16</b>, and can
-normally be accesss by adding <b>-lpcre16</b> to the command for linking an
+In Unix-like systems, the 16-bit library is called <b>libpcre16</b>, and can
+normally be accesss by adding <b>-lpcre16</b> to the command for linking an
application that uses PCRE.
</P>
<br><a name="SEC9" href="#TOC1">STRING TYPES</a><br>
<P>
-In the 8-bit library, strings are passed to PCRE library functions as vectors
-of bytes with the C type "char *". In the 16-bit library, strings are passed as
-vectors of unsigned 16-bit quantities. The macro PCRE_UCHAR16 specifies an
-appropriate data type, and PCRE_SPTR16 is defined as "const PCRE_UCHAR16 *". In
-very many environments, "short int" is a 16-bit data type. When PCRE is built,
-it defines PCRE_UCHAR16 as "short int", but checks that it really is a 16-bit
-data type. If it is not, the build fails with an error message telling the
+In the 8-bit library, strings are passed to PCRE library functions as vectors
+of bytes with the C type "char *". In the 16-bit library, strings are passed as
+vectors of unsigned 16-bit quantities. The macro PCRE_UCHAR16 specifies an
+appropriate data type, and PCRE_SPTR16 is defined as "const PCRE_UCHAR16 *". In
+very many environments, "short int" is a 16-bit data type. When PCRE is built,
+it defines PCRE_UCHAR16 as "short int", but checks that it really is a 16-bit
+data type. If it is not, the build fails with an error message telling the
maintainer to modify the definition appropriately.
</P>
<br><a name="SEC10" href="#TOC1">STRUCTURE TYPES</a><br>
<P>
-The types of the opaque structures that are used for compiled 16-bit patterns
-and JIT stacks are <b>pcre16</b> and <b>pcre16_jit_stack</b> respectively. The
-type of the user-accessible structure that is returned by <b>pcre16_study()</b>
+The types of the opaque structures that are used for compiled 16-bit patterns
+and JIT stacks are <b>pcre16</b> and <b>pcre16_jit_stack</b> respectively. The
+type of the user-accessible structure that is returned by <b>pcre16_study()</b>
is <b>pcre16_extra</b>, and the type of the structure that is used for passing
-data to a callout function is <b>pcre16_callout_block</b>. These structures
-contain the same fields, with the same names, as their 8-bit counterparts. The
-only difference is that pointers to character strings are 16-bit instead of
+data to a callout function is <b>pcre16_callout_block</b>. These structures
+contain the same fields, with the same names, as their 8-bit counterparts. The
+only difference is that pointers to character strings are 16-bit instead of
8-bit types.
</P>
<br><a name="SEC11" href="#TOC1">16-BIT FUNCTIONS</a><br>
<P>
For every function in the 8-bit library there is a corresponding function in
-the 16-bit library with a name that starts with <b>pcre16_</b> instead of
+the 16-bit library with a name that starts with <b>pcre16_</b> instead of
<b>pcre_</b>. The prototypes are listed above. In addition, there is one extra
-function, <b>pcre16_utf16_to_host_byte_order()</b>. This is a utility function
-that converts a UTF-16 character string to host byte order if necessary. The
-other 16-bit functions expect the strings they are passed to be in host byte
-order.
+function, <b>pcre16_utf16_to_host_byte_order()</b>. This is a utility function
+that converts a UTF-16 character string to host byte order if necessary. The
+other 16-bit functions expect the strings they are passed to be in host byte
+order.
</P>
<P>
The <i>input</i> and <i>output</i> arguments of
-<b>pcre16_utf16_to_host_byte_order()</b> may point to the same address, that is,
-conversion in place is supported. The output buffer must be at least as long as
+<b>pcre16_utf16_to_host_byte_order()</b> may point to the same address, that is,
+conversion in place is supported. The output buffer must be at least as long as
the input.
</P>
<P>
@@ -239,18 +239,18 @@ The <i>length</i> argument specifies the number of 16-bit data units in the
input string; a negative value specifies a zero-terminated string.
</P>
<P>
-If <i>byte_order</i> is NULL, it is assumed that the string starts off in host
+If <i>byte_order</i> is NULL, it is assumed that the string starts off in host
byte order. This may be changed by byte-order marks (BOMs) anywhere in the
string (commonly as the first character).
</P>
<P>
-If <i>byte_order</i> is not NULL, a non-zero value of the integer to which it
-points means that the input starts off in host byte order, otherwise the
-opposite order is assumed. Again, BOMs in the string can change this. The final
-byte order is passed back at the end of processing.
+If <i>byte_order</i> is not NULL, a non-zero value of the integer to which it
+points means that the input starts off in host byte order, otherwise the
+opposite order is assumed. Again, BOMs in the string can change this. The final
+byte order is passed back at the end of processing.
</P>
<P>
-If <i>keep_boms</i> is not zero, byte-order mark characters (0xfeff) are copied
+If <i>keep_boms</i> is not zero, byte-order mark characters (0xfeff) are copied
into the output string. Otherwise they are discarded.
</P>
<P>
@@ -259,14 +259,14 @@ buffer, including the zero terminator if the string was zero-terminated.
</P>
<br><a name="SEC12" href="#TOC1">SUBJECT STRING OFFSETS</a><br>
<P>
-The offsets within subject strings that are returned by the matching functions
+The offsets within subject strings that are returned by the matching functions
are in 16-bit units rather than bytes.
</P>
<br><a name="SEC13" href="#TOC1">NAMED SUBPATTERNS</a><br>
<P>
-The name-to-number translation table that is maintained for named subpatterns
-uses 16-bit characters. The <b>pcre16_get_stringtable_entries()</b> function
-returns the length of each entry in the table as the number of 16-bit data
+The name-to-number translation table that is maintained for named subpatterns
+uses 16-bit characters. The <b>pcre16_get_stringtable_entries()</b> function
+returns the length of each entry in the table as the number of 16-bit data
units.
</P>
<br><a name="SEC14" href="#TOC1">OPTION NAMES</a><br>
@@ -276,27 +276,27 @@ which correspond to PCRE_UTF8 and PCRE_NO_UTF8_CHECK in the 8-bit library. In
fact, these new options define the same bits in the options word.
</P>
<P>
-For the <b>pcre16_config()</b> function there is an option PCRE_CONFIG_UTF16
+For the <b>pcre16_config()</b> function there is an option PCRE_CONFIG_UTF16
that returns 1 if UTF-16 support is configured, otherwise 0. If this option is
given to <b>pcre_config()</b>, or if the PCRE_CONFIG_UTF8 option is given to
<b>pcre16_config()</b>, the result is the PCRE_ERROR_BADOPTION error.
</P>
<br><a name="SEC15" href="#TOC1">CHARACTER CODES</a><br>
<P>
-In 16-bit mode, when PCRE_UTF16 is not set, character values are treated in the
-same way as in 8-bit, non UTF-8 mode, except, of course, that they can range
-from 0 to 0xffff instead of 0 to 0xff. Character types for characters less than
-0xff can therefore be influenced by the locale in the same way as before.
-Characters greater than 0xff have only one case, and no "type" (such as letter
+In 16-bit mode, when PCRE_UTF16 is not set, character values are treated in the
+same way as in 8-bit, non UTF-8 mode, except, of course, that they can range
+from 0 to 0xffff instead of 0 to 0xff. Character types for characters less than
+0xff can therefore be influenced by the locale in the same way as before.
+Characters greater than 0xff have only one case, and no "type" (such as letter
or digit).
</P>
<P>
-In UTF-16 mode, the character code is Unicode, in the range 0 to 0x10ffff, with
-the exception of values in the range 0xd800 to 0xdfff because those are
+In UTF-16 mode, the character code is Unicode, in the range 0 to 0x10ffff, with
+the exception of values in the range 0xd800 to 0xdfff because those are
"surrogate" values that are used in pairs to encode values greater than 0xffff.
</P>
<P>
-A UTF-16 string can indicate its endianness by special code knows as a
+A UTF-16 string can indicate its endianness by special code knows as a
byte-order mark (BOM). The PCRE functions do not handle this, expecting strings
to be in host byte order. A utility function called
<b>pcre16_utf16_to_host_byte_order()</b> is provided to help with this (see
@@ -304,18 +304,18 @@ above).
</P>
<br><a name="SEC16" href="#TOC1">ERROR NAMES</a><br>
<P>
-The errors PCRE_ERROR_BADUTF16_OFFSET and PCRE_ERROR_SHORTUTF16 correspond to
+The errors PCRE_ERROR_BADUTF16_OFFSET and PCRE_ERROR_SHORTUTF16 correspond to
their 8-bit counterparts. The error PCRE_ERROR_BADMODE is given when a compiled
pattern is passed to a function that processes patterns in the other
-mode, for example, if a pattern compiled with <b>pcre_compile()</b> is passed to
+mode, for example, if a pattern compiled with <b>pcre_compile()</b> is passed to
<b>pcre16_exec()</b>.
</P>
<P>
There are new error codes whose names begin with PCRE_UTF16_ERR for invalid
-UTF-16 strings, corresponding to the PCRE_UTF8_ERR codes for UTF-8 strings that
+UTF-16 strings, corresponding to the PCRE_UTF8_ERR codes for UTF-8 strings that
are described in the section entitled
<a href="pcreapi.html#badutf8reasons">"Reason codes for invalid UTF-8 strings"</a>
-in the main
+in the main
<a href="pcreapi.html"><b>pcreapi</b></a>
page. The UTF-16 errors are:
<pre>
@@ -327,8 +327,8 @@ page. The UTF-16 errors are:
</P>
<br><a name="SEC17" href="#TOC1">ERROR TEXTS</a><br>
<P>
-If there is an error while compiling a pattern, the error text that is passed
-back by <b>pcre16_compile()</b> or <b>pcre16_compile2()</b> is still an 8-bit
+If there is an error while compiling a pattern, the error text that is passed
+back by <b>pcre16_compile()</b> or <b>pcre16_compile2()</b> is still an 8-bit
character string, zero-terminated.
</P>
<br><a name="SEC18" href="#TOC1">CALLOUTS</a><br>
@@ -338,23 +338,23 @@ a callout function point to 16-bit vectors.
</P>
<br><a name="SEC19" href="#TOC1">TESTING</a><br>
<P>
-The <b>pcretest</b> program continues to operate with 8-bit input and output
-files, but it can be used for testing the 16-bit library. If it is run with the
-command line option <b>-16</b>, patterns and subject strings are converted from
-8-bit to 16-bit before being passed to PCRE, and the 16-bit library functions
-are used instead of the 8-bit ones. Returned 16-bit strings are converted to
+The <b>pcretest</b> program continues to operate with 8-bit input and output
+files, but it can be used for testing the 16-bit library. If it is run with the
+command line option <b>-16</b>, patterns and subject strings are converted from
+8-bit to 16-bit before being passed to PCRE, and the 16-bit library functions
+are used instead of the 8-bit ones. Returned 16-bit strings are converted to
8-bit for output. If the 8-bit library was not compiled, <b>pcretest</b>
defaults to 16-bit and the <b>-16</b> option is ignored.
</P>
<P>
-When PCRE is being built, the <b>RunTest</b> script that is called by "make
+When PCRE is being built, the <b>RunTest</b> script that is called by "make
check" uses the <b>pcretest</b> <b>-C</b> option to discover which of the 8-bit
and 16-bit libraries has been built, and runs the tests appropriately.
</P>
<br><a name="SEC20" href="#TOC1">NOT SUPPORTED IN 16-BIT MODE</a><br>
<P>
-Not all the features of the 8-bit library are available with the 16-bit
-library. The C++ and POSIX wrapper functions support only the 8-bit library,
+Not all the features of the 8-bit library are available with the 16-bit
+library. The C++ and POSIX wrapper functions support only the 8-bit library,
and the <b>pcregrep</b> program is at present 8-bit only.
</P>
<br><a name="SEC21" href="#TOC1">AUTHOR</a><br>
diff --git a/doc/html/pcre_config.html b/doc/html/pcre_config.html
index 31747b0..dcfb831 100644
--- a/doc/html/pcre_config.html
+++ b/doc/html/pcre_config.html
@@ -41,6 +41,9 @@ point to an unsigned long integer. The available codes are:
<pre>
PCRE_CONFIG_JIT Availability of just-in-time compiler
support (1=yes 0=no)
+ PCRE_CONFIG_JITTARGET String containing information about the
+ target architecture for the JIT compiler,
+ or NULL if there is no JIT support
PCRE_CONFIG_LINK_SIZE Internal link size: 2, 3, or 4
PCRE_CONFIG_MATCH_LIMIT Internal resource limit
PCRE_CONFIG_MATCH_LIMIT_RECURSION
@@ -66,7 +69,7 @@ point to an unsigned long integer. The available codes are:
Availability of Unicode property support
(1=yes 0=no)
</pre>
-The function yields 0 on success or PCRE_ERROR_BADOPTION otherwise. That error
+The function yields 0 on success or PCRE_ERROR_BADOPTION otherwise. That error
is also given if PCRE_CONFIG_UTF16 is passed to <b>pcre_config()</b> or if
PCRE_CONFIG_UTF8 is passed to <b>pcre16_config()</b>.
</P>
diff --git a/doc/html/pcre_fullinfo.html b/doc/html/pcre_fullinfo.html
index 49186dd..edb6eb7 100644
--- a/doc/html/pcre_fullinfo.html
+++ b/doc/html/pcre_fullinfo.html
@@ -50,7 +50,7 @@ The following information is available:
PCRE_INFO_HASCRORLF Return 1 if explicit CR or LF matches exist
PCRE_INFO_JCHANGED Return 1 if (?J) or (?-J) was used
PCRE_INFO_JIT Return 1 after successful JIT compilation
- PCRE_INFO_JITSIZE Size of JIT compiled code
+ PCRE_INFO_JITSIZE Size of JIT compiled code
PCRE_INFO_LASTLITERAL Literal last data unit required
PCRE_INFO_MINLENGTH Lower bound length of matching strings
PCRE_INFO_NAMECOUNT Number of named subpatterns
diff --git a/doc/html/pcre_jit_stack_alloc.html b/doc/html/pcre_jit_stack_alloc.html
index 72d4b4e..2daac72 100644
--- a/doc/html/pcre_jit_stack_alloc.html
+++ b/doc/html/pcre_jit_stack_alloc.html
@@ -23,7 +23,7 @@ SYNOPSIS
<b>int <i>maxsize</i>);</b>
</P>
<P>
-<b>pcre16_jit_stack *pcre16_jit_stack_alloc(int <i>startsize</i>, </b>
+<b>pcre16_jit_stack *pcre16_jit_stack_alloc(int <i>startsize</i>,</b>
<b>int <i>maxsize</i>);</b>
</P>
<br><b>
diff --git a/doc/html/pcre_pattern_to_host_byte_order.html b/doc/html/pcre_pattern_to_host_byte_order.html
index 3c311de..2fb7f10 100644
--- a/doc/html/pcre_pattern_to_host_byte_order.html
+++ b/doc/html/pcre_pattern_to_host_byte_order.html
@@ -20,7 +20,7 @@ SYNOPSIS
</P>
<P>
<b>int pcre_pattern_to_host_byte_order(pcre *<i>code</i>,</b>
-<b>pcre_extra *<i>extra</i>, const unsigned char *<i>tables</i>); </b>
+<b>pcre_extra *<i>extra</i>, const unsigned char *<i>tables</i>);</b>
</P>
<P>
<b>int pcre16_pattern_to_host_byte_order(pcre16 *<i>code</i>,</b>
@@ -31,8 +31,8 @@ DESCRIPTION
</b><br>
<P>
This function ensures that the bytes in 2-byte and 4-byte values in a compiled
-pattern are in the correct order for the current host. It is useful when a
-pattern that has been compiled on one host is transferred to another that might
+pattern are in the correct order for the current host. It is useful when a
+pattern that has been compiled on one host is transferred to another that might
have different endianness. The arguments are:
<pre>
<i>code</i> A compiled regular expression
diff --git a/doc/html/pcre_utf16_to_host_byte_order.html b/doc/html/pcre_utf16_to_host_byte_order.html
index 5434554..164e236 100644
--- a/doc/html/pcre_utf16_to_host_byte_order.html
+++ b/doc/html/pcre_utf16_to_host_byte_order.html
@@ -20,15 +20,15 @@ SYNOPSIS
</P>
<P>
<b>int pcre16_utf16_to_host_byte_order(PCRE_UCHAR16 *<i>output</i>,</b>
-<b>PCRE_SPTR16 <i>input</i>, int <i>length</i>, int *<i>host_byte_order</i>, </b>
+<b>PCRE_SPTR16 <i>input</i>, int <i>length</i>, int *<i>host_byte_order</i>,</b>
<b>int <i>keep_boms</i>);</b>
</P>
<br><b>
DESCRIPTION
</b><br>
<P>
-This function, which exists only in the 16-bit library, converts a UTF-16
-string to the correct order for the current host, taking account of any byte
+This function, which exists only in the 16-bit library, converts a UTF-16
+string to the correct order for the current host, taking account of any byte
order marks (BOMs) within the string. Its arguments are:
<pre>
<i>output</i> pointer to output buffer, may be the same as <i>input</i>
diff --git a/doc/html/pcreapi.html b/doc/html/pcreapi.html
index e4566a3..c5b58ff 100644
--- a/doc/html/pcreapi.html
+++ b/doc/html/pcreapi.html
@@ -34,10 +34,11 @@ man page, in case the conversion went wrong.
<li><a name="TOC19" href="#SEC19">EXTRACTING CAPTURED SUBSTRINGS BY NAME</a>
<li><a name="TOC20" href="#SEC20">DUPLICATE SUBPATTERN NAMES</a>
<li><a name="TOC21" href="#SEC21">FINDING ALL POSSIBLE MATCHES</a>
-<li><a name="TOC22" href="#SEC22">MATCHING A PATTERN: THE ALTERNATIVE FUNCTION</a>
-<li><a name="TOC23" href="#SEC23">SEE ALSO</a>
-<li><a name="TOC24" href="#SEC24">AUTHOR</a>
-<li><a name="TOC25" href="#SEC25">REVISION</a>
+<li><a name="TOC22" href="#SEC22">OBTAINING AN ESTIMATE OF STACK USAGE</a>
+<li><a name="TOC23" href="#SEC23">MATCHING A PATTERN: THE ALTERNATIVE FUNCTION</a>
+<li><a name="TOC24" href="#SEC24">SEE ALSO</a>
+<li><a name="TOC25" href="#SEC25">AUTHOR</a>
+<li><a name="TOC26" href="#SEC26">REVISION</a>
</ul>
<P>
<b>#include &#60;pcre.h&#62;</b>
@@ -174,7 +175,7 @@ just use different data types for their arguments and results, and their names
start with <b>pcre16_</b> instead of <b>pcre_</b>. For every option that has UTF8
in its name (for example, PCRE_UTF8), there is a corresponding 16-bit name with
UTF8 replaced by UTF16. This facility is in fact just cosmetic; the 16-bit
-option names define the same bit values.
+option names define the same bit values.
</P>
<P>
References to bytes and UTF-8 in this document should be read as references to
@@ -182,7 +183,7 @@ References to bytes and UTF-8 in this document should be read as references to
specified otherwise. More details of the specific differences for the 16-bit
library are given in the
<a href="pcre16.html"><b>pcre16</b></a>
-page.
+page.
</P>
<br><a name="SEC6" href="#TOC1">PCRE API OVERVIEW</a><br>
<P>
@@ -397,7 +398,7 @@ not recognized. The following information is available:
PCRE_CONFIG_UTF8
</pre>
The output is an integer that is set to one if UTF-8 support is available;
-otherwise it is set to zero. If this option is given to the 16-bit version of
+otherwise it is set to zero. If this option is given to the 16-bit version of
this function, <b>pcre16_config()</b>, the result is PCRE_ERROR_BADOPTION.
<pre>
PCRE_CONFIG_UTF16
@@ -417,6 +418,13 @@ properties is available; otherwise it is set to zero.
The output is an integer that is set to one if support for just-in-time
compiling is available; otherwise it is set to zero.
<pre>
+ PCRE_CONFIG_JITTARGET
+</pre>
+The output is a pointer to a zero-terminated "const char *" string. If JIT
+support is available, the string contains the name of the architecture for
+which the JIT compiler is configured, for example "x86 32bit (little endian +
+unaligned)". If JIT support is not available, the result is NULL.
+<pre>
PCRE_CONFIG_NEWLINE
</pre>
The output is an integer whose value specifies the default character sequence
@@ -738,7 +746,7 @@ preceding sequences should be recognized. Setting PCRE_NEWLINE_ANY specifies
that any Unicode newline sequence should be recognized. The Unicode newline
sequences are the three just mentioned, plus the single characters VT (vertical
tab, U+000B), FF (formfeed, U+000C), NEL (next line, U+0085), LS (line
-separator, U+2028), and PS (paragraph separator, U+2029). For the 8-bit
+separator, U+2028), and PS (paragraph separator, U+2029). For the 8-bit
library, the last two are recognized only in UTF-8 mode.
</P>
<P>
@@ -808,7 +816,7 @@ page.
<pre>
PCRE_NO_UTF8_CHECK
</pre>
-When PCRE_UTF8 is set, the validity of the pattern as a UTF-8
+When PCRE_UTF8 is set, the validity of the pattern as a UTF-8
string is automatically checked. There is a discussion about the
<a href="pcreunicode.html#utf8strings">validity of UTF-8 strings</a>
in the
@@ -825,7 +833,7 @@ validity checking of subject strings.
<P>
The following table lists the error codes than may be returned by
<b>pcre_compile2()</b>, along with the error messages that may be returned by
-both compiling functions. Note that error messages are always 8-bit ASCII
+both compiling functions. Note that error messages are always 8-bit ASCII
strings, even in 16-bit mode. As PCRE has developed, some error codes have
fallen out of use. To avoid confusion, they have not been re-used.
<pre>
@@ -899,14 +907,14 @@ fallen out of use. To avoid confusion, they have not been re-used.
65 different names for subpatterns of the same number are
not allowed
66 (*MARK) must have an argument
- 67 this version of PCRE is not compiled with Unicode property
+ 67 this version of PCRE is not compiled with Unicode property
support
68 \c must be followed by an ASCII character
69 \k is not followed by a braced, angle-bracketed, or quoted name
70 internal error: unknown opcode in find_fixedlength()
71 \N is not supported in a class
72 too many forward references
- 73 disallowed Unicode code point (&#62;= 0xd800 && &#60;= 0xdfff)
+ 73 disallowed Unicode code point (&#62;= 0xd800 && &#60;= 0xdfff)
74 invalid UTF-16 string (specifically UTF-16)
</pre>
The numbers 32 and 10000 in errors 48 and 49 are defaults; different values may
@@ -1101,12 +1109,12 @@ the following negative numbers:
PCRE_ERROR_NULL the argument <i>code</i> was NULL
the argument <i>where</i> was NULL
PCRE_ERROR_BADMAGIC the "magic number" was not found
- PCRE_ERROR_BADENDIANNESS the pattern was compiled with different
+ PCRE_ERROR_BADENDIANNESS the pattern was compiled with different
endianness
PCRE_ERROR_BADOPTION the value of <i>what</i> was invalid
</pre>
The "magic number" is placed at the start of each compiled pattern as an simple
-check against passing an arbitrary memory pointer. The endianness error can
+check against passing an arbitrary memory pointer. The endianness error can
occur if a compiled pattern is saved and reloaded on a different host. Here is
a typical call of <b>pcre_fullinfo()</b>, to obtain the length of the compiled
pattern:
@@ -1150,8 +1158,8 @@ variable.
</P>
<P>
If there is a fixed first value, for example, the letter "c" from a pattern
-such as (cat|cow|coyote), its value is returned. In the 8-bit library, the
-value is always less than 256; in the 16-bit library the value can be up to
+such as (cat|cow|coyote), its value is returned. In the 8-bit library, the
+value is always less than 256; in the 16-bit library the value can be up to
0xffff.
</P>
<P>
@@ -1427,7 +1435,7 @@ fields (not necessarily in this order):
const unsigned char *<i>tables</i>;
unsigned char **<i>mark</i>;
</pre>
-In the 16-bit version of this structure, the <i>mark</i> field has type
+In the 16-bit version of this structure, the <i>mark</i> field has type
"PCRE_UCHAR16 **".
</P>
<P>
@@ -2067,14 +2075,14 @@ documentation for more details.
<pre>
PCRE_ERROR_BADMODE (-28)
</pre>
-This error is given if a pattern that was compiled by the 8-bit library is
+This error is given if a pattern that was compiled by the 8-bit library is
passed to a 16-bit library function, or vice versa.
<pre>
PCRE_ERROR_BADENDIANNESS (-29)
</pre>
-This error is given if a pattern that was compiled and saved is reloaded on a
-host with different endianness. The utility function
-<b>pcre_pattern_to_host_byte_order()</b> can be used to convert such a pattern
+This error is given if a pattern that was compiled and saved is reloaded on a
+host with different endianness. The utility function
+<b>pcre_pattern_to_host_byte_order()</b> can be used to convert such a pattern
so that it runs on the new host.
</P>
<P>
@@ -2084,7 +2092,7 @@ Error numbers -16 to -20 and -22 are not used by <b>pcre_exec()</b>.
Reason codes for invalid UTF-8 strings
</b><br>
<P>
-This section applies only to the 8-bit library. The corresponding information
+This section applies only to the 8-bit library. The corresponding information
for the 16-bit library is given in the
<a href="pcre16.html"><b>pcre16</b></a>
page.
@@ -2374,8 +2382,32 @@ When your callout function is called, extract and save the current matched
substring. Then return 1, which forces <b>pcre_exec()</b> to backtrack and try
other alternatives. Ultimately, when it runs out of matches, <b>pcre_exec()</b>
will yield PCRE_ERROR_NOMATCH.
+</P>
+<br><a name="SEC22" href="#TOC1">OBTAINING AN ESTIMATE OF STACK USAGE</a><br>
+<P>
+Matching certain patterns using <b>pcre_exec()</b> can use a lot of process
+stack, which in certain environments can be rather limited in size. Some users
+find it helpful to have an estimate of the amount of stack that is used by
+<b>pcre_exec()</b>, to help them set recursion limits, as described in the
+<a href="pcrestack.html"><b>pcrestack</b></a>
+documentation. The estimate that is output by <b>pcretest</b> when called with
+the <b>-m</b> and <b>-C</b> options is obtained by calling <b>pcre_exec</b> with
+the values NULL, NULL, NULL, -999, and -999 for its first five arguments.
+</P>
+<P>
+Normally, if its first argument is NULL, <b>pcre_exec()</b> immediately returns
+the negative error code PCRE_ERROR_NULL, but with this special combination of
+arguments, it returns instead a negative number whose absolute value is the
+approximate stack frame size in bytes. (A negative number is used so that it is
+clear that no match has happened.) The value is approximate because in some
+cases, recursive calls to <b>pcre_exec()</b> occur when there are one or two
+additional variables on the stack.
+</P>
+<P>
+If PCRE has been compiled to use the heap instead of the stack for recursion,
+the value returned is the size of each block that is obtained from the heap.
<a name="dfamatch"></a></P>
-<br><a name="SEC22" href="#TOC1">MATCHING A PATTERN: THE ALTERNATIVE FUNCTION</a><br>
+<br><a name="SEC23" href="#TOC1">MATCHING A PATTERN: THE ALTERNATIVE FUNCTION</a><br>
<P>
<b>int pcre_dfa_exec(const pcre *<i>code</i>, const pcre_extra *<i>extra</i>,</b>
<b>const char *<i>subject</i>, int <i>length</i>, int <i>startoffset</i>,</b>
@@ -2550,13 +2582,13 @@ recursively, using private vectors for <i>ovector</i> and <i>workspace</i>. This
error is given if the output vector is not large enough. This should be
extremely rare, as a vector of size 1000 is used.
</P>
-<br><a name="SEC23" href="#TOC1">SEE ALSO</a><br>
+<br><a name="SEC24" href="#TOC1">SEE ALSO</a><br>
<P>
<b>pcre16</b>(3), <b>pcrebuild</b>(3), <b>pcrecallout</b>(3), <b>pcrecpp(3)</b>(3),
<b>pcrematching</b>(3), <b>pcrepartial</b>(3), <b>pcreposix</b>(3),
<b>pcreprecompile</b>(3), <b>pcresample</b>(3), <b>pcrestack</b>(3).
</P>
-<br><a name="SEC24" href="#TOC1">AUTHOR</a><br>
+<br><a name="SEC25" href="#TOC1">AUTHOR</a><br>
<P>
Philip Hazel
<br>
@@ -2565,9 +2597,9 @@ University Computing Service
Cambridge CB2 3QH, England.
<br>
</P>
-<br><a name="SEC25" href="#TOC1">REVISION</a><br>
+<br><a name="SEC26" href="#TOC1">REVISION</a><br>
<P>
-Last updated: 07 January 2012
+Last updated: 21 January 2012
<br>
Copyright &copy; 1997-2012 University of Cambridge.
<br>
diff --git a/doc/html/pcrebuild.html b/doc/html/pcrebuild.html
index eee9a74..8faed44 100644
--- a/doc/html/pcrebuild.html
+++ b/doc/html/pcrebuild.html
@@ -66,11 +66,11 @@ exists as well, but as it specifies the default, it is not described.
</P>
<br><a name="SEC2" href="#TOC1">BUILDING 8-BIT and 16-BIT LIBRARIES</a><br>
<P>
-By default, a library called <b>libpcre</b> is built, containing functions that
-take string arguments contained in vectors of bytes, either as single-byte
+By default, a library called <b>libpcre</b> is built, containing functions that
+take string arguments contained in vectors of bytes, either as single-byte
characters, or interpreted as UTF-8 strings. You can also build a separate
-library, called <b>libpcre16</b>, in which strings are contained in vectors of
-16-bit data units and interpreted either as single-unit characters or UTF-16
+library, called <b>libpcre16</b>, in which strings are contained in vectors of
+16-bit data units and interpreted either as single-unit characters or UTF-16
strings, by adding
<pre>
--enable-pcre16
@@ -97,7 +97,7 @@ to the <b>configure</b> command, as required.
<P>
By default, if the 8-bit library is being built, the <b>configure</b> script
will search for a C++ compiler and C++ header files. If it finds them, it
-automatically builds the C++ wrapper library (which supports only 8-bit
+automatically builds the C++ wrapper library (which supports only 8-bit
strings). You can disable this by adding
<pre>
--disable-cpp
@@ -122,7 +122,7 @@ configuration. (For backwards compatibility, --enable-utf8 is a synonym of
<P>
Of itself, this setting does not make PCRE treat strings as UTF-8 or UTF-16. As
well as compiling PCRE with this option, you also have have to set the
-PCRE_UTF8 or PCRE_UTF16 option when you call one of the pattern compiling
+PCRE_UTF8 or PCRE_UTF16 option when you call one of the pattern compiling
functions.
</P>
<P>
diff --git a/doc/html/pcrecallout.html b/doc/html/pcrecallout.html
index 75bda4b..8076cee 100644
--- a/doc/html/pcrecallout.html
+++ b/doc/html/pcrecallout.html
@@ -31,7 +31,7 @@ man page, in case the conversion went wrong.
PCRE provides a feature called "callout", which is a means of temporarily
passing control to the caller of PCRE in the middle of pattern matching. The
caller of PCRE provides an external function by putting its entry point in the
-global variable <i>pcre_callout</i> (<i>pcre16_callout</i> for the 16-bit
+global variable <i>pcre_callout</i> (<i>pcre16_callout</i> for the 16-bit
library). By default, this variable contains NULL, which disables all calling
out.
</P>
@@ -105,7 +105,7 @@ These structures contains the following fields:
int <i>callout_number</i>;
int *<i>offset_vector</i>;
const char *<i>subject</i>; (8-bit version)
- PCRE_SPTR16 <i>subject</i>; (16-bit version)
+ PCRE_SPTR16 <i>subject</i>; (16-bit version)
int <i>subject_length</i>;
int <i>start_match</i>;
int <i>current_position</i>;
@@ -129,7 +129,7 @@ automatically generated callouts).
</P>
<P>
The <i>offset_vector</i> field is a pointer to the vector of offsets that was
-passed by the caller to the matching function. When <b>pcre_exec()</b> or
+passed by the caller to the matching function. When <b>pcre_exec()</b> or
<b>pcre16_exec()</b> is used, the contents can be inspected, in order to extract
substrings that have been matched so far, in the same way as for extracting
substrings after a match has completed. For the DFA matching functions, this
diff --git a/doc/html/pcrecpp.html b/doc/html/pcrecpp.html
index 2c5879a..06518da 100644
--- a/doc/html/pcrecpp.html
+++ b/doc/html/pcrecpp.html
@@ -35,7 +35,7 @@ man page, in case the conversion went wrong.
The C++ wrapper for PCRE was provided by Google Inc. Some additional
functionality was added by Giuseppe Maxia. This brief man page was constructed
from the notes in the <i>pcrecpp.h</i> file, which should be consulted for
-further details. Note that the C++ wrapper supports only the original 8-bit
+further details. Note that the C++ wrapper supports only the original 8-bit
PCRE library. There is no 16-bit support at present.
</P>
<br><a name="SEC3" href="#TOC1">MATCHING INTERFACE</a><br>
diff --git a/doc/html/pcrejit.html b/doc/html/pcrejit.html
index a604157..7b23edb 100644
--- a/doc/html/pcrejit.html
+++ b/doc/html/pcrejit.html
@@ -45,10 +45,10 @@ this support was written by Zoltan Herczeg.
</P>
<br><a name="SEC2" href="#TOC1">8-BIT and 16-BIT SUPPORT</a><br>
<P>
-JIT support is available for both the 8-bit and 16-bit PCRE libraries. To keep
-this documentation simple, only the 8-bit interface is described in what
-follows. If you are using the 16-bit library, substitute the 16-bit functions
-and 16-bit structures (for example, <i>pcre16_jit_stack</i> instead of
+JIT support is available for both the 8-bit and 16-bit PCRE libraries. To keep
+this documentation simple, only the 8-bit interface is described in what
+follows. If you are using the 16-bit library, substitute the 16-bit functions
+and 16-bit structures (for example, <i>pcre16_jit_stack</i> instead of
<i>pcre_jit_stack</i>).
</P>
<br><a name="SEC3" href="#TOC1">AVAILABILITY OF JIT SUPPORT</a><br>
diff --git a/doc/html/pcrematching.html b/doc/html/pcrematching.html
index 5cd41ae..6abd17e 100644
--- a/doc/html/pcrematching.html
+++ b/doc/html/pcrematching.html
@@ -28,13 +28,13 @@ This document describes the two different algorithms that are available in PCRE
for matching a compiled regular expression against a given subject string. The
"standard" algorithm is the one provided by the <b>pcre_exec()</b> and
<b>pcre16_exec()</b> functions. These work in the same was as Perl's matching
-function, and provide a Perl-compatible matching operation. The just-in-time
+function, and provide a Perl-compatible matching operation. The just-in-time
(JIT) optimization that is described in the
<a href="pcrejit.html"><b>pcrejit</b></a>
documentation is compatible with these functions.
</P>
<P>
-An alternative algorithm is provided by the <b>pcre_dfa_exec()</b> and
+An alternative algorithm is provided by the <b>pcre_dfa_exec()</b> and
<b>pcre16_dfa_exec()</b> functions; they operate in a different way, and are not
Perl-compatible. This alternative has advantages and disadvantages compared
with the standard algorithm, and these are described below.
diff --git a/doc/html/pcrepartial.html b/doc/html/pcrepartial.html
index 40cdf97..989ce38 100644
--- a/doc/html/pcrepartial.html
+++ b/doc/html/pcrepartial.html
@@ -50,7 +50,7 @@ long and is not all available at once.
</P>
<P>
PCRE supports partial matching by means of the PCRE_PARTIAL_SOFT and
-PCRE_PARTIAL_HARD options, which can be set when calling any of the matching
+PCRE_PARTIAL_HARD options, which can be set when calling any of the matching
functions. For backwards compatibility, PCRE_PARTIAL is a synonym for
PCRE_PARTIAL_SOFT. The essential difference between the two options is whether
or not a partial match is preferred to an alternative complete match, though
@@ -70,7 +70,7 @@ strings. This optimization is also disabled for partial matching.
</P>
<br><a name="SEC2" href="#TOC1">PARTIAL MATCHING USING pcre_exec() OR pcre16_exec()</a><br>
<P>
-A partial match occurs during a call to <b>pcre_exec()</b> or
+A partial match occurs during a call to <b>pcre_exec()</b> or
<b>pcre16_exec()</b> when the end of the subject string is reached successfully,
but matching cannot continue because more characters are needed. However, at
least one character in the subject must have been inspected. This character
@@ -144,7 +144,8 @@ because it prefers an earlier partial match over a later complete match. For
this reason, the assumption is made that the end of the supplied subject string
may not be the true end of the available data, and so, if \z, \Z, \b, \B,
or $ are encountered at the end of the subject, the result is
-PCRE_ERROR_PARTIAL.
+PCRE_ERROR_PARTIAL, provided that at least one character in the subject has
+been inspected.
</P>
<P>
Setting PCRE_PARTIAL_HARD also affects the way UTF-8 and UTF-16
@@ -294,7 +295,7 @@ program to do that if it needs to.
<P>
You can set the PCRE_PARTIAL_SOFT or PCRE_PARTIAL_HARD options with
PCRE_DFA_RESTART to continue partial matching over multiple segments. This
-facility can be used to pass very long subject strings to the DFA matching
+facility can be used to pass very long subject strings to the DFA matching
functions.
</P>
<br><a name="SEC8" href="#TOC1">MULTI-SEGMENT MATCHING WITH pcre_exec() OR pcre16_exec()</a><br>
@@ -434,7 +435,7 @@ Cambridge CB2 3QH, England.
</P>
<br><a name="SEC11" href="#TOC1">REVISION</a><br>
<P>
-Last updated: 08 January 2012
+Last updated: 21 January 2012
<br>
Copyright &copy; 1997-2012 University of Cambridge.
<br>
diff --git a/doc/html/pcrepattern.html b/doc/html/pcrepattern.html
index 26c23f2..1dca37a 100644
--- a/doc/html/pcrepattern.html
+++ b/doc/html/pcrepattern.html
@@ -65,11 +65,11 @@ there is now also support for UTF-8 strings in the original library, and a
second library that supports 16-bit and UTF-16 character strings. To use these
features, PCRE must be built to include appropriate support. When using UTF
strings you must either call the compiling function with the PCRE_UTF8 or
-PCRE_UTF16 option, or the pattern must start with one of these special
+PCRE_UTF16 option, or the pattern must start with one of these special
sequences:
<pre>
(*UTF8)
- (*UTF16)
+ (*UTF16)
</pre>
Starting a pattern with such a sequence is equivalent to setting the relevant
option. This feature is not Perl-compatible. How setting a UTF mode affects
@@ -292,7 +292,7 @@ between \x{ and }, but the character code is constrained as follows:
16-bit non-UTF mode less than 0x10000
16-bit UTF-16 mode less than 0x10ffff and a valid codepoint
</pre>
-Invalid Unicode codepoints are the range 0xd800 to 0xdfff (the so-called
+Invalid Unicode codepoints are the range 0xd800 to 0xdfff (the so-called
"surrogate" codepoints).
</P>
<P>
@@ -335,7 +335,7 @@ following the discussion of
Inside a character class, or if the decimal number is greater than 9 and there
have not been that many capturing subpatterns, PCRE re-reads up to three octal
digits following the backslash, and uses them to generate a data character. Any
-subsequent digits stand for themselves. The value of the character is
+subsequent digits stand for themselves. The value of the character is
constrained in the same way as characters specified in hexadecimal.
For example:
<pre>
@@ -503,8 +503,8 @@ The vertical space characters are:
U+2028 Line separator
U+2029 Paragraph separator
</pre>
-In 8-bit, non-UTF-8 mode, only the characters with codepoints less than 256 are
-relevant.
+In 8-bit, non-UTF-8 mode, only the characters with codepoints less than 256 are
+relevant.
<a name="newlineseq"></a></P>
<br><b>
Newline sequences
@@ -970,7 +970,7 @@ end of the subject in both modes, and if all branches of a pattern start with
<P>
Outside a character class, a dot in the pattern matches any one character in
the subject string except (by default) a character that signifies the end of a
-line.
+line.
</P>
<P>
When a line ending is defined as a single character, dot never matches that
@@ -1103,7 +1103,7 @@ followed by two other characters. The octal or hexadecimal representation of
</P>
<P>
Ranges operate in the collating sequence of character values. They can also be
-used for characters specified numerically, for example [\000-\037]. Ranges
+used for characters specified numerically, for example [\000-\037]. Ranges
can include any characters that are valid for the current mode.
</P>
<P>
@@ -1298,8 +1298,8 @@ match "cataract", "erpillar" or an empty string.
<br>
2. It sets up the subpattern as a capturing subpattern. This means that, when
the whole pattern matches, that portion of the subject string that matched the
-subpattern is passed back to the caller via the <i>ovector</i> argument of the
-matching function. (This applies only to the traditional matching functions;
+subpattern is passed back to the caller via the <i>ovector</i> argument of the
+matching function. (This applies only to the traditional matching functions;
the DFA matching functions do not support capturing.)
</P>
<P>
@@ -2505,7 +2505,7 @@ same pair of parentheses when there is a repetition.
<P>
PCRE provides a similar feature, but of course it cannot obey arbitrary Perl
code. The feature is called "callout". The caller of PCRE provides an external
-function by putting its entry point in the global variable <i>pcre_callout</i>
+function by putting its entry point in the global variable <i>pcre_callout</i>
(8-bit library) or <i>pcre16_callout</i> (16-bit library). By default, this
variable contains NULL, which disables all calling out.
</P>
diff --git a/doc/html/pcreposix.html b/doc/html/pcreposix.html
index 637305d..9aa699a 100644
--- a/doc/html/pcreposix.html
+++ b/doc/html/pcreposix.html
@@ -48,7 +48,7 @@ This set of functions provides a POSIX-style API for the PCRE regular
expression 8-bit library. See the
<a href="pcreapi.html"><b>pcreapi</b></a>
documentation for a description of PCRE's native API, which contains much
-additional functionality. There is no POSIX-style wrapper for PCRE's 16-bit
+additional functionality. There is no POSIX-style wrapper for PCRE's 16-bit
library.
</P>
<P>
diff --git a/doc/html/pcreprecompile.html b/doc/html/pcreprecompile.html
index cbc2812..8361b7a 100644
--- a/doc/html/pcreprecompile.html
+++ b/doc/html/pcreprecompile.html
@@ -120,7 +120,7 @@ documentation.
</P>
<P>
If you did not provide custom character tables when the pattern was compiled,
-the pointer in the compiled pattern is NULL, which causes the matching
+the pointer in the compiled pattern is NULL, which causes the matching
functions to use PCRE's internal tables. Thus, you do not need to take any
special action at run time in this case.
</P>
diff --git a/doc/html/pcrestack.html b/doc/html/pcrestack.html
index 8bf8c92..76101b3 100644
--- a/doc/html/pcrestack.html
+++ b/doc/html/pcrestack.html
@@ -130,9 +130,9 @@ documentation.
</P>
<P>
As a very rough rule of thumb, you should reckon on about 500 bytes per
-recursion. Thus, if you want to limit your stack usage to 8Mb, you
-should set the limit at 16000 recursions. A 64Mb stack, on the other hand, can
-support around 128000 recursions.
+recursion. Thus, if you want to limit your stack usage to 8Mb, you should set
+the limit at 16000 recursions. A 64Mb stack, on the other hand, can support
+around 128000 recursions.
</P>
<P>
In Unix-like environments, the <b>pcretest</b> test program has a command line
@@ -143,6 +143,32 @@ string. This is done by calling <b>pcre[16]_exec()</b> repeatedly with different
limits.
</P>
<br><b>
+Obtaining an estimate of stack usage
+</b><br>
+<P>
+The actual amount of stack used per recursion can vary quite a lot, depending
+on the compiler that was used to build PCRE and the optimization or debugging
+options that were set for it. The rule of thumb value of 500 bytes mentioned
+above may be larger or smaller than what is actually needed. A better
+approximation can be obtained by running this command:
+<pre>
+ pcretest -m -C
+</pre>
+The <b>-C</b> option causes <b>pcretest</b> to output information about the
+options with which PCRE was compiled. When <b>-m</b> is also given (before
+<b>-C</b>), information about stack use is given in a line like this:
+<pre>
+ Match recursion uses stack: approximate frame size = 640 bytes
+</pre>
+The value is approximate because some recursions need a bit more (up to perhaps
+16 more bytes).
+</P>
+<P>
+If the above command is given when PCRE is compiled to use the heap instead of
+the stack for recursion, the value that is output is the size of each block
+that is obtained from the heap.
+</P>
+<br><b>
Changing stack size in Unix-like systems
</b><br>
<P>
@@ -190,7 +216,7 @@ Cambridge CB2 3QH, England.
REVISION
</b><br>
<P>
-Last updated: 10 January 2012
+Last updated: 21 January 2012
<br>
Copyright &copy; 1997-2012 University of Cambridge.
<br>
diff --git a/doc/html/pcresyntax.html b/doc/html/pcresyntax.html
index 0e7d364..5181d7a 100644
--- a/doc/html/pcresyntax.html
+++ b/doc/html/pcresyntax.html
@@ -448,12 +448,12 @@ pattern is not anchored.
<pre>
(*COMMIT) overall failure, no advance of starting point
(*PRUNE) advance to next starting character
- (*PRUNE:NAME) equivalent to (*MARK:NAME)(*PRUNE)
+ (*PRUNE:NAME) equivalent to (*MARK:NAME)(*PRUNE)
(*SKIP) advance to current matching position
(*SKIP:NAME) advance to position corresponding to an earlier
- (*MARK:NAME); if not found, the (*SKIP) is ignored
+ (*MARK:NAME); if not found, the (*SKIP) is ignored
(*THEN) local failure, backtrack to next alternation
- (*THEN:NAME) equivalent to (*MARK:NAME)(*THEN)
+ (*THEN:NAME) equivalent to (*MARK:NAME)(*THEN)
</PRE>
</P>
<br><a name="SEC22" href="#TOC1">NEWLINE CONVENTIONS</a><br>
diff --git a/doc/html/pcretest.html b/doc/html/pcretest.html
index 6915115..a88dbd0 100644
--- a/doc/html/pcretest.html
+++ b/doc/html/pcretest.html
@@ -52,26 +52,26 @@ options and exactly what is output.
</P>
<br><a name="SEC2" href="#TOC1">PCRE's 8-BIT and 16-BIT LIBRARIES</a><br>
<P>
-From release 8.30, two separate PCRE libraries can be built. The original one
-supports 8-bit character strings, whereas the newer 16-bit library supports
-character strings encoded in 16-bit units. The <b>pcretest</b> program can be
+From release 8.30, two separate PCRE libraries can be built. The original one
+supports 8-bit character strings, whereas the newer 16-bit library supports
+character strings encoded in 16-bit units. The <b>pcretest</b> program can be
used to test both libraries. However, it is itself still an 8-bit program,
reading 8-bit input and writing 8-bit output. When testing the 16-bit library,
the patterns and data strings are converted to 16-bit format before being
-passed to the PCRE library functions. Results are converted to 8-bit for
+passed to the PCRE library functions. Results are converted to 8-bit for
output.
</P>
<P>
-References to functions and structures of the form <b>pcre[16]_xx</b> below
-mean "<b>pcre_xx</b> when using the 8-bit library or <b>pcre16_xx</b> when using
+References to functions and structures of the form <b>pcre[16]_xx</b> below
+mean "<b>pcre_xx</b> when using the 8-bit library or <b>pcre16_xx</b> when using
the 16-bit library".
</P>
<br><a name="SEC3" href="#TOC1">COMMAND LINE OPTIONS</a><br>
<P>
<b>-16</b>
-If both the 8-bit and the 16-bit libraries have been built, this option causes
-the 16-bit library to be used. If only the 16-bit library has been built, this
-is the default (so has no effect). If only the 8-bit library has been built,
+If both the 8-bit and the 16-bit libraries have been built, this option causes
+the 16-bit library to be used. If only the 16-bit library has been built, this
+is the default (so has no effect). If only the 8-bit library has been built,
this option causes an error.
</P>
<P>
@@ -82,25 +82,25 @@ internal form is output after compilation.
<P>
<b>-C</b>
Output the version number of the PCRE library, and all available information
-about the optional features that are included, and then exit. All other options
+about the optional features that are included, and then exit. All other options
are ignored.
</P>
<P>
<b>-C</b> <i>option</i>
-Output information about a specific build-time option, then exit. This
-functionality is intended for use in scripts such as <b>RunTest</b>. The
+Output information about a specific build-time option, then exit. This
+functionality is intended for use in scripts such as <b>RunTest</b>. The
following options output the value indicated:
<pre>
linksize the internal link size (2, 3, or 4)
- newline the default newline setting:
- CR, LF, CRLF, ANYCRLF, or ANY
+ newline the default newline setting:
+ CR, LF, CRLF, ANYCRLF, or ANY
</pre>
The following options output 1 for true or zero for false:
<pre>
jit just-in-time support is available
pcre16 the 16-bit library was built
pcre8 the 8-bit library was built
- ucp Unicode property support is available
+ ucp Unicode property support is available
utf UTF-8 and/or UTF-16 support is available
</PRE>
</P>
@@ -134,7 +134,7 @@ calling <b>pcre[16]_exec()</b> repeatedly with different limits.
<P>
<b>-m</b>
Output the size of each compiled pattern after it has been compiled. This is
-equivalent to adding <b>/M</b> to each regular expression. The size is given in
+equivalent to adding <b>/M</b> to each regular expression. The size is given in
bytes for both libraries.
</P>
<P>
@@ -172,7 +172,7 @@ result of studying is not included when studying is caused only by <b>-s</b> and
neither <b>-i</b> nor <b>-d</b> is present on the command line. This behaviour
means that the output from tests that are run with and without <b>-s</b> should
be identical, except when options that output information about the actual
-running of a match are set.
+running of a match are set.
<br>
<br>
The <b>-M</b>, <b>-t</b>, and <b>-tm</b> options, which give information about
@@ -276,7 +276,7 @@ effect as they do in Perl. For example:
The following table shows additional modifiers for setting PCRE compile-time
options that do not correspond to anything in Perl:
<pre>
- <b>/8</b> PCRE_UTF8 ) when using the 8-bit
+ <b>/8</b> PCRE_UTF8 ) when using the 8-bit
<b>/?</b> PCRE_NO_UTF8_CHECK ) library
<b>/8</b> PCRE_UTF16 ) when using the 16-bit
@@ -309,7 +309,7 @@ This example sets multiline matching with CRLF as the line ending sequence:
</pre>
As well as turning on the PCRE_UTF8/16 option, the <b>/8</b> modifier causes
all non-printing characters in output strings to be printed using the
-\x{hh...} notation. Otherwise, those less than 0x100 are output in hex without
+\x{hh...} notation. Otherwise, those less than 0x100 are output in hex without
the curly brackets.
</P>
<P>
@@ -661,7 +661,7 @@ substring is shown as "&#60;unset&#62;", as for the second data line.
2: b
</pre>
If the strings contain any non-printing characters, they are output as \xhh
-escapes if the value is less than 256 and UTF mode is not set. Otherwise they
+escapes if the value is less than 256 and UTF mode is not set. Otherwise they
are output as \x{hh...} escapes. See below for the definition of non-printing
characters. If the pattern has the <b>/+</b> modifier, the output for substring
0 is followed by the the rest of the subject string, identified by "0+" like
@@ -881,15 +881,15 @@ been loaded, <b>pcretest</b> proceeds to read data lines in the usual way.
You can copy a file written by <b>pcretest</b> to a different host and reload it
there, even if the new host has opposite endianness to the one on which the
pattern was compiled. For example, you can compile on an i86 machine and run on
-a SPARC machine. When a pattern is reloaded on a host with different
+a SPARC machine. When a pattern is reloaded on a host with different
endianness, the confirmation message is changed to:
<pre>
Compiled pattern (byte-inverted) loaded from /some/file
</pre>
-The test suite contains some saved pre-compiled patterns with different
-endianness. These are reloaded using "&#60;!" instead of just "&#60;". This suppresses
-the "(byte-inverted)" text so that the output is the same on all hosts. It also
-forces debugging output once the pattern has been reloaded.
+The test suite contains some saved pre-compiled patterns with different
+endianness. These are reloaded using "&#60;!" instead of just "&#60;". This suppresses
+the "(byte-inverted)" text so that the output is the same on all hosts. It also
+forces debugging output once the pattern has been reloaded.
</P>
<P>
File names for saving and reloading can be absolute or relative, but note that
diff --git a/doc/html/pcreunicode.html b/doc/html/pcreunicode.html
index bacde25..e3c6d58 100644
--- a/doc/html/pcreunicode.html
+++ b/doc/html/pcreunicode.html
@@ -17,7 +17,7 @@ UTF-8, UTF-16, AND UNICODE PROPERTY SUPPORT
</b><br>
<P>
From Release 8.30, in addition to its previous UTF-8 support, PCRE also
-supports UTF-16 by means of a separate 16-bit library. This can be built as
+supports UTF-16 by means of a separate 16-bit library. This can be built as
well as, or instead of, the 8-bit library.
</P>
<br><b>
@@ -82,7 +82,7 @@ range U+0 to U+10FFFF, excluding U+D800 to U+DFFF.
</P>
<P>
The excluded code points are the "Surrogate Area" of Unicode. They are reserved
-for use by UTF-16, where they are used in pairs to encode codepoints with
+for use by UTF-16, where they are used in pairs to encode codepoints with
values greater than 0xFFFF. The code points that are encoded by UTF-16 pairs
are available independently in the UTF-8 encoding. (In other words, the whole
surrogate thing is a fudge for UTF-16 which unfortunately messes up UTF-8.)
@@ -161,7 +161,7 @@ two-byte characters for values greater than \177.
data units, for example: \x{100}{3}.
</P>
<P>
-4. The dot metacharacter matches one UTF character instead of a single data
+4. The dot metacharacter matches one UTF character instead of a single data
unit.
</P>
<P>
@@ -179,7 +179,7 @@ be carried out by the normal interpretive function.
<P>
6. The character escapes \b, \B, \d, \D, \s, \S, \w, and \W correctly
test characters of any code value, but, by default, the characters that PCRE
-recognizes as digits, spaces, or word characters remain the same set as in
+recognizes as digits, spaces, or word characters remain the same set as in
non-UTF mode, all with values less than 256. This remains true even when PCRE
is built to include Unicode property support, because to do otherwise would
slow down PCRE in many common cases. Note in particular that this applies to
diff --git a/doc/pcre-config.1 b/doc/pcre-config.1
index 445fe18..6f2c48f 100644
--- a/doc/pcre-config.1
+++ b/doc/pcre-config.1
@@ -6,7 +6,7 @@ pcre-config - program to return PCRE configuration
.sp
.B pcre-config [--prefix] [--exec-prefix] [--version] [--libs]
.ti +5n
-.B [--libs16] [--libs-cpp] [--libs-posix] [--cflags]
+.B [--libs16] [--libs-cpp] [--libs-posix] [--cflags]
.ti +5n
.B [--cflags-posix]
.
@@ -15,9 +15,9 @@ pcre-config - program to return PCRE configuration
.rs
.sp
\fBpcre-config\fP returns the configuration of the installed PCRE
-libraries and the options required to compile a program to use them. Some of
-the options apply only to the 8-bit or 16-bit libraries, respectively, and are
-not available if only one of those libraries has been built. If an unavailable
+libraries and the options required to compile a program to use them. Some of
+the options apply only to the 8-bit or 16-bit libraries, respectively, and are
+not available if only one of those libraries has been built. If an unavailable
option is encountered, the "usage" information is output.
.
.
diff --git a/doc/pcre.3 b/doc/pcre.3
index 8d46dd9..e4551c8 100644
--- a/doc/pcre.3
+++ b/doc/pcre.3
@@ -11,11 +11,11 @@ appeared in Perl are also available using the Python syntax, there is some
support for one or two .NET and Oniguruma syntax items, and there is an option
for requesting some minor changes that give better JavaScript compatibility.
.P
-Starting with release 8.30, it is possible to compile two separate PCRE
+Starting with release 8.30, it is possible to compile two separate PCRE
libraries: the original, which supports 8-bit character strings (including
UTF-8 strings), and a second library that supports 16-bit character strings
(including UTF-16 strings). The build process allows either one or both to be
-built. The majority of the work to make this possible was done by Zoltan
+built. The majority of the work to make this possible was done by Zoltan
Herczeg.
.P
The two libraries contain identical sets of functions, except that the names in
@@ -26,8 +26,8 @@ library described separately in the
.\" HREF
\fBpcre16\fP
.\"
-page. References to functions or structures of the form \fIpcre[16]_xxx\fP
-should be read as meaning "\fIpcre_xxx\fP when using the 8-bit library and
+page. References to functions or structures of the form \fIpcre[16]_xxx\fP
+should be read as meaning "\fIpcre_xxx\fP when using the 8-bit library and
\fIpcre16_xxx\fP when using the 16-bit library".
.P
The current implementation of PCRE corresponds approximately with Perl 5.12,
@@ -106,7 +106,7 @@ all the sections, except the \fBpcredemo\fP section, are concatenated, for ease
of searching. The sections are as follows:
.sp
pcre this document
- pcre16 details of the 16-bit library
+ pcre16 details of the 16-bit library
pcre-config show PCRE installation configuration information
pcreapi details of PCRE's native C API
pcrebuild options for building PCRE
diff --git a/doc/pcre.txt b/doc/pcre.txt
index 6740394..c9a7b2e 100644
--- a/doc/pcre.txt
+++ b/doc/pcre.txt
@@ -138,8 +138,8 @@ REVISION
Last updated: 10 January 2012
Copyright (c) 1997-2012 University of Cambridge.
------------------------------------------------------------------------------
-
-
+
+
PCRE(3) PCRE(3)
@@ -463,8 +463,8 @@ REVISION
Last updated: 08 January 2012
Copyright (c) 1997-2012 University of Cambridge.
------------------------------------------------------------------------------
-
-
+
+
PCREBUILD(3) PCREBUILD(3)
@@ -859,8 +859,8 @@ REVISION
Last updated: 07 January 2012
Copyright (c) 1997-2012 University of Cambridge.
------------------------------------------------------------------------------
-
-
+
+
PCREMATCHING(3) PCREMATCHING(3)
@@ -1066,8 +1066,8 @@ REVISION
Last updated: 08 January 2012
Copyright (c) 1997-2012 University of Cambridge.
------------------------------------------------------------------------------
-
-
+
+
PCREAPI(3) PCREAPI(3)
@@ -1405,6 +1405,14 @@ CHECKING BUILD-TIME OPTIONS
The output is an integer that is set to one if support for just-in-time
compiling is available; otherwise it is set to zero.
+ PCRE_CONFIG_JITTARGET
+
+ The output is a pointer to a zero-terminated "const char *" string. If
+ JIT support is available, the string contains the name of the architec-
+ ture for which the JIT compiler is configured, for example "x86 32bit
+ (little endian + unaligned)". If JIT support is not available, the
+ result is NULL.
+
PCRE_CONFIG_NEWLINE
The output is an integer whose value specifies the default character
@@ -3255,6 +3263,31 @@ FINDING ALL POSSIBLE MATCHES
matches, pcre_exec() will yield PCRE_ERROR_NOMATCH.
+OBTAINING AN ESTIMATE OF STACK USAGE
+
+ Matching certain patterns using pcre_exec() can use a lot of process
+ stack, which in certain environments can be rather limited in size.
+ Some users find it helpful to have an estimate of the amount of stack
+ that is used by pcre_exec(), to help them set recursion limits, as
+ described in the pcrestack documentation. The estimate that is output
+ by pcretest when called with the -m and -C options is obtained by call-
+ ing pcre_exec with the values NULL, NULL, NULL, -999, and -999 for its
+ first five arguments.
+
+ Normally, if its first argument is NULL, pcre_exec() immediately
+ returns the negative error code PCRE_ERROR_NULL, but with this special
+ combination of arguments, it returns instead a negative number whose
+ absolute value is the approximate stack frame size in bytes. (A nega-
+ tive number is used so that it is clear that no match has happened.)
+ The value is approximate because in some cases, recursive calls to
+ pcre_exec() occur when there are one or two additional variables on the
+ stack.
+
+ If PCRE has been compiled to use the heap instead of the stack for
+ recursion, the value returned is the size of each block that is
+ obtained from the heap.
+
+
MATCHING A PATTERN: THE ALTERNATIVE FUNCTION
int pcre_dfa_exec(const pcre *code, const pcre_extra *extra,
@@ -3436,11 +3469,11 @@ AUTHOR
REVISION
- Last updated: 07 January 2012
+ Last updated: 21 January 2012
Copyright (c) 1997-2012 University of Cambridge.
------------------------------------------------------------------------------
-
-
+
+
PCRECALLOUT(3) PCRECALLOUT(3)
@@ -3638,8 +3671,8 @@ REVISION
Last updated: 08 Janurary 2012
Copyright (c) 1997-2012 University of Cambridge.
------------------------------------------------------------------------------
-
-
+
+
PCRECOMPAT(3) PCRECOMPAT(3)
@@ -3813,8 +3846,8 @@ REVISION
Last updated: 08 Januray 2012
Copyright (c) 1997-2012 University of Cambridge.
------------------------------------------------------------------------------
-
-
+
+
PCREPATTERN(3) PCREPATTERN(3)
@@ -6418,8 +6451,8 @@ REVISION
Last updated: 09 January 2012
Copyright (c) 1997-2012 University of Cambridge.
------------------------------------------------------------------------------
-
-
+
+
PCRESYNTAX(3) PCRESYNTAX(3)
@@ -6794,8 +6827,8 @@ REVISION
Last updated: 10 January 2012
Copyright (c) 1997-2012 University of Cambridge.
------------------------------------------------------------------------------
-
-
+
+
PCREUNICODE(3) PCREUNICODE(3)
@@ -6992,8 +7025,8 @@ REVISION
Last updated: 13 January 2012
Copyright (c) 1997-2012 University of Cambridge.
------------------------------------------------------------------------------
-
-
+
+
PCREJIT(3) PCREJIT(3)
@@ -7348,8 +7381,8 @@ REVISION
Last updated: 08 January 2012
Copyright (c) 1997-2012 University of Cambridge.
------------------------------------------------------------------------------
-
-
+
+
PCREPARTIAL(3) PCREPARTIAL(3)
@@ -7469,111 +7502,112 @@ PARTIAL MATCHING USING pcre_exec() OR pcre16_exec()
plete match. For this reason, the assumption is made that the end of
the supplied subject string may not be the true end of the available
data, and so, if \z, \Z, \b, \B, or $ are encountered at the end of the
- subject, the result is PCRE_ERROR_PARTIAL.
+ subject, the result is PCRE_ERROR_PARTIAL, provided that at least one
+ character in the subject has been inspected.
Setting PCRE_PARTIAL_HARD also affects the way UTF-8 and UTF-16 subject
- strings are checked for validity. Normally, an invalid sequence causes
- the error PCRE_ERROR_BADUTF8 or PCRE_ERROR_BADUTF16. However, in the
- special case of a truncated character at the end of the subject,
- PCRE_ERROR_SHORTUTF8 or PCRE_ERROR_SHORTUTF16 is returned when
+ strings are checked for validity. Normally, an invalid sequence causes
+ the error PCRE_ERROR_BADUTF8 or PCRE_ERROR_BADUTF16. However, in the
+ special case of a truncated character at the end of the subject,
+ PCRE_ERROR_SHORTUTF8 or PCRE_ERROR_SHORTUTF16 is returned when
PCRE_PARTIAL_HARD is set.
Comparing hard and soft partial matching
- The difference between the two partial matching options can be illus-
+ The difference between the two partial matching options can be illus-
trated by a pattern such as:
/dog(sbody)?/
- This matches either "dog" or "dogsbody", greedily (that is, it prefers
- the longer string if possible). If it is matched against the string
- "dog" with PCRE_PARTIAL_SOFT, it yields a complete match for "dog".
+ This matches either "dog" or "dogsbody", greedily (that is, it prefers
+ the longer string if possible). If it is matched against the string
+ "dog" with PCRE_PARTIAL_SOFT, it yields a complete match for "dog".
However, if PCRE_PARTIAL_HARD is set, the result is PCRE_ERROR_PARTIAL.
- On the other hand, if the pattern is made ungreedy the result is dif-
+ On the other hand, if the pattern is made ungreedy the result is dif-
ferent:
/dog(sbody)??/
- In this case the result is always a complete match because that is
- found first, and matching never continues after finding a complete
+ In this case the result is always a complete match because that is
+ found first, and matching never continues after finding a complete
match. It might be easier to follow this explanation by thinking of the
two patterns like this:
/dog(sbody)?/ is the same as /dogsbody|dog/
/dog(sbody)??/ is the same as /dog|dogsbody/
- The second pattern will never match "dogsbody", because it will always
+ The second pattern will never match "dogsbody", because it will always
find the shorter match first.
PARTIAL MATCHING USING pcre_dfa_exec() OR pcre16_dfa_exec()
The DFA functions move along the subject string character by character,
- without backtracking, searching for all possible matches simultane-
- ously. If the end of the subject is reached before the end of the pat-
- tern, there is the possibility of a partial match, again provided that
+ without backtracking, searching for all possible matches simultane-
+ ously. If the end of the subject is reached before the end of the pat-
+ tern, there is the possibility of a partial match, again provided that
at least one character has been inspected.
- When PCRE_PARTIAL_SOFT is set, PCRE_ERROR_PARTIAL is returned only if
- there have been no complete matches. Otherwise, the complete matches
- are returned. However, if PCRE_PARTIAL_HARD is set, a partial match
- takes precedence over any complete matches. The portion of the string
- that was inspected when the longest partial match was found is set as
+ When PCRE_PARTIAL_SOFT is set, PCRE_ERROR_PARTIAL is returned only if
+ there have been no complete matches. Otherwise, the complete matches
+ are returned. However, if PCRE_PARTIAL_HARD is set, a partial match
+ takes precedence over any complete matches. The portion of the string
+ that was inspected when the longest partial match was found is set as
the first matching string, provided there are at least two slots in the
offsets vector.
- Because the DFA functions always search for all possible matches, and
- there is no difference between greedy and ungreedy repetition, their
- behaviour is different from the standard functions when PCRE_PAR-
- TIAL_HARD is set. Consider the string "dog" matched against the
+ Because the DFA functions always search for all possible matches, and
+ there is no difference between greedy and ungreedy repetition, their
+ behaviour is different from the standard functions when PCRE_PAR-
+ TIAL_HARD is set. Consider the string "dog" matched against the
ungreedy pattern shown above:
/dog(sbody)??/
- Whereas the standard functions stop as soon as they find the complete
- match for "dog", the DFA functions also find the partial match for
+ Whereas the standard functions stop as soon as they find the complete
+ match for "dog", the DFA functions also find the partial match for
"dogsbody", and so return that when PCRE_PARTIAL_HARD is set.
PARTIAL MATCHING AND WORD BOUNDARIES
- If a pattern ends with one of sequences \b or \B, which test for word
- boundaries, partial matching with PCRE_PARTIAL_SOFT can give counter-
+ If a pattern ends with one of sequences \b or \B, which test for word
+ boundaries, partial matching with PCRE_PARTIAL_SOFT can give counter-
intuitive results. Consider this pattern:
/\bcat\b/
This matches "cat", provided there is a word boundary at either end. If
the subject string is "the cat", the comparison of the final "t" with a
- following character cannot take place, so a partial match is found.
- However, normal matching carries on, and \b matches at the end of the
- subject when the last character is a letter, so a complete match is
- found. The result, therefore, is not PCRE_ERROR_PARTIAL. Using
- PCRE_PARTIAL_HARD in this case does yield PCRE_ERROR_PARTIAL, because
+ following character cannot take place, so a partial match is found.
+ However, normal matching carries on, and \b matches at the end of the
+ subject when the last character is a letter, so a complete match is
+ found. The result, therefore, is not PCRE_ERROR_PARTIAL. Using
+ PCRE_PARTIAL_HARD in this case does yield PCRE_ERROR_PARTIAL, because
then the partial match takes precedence.
FORMERLY RESTRICTED PATTERNS
For releases of PCRE prior to 8.00, because of the way certain internal
- optimizations were implemented in the pcre_exec() function, the
- PCRE_PARTIAL option (predecessor of PCRE_PARTIAL_SOFT) could not be
- used with all patterns. From release 8.00 onwards, the restrictions no
- longer apply, and partial matching with can be requested for any pat-
+ optimizations were implemented in the pcre_exec() function, the
+ PCRE_PARTIAL option (predecessor of PCRE_PARTIAL_SOFT) could not be
+ used with all patterns. From release 8.00 onwards, the restrictions no
+ longer apply, and partial matching with can be requested for any pat-
tern.
Items that were formerly restricted were repeated single characters and
- repeated metasequences. If PCRE_PARTIAL was set for a pattern that did
- not conform to the restrictions, pcre_exec() returned the error code
- PCRE_ERROR_BADPARTIAL (-13). This error code is no longer in use. The
- PCRE_INFO_OKPARTIAL call to pcre_fullinfo() to find out if a compiled
+ repeated metasequences. If PCRE_PARTIAL was set for a pattern that did
+ not conform to the restrictions, pcre_exec() returned the error code
+ PCRE_ERROR_BADPARTIAL (-13). This error code is no longer in use. The
+ PCRE_INFO_OKPARTIAL call to pcre_fullinfo() to find out if a compiled
pattern can be used for partial matching now always returns 1.
EXAMPLE OF PARTIAL MATCHING USING PCRETEST
- If the escape sequence \P is present in a pcretest data line, the
- PCRE_PARTIAL_SOFT option is used for the match. Here is a run of
+ If the escape sequence \P is present in a pcretest data line, the
+ PCRE_PARTIAL_SOFT option is used for the match. Here is a run of
pcretest that uses the date example quoted above:
re> /^\d?\d(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\d\d$/
@@ -7589,24 +7623,24 @@ EXAMPLE OF PARTIAL MATCHING USING PCRETEST
data> j\P
No match
- The first data string is matched completely, so pcretest shows the
- matched substrings. The remaining four strings do not match the com-
+ The first data string is matched completely, so pcretest shows the
+ matched substrings. The remaining four strings do not match the com-
plete pattern, but the first two are partial matches. Similar output is
obtained if DFA matching is used.
- If the escape sequence \P is present more than once in a pcretest data
+ If the escape sequence \P is present more than once in a pcretest data
line, the PCRE_PARTIAL_HARD option is set for the match.
MULTI-SEGMENT MATCHING WITH pcre_dfa_exec() OR pcre16_dfa_exec()
- When a partial match has been found using a DFA matching function, it
- is possible to continue the match by providing additional subject data
- and calling the function again with the same compiled regular expres-
- sion, this time setting the PCRE_DFA_RESTART option. You must pass the
+ When a partial match has been found using a DFA matching function, it
+ is possible to continue the match by providing additional subject data
+ and calling the function again with the same compiled regular expres-
+ sion, this time setting the PCRE_DFA_RESTART option. You must pass the
same working space as before, because this is where details of the pre-
- vious partial match are stored. Here is an example using pcretest,
- using the \R escape sequence to set the PCRE_DFA_RESTART option (\D
+ vious partial match are stored. Here is an example using pcretest,
+ using the \R escape sequence to set the PCRE_DFA_RESTART option (\D
specifies the use of the DFA matching function):
re> /^\d?\d(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\d\d$/
@@ -7615,47 +7649,47 @@ MULTI-SEGMENT MATCHING WITH pcre_dfa_exec() OR pcre16_dfa_exec()
data> n05\R\D
0: n05
- The first call has "23ja" as the subject, and requests partial match-
- ing; the second call has "n05" as the subject for the continued
- (restarted) match. Notice that when the match is complete, only the
- last part is shown; PCRE does not retain the previously partially-
- matched string. It is up to the calling program to do that if it needs
+ The first call has "23ja" as the subject, and requests partial match-
+ ing; the second call has "n05" as the subject for the continued
+ (restarted) match. Notice that when the match is complete, only the
+ last part is shown; PCRE does not retain the previously partially-
+ matched string. It is up to the calling program to do that if it needs
to.
- You can set the PCRE_PARTIAL_SOFT or PCRE_PARTIAL_HARD options with
- PCRE_DFA_RESTART to continue partial matching over multiple segments.
- This facility can be used to pass very long subject strings to the DFA
+ You can set the PCRE_PARTIAL_SOFT or PCRE_PARTIAL_HARD options with
+ PCRE_DFA_RESTART to continue partial matching over multiple segments.
+ This facility can be used to pass very long subject strings to the DFA
matching functions.
MULTI-SEGMENT MATCHING WITH pcre_exec() OR pcre16_exec()
- From release 8.00, the standard matching functions can also be used to
+ From release 8.00, the standard matching functions can also be used to
do multi-segment matching. Unlike the DFA functions, it is not possible
- to restart the previous match with a new segment of data. Instead, new
+ to restart the previous match with a new segment of data. Instead, new
data must be added to the previous subject string, and the entire match
- re-run, starting from the point where the partial match occurred. Ear-
+ re-run, starting from the point where the partial match occurred. Ear-
lier data can be discarded.
- It is best to use PCRE_PARTIAL_HARD in this situation, because it does
- not treat the end of a segment as the end of the subject when matching
- \z, \Z, \b, \B, and $. Consider an unanchored pattern that matches
+ It is best to use PCRE_PARTIAL_HARD in this situation, because it does
+ not treat the end of a segment as the end of the subject when matching
+ \z, \Z, \b, \B, and $. Consider an unanchored pattern that matches
dates:
re> /\d?\d(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\d\d/
data> The date is 23ja\P\P
Partial match: 23ja
- At this stage, an application could discard the text preceding "23ja",
- add on text from the next segment, and call the matching function
- again. Unlike the DFA matching functions the entire matching string
- must always be available, and the complete matching process occurs for
+ At this stage, an application could discard the text preceding "23ja",
+ add on text from the next segment, and call the matching function
+ again. Unlike the DFA matching functions the entire matching string
+ must always be available, and the complete matching process occurs for
each call, so more memory and more processing time is needed.
- Note: If the pattern contains lookbehind assertions, or \K, or starts
+ Note: If the pattern contains lookbehind assertions, or \K, or starts
with \b or \B, the string that is returned for a partial match includes
- characters that precede the partially matched string itself, because
- these must be retained when adding on more characters for a subsequent
+ characters that precede the partially matched string itself, because
+ these must be retained when adding on more characters for a subsequent
matching attempt.
@@ -7665,28 +7699,28 @@ ISSUES WITH MULTI-SEGMENT MATCHING
whichever matching function is used.
1. If the pattern contains a test for the beginning of a line, you need
- to pass the PCRE_NOTBOL option when the subject string for any call
- does start at the beginning of a line. There is also a PCRE_NOTEOL
+ to pass the PCRE_NOTBOL option when the subject string for any call
+ does start at the beginning of a line. There is also a PCRE_NOTEOL
option, but in practice when doing multi-segment matching you should be
using PCRE_PARTIAL_HARD, which includes the effect of PCRE_NOTEOL.
- 2. Lookbehind assertions at the start of a pattern are catered for in
- the offsets that are returned for a partial match. However, in theory,
- a lookbehind assertion later in the pattern could require even earlier
- characters to be inspected, and it might not have been reached when a
- partial match occurs. This is probably an extremely unlikely case; you
- could guard against it to a certain extent by always including extra
+ 2. Lookbehind assertions at the start of a pattern are catered for in
+ the offsets that are returned for a partial match. However, in theory,
+ a lookbehind assertion later in the pattern could require even earlier
+ characters to be inspected, and it might not have been reached when a
+ partial match occurs. This is probably an extremely unlikely case; you
+ could guard against it to a certain extent by always including extra
characters at the start.
- 3. Matching a subject string that is split into multiple segments may
- not always produce exactly the same result as matching over one single
- long string, especially when PCRE_PARTIAL_SOFT is used. The section
- "Partial Matching and Word Boundaries" above describes an issue that
- arises if the pattern ends with \b or \B. Another kind of difference
- may occur when there are multiple matching possibilities, because (for
- PCRE_PARTIAL_SOFT) a partial match result is given only when there are
+ 3. Matching a subject string that is split into multiple segments may
+ not always produce exactly the same result as matching over one single
+ long string, especially when PCRE_PARTIAL_SOFT is used. The section
+ "Partial Matching and Word Boundaries" above describes an issue that
+ arises if the pattern ends with \b or \B. Another kind of difference
+ may occur when there are multiple matching possibilities, because (for
+ PCRE_PARTIAL_SOFT) a partial match result is given only when there are
no completed matches. This means that as soon as the shortest match has
- been found, continuation to a new subject segment is no longer possi-
+ been found, continuation to a new subject segment is no longer possi-
ble. Consider again this pcretest example:
re> /dog(sbody)?/
@@ -7700,18 +7734,18 @@ ISSUES WITH MULTI-SEGMENT MATCHING
0: dogsbody
1: dog
- The first data line passes the string "dogsb" to a standard matching
- function, setting the PCRE_PARTIAL_SOFT option. Although the string is
- a partial match for "dogsbody", the result is not PCRE_ERROR_PARTIAL,
- because the shorter string "dog" is a complete match. Similarly, when
- the subject is presented to a DFA matching function in several parts
- ("do" and "gsb" being the first two) the match stops when "dog" has
- been found, and it is not possible to continue. On the other hand, if
- "dogsbody" is presented as a single string, a DFA matching function
+ The first data line passes the string "dogsb" to a standard matching
+ function, setting the PCRE_PARTIAL_SOFT option. Although the string is
+ a partial match for "dogsbody", the result is not PCRE_ERROR_PARTIAL,
+ because the shorter string "dog" is a complete match. Similarly, when
+ the subject is presented to a DFA matching function in several parts
+ ("do" and "gsb" being the first two) the match stops when "dog" has
+ been found, and it is not possible to continue. On the other hand, if
+ "dogsbody" is presented as a single string, a DFA matching function
finds both matches.
- Because of these problems, it is best to use PCRE_PARTIAL_HARD when
- matching multi-segment data. The example above then behaves differ-
+ Because of these problems, it is best to use PCRE_PARTIAL_HARD when
+ matching multi-segment data. The example above then behaves differ-
ently:
re> /dog(sbody)?/
@@ -7723,25 +7757,25 @@ ISSUES WITH MULTI-SEGMENT MATCHING
Partial match: gsb
4. Patterns that contain alternatives at the top level which do not all
- start with the same pattern item may not work as expected when
+ start with the same pattern item may not work as expected when
PCRE_DFA_RESTART is used. For example, consider this pattern:
1234|3789
- If the first part of the subject is "ABC123", a partial match of the
- first alternative is found at offset 3. There is no partial match for
+ If the first part of the subject is "ABC123", a partial match of the
+ first alternative is found at offset 3. There is no partial match for
the second alternative, because such a match does not start at the same
- point in the subject string. Attempting to continue with the string
- "7890" does not yield a match because only those alternatives that
- match at one point in the subject are remembered. The problem arises
- because the start of the second alternative matches within the first
- alternative. There is no problem with anchored patterns or patterns
+ point in the subject string. Attempting to continue with the string
+ "7890" does not yield a match because only those alternatives that
+ match at one point in the subject are remembered. The problem arises
+ because the start of the second alternative matches within the first
+ alternative. There is no problem with anchored patterns or patterns
such as:
1234|ABCD
- where no string can be a partial match for both alternatives. This is
- not a problem if a standard matching function is used, because the
+ where no string can be a partial match for both alternatives. This is
+ not a problem if a standard matching function is used, because the
entire match has to be rerun each time:
re> /1234|3789/
@@ -7751,10 +7785,10 @@ ISSUES WITH MULTI-SEGMENT MATCHING
0: 3789
Of course, instead of using PCRE_DFA_RESTART, the same technique of re-
- running the entire match can also be used with the DFA matching func-
- tions. Another possibility is to work with two buffers. If a partial
- match at offset n in the first buffer is followed by "no match" when
- PCRE_DFA_RESTART is used on the second buffer, you can then try a new
+ running the entire match can also be used with the DFA matching func-
+ tions. Another possibility is to work with two buffers. If a partial
+ match at offset n in the first buffer is followed by "no match" when
+ PCRE_DFA_RESTART is used on the second buffer, you can then try a new
match starting at offset n+1 in the first buffer.
@@ -7767,11 +7801,11 @@ AUTHOR
REVISION
- Last updated: 08 January 2012
+ Last updated: 21 January 2012
Copyright (c) 1997-2012 University of Cambridge.
------------------------------------------------------------------------------
-
-
+
+
PCREPRECOMPILE(3) PCREPRECOMPILE(3)
@@ -7905,8 +7939,8 @@ REVISION
Last updated: 10 January 2012
Copyright (c) 1997-2012 University of Cambridge.
------------------------------------------------------------------------------
-
-
+
+
PCREPERFORM(3) PCREPERFORM(3)
@@ -8075,8 +8109,8 @@ REVISION
Last updated: 09 January 2012
Copyright (c) 1997-2012 University of Cambridge.
------------------------------------------------------------------------------
-
-
+
+
PCREPOSIX(3) PCREPOSIX(3)
@@ -8339,8 +8373,8 @@ REVISION
Last updated: 09 January 2012
Copyright (c) 1997-2012 University of Cambridge.
------------------------------------------------------------------------------
-
-
+
+
PCRECPP(3) PCRECPP(3)
@@ -8681,8 +8715,8 @@ REVISION
Last updated: 08 January 2012
------------------------------------------------------------------------------
-
-
+
+
PCRESAMPLE(3) PCRESAMPLE(3)
@@ -8825,8 +8859,8 @@ REVISION
Last updated: 08 January 2012
Copyright (c) 1997-2012 University of Cambridge.
------------------------------------------------------------------------------
-
-
+
+
PCRESTACK(3) PCRESTACK(3)
@@ -8944,6 +8978,30 @@ PCRE DISCUSSION OF STACK USAGE
subject string. This is done by calling pcre[16]_exec() repeatedly with
different limits.
+ Obtaining an estimate of stack usage
+
+ The actual amount of stack used per recursion can vary quite a lot,
+ depending on the compiler that was used to build PCRE and the optimiza-
+ tion or debugging options that were set for it. The rule of thumb value
+ of 500 bytes mentioned above may be larger or smaller than what is
+ actually needed. A better approximation can be obtained by running this
+ command:
+
+ pcretest -m -C
+
+ The -C option causes pcretest to output information about the options
+ with which PCRE was compiled. When -m is also given (before -C), infor-
+ mation about stack use is given in a line like this:
+
+ Match recursion uses stack: approximate frame size = 640 bytes
+
+ The value is approximate because some recursions need a bit more (up to
+ perhaps 16 more bytes).
+
+ If the above command is given when PCRE is compiled to use the heap
+ instead of the stack for recursion, the value that is output is the
+ size of each block that is obtained from the heap.
+
Changing stack size in Unix-like systems
In Unix-like environments, there is not often a problem with the stack
@@ -8983,8 +9041,8 @@ AUTHOR
REVISION
- Last updated: 10 January 2012
+ Last updated: 21 January 2012
Copyright (c) 1997-2012 University of Cambridge.
------------------------------------------------------------------------------
-
-
+
+
diff --git a/doc/pcre16.3 b/doc/pcre16.3
index c206e44..726ef90 100644
--- a/doc/pcre16.3
+++ b/doc/pcre16.3
@@ -139,7 +139,7 @@ PCRE - Perl-compatible regular expressions
.sp
.B int pcre16_utf16_to_host_byte_order(PCRE_UCHAR16 *\fIoutput\fP,
.ti +5n
-.B PCRE_SPTR16 \fIinput\fP, int \fIlength\fP, int *\fIbyte_order\fP,
+.B PCRE_SPTR16 \fIinput\fP, int \fIlength\fP, int *\fIbyte_order\fP,
.ti +5n
.B int \fIkeep_boms\fP);
.
@@ -158,8 +158,8 @@ PCRE documentation describes the 8-bit library, with only occasional references
to the 16-bit library. This page describes what is different when you use the
16-bit library.
.P
-WARNING: A single application can be linked with both libraries, but you must
-take care when processing any particular pattern to use functions from just one
+WARNING: A single application can be linked with both libraries, but you must
+take care when processing any particular pattern to use functions from just one
library. For example, if you want to study a pattern that was compiled with
\fBpcre16_compile()\fP, you must do so with \fBpcre16_study()\fP, not
\fBpcre_study()\fP, and you must free the study data with
@@ -169,7 +169,7 @@ library. For example, if you want to study a pattern that was compiled with
.SH "THE HEADER FILE"
.rs
.sp
-There is only one header file, \fBpcre.h\fP. It contains prototypes for all the
+There is only one header file, \fBpcre.h\fP. It contains prototypes for all the
functions in both libraries, as well as definitions of flags, structures, error
codes, etc.
.
@@ -177,34 +177,34 @@ codes, etc.
.SH "THE LIBRARY NAME"
.rs
.sp
-In Unix-like systems, the 16-bit library is called \fBlibpcre16\fP, and can
-normally be accesss by adding \fB-lpcre16\fP to the command for linking an
+In Unix-like systems, the 16-bit library is called \fBlibpcre16\fP, and can
+normally be accesss by adding \fB-lpcre16\fP to the command for linking an
application that uses PCRE.
.
.
.SH "STRING TYPES"
.rs
.sp
-In the 8-bit library, strings are passed to PCRE library functions as vectors
-of bytes with the C type "char *". In the 16-bit library, strings are passed as
-vectors of unsigned 16-bit quantities. The macro PCRE_UCHAR16 specifies an
-appropriate data type, and PCRE_SPTR16 is defined as "const PCRE_UCHAR16 *". In
-very many environments, "short int" is a 16-bit data type. When PCRE is built,
-it defines PCRE_UCHAR16 as "short int", but checks that it really is a 16-bit
-data type. If it is not, the build fails with an error message telling the
+In the 8-bit library, strings are passed to PCRE library functions as vectors
+of bytes with the C type "char *". In the 16-bit library, strings are passed as
+vectors of unsigned 16-bit quantities. The macro PCRE_UCHAR16 specifies an
+appropriate data type, and PCRE_SPTR16 is defined as "const PCRE_UCHAR16 *". In
+very many environments, "short int" is a 16-bit data type. When PCRE is built,
+it defines PCRE_UCHAR16 as "short int", but checks that it really is a 16-bit
+data type. If it is not, the build fails with an error message telling the
maintainer to modify the definition appropriately.
.
.
.SH "STRUCTURE TYPES"
.rs
.sp
-The types of the opaque structures that are used for compiled 16-bit patterns
-and JIT stacks are \fBpcre16\fP and \fBpcre16_jit_stack\fP respectively. The
-type of the user-accessible structure that is returned by \fBpcre16_study()\fP
+The types of the opaque structures that are used for compiled 16-bit patterns
+and JIT stacks are \fBpcre16\fP and \fBpcre16_jit_stack\fP respectively. The
+type of the user-accessible structure that is returned by \fBpcre16_study()\fP
is \fBpcre16_extra\fP, and the type of the structure that is used for passing
-data to a callout function is \fBpcre16_callout_block\fP. These structures
-contain the same fields, with the same names, as their 8-bit counterparts. The
-only difference is that pointers to character strings are 16-bit instead of
+data to a callout function is \fBpcre16_callout_block\fP. These structures
+contain the same fields, with the same names, as their 8-bit counterparts. The
+only difference is that pointers to character strings are 16-bit instead of
8-bit types.
.
.
@@ -212,31 +212,31 @@ only difference is that pointers to character strings are 16-bit instead of
.rs
.sp
For every function in the 8-bit library there is a corresponding function in
-the 16-bit library with a name that starts with \fBpcre16_\fP instead of
+the 16-bit library with a name that starts with \fBpcre16_\fP instead of
\fBpcre_\fP. The prototypes are listed above. In addition, there is one extra
-function, \fBpcre16_utf16_to_host_byte_order()\fP. This is a utility function
-that converts a UTF-16 character string to host byte order if necessary. The
-other 16-bit functions expect the strings they are passed to be in host byte
-order.
+function, \fBpcre16_utf16_to_host_byte_order()\fP. This is a utility function
+that converts a UTF-16 character string to host byte order if necessary. The
+other 16-bit functions expect the strings they are passed to be in host byte
+order.
.P
The \fIinput\fP and \fIoutput\fP arguments of
-\fBpcre16_utf16_to_host_byte_order()\fP may point to the same address, that is,
-conversion in place is supported. The output buffer must be at least as long as
+\fBpcre16_utf16_to_host_byte_order()\fP may point to the same address, that is,
+conversion in place is supported. The output buffer must be at least as long as
the input.
.P
The \fIlength\fP argument specifies the number of 16-bit data units in the
input string; a negative value specifies a zero-terminated string.
.P
-If \fIbyte_order\fP is NULL, it is assumed that the string starts off in host
+If \fIbyte_order\fP is NULL, it is assumed that the string starts off in host
byte order. This may be changed by byte-order marks (BOMs) anywhere in the
string (commonly as the first character).
.P
-If \fIbyte_order\fP is not NULL, a non-zero value of the integer to which it
-points means that the input starts off in host byte order, otherwise the
-opposite order is assumed. Again, BOMs in the string can change this. The final
-byte order is passed back at the end of processing.
+If \fIbyte_order\fP is not NULL, a non-zero value of the integer to which it
+points means that the input starts off in host byte order, otherwise the
+opposite order is assumed. Again, BOMs in the string can change this. The final
+byte order is passed back at the end of processing.
.P
-If \fIkeep_boms\fP is not zero, byte-order mark characters (0xfeff) are copied
+If \fIkeep_boms\fP is not zero, byte-order mark characters (0xfeff) are copied
into the output string. Otherwise they are discarded.
.P
The result of the function is the number of 16-bit units placed into the output
@@ -246,16 +246,16 @@ buffer, including the zero terminator if the string was zero-terminated.
.SH "SUBJECT STRING OFFSETS"
.rs
.sp
-The offsets within subject strings that are returned by the matching functions
+The offsets within subject strings that are returned by the matching functions
are in 16-bit units rather than bytes.
.
.
.SH "NAMED SUBPATTERNS"
.rs
.sp
-The name-to-number translation table that is maintained for named subpatterns
-uses 16-bit characters. The \fBpcre16_get_stringtable_entries()\fP function
-returns the length of each entry in the table as the number of 16-bit data
+The name-to-number translation table that is maintained for named subpatterns
+uses 16-bit characters. The \fBpcre16_get_stringtable_entries()\fP function
+returns the length of each entry in the table as the number of 16-bit data
units.
.
.
@@ -266,7 +266,7 @@ There are two new general option names, PCRE_UTF16 and PCRE_NO_UTF16_CHECK,
which correspond to PCRE_UTF8 and PCRE_NO_UTF8_CHECK in the 8-bit library. In
fact, these new options define the same bits in the options word.
.P
-For the \fBpcre16_config()\fP function there is an option PCRE_CONFIG_UTF16
+For the \fBpcre16_config()\fP function there is an option PCRE_CONFIG_UTF16
that returns 1 if UTF-16 support is configured, otherwise 0. If this option is
given to \fBpcre_config()\fP, or if the PCRE_CONFIG_UTF8 option is given to
\fBpcre16_config()\fP, the result is the PCRE_ERROR_BADOPTION error.
@@ -275,18 +275,18 @@ given to \fBpcre_config()\fP, or if the PCRE_CONFIG_UTF8 option is given to
.SH "CHARACTER CODES"
.rs
.sp
-In 16-bit mode, when PCRE_UTF16 is not set, character values are treated in the
-same way as in 8-bit, non UTF-8 mode, except, of course, that they can range
-from 0 to 0xffff instead of 0 to 0xff. Character types for characters less than
-0xff can therefore be influenced by the locale in the same way as before.
-Characters greater than 0xff have only one case, and no "type" (such as letter
+In 16-bit mode, when PCRE_UTF16 is not set, character values are treated in the
+same way as in 8-bit, non UTF-8 mode, except, of course, that they can range
+from 0 to 0xffff instead of 0 to 0xff. Character types for characters less than
+0xff can therefore be influenced by the locale in the same way as before.
+Characters greater than 0xff have only one case, and no "type" (such as letter
or digit).
.P
-In UTF-16 mode, the character code is Unicode, in the range 0 to 0x10ffff, with
-the exception of values in the range 0xd800 to 0xdfff because those are
+In UTF-16 mode, the character code is Unicode, in the range 0 to 0x10ffff, with
+the exception of values in the range 0xd800 to 0xdfff because those are
"surrogate" values that are used in pairs to encode values greater than 0xffff.
.P
-A UTF-16 string can indicate its endianness by special code knows as a
+A UTF-16 string can indicate its endianness by special code knows as a
byte-order mark (BOM). The PCRE functions do not handle this, expecting strings
to be in host byte order. A utility function called
\fBpcre16_utf16_to_host_byte_order()\fP is provided to help with this (see
@@ -296,20 +296,20 @@ above).
.SH "ERROR NAMES"
.rs
.sp
-The errors PCRE_ERROR_BADUTF16_OFFSET and PCRE_ERROR_SHORTUTF16 correspond to
+The errors PCRE_ERROR_BADUTF16_OFFSET and PCRE_ERROR_SHORTUTF16 correspond to
their 8-bit counterparts. The error PCRE_ERROR_BADMODE is given when a compiled
pattern is passed to a function that processes patterns in the other
-mode, for example, if a pattern compiled with \fBpcre_compile()\fP is passed to
+mode, for example, if a pattern compiled with \fBpcre_compile()\fP is passed to
\fBpcre16_exec()\fP.
.P
There are new error codes whose names begin with PCRE_UTF16_ERR for invalid
-UTF-16 strings, corresponding to the PCRE_UTF8_ERR codes for UTF-8 strings that
+UTF-16 strings, corresponding to the PCRE_UTF8_ERR codes for UTF-8 strings that
are described in the section entitled
.\" HTML <a href="pcreapi.html#badutf8reasons">
.\" </a>
"Reason codes for invalid UTF-8 strings"
.\"
-in the main
+in the main
.\" HREF
\fBpcreapi\fP
.\"
@@ -324,8 +324,8 @@ page. The UTF-16 errors are:
.SH "ERROR TEXTS"
.rs
.sp
-If there is an error while compiling a pattern, the error text that is passed
-back by \fBpcre16_compile()\fP or \fBpcre16_compile2()\fP is still an 8-bit
+If there is an error while compiling a pattern, the error text that is passed
+back by \fBpcre16_compile()\fP or \fBpcre16_compile2()\fP is still an 8-bit
character string, zero-terminated.
.
.
@@ -339,15 +339,15 @@ a callout function point to 16-bit vectors.
.SH "TESTING"
.rs
.sp
-The \fBpcretest\fP program continues to operate with 8-bit input and output
-files, but it can be used for testing the 16-bit library. If it is run with the
-command line option \fB-16\fP, patterns and subject strings are converted from
-8-bit to 16-bit before being passed to PCRE, and the 16-bit library functions
-are used instead of the 8-bit ones. Returned 16-bit strings are converted to
+The \fBpcretest\fP program continues to operate with 8-bit input and output
+files, but it can be used for testing the 16-bit library. If it is run with the
+command line option \fB-16\fP, patterns and subject strings are converted from
+8-bit to 16-bit before being passed to PCRE, and the 16-bit library functions
+are used instead of the 8-bit ones. Returned 16-bit strings are converted to
8-bit for output. If the 8-bit library was not compiled, \fBpcretest\fP
defaults to 16-bit and the \fB-16\fP option is ignored.
.P
-When PCRE is being built, the \fBRunTest\fP script that is called by "make
+When PCRE is being built, the \fBRunTest\fP script that is called by "make
check" uses the \fBpcretest\fP \fB-C\fP option to discover which of the 8-bit
and 16-bit libraries has been built, and runs the tests appropriately.
.
@@ -355,8 +355,8 @@ and 16-bit libraries has been built, and runs the tests appropriately.
.SH "NOT SUPPORTED IN 16-BIT MODE"
.rs
.sp
-Not all the features of the 8-bit library are available with the 16-bit
-library. The C++ and POSIX wrapper functions support only the 8-bit library,
+Not all the features of the 8-bit library are available with the 16-bit
+library. The C++ and POSIX wrapper functions support only the 8-bit library,
and the \fBpcregrep\fP program is at present 8-bit only.
.
.
diff --git a/doc/pcre_config.3 b/doc/pcre_config.3
index 8ebf27f..ac298c2 100644
--- a/doc/pcre_config.3
+++ b/doc/pcre_config.3
@@ -28,7 +28,7 @@ point to an unsigned long integer. The available codes are:
PCRE_CONFIG_JIT Availability of just-in-time compiler
support (1=yes 0=no)
PCRE_CONFIG_JITTARGET String containing information about the
- target architecture for the JIT compiler,
+ target architecture for the JIT compiler,
or NULL if there is no JIT support
PCRE_CONFIG_LINK_SIZE Internal link size: 2, 3, or 4
PCRE_CONFIG_MATCH_LIMIT Internal resource limit
@@ -55,7 +55,7 @@ point to an unsigned long integer. The available codes are:
Availability of Unicode property support
(1=yes 0=no)
.sp
-The function yields 0 on success or PCRE_ERROR_BADOPTION otherwise. That error
+The function yields 0 on success or PCRE_ERROR_BADOPTION otherwise. That error
is also given if PCRE_CONFIG_UTF16 is passed to \fBpcre_config()\fP or if
PCRE_CONFIG_UTF8 is passed to \fBpcre16_config()\fP.
.P
diff --git a/doc/pcre_fullinfo.3 b/doc/pcre_fullinfo.3
index c16406b..7ba6532 100644
--- a/doc/pcre_fullinfo.3
+++ b/doc/pcre_fullinfo.3
@@ -38,7 +38,7 @@ The following information is available:
PCRE_INFO_HASCRORLF Return 1 if explicit CR or LF matches exist
PCRE_INFO_JCHANGED Return 1 if (?J) or (?-J) was used
PCRE_INFO_JIT Return 1 after successful JIT compilation
- PCRE_INFO_JITSIZE Size of JIT compiled code
+ PCRE_INFO_JITSIZE Size of JIT compiled code
PCRE_INFO_LASTLITERAL Literal last data unit required
PCRE_INFO_MINLENGTH Lower bound length of matching strings
PCRE_INFO_NAMECOUNT Number of named subpatterns
diff --git a/doc/pcre_jit_stack_alloc.3 b/doc/pcre_jit_stack_alloc.3
index 1c97f30..0392839 100644
--- a/doc/pcre_jit_stack_alloc.3
+++ b/doc/pcre_jit_stack_alloc.3
@@ -11,7 +11,7 @@ PCRE - Perl-compatible regular expressions
.ti +5n
.B int \fImaxsize\fP);
.PP
-.B pcre16_jit_stack *pcre16_jit_stack_alloc(int \fIstartsize\fP,
+.B pcre16_jit_stack *pcre16_jit_stack_alloc(int \fIstartsize\fP,
.ti +5n
.B int \fImaxsize\fP);
.
diff --git a/doc/pcre_pattern_to_host_byte_order.3 b/doc/pcre_pattern_to_host_byte_order.3
index adb51c0..615cf55 100644
--- a/doc/pcre_pattern_to_host_byte_order.3
+++ b/doc/pcre_pattern_to_host_byte_order.3
@@ -9,7 +9,7 @@ PCRE - Perl-compatible regular expressions
.SM
.B int pcre_pattern_to_host_byte_order(pcre *\fIcode\fP,
.ti +5n
-.B pcre_extra *\fIextra\fP, const unsigned char *\fItables\fP);
+.B pcre_extra *\fIextra\fP, const unsigned char *\fItables\fP);
.PP
.B int pcre16_pattern_to_host_byte_order(pcre16 *\fIcode\fP,
.ti +5n
@@ -20,8 +20,8 @@ PCRE - Perl-compatible regular expressions
.rs
.sp
This function ensures that the bytes in 2-byte and 4-byte values in a compiled
-pattern are in the correct order for the current host. It is useful when a
-pattern that has been compiled on one host is transferred to another that might
+pattern are in the correct order for the current host. It is useful when a
+pattern that has been compiled on one host is transferred to another that might
have different endianness. The arguments are:
.sp
\fIcode\fP A compiled regular expression
diff --git a/doc/pcre_utf16_to_host_byte_order.3 b/doc/pcre_utf16_to_host_byte_order.3
index 557d208..f08ce1e 100644
--- a/doc/pcre_utf16_to_host_byte_order.3
+++ b/doc/pcre_utf16_to_host_byte_order.3
@@ -9,7 +9,7 @@ PCRE - Perl-compatible regular expressions
.SM
.B int pcre16_utf16_to_host_byte_order(PCRE_UCHAR16 *\fIoutput\fP,
.ti +5n
-.B PCRE_SPTR16 \fIinput\fP, int \fIlength\fP, int *\fIhost_byte_order\fP,
+.B PCRE_SPTR16 \fIinput\fP, int \fIlength\fP, int *\fIhost_byte_order\fP,
.ti +5n
.B int \fIkeep_boms\fP);
.
@@ -17,8 +17,8 @@ PCRE - Perl-compatible regular expressions
.SH DESCRIPTION
.rs
.sp
-This function, which exists only in the 16-bit library, converts a UTF-16
-string to the correct order for the current host, taking account of any byte
+This function, which exists only in the 16-bit library, converts a UTF-16
+string to the correct order for the current host, taking account of any byte
order marks (BOMs) within the string. Its arguments are:
.sp
\fIoutput\fP pointer to output buffer, may be the same as \fIinput\fP
diff --git a/doc/pcreapi.3 b/doc/pcreapi.3
index b6f2507..6263e7b 100644
--- a/doc/pcreapi.3
+++ b/doc/pcreapi.3
@@ -148,7 +148,7 @@ just use different data types for their arguments and results, and their names
start with \fBpcre16_\fP instead of \fBpcre_\fP. For every option that has UTF8
in its name (for example, PCRE_UTF8), there is a corresponding 16-bit name with
UTF8 replaced by UTF16. This facility is in fact just cosmetic; the 16-bit
-option names define the same bit values.
+option names define the same bit values.
.P
References to bytes and UTF-8 in this document should be read as references to
16-bit data quantities and UTF-16 when using the 16-bit library, unless
@@ -157,7 +157,7 @@ library are given in the
.\" HREF
\fBpcre16\fP
.\"
-page.
+page.
.
.
.SH "PCRE API OVERVIEW"
@@ -392,7 +392,7 @@ not recognized. The following information is available:
PCRE_CONFIG_UTF8
.sp
The output is an integer that is set to one if UTF-8 support is available;
-otherwise it is set to zero. If this option is given to the 16-bit version of
+otherwise it is set to zero. If this option is given to the 16-bit version of
this function, \fBpcre16_config()\fP, the result is PCRE_ERROR_BADOPTION.
.sp
PCRE_CONFIG_UTF16
@@ -415,8 +415,8 @@ compiling is available; otherwise it is set to zero.
PCRE_CONFIG_JITTARGET
.sp
The output is a pointer to a zero-terminated "const char *" string. If JIT
-support is available, the string contains the name of the architecture for
-which the JIT compiler is configured, for example "x86 32bit (little endian +
+support is available, the string contains the name of the architecture for
+which the JIT compiler is configured, for example "x86 32bit (little endian +
unaligned)". If JIT support is not available, the result is NULL.
.sp
PCRE_CONFIG_NEWLINE
@@ -742,7 +742,7 @@ preceding sequences should be recognized. Setting PCRE_NEWLINE_ANY specifies
that any Unicode newline sequence should be recognized. The Unicode newline
sequences are the three just mentioned, plus the single characters VT (vertical
tab, U+000B), FF (formfeed, U+000C), NEL (next line, U+0085), LS (line
-separator, U+2028), and PS (paragraph separator, U+2029). For the 8-bit
+separator, U+2028), and PS (paragraph separator, U+2029). For the 8-bit
library, the last two are recognized only in UTF-8 mode.
.P
The newline setting in the options word uses three bits that are treated
@@ -819,11 +819,11 @@ page.
.sp
PCRE_NO_UTF8_CHECK
.sp
-When PCRE_UTF8 is set, the validity of the pattern as a UTF-8
+When PCRE_UTF8 is set, the validity of the pattern as a UTF-8
string is automatically checked. There is a discussion about the
.\" HTML <a href="pcreunicode.html#utf8strings">
.\" </a>
-validity of UTF-8 strings
+validity of UTF-8 strings
.\"
in the
.\" HREF
@@ -843,7 +843,7 @@ validity checking of subject strings.
.sp
The following table lists the error codes than may be returned by
\fBpcre_compile2()\fP, along with the error messages that may be returned by
-both compiling functions. Note that error messages are always 8-bit ASCII
+both compiling functions. Note that error messages are always 8-bit ASCII
strings, even in 16-bit mode. As PCRE has developed, some error codes have
fallen out of use. To avoid confusion, they have not been re-used.
.sp
@@ -917,14 +917,14 @@ fallen out of use. To avoid confusion, they have not been re-used.
65 different names for subpatterns of the same number are
not allowed
66 (*MARK) must have an argument
- 67 this version of PCRE is not compiled with Unicode property
+ 67 this version of PCRE is not compiled with Unicode property
support
68 \ec must be followed by an ASCII character
69 \ek is not followed by a braced, angle-bracketed, or quoted name
70 internal error: unknown opcode in find_fixedlength()
71 \eN is not supported in a class
72 too many forward references
- 73 disallowed Unicode code point (>= 0xd800 && <= 0xdfff)
+ 73 disallowed Unicode code point (>= 0xd800 && <= 0xdfff)
74 invalid UTF-16 string (specifically UTF-16)
.sp
The numbers 32 and 10000 in errors 48 and 49 are defaults; different values may
@@ -1120,12 +1120,12 @@ the following negative numbers:
PCRE_ERROR_NULL the argument \fIcode\fP was NULL
the argument \fIwhere\fP was NULL
PCRE_ERROR_BADMAGIC the "magic number" was not found
- PCRE_ERROR_BADENDIANNESS the pattern was compiled with different
+ PCRE_ERROR_BADENDIANNESS the pattern was compiled with different
endianness
PCRE_ERROR_BADOPTION the value of \fIwhat\fP was invalid
.sp
The "magic number" is placed at the start of each compiled pattern as an simple
-check against passing an arbitrary memory pointer. The endianness error can
+check against passing an arbitrary memory pointer. The endianness error can
occur if a compiled pattern is saved and reloaded on a different host. Here is
a typical call of \fBpcre_fullinfo()\fP, to obtain the length of the compiled
pattern:
@@ -1168,8 +1168,8 @@ where data units are bytes.) The fourth argument should point to an \fBint\fP
variable.
.P
If there is a fixed first value, for example, the letter "c" from a pattern
-such as (cat|cow|coyote), its value is returned. In the 8-bit library, the
-value is always less than 256; in the 16-bit library the value can be up to
+such as (cat|cow|coyote), its value is returned. In the 8-bit library, the
+value is always less than 256; in the 16-bit library the value can be up to
0xffff.
.P
If there is no fixed first value, and if either
@@ -1459,7 +1459,7 @@ fields (not necessarily in this order):
const unsigned char *\fItables\fP;
unsigned char **\fImark\fP;
.sp
-In the 16-bit version of this structure, the \fImark\fP field has type
+In the 16-bit version of this structure, the \fImark\fP field has type
"PCRE_UCHAR16 **".
.P
The \fIflags\fP field is a bitmap that specifies which of the other fields
@@ -2092,14 +2092,14 @@ documentation for more details.
.sp
PCRE_ERROR_BADMODE (-28)
.sp
-This error is given if a pattern that was compiled by the 8-bit library is
+This error is given if a pattern that was compiled by the 8-bit library is
passed to a 16-bit library function, or vice versa.
.sp
PCRE_ERROR_BADENDIANNESS (-29)
-.sp
-This error is given if a pattern that was compiled and saved is reloaded on a
-host with different endianness. The utility function
-\fBpcre_pattern_to_host_byte_order()\fP can be used to convert such a pattern
+.sp
+This error is given if a pattern that was compiled and saved is reloaded on a
+host with different endianness. The utility function
+\fBpcre_pattern_to_host_byte_order()\fP can be used to convert such a pattern
so that it runs on the new host.
.P
Error numbers -16 to -20 and -22 are not used by \fBpcre_exec()\fP.
@@ -2109,7 +2109,7 @@ Error numbers -16 to -20 and -22 are not used by \fBpcre_exec()\fP.
.SS "Reason codes for invalid UTF-8 strings"
.rs
.sp
-This section applies only to the 8-bit library. The corresponding information
+This section applies only to the 8-bit library. The corresponding information
for the 16-bit library is given in the
.\" HREF
\fBpcre16\fP
@@ -2417,14 +2417,14 @@ will yield PCRE_ERROR_NOMATCH.
.rs
.sp
Matching certain patterns using \fBpcre_exec()\fP can use a lot of process
-stack, which in certain environments can be rather limited in size. Some users
-find it helpful to have an estimate of the amount of stack that is used by
+stack, which in certain environments can be rather limited in size. Some users
+find it helpful to have an estimate of the amount of stack that is used by
\fBpcre_exec()\fP, to help them set recursion limits, as described in the
.\" HREF
\fBpcrestack\fP
.\"
-documentation. The estimate that is output by \fBpcretest\fP when called with
-the \fB-m\fP and \fB-C\fP options is obtained by calling \fBpcre_exec\fP with
+documentation. The estimate that is output by \fBpcretest\fP when called with
+the \fB-m\fP and \fB-C\fP options is obtained by calling \fBpcre_exec\fP with
the values NULL, NULL, NULL, -999, and -999 for its first five arguments.
.P
Normally, if its first argument is NULL, \fBpcre_exec()\fP immediately returns
@@ -2432,10 +2432,10 @@ the negative error code PCRE_ERROR_NULL, but with this special combination of
arguments, it returns instead a negative number whose absolute value is the
approximate stack frame size in bytes. (A negative number is used so that it is
clear that no match has happened.) The value is approximate because in some
-cases, recursive calls to \fBpcre_exec()\fP occur when there are one or two
+cases, recursive calls to \fBpcre_exec()\fP occur when there are one or two
additional variables on the stack.
.P
-If PCRE has been compiled to use the heap instead of the stack for recursion,
+If PCRE has been compiled to use the heap instead of the stack for recursion,
the value returned is the size of each block that is obtained from the heap.
.
.
diff --git a/doc/pcrebuild.3 b/doc/pcrebuild.3
index 11efdc2..aea25b4 100644
--- a/doc/pcrebuild.3
+++ b/doc/pcrebuild.3
@@ -35,11 +35,11 @@ exists as well, but as it specifies the default, it is not described.
.SH "BUILDING 8-BIT and 16-BIT LIBRARIES"
.rs
.sp
-By default, a library called \fBlibpcre\fP is built, containing functions that
-take string arguments contained in vectors of bytes, either as single-byte
+By default, a library called \fBlibpcre\fP is built, containing functions that
+take string arguments contained in vectors of bytes, either as single-byte
characters, or interpreted as UTF-8 strings. You can also build a separate
-library, called \fBlibpcre16\fP, in which strings are contained in vectors of
-16-bit data units and interpreted either as single-unit characters or UTF-16
+library, called \fBlibpcre16\fP, in which strings are contained in vectors of
+16-bit data units and interpreted either as single-unit characters or UTF-16
strings, by adding
.sp
--enable-pcre16
@@ -70,7 +70,7 @@ to the \fBconfigure\fP command, as required.
.sp
By default, if the 8-bit library is being built, the \fBconfigure\fP script
will search for a C++ compiler and C++ header files. If it finds them, it
-automatically builds the C++ wrapper library (which supports only 8-bit
+automatically builds the C++ wrapper library (which supports only 8-bit
strings). You can disable this by adding
.sp
--disable-cpp
@@ -96,7 +96,7 @@ configuration. (For backwards compatibility, --enable-utf8 is a synonym of
.P
Of itself, this setting does not make PCRE treat strings as UTF-8 or UTF-16. As
well as compiling PCRE with this option, you also have have to set the
-PCRE_UTF8 or PCRE_UTF16 option when you call one of the pattern compiling
+PCRE_UTF8 or PCRE_UTF16 option when you call one of the pattern compiling
functions.
.P
If you set --enable-utf when compiling in an EBCDIC environment, PCRE expects
diff --git a/doc/pcrecallout.3 b/doc/pcrecallout.3
index 575eab9..7421d54 100644
--- a/doc/pcrecallout.3
+++ b/doc/pcrecallout.3
@@ -11,7 +11,7 @@ PCRE - Perl-compatible regular expressions
PCRE provides a feature called "callout", which is a means of temporarily
passing control to the caller of PCRE in the middle of pattern matching. The
caller of PCRE provides an external function by putting its entry point in the
-global variable \fIpcre_callout\fP (\fIpcre16_callout\fP for the 16-bit
+global variable \fIpcre_callout\fP (\fIpcre16_callout\fP for the 16-bit
library). By default, this variable contains NULL, which disables all calling
out.
.P
@@ -85,7 +85,7 @@ These structures contains the following fields:
int \fIcallout_number\fP;
int *\fIoffset_vector\fP;
const char *\fIsubject\fP; (8-bit version)
- PCRE_SPTR16 \fIsubject\fP; (16-bit version)
+ PCRE_SPTR16 \fIsubject\fP; (16-bit version)
int \fIsubject_length\fP;
int \fIstart_match\fP;
int \fIcurrent_position\fP;
@@ -107,7 +107,7 @@ into the pattern (that is, the number after ?C for manual callouts, and 255 for
automatically generated callouts).
.P
The \fIoffset_vector\fP field is a pointer to the vector of offsets that was
-passed by the caller to the matching function. When \fBpcre_exec()\fP or
+passed by the caller to the matching function. When \fBpcre_exec()\fP or
\fBpcre16_exec()\fP is used, the contents can be inspected, in order to extract
substrings that have been matched so far, in the same way as for extracting
substrings after a match has completed. For the DFA matching functions, this
diff --git a/doc/pcrecpp.3 b/doc/pcrecpp.3
index 146d222..772ce92 100644
--- a/doc/pcrecpp.3
+++ b/doc/pcrecpp.3
@@ -12,7 +12,7 @@ PCRE - Perl-compatible regular expressions.
The C++ wrapper for PCRE was provided by Google Inc. Some additional
functionality was added by Giuseppe Maxia. This brief man page was constructed
from the notes in the \fIpcrecpp.h\fP file, which should be consulted for
-further details. Note that the C++ wrapper supports only the original 8-bit
+further details. Note that the C++ wrapper supports only the original 8-bit
PCRE library. There is no 16-bit support at present.
.
.
diff --git a/doc/pcrejit.3 b/doc/pcrejit.3
index 64764b1..0a32a11 100644
--- a/doc/pcrejit.3
+++ b/doc/pcrejit.3
@@ -21,10 +21,10 @@ this support was written by Zoltan Herczeg.
.SH "8-BIT and 16-BIT SUPPORT"
.rs
.sp
-JIT support is available for both the 8-bit and 16-bit PCRE libraries. To keep
-this documentation simple, only the 8-bit interface is described in what
-follows. If you are using the 16-bit library, substitute the 16-bit functions
-and 16-bit structures (for example, \fIpcre16_jit_stack\fP instead of
+JIT support is available for both the 8-bit and 16-bit PCRE libraries. To keep
+this documentation simple, only the 8-bit interface is described in what
+follows. If you are using the 16-bit library, substitute the 16-bit functions
+and 16-bit structures (for example, \fIpcre16_jit_stack\fP instead of
\fIpcre_jit_stack\fP).
.
.
diff --git a/doc/pcrematching.3 b/doc/pcrematching.3
index 09b20c8..7ec2f5b 100644
--- a/doc/pcrematching.3
+++ b/doc/pcrematching.3
@@ -8,14 +8,14 @@ This document describes the two different algorithms that are available in PCRE
for matching a compiled regular expression against a given subject string. The
"standard" algorithm is the one provided by the \fBpcre_exec()\fP and
\fBpcre16_exec()\fP functions. These work in the same was as Perl's matching
-function, and provide a Perl-compatible matching operation. The just-in-time
+function, and provide a Perl-compatible matching operation. The just-in-time
(JIT) optimization that is described in the
.\" HREF
\fBpcrejit\fP
.\"
documentation is compatible with these functions.
.P
-An alternative algorithm is provided by the \fBpcre_dfa_exec()\fP and
+An alternative algorithm is provided by the \fBpcre_dfa_exec()\fP and
\fBpcre16_dfa_exec()\fP functions; they operate in a different way, and are not
Perl-compatible. This alternative has advantages and disadvantages compared
with the standard algorithm, and these are described below.
diff --git a/doc/pcrepartial.3 b/doc/pcrepartial.3
index 1706a62..356fd61 100644
--- a/doc/pcrepartial.3
+++ b/doc/pcrepartial.3
@@ -25,7 +25,7 @@ entered. Partial matching can also be useful when the subject string is very
long and is not all available at once.
.P
PCRE supports partial matching by means of the PCRE_PARTIAL_SOFT and
-PCRE_PARTIAL_HARD options, which can be set when calling any of the matching
+PCRE_PARTIAL_HARD options, which can be set when calling any of the matching
functions. For backwards compatibility, PCRE_PARTIAL is a synonym for
PCRE_PARTIAL_SOFT. The essential difference between the two options is whether
or not a partial match is preferred to an alternative complete match, though
@@ -46,7 +46,7 @@ strings. This optimization is also disabled for partial matching.
.SH "PARTIAL MATCHING USING pcre_exec() OR pcre16_exec()"
.rs
.sp
-A partial match occurs during a call to \fBpcre_exec()\fP or
+A partial match occurs during a call to \fBpcre_exec()\fP or
\fBpcre16_exec()\fP when the end of the subject string is reached successfully,
but matching cannot continue because more characters are needed. However, at
least one character in the subject must have been inspected. This character
@@ -115,7 +115,7 @@ because it prefers an earlier partial match over a later complete match. For
this reason, the assumption is made that the end of the supplied subject string
may not be the true end of the available data, and so, if \ez, \eZ, \eb, \eB,
or $ are encountered at the end of the subject, the result is
-PCRE_ERROR_PARTIAL, provided that at least one character in the subject has
+PCRE_ERROR_PARTIAL, provided that at least one character in the subject has
been inspected.
.P
Setting PCRE_PARTIAL_HARD also affects the way UTF-8 and UTF-16
@@ -270,7 +270,7 @@ program to do that if it needs to.
.P
You can set the PCRE_PARTIAL_SOFT or PCRE_PARTIAL_HARD options with
PCRE_DFA_RESTART to continue partial matching over multiple segments. This
-facility can be used to pass very long subject strings to the DFA matching
+facility can be used to pass very long subject strings to the DFA matching
functions.
.
.
diff --git a/doc/pcrepattern.3 b/doc/pcrepattern.3
index 49cfcd5..5ffadb7 100644
--- a/doc/pcrepattern.3
+++ b/doc/pcrepattern.3
@@ -25,11 +25,11 @@ there is now also support for UTF-8 strings in the original library, and a
second library that supports 16-bit and UTF-16 character strings. To use these
features, PCRE must be built to include appropriate support. When using UTF
strings you must either call the compiling function with the PCRE_UTF8 or
-PCRE_UTF16 option, or the pattern must start with one of these special
+PCRE_UTF16 option, or the pattern must start with one of these special
sequences:
.sp
(*UTF8)
- (*UTF16)
+ (*UTF16)
.sp
Starting a pattern with such a sequence is equivalent to setting the relevant
option. This feature is not Perl-compatible. How setting a UTF mode affects
@@ -263,8 +263,8 @@ between \ex{ and }, but the character code is constrained as follows:
8-bit UTF-8 mode less than 0x10ffff and a valid codepoint
16-bit non-UTF mode less than 0x10000
16-bit UTF-16 mode less than 0x10ffff and a valid codepoint
-.sp
-Invalid Unicode codepoints are the range 0xd800 to 0xdfff (the so-called
+.sp
+Invalid Unicode codepoints are the range 0xd800 to 0xdfff (the so-called
"surrogate" codepoints).
.P
If characters other than hexadecimal digits appear between \ex{ and }, or if
@@ -307,7 +307,7 @@ parenthesized subpatterns.
Inside a character class, or if the decimal number is greater than 9 and there
have not been that many capturing subpatterns, PCRE re-reads up to three octal
digits following the backslash, and uses them to generate a data character. Any
-subsequent digits stand for themselves. The value of the character is
+subsequent digits stand for themselves. The value of the character is
constrained in the same way as characters specified in hexadecimal.
For example:
.sp
@@ -499,8 +499,8 @@ The vertical space characters are:
U+2028 Line separator
U+2029 Paragraph separator
.sp
-In 8-bit, non-UTF-8 mode, only the characters with codepoints less than 256 are
-relevant.
+In 8-bit, non-UTF-8 mode, only the characters with codepoints less than 256 are
+relevant.
.
.
.\" HTML <a name="newlineseq"></a>
@@ -974,7 +974,7 @@ end of the subject in both modes, and if all branches of a pattern start with
.sp
Outside a character class, a dot in the pattern matches any one character in
the subject string except (by default) a character that signifies the end of a
-line.
+line.
.P
When a line ending is defined as a single character, dot never matches that
character; when the two-character sequence CRLF is used, dot does not match CR
@@ -1104,7 +1104,7 @@ followed by two other characters. The octal or hexadecimal representation of
"]" can also be used to end a range.
.P
Ranges operate in the collating sequence of character values. They can also be
-used for characters specified numerically, for example [\e000-\e037]. Ranges
+used for characters specified numerically, for example [\e000-\e037]. Ranges
can include any characters that are valid for the current mode.
.P
If a range that includes letters is used when caseless matching is set, it
@@ -1305,8 +1305,8 @@ match "cataract", "erpillar" or an empty string.
.sp
2. It sets up the subpattern as a capturing subpattern. This means that, when
the whole pattern matches, that portion of the subject string that matched the
-subpattern is passed back to the caller via the \fIovector\fP argument of the
-matching function. (This applies only to the traditional matching functions;
+subpattern is passed back to the caller via the \fIovector\fP argument of the
+matching function. (This applies only to the traditional matching functions;
the DFA matching functions do not support capturing.)
.P
Opening parentheses are counted from left to right (starting from 1) to obtain
@@ -2538,7 +2538,7 @@ same pair of parentheses when there is a repetition.
.P
PCRE provides a similar feature, but of course it cannot obey arbitrary Perl
code. The feature is called "callout". The caller of PCRE provides an external
-function by putting its entry point in the global variable \fIpcre_callout\fP
+function by putting its entry point in the global variable \fIpcre_callout\fP
(8-bit library) or \fIpcre16_callout\fP (16-bit library). By default, this
variable contains NULL, which disables all calling out.
.P
diff --git a/doc/pcreposix.3 b/doc/pcreposix.3
index 567cd89..7f08a1d 100644
--- a/doc/pcreposix.3
+++ b/doc/pcreposix.3
@@ -30,7 +30,7 @@ expression 8-bit library. See the
\fBpcreapi\fP
.\"
documentation for a description of PCRE's native API, which contains much
-additional functionality. There is no POSIX-style wrapper for PCRE's 16-bit
+additional functionality. There is no POSIX-style wrapper for PCRE's 16-bit
library.
.P
The functions described here are just wrapper functions that ultimately call
diff --git a/doc/pcreprecompile.3 b/doc/pcreprecompile.3
index 716527f..aafb723 100644
--- a/doc/pcreprecompile.3
+++ b/doc/pcreprecompile.3
@@ -111,7 +111,7 @@ in the
documentation.
.P
If you did not provide custom character tables when the pattern was compiled,
-the pointer in the compiled pattern is NULL, which causes the matching
+the pointer in the compiled pattern is NULL, which causes the matching
functions to use PCRE's internal tables. Thus, you do not need to take any
special action at run time in this case.
.P
diff --git a/doc/pcrestack.3 b/doc/pcrestack.3
index 37a6fe4..12e5cbd 100644
--- a/doc/pcrestack.3
+++ b/doc/pcrestack.3
@@ -140,24 +140,24 @@ limits.
.sp
The actual amount of stack used per recursion can vary quite a lot, depending
on the compiler that was used to build PCRE and the optimization or debugging
-options that were set for it. The rule of thumb value of 500 bytes mentioned
-above may be larger or smaller than what is actually needed. A better
+options that were set for it. The rule of thumb value of 500 bytes mentioned
+above may be larger or smaller than what is actually needed. A better
approximation can be obtained by running this command:
.sp
pcretest -m -C
.sp
-The \fB-C\fP option causes \fBpcretest\fP to output information about the
-options with which PCRE was compiled. When \fB-m\fP is also given (before
+The \fB-C\fP option causes \fBpcretest\fP to output information about the
+options with which PCRE was compiled. When \fB-m\fP is also given (before
\fB-C\fP), information about stack use is given in a line like this:
.sp
Match recursion uses stack: approximate frame size = 640 bytes
-.sp
-The value is approximate because some recursions need a bit more (up to perhaps
+.sp
+The value is approximate because some recursions need a bit more (up to perhaps
16 more bytes).
.P
-If the above command is given when PCRE is compiled to use the heap instead of
-the stack for recursion, the value that is output is the size of each block
-that is obtained from the heap.
+If the above command is given when PCRE is compiled to use the heap instead of
+the stack for recursion, the value that is output is the size of each block
+that is obtained from the heap.
.
.
.SS "Changing stack size in Unix-like systems"
diff --git a/doc/pcresyntax.3 b/doc/pcresyntax.3
index f722892..43ba1db 100644
--- a/doc/pcresyntax.3
+++ b/doc/pcresyntax.3
@@ -420,12 +420,12 @@ pattern is not anchored.
.sp
(*COMMIT) overall failure, no advance of starting point
(*PRUNE) advance to next starting character
- (*PRUNE:NAME) equivalent to (*MARK:NAME)(*PRUNE)
+ (*PRUNE:NAME) equivalent to (*MARK:NAME)(*PRUNE)
(*SKIP) advance to current matching position
(*SKIP:NAME) advance to position corresponding to an earlier
- (*MARK:NAME); if not found, the (*SKIP) is ignored
+ (*MARK:NAME); if not found, the (*SKIP) is ignored
(*THEN) local failure, backtrack to next alternation
- (*THEN:NAME) equivalent to (*MARK:NAME)(*THEN)
+ (*THEN:NAME) equivalent to (*MARK:NAME)(*THEN)
.
.
.SH "NEWLINE CONVENTIONS"
diff --git a/doc/pcretest.1 b/doc/pcretest.1
index 1be87c1..beb9d17 100644
--- a/doc/pcretest.1
+++ b/doc/pcretest.1
@@ -31,17 +31,17 @@ options and exactly what is output.
.SH "PCRE's 8-BIT and 16-BIT LIBRARIES"
.rs
.sp
-From release 8.30, two separate PCRE libraries can be built. The original one
-supports 8-bit character strings, whereas the newer 16-bit library supports
-character strings encoded in 16-bit units. The \fBpcretest\fP program can be
+From release 8.30, two separate PCRE libraries can be built. The original one
+supports 8-bit character strings, whereas the newer 16-bit library supports
+character strings encoded in 16-bit units. The \fBpcretest\fP program can be
used to test both libraries. However, it is itself still an 8-bit program,
reading 8-bit input and writing 8-bit output. When testing the 16-bit library,
the patterns and data strings are converted to 16-bit format before being
-passed to the PCRE library functions. Results are converted to 8-bit for
+passed to the PCRE library functions. Results are converted to 8-bit for
output.
.P
-References to functions and structures of the form \fBpcre[16]_xx\fP below
-mean "\fBpcre_xx\fP when using the 8-bit library or \fBpcre16_xx\fP when using
+References to functions and structures of the form \fBpcre[16]_xx\fP below
+mean "\fBpcre_xx\fP when using the 8-bit library or \fBpcre16_xx\fP when using
the 16-bit library".
.
.
@@ -49,9 +49,9 @@ the 16-bit library".
.rs
.TP 10
\fB-16\fP
-If both the 8-bit and the 16-bit libraries have been built, this option causes
-the 16-bit library to be used. If only the 16-bit library has been built, this
-is the default (so has no effect). If only the 8-bit library has been built,
+If both the 8-bit and the 16-bit libraries have been built, this option causes
+the 16-bit library to be used. If only the 16-bit library has been built, this
+is the default (so has no effect). If only the 8-bit library has been built,
this option causes an error.
.TP 10
\fB-b\fP
@@ -60,24 +60,24 @@ internal form is output after compilation.
.TP 10
\fB-C\fP
Output the version number of the PCRE library, and all available information
-about the optional features that are included, and then exit. All other options
+about the optional features that are included, and then exit. All other options
are ignored.
.TP 10
\fB-C\fP \fIoption\fP
-Output information about a specific build-time option, then exit. This
-functionality is intended for use in scripts such as \fBRunTest\fP. The
+Output information about a specific build-time option, then exit. This
+functionality is intended for use in scripts such as \fBRunTest\fP. The
following options output the value indicated:
.sp
linksize the internal link size (2, 3, or 4)
- newline the default newline setting:
- CR, LF, CRLF, ANYCRLF, or ANY
+ newline the default newline setting:
+ CR, LF, CRLF, ANYCRLF, or ANY
.sp
The following options output 1 for true or zero for false:
-.sp
+.sp
jit just-in-time support is available
pcre16 the 16-bit library was built
pcre8 the 8-bit library was built
- ucp Unicode property support is available
+ ucp Unicode property support is available
utf UTF-8 and/or UTF-16 support is available
.TP 10
\fB-d\fP
@@ -104,7 +104,7 @@ calling \fBpcre[16]_exec()\fP repeatedly with different limits.
.TP 10
\fB-m\fP
Output the size of each compiled pattern after it has been compiled. This is
-equivalent to adding \fB/M\fP to each regular expression. The size is given in
+equivalent to adding \fB/M\fP to each regular expression. The size is given in
bytes for both libraries.
.TP 10
\fB-o\fP \fIosize\fP
@@ -137,7 +137,7 @@ result of studying is not included when studying is caused only by \fB-s\fP and
neither \fB-i\fP nor \fB-d\fP is present on the command line. This behaviour
means that the output from tests that are run with and without \fB-s\fP should
be identical, except when options that output information about the actual
-running of a match are set.
+running of a match are set.
.sp
The \fB-M\fP, \fB-t\fP, and \fB-tm\fP options, which give information about
resources used, are likely to produce different output with and without
@@ -237,12 +237,12 @@ effect as they do in Perl. For example:
The following table shows additional modifiers for setting PCRE compile-time
options that do not correspond to anything in Perl:
.sp
- \fB/8\fP PCRE_UTF8 ) when using the 8-bit
+ \fB/8\fP PCRE_UTF8 ) when using the 8-bit
\fB/?\fP PCRE_NO_UTF8_CHECK ) library
-.sp
+.sp
\fB/8\fP PCRE_UTF16 ) when using the 16-bit
\fB/?\fP PCRE_NO_UTF16_CHECK ) library
-.sp
+.sp
\fB/A\fP PCRE_ANCHORED
\fB/C\fP PCRE_AUTO_CALLOUT
\fB/E\fP PCRE_DOLLAR_ENDONLY
@@ -270,7 +270,7 @@ This example sets multiline matching with CRLF as the line ending sequence:
.sp
As well as turning on the PCRE_UTF8/16 option, the \fB/8\fP modifier causes
all non-printing characters in output strings to be printed using the
-\ex{hh...} notation. Otherwise, those less than 0x100 are output in hex without
+\ex{hh...} notation. Otherwise, those less than 0x100 are output in hex without
the curly brackets.
.P
Full details of the PCRE options are given in the
@@ -663,7 +663,7 @@ substring is shown as "<unset>", as for the second data line.
2: b
.sp
If the strings contain any non-printing characters, they are output as \exhh
-escapes if the value is less than 256 and UTF mode is not set. Otherwise they
+escapes if the value is less than 256 and UTF mode is not set. Otherwise they
are output as \ex{hh...} escapes. See below for the definition of non-printing
characters. If the pattern has the \fB/+\fP modifier, the output for substring
0 is followed by the the rest of the subject string, identified by "0+" like
@@ -890,15 +890,15 @@ been loaded, \fBpcretest\fP proceeds to read data lines in the usual way.
You can copy a file written by \fBpcretest\fP to a different host and reload it
there, even if the new host has opposite endianness to the one on which the
pattern was compiled. For example, you can compile on an i86 machine and run on
-a SPARC machine. When a pattern is reloaded on a host with different
+a SPARC machine. When a pattern is reloaded on a host with different
endianness, the confirmation message is changed to:
.sp
Compiled pattern (byte-inverted) loaded from /some/file
.sp
-The test suite contains some saved pre-compiled patterns with different
-endianness. These are reloaded using "<!" instead of just "<". This suppresses
-the "(byte-inverted)" text so that the output is the same on all hosts. It also
-forces debugging output once the pattern has been reloaded.
+The test suite contains some saved pre-compiled patterns with different
+endianness. These are reloaded using "<!" instead of just "<". This suppresses
+the "(byte-inverted)" text so that the output is the same on all hosts. It also
+forces debugging output once the pattern has been reloaded.
.P
File names for saving and reloading can be absolute or relative, but note that
the shell facility of expanding a file name that starts with a tilde (~) is not
diff --git a/doc/pcreunicode.3 b/doc/pcreunicode.3
index eab65b4..0f51d03 100644
--- a/doc/pcreunicode.3
+++ b/doc/pcreunicode.3
@@ -5,7 +5,7 @@ PCRE - Perl-compatible regular expressions
.rs
.sp
From Release 8.30, in addition to its previous UTF-8 support, PCRE also
-supports UTF-16 by means of a separate 16-bit library. This can be built as
+supports UTF-16 by means of a separate 16-bit library. This can be built as
well as, or instead of, the 8-bit library.
.
.
@@ -77,7 +77,7 @@ releases of PCRE followed the rules of RFC 2279, which allows the full range of
range U+0 to U+10FFFF, excluding U+D800 to U+DFFF.
.P
The excluded code points are the "Surrogate Area" of Unicode. They are reserved
-for use by UTF-16, where they are used in pairs to encode codepoints with
+for use by UTF-16, where they are used in pairs to encode codepoints with
values greater than 0xFFFF. The code points that are encoded by UTF-16 pairs
are available independently in the UTF-8 encoding. (In other words, the whole
surrogate thing is a fudge for UTF-16 which unfortunately messes up UTF-8.)
@@ -148,7 +148,7 @@ two-byte characters for values greater than \e177.
3. Repeat quantifiers apply to complete UTF characters, not to individual
data units, for example: \ex{100}{3}.
.P
-4. The dot metacharacter matches one UTF character instead of a single data
+4. The dot metacharacter matches one UTF character instead of a single data
unit.
.P
5. The escape sequence \eC can be used to match a single byte in UTF-8 mode, or
@@ -166,7 +166,7 @@ be carried out by the normal interpretive function.
.P
6. The character escapes \eb, \eB, \ed, \eD, \es, \eS, \ew, and \eW correctly
test characters of any code value, but, by default, the characters that PCRE
-recognizes as digits, spaces, or word characters remain the same set as in
+recognizes as digits, spaces, or word characters remain the same set as in
non-UTF mode, all with values less than 256. This remains true even when PCRE
is built to include Unicode property support, because to do otherwise would
slow down PCRE in many common cases. Note in particular that this applies to
diff --git a/doc/perltest.txt b/doc/perltest.txt
index 37e0012..bb1a52a 100644
--- a/doc/perltest.txt
+++ b/doc/perltest.txt
@@ -14,7 +14,7 @@ other pcretest modifiers that are either handled or ignored:
/W ignored
/S ignored
/SS ignored
- /Y ignored
+ /Y ignored
The pcretest \Y escape in data lines is removed before matching. The data lines
are processed as Perl double-quoted strings, so if they contain " $ or @
@@ -29,7 +29,7 @@ The perltest.pl script can also test UTF-8 features. It recognizes the special
modifier /8 that pcretest uses to invoke UTF-8 functionality. The testinput4
and testinput6 files can be fed to perltest to run compatible UTF-8 tests.
However, it is necessary to add "use utf8; require Encode" to the script to
-make this work correctly. I have not managed to find a way to handle this
+make this work correctly. I have not managed to find a way to handle this
automatically.
The other testinput files are not suitable for feeding to perltest.pl, since
diff --git a/pcre-config.in b/pcre-config.in
index f557f8c..595e5d1 100644
--- a/pcre-config.in
+++ b/pcre-config.in
@@ -10,7 +10,7 @@ if test @enable_cpp@ = yes ; then
libs="[--libs-cpp]"
else
libs=
-fi
+fi
if test @enable_pcre16@ = yes ; then
libs="[--libs16] $libs"
@@ -18,7 +18,7 @@ fi
if test @enable_pcre8@ = yes ; then
libs="[--libs] [--libs-posix] $libs"
- cflags="$cflags [--cflags-posix]"
+ cflags="$cflags [--cflags-posix]"
fi
usage="Usage: pcre-config [--prefix] [--exec-prefix] [--version] $libs $cflags"
diff --git a/pcre_compile.c b/pcre_compile.c
index 5f95ac7..dbb5419 100644
--- a/pcre_compile.c
+++ b/pcre_compile.c
@@ -488,7 +488,7 @@ static const char error_texts[] =
"\\N is not supported in a class\0"
"too many forward references\0"
"disallowed Unicode code point (>= 0xd800 && <= 0xdfff)\0"
- "invalid UTF-16 string\0"
+ "invalid UTF-16 string\0"
;
/* Table to identify digits and hex digits. This is used when compiling
@@ -998,9 +998,9 @@ else
c -= CHAR_0;
while(i++ < 2 && ptr[1] >= CHAR_0 && ptr[1] <= CHAR_7)
c = c * 8 + *(++ptr) - CHAR_0;
-#ifdef COMPILE_PCRE8
+#ifdef COMPILE_PCRE8
if (!utf && c > 0xff) *errorcodeptr = ERR51;
-#endif
+#endif
break;
/* \x is complicated. \x{ddd} is a character number which can be greater
@@ -7709,11 +7709,11 @@ not used here. */
if (utf && (options & PCRE_NO_UTF8_CHECK) == 0 &&
(errorcode = PRIV(valid_utf)((PCRE_PUCHAR)pattern, -1, erroroffset)) != 0)
{
-#ifdef COMPILE_PCRE8
+#ifdef COMPILE_PCRE8
errorcode = ERR44;
-#else
+#else
errorcode = ERR74;
-#endif
+#endif
goto PCRE_EARLY_ERROR_RETURN2;
}
#else
diff --git a/pcre_exec.c b/pcre_exec.c
index 9fdda7a..dcab5aa 100644
--- a/pcre_exec.c
+++ b/pcre_exec.c
@@ -487,13 +487,13 @@ int condcode;
/* When recursion is not being used, all "local" variables that have to be
preserved over calls to RMATCH() are part of a "frame". We set up the top-level
frame on the stack here; subsequent instantiations are obtained from the heap
-whenever RMATCH() does a "recursion". See the macro definitions above. Putting
-the top-level on the stack rather than malloc-ing them all gives a performance
+whenever RMATCH() does a "recursion". See the macro definitions above. Putting
+the top-level on the stack rather than malloc-ing them all gives a performance
boost in many cases where there is not much "recursion". */
#ifdef NO_RECURSE
-heapframe frame_zero;
-heapframe *frame = &frame_zero;
+heapframe frame_zero;
+heapframe *frame = &frame_zero;
frame->Xprevframe = NULL; /* Marks the top level */
/* Copy in the original argument variables */
@@ -616,7 +616,7 @@ int stacksave[REC_STACK_SAVE_MAX];
eptrblock newptrb;
-/* There is a special fudge for calling match() in a way that causes it to
+/* There is a special fudge for calling match() in a way that causes it to
measure the size of its basic stack frame when the stack is being used for
recursion. The second argument (ecode) being NULL triggers this behaviour. It
cannot normally ever be NULL. The return is the negated value of the frame
@@ -631,7 +631,7 @@ if (ecode == NULL)
int len = (char *)&rdepth - (char *)eptr;
return (len > 0)? -len : len;
}
- }
+ }
#endif /* NO_RECURSE */
/* To save space on the stack and in the heap frame, I have doubled up on some
@@ -838,7 +838,7 @@ for (;;)
case OP_ONCE_NC:
prev = ecode;
saved_eptr = eptr;
- save_mark = md->mark;
+ save_mark = md->mark;
do
{
RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM64);
@@ -857,7 +857,7 @@ for (;;)
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
ecode += GET(ecode,1);
- md->mark = save_mark;
+ md->mark = save_mark;
}
while (*ecode == OP_ALT);
@@ -937,7 +937,7 @@ for (;;)
save_offset2 = md->offset_vector[offset+1];
save_offset3 = md->offset_vector[md->offset_end - number];
save_capture_last = md->capture_last;
- save_mark = md->mark;
+ save_mark = md->mark;
DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
md->offset_vector[md->offset_end - number] =
@@ -1043,7 +1043,7 @@ for (;;)
save_mark = md->mark;
RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md, eptrb,
RM2);
-
+
/* See comment in the code for capturing groups above about handling
THEN. */
@@ -1070,7 +1070,7 @@ for (;;)
RRETURN(rrc);
}
ecode += GET(ecode, 1);
- md->mark = save_mark;
+ md->mark = save_mark;
if (*ecode != OP_ALT) break;
}
@@ -1549,7 +1549,7 @@ for (;;)
case OP_ASSERT:
case OP_ASSERTBACK:
- save_mark = md->mark;
+ save_mark = md->mark;
if (md->match_function_type == MATCH_CONDASSERT)
{
condassert = TRUE;
@@ -1571,7 +1571,7 @@ for (;;)
if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
ecode += GET(ecode, 1);
- md->mark = save_mark;
+ md->mark = save_mark;
}
while (*ecode == OP_ALT);
@@ -1595,7 +1595,7 @@ for (;;)
case OP_ASSERT_NOT:
case OP_ASSERTBACK_NOT:
- save_mark = md->mark;
+ save_mark = md->mark;
if (md->match_function_type == MATCH_CONDASSERT)
{
condassert = TRUE;
@@ -1606,7 +1606,7 @@ for (;;)
do
{
RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM5);
- md->mark = save_mark;
+ md->mark = save_mark;
if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) RRETURN(MATCH_NOMATCH);
if (rrc == MATCH_SKIP || rrc == MATCH_PRUNE || rrc == MATCH_COMMIT)
{
@@ -6207,21 +6207,21 @@ PCRE_PUCHAR req_char_ptr = start_match - 1;
const pcre_study_data *study;
const REAL_PCRE *re = (const REAL_PCRE *)argument_re;
-/* Check for the special magic call that measures the size of the stack used
+/* Check for the special magic call that measures the size of the stack used
per recursive call of match(). */
if (re == NULL && extra_data == NULL && subject == NULL && length == -999 &&
start_offset == -999)
#ifdef NO_RECURSE
return -sizeof(heapframe);
-#else
+#else
return match(NULL, NULL, NULL, 0, NULL, NULL, 0);
-#endif
+#endif
/* Plausibility checks */
if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
-if (re == NULL || subject == NULL || (offsets == NULL && offsetcount > 0))
+if (re == NULL || subject == NULL || (offsets == NULL && offsetcount > 0))
return PCRE_ERROR_NULL;
if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
if (start_offset < 0 || start_offset > length) return PCRE_ERROR_BADOFFSET;
diff --git a/pcreposix.c b/pcreposix.c
index 06cdd01..808c9da 100644
--- a/pcreposix.c
+++ b/pcreposix.c
@@ -158,7 +158,7 @@ static const int eint[] = {
REG_BADPAT, /* \N is not supported in a class */
REG_BADPAT, /* too many forward references */
REG_BADPAT, /* disallowed UTF-8/16 code point (>= 0xd800 && <= 0xdfff) */
- REG_BADPAT /* invalid UTF-16 string (should not occur) */
+ REG_BADPAT /* invalid UTF-16 string (should not occur) */
};
/* Table of texts corresponding to POSIX error codes */
diff --git a/pcretest.c b/pcretest.c
index dcb46d6..6b733bd 100644
--- a/pcretest.c
+++ b/pcretest.c
@@ -2412,9 +2412,9 @@ are set, either both UTFs are supported or both are not supported. */
if (rc)
{
const char *arch;
- (void)PCRE_CONFIG(PCRE_CONFIG_JITTARGET, &arch);
+ (void)PCRE_CONFIG(PCRE_CONFIG_JITTARGET, &arch);
printf(" Just-in-time compiler support: %s\n", arch);
- }
+ }
else
printf(" No just-in-time compiler support\n");
(void)PCRE_CONFIG(PCRE_CONFIG_NEWLINE, &rc);
@@ -2438,11 +2438,11 @@ are set, either both UTFs are supported or both are not supported. */
(void)PCRE_CONFIG(PCRE_CONFIG_STACKRECURSE, &rc);
printf(" Match recursion uses %s", rc? "stack" : "heap");
if (showstore)
- {
+ {
PCRE_EXEC(stack_size, NULL, NULL, NULL, -999, -999, 0, NULL, 0);
- printf(": %sframe size = %d bytes", rc? "approximate " : "", -stack_size);
+ printf(": %sframe size = %d bytes", rc? "approximate " : "", -stack_size);
}
- printf("\n");
+ printf("\n");
goto EXIT;
}
else if (strcmp(argv[op], "-help") == 0 ||
@@ -3385,10 +3385,10 @@ while (!done)
cn16ptr = copynames;
gn16ptr = getnames;
#endif
-#ifdef SUPPORT_PCRE8
+#ifdef SUPPORT_PCRE8
cn8ptr = copynames8;
gn8ptr = getnames8;
-#endif
+#endif
SET_PCRE_CALLOUT(callout);
first_callout = 1;
@@ -3483,9 +3483,9 @@ while (!done)
{
if (++i == 9)
fprintf(outfile, "** Too many hex digits in \\x{...} item; "
- "using only the first eight.\n");
+ "using only the first eight.\n");
else c = c * 16 + tolower(*pt) - ((isdigit(*pt))? '0' : 'a' - 10);
- }
+ }
if (*pt == '}')
{
p = pt + 1;
diff --git a/perltest.pl b/perltest.pl
index d44e6c5..ca32cd7 100755
--- a/perltest.pl
+++ b/perltest.pl
@@ -23,7 +23,7 @@ if ($utf8)
foreach $c (@p)
{
if ($c >= 32 && $c < 127) { $t .= chr $c; }
- else { $t .= sprintf("\\x{%02x}", $c);
+ else { $t .= sprintf("\\x{%02x}", $c);
}
}
}
@@ -216,16 +216,16 @@ for (;;)
}
splice(@subs, 0, 18);
}
-
+
# It seems that $REGMARK is not marked as UTF-8 even when use utf8 is
# set and the input pattern was a UTF-8 string. We can, however, force
- # it to be so marked.
-
+ # it to be so marked.
+
if (defined $REGMARK && $REGMARK != 1)
{
- $xx = $REGMARK;
- $xx = Encode::decode_utf8($xx) if $utf8;
- printf $outfile ("MK: %s\n", &pchars($xx));
+ $xx = $REGMARK;
+ $xx = Encode::decode_utf8($xx) if $utf8;
+ printf $outfile ("MK: %s\n", &pchars($xx));
}
}
}