diff options
author | ph10 <ph10@2f5784b3-3f2a-0410-8824-cb99058d5e15> | 2012-01-21 16:37:17 +0000 |
---|---|---|
committer | ph10 <ph10@2f5784b3-3f2a-0410-8824-cb99058d5e15> | 2012-01-21 16:37:17 +0000 |
commit | ec7abfc6da4749a0deda01d514b353a43bdf39db (patch) | |
tree | 6236bb15f857322a56762945d3e9b84f6b393274 | |
parent | 2e9472220441a6c61e9ed14f3fe3d33686e241b1 (diff) | |
download | pcre-ec7abfc6da4749a0deda01d514b353a43bdf39db.tar.gz |
Source file tidies for 8.30-RC1 release; fix Makefile.am bugs for building
symbolic links to man pages.
git-svn-id: svn://vcs.exim.org/pcre/code/trunk@903 2f5784b3-3f2a-0410-8824-cb99058d5e15
60 files changed, 742 insertions, 623 deletions
diff --git a/CMakeLists.txt b/CMakeLists.txt index c0ff97e..a253546 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -393,7 +393,7 @@ SET(PCRE_SOURCES pcre_newline.c pcre_ord2utf8.c pcre_refcount.c - pcre_string_utils.c + pcre_string_utils.c pcre_study.c pcre_tables.c pcre_ucd.c @@ -32,33 +32,33 @@ Version 8.30 8. Ovector size of 2 is also supported by JIT based pcre_exec (the ovector size rounding is not applied in this particular case). - + 9. The invalid Unicode surrogate codepoints U+D800 to U+DFFF are now rejected if they appear, or are escaped, in patterns. - -10. Get rid of a number of -Wunused-but-set-variable warnings. - -11. The pattern /(?=(*:x))(q|)/ matches an empty string, and returns the mark - "x". The similar pattern /(?=(*:x))((*:y)q|)/ did not return a mark at all. - Oddly, Perl behaves the same way. PCRE has been fixed so that this pattern - also returns the mark "x". This bug applied to capturing parentheses, - non-capturing parentheses, and atomic parentheses. It also applied to some + +10. Get rid of a number of -Wunused-but-set-variable warnings. + +11. The pattern /(?=(*:x))(q|)/ matches an empty string, and returns the mark + "x". The similar pattern /(?=(*:x))((*:y)q|)/ did not return a mark at all. + Oddly, Perl behaves the same way. PCRE has been fixed so that this pattern + also returns the mark "x". This bug applied to capturing parentheses, + non-capturing parentheses, and atomic parentheses. It also applied to some assertions. - -12. Stephen Kelly's patch to CMakeLists.txt allows it to parse the version - information out of configure.ac instead of relying on pcre.h.generic, which - is not stored in the repository. - + +12. Stephen Kelly's patch to CMakeLists.txt allows it to parse the version + information out of configure.ac instead of relying on pcre.h.generic, which + is not stored in the repository. + 13. Applied Dmitry V. Levin's patch for a more portable method for linking with -lreadline. - -14. ZH added PCRE_CONFIG_JITTARGET; added its output to pcretest -C. -15. Applied Graycode's patch to put the top-level frame on the stack rather - than the heap when not using the stack for recursion. This gives a +14. ZH added PCRE_CONFIG_JITTARGET; added its output to pcretest -C. + +15. Applied Graycode's patch to put the top-level frame on the stack rather + than the heap when not using the stack for recursion. This gives a performance improvement in many cases when recursion is not deep. - -16. Experimental code added to "pcretest -C" to output the stack frame size. + +16. Experimental code added to "pcretest -C" to output the stack frame size. Version 8.21 12-Dec-2011 diff --git a/Makefile.am b/Makefile.am index f71339c..054640a 100644 --- a/Makefile.am +++ b/Makefile.am @@ -520,34 +520,32 @@ dist_man_MANS = \ doc/pcreunicode.3 # Arrange for the per-function man pages to have 16-bit names as well. -install-exec-hook: - pushd $(DESTDIR)($man3dir) - ln -s pcre_assign_jit_stack.3 pcre16_assign_jit_stack.3 - ln -s pcre_compile.3 pcre16_compile.3 - ln -s pcre_compile2.3 pcre16_compile2.3 - ln -s pcre_config.3 pcre16_config.3 - ln -s pcre_copy_named_substring.3 pcre16_copy_named_substring.3 - ln -s pcre_copy_substring.3 pcre16_copy_substring.3 - ln -s pcre_dfa_exec.3 pcre16_dfa_exec.3 - ln -s pcre_exec.3 pcre16_exec.3 - ln -s pcre_free_study.3 pcre16_free_study.3 - ln -s pcre_free_substring.3 pcre16_free_substring.3 - ln -s pcre_free_substring_list.3 pcre16_free_substring_list.3 - ln -s pcre_fullinfo.3 pcre16_fullinfo.3 - ln -s pcre_get_named_substring.3 pcre16_get_named_substring.3 - ln -s pcre_get_stringnumber.3 pcre16_get_stringnumber.3 - ln -s pcre_get_stringtable_entries.3 pcre16_get_stringtable_entries.3 - ln -s pcre_get_substring.3 pcre16_get_substring.3 - ln -s pcre_get_substring_list.3 pcre16_get_substring_list.3 - ln -s pcre_jit_stack_alloc.3 pcre16_jit_stack_alloc.3 - ln -s pcre_jit_stack_free.3 pcre16_jit_stack_free.3 - ln -s pcre_maketables.3 pcre16_maketables.3 - ln -s pcre_pattern_to_host_byte_order.3 pcre16_pattern_to_host_byte_order.3 - ln -s pcre_refcount.3 pcre16_refcount.3 - ln -s pcre_study.3 pcre16_study.3 - ln -s pcre_utf16_to_host_byte_order.3 pcre16_utf16_to_host_byte_order.3 - ln -s pcre_version.3 pcre16_version.3 - popd +install-data-hook: + ln -s $(DESTDIR)$(man3dir)/pcre_assign_jit_stack.3 $(DESTDIR)$(man3dir)/pcre16_assign_jit_stack.3 + ln -s $(DESTDIR)$(man3dir)/pcre_compile.3 $(DESTDIR)$(man3dir)/pcre16_compile.3 + ln -s $(DESTDIR)$(man3dir)/pcre_compile2.3 $(DESTDIR)$(man3dir)/pcre16_compile2.3 + ln -s $(DESTDIR)$(man3dir)/pcre_config.3 $(DESTDIR)$(man3dir)/pcre16_config.3 + ln -s $(DESTDIR)$(man3dir)/pcre_copy_named_substring.3 $(DESTDIR)$(man3dir)/pcre16_copy_named_substring.3 + ln -s $(DESTDIR)$(man3dir)/pcre_copy_substring.3 $(DESTDIR)$(man3dir)/pcre16_copy_substring.3 + ln -s $(DESTDIR)$(man3dir)/pcre_dfa_exec.3 $(DESTDIR)$(man3dir)/pcre16_dfa_exec.3 + ln -s $(DESTDIR)$(man3dir)/pcre_exec.3 $(DESTDIR)$(man3dir)/pcre16_exec.3 + ln -s $(DESTDIR)$(man3dir)/pcre_free_study.3 $(DESTDIR)$(man3dir)/pcre16_free_study.3 + ln -s $(DESTDIR)$(man3dir)/pcre_free_substring.3 $(DESTDIR)$(man3dir)/pcre16_free_substring.3 + ln -s $(DESTDIR)$(man3dir)/pcre_free_substring_list.3 $(DESTDIR)$(man3dir)/pcre16_free_substring_list.3 + ln -s $(DESTDIR)$(man3dir)/pcre_fullinfo.3 $(DESTDIR)$(man3dir)/pcre16_fullinfo.3 + ln -s $(DESTDIR)$(man3dir)/pcre_get_named_substring.3 $(DESTDIR)$(man3dir)/pcre16_get_named_substring.3 + ln -s $(DESTDIR)$(man3dir)/pcre_get_stringnumber.3 $(DESTDIR)$(man3dir)/pcre16_get_stringnumber.3 + ln -s $(DESTDIR)$(man3dir)/pcre_get_stringtable_entries.3 $(DESTDIR)$(man3dir)/pcre16_get_stringtable_entries.3 + ln -s $(DESTDIR)$(man3dir)/pcre_get_substring.3 $(DESTDIR)$(man3dir)/pcre16_get_substring.3 + ln -s $(DESTDIR)$(man3dir)/pcre_get_substring_list.3 $(DESTDIR)$(man3dir)/pcre16_get_substring_list.3 + ln -s $(DESTDIR)$(man3dir)/pcre_jit_stack_alloc.3 $(DESTDIR)$(man3dir)/pcre16_jit_stack_alloc.3 + ln -s $(DESTDIR)$(man3dir)/pcre_jit_stack_free.3 $(DESTDIR)$(man3dir)/pcre16_jit_stack_free.3 + ln -s $(DESTDIR)$(man3dir)/pcre_maketables.3 $(DESTDIR)$(man3dir)/pcre16_maketables.3 + ln -s $(DESTDIR)$(man3dir)/pcre_pattern_to_host_byte_order.3 $(DESTDIR)$(man3dir)/pcre16_pattern_to_host_byte_order.3 + ln -s $(DESTDIR)$(man3dir)/pcre_refcount.3 $(DESTDIR)$(man3dir)/pcre16_refcount.3 + ln -s $(DESTDIR)$(man3dir)/pcre_study.3 $(DESTDIR)$(man3dir)/pcre16_study.3 + ln -s $(DESTDIR)$(man3dir)/pcre_utf16_to_host_byte_order.3 $(DESTDIR)$(man3dir)/pcre16_utf16_to_host_byte_order.3 + ln -s $(DESTDIR)$(man3dir)/pcre_version.3 $(DESTDIR)$(man3dir)/pcre16_version.3 pcrecpp_man = doc/pcrecpp.3 EXTRA_DIST += $(pcrecpp_man) @@ -5,20 +5,20 @@ Release 8.30 ------------ Release 8.30 introduces a major new feature: support for 16-bit character -strings, compiled as a separate library. There are a few changes to the +strings, compiled as a separate library. There are a few changes to the 8-bit library, in addition to some bug fixes. . The pcre_info() function, which has been obsolete for over 10 years, has been removed. . When a compiled pattern was saved to a file and later reloaded on a host - with different endianness, PCRE used automatically to swap the bytes in some + with different endianness, PCRE used automatically to swap the bytes in some of the data fields. With the advent of the 16-bit library, where more of this swapping is needed, it is no longer done automatically. Instead, the bad endianness is detected and a specific error is given. The user can then call a new function called pcre_pattern_to_host_byte_order() (or an equivalent 16-bit function) to do the swap. - + . In UTF-8 mode, the values 0xd800 to 0xdfff are not legal Unicode code points and are now faulted. (They are the so-called "surrogates" that are reserved for coding high values in UTF-16.) @@ -201,7 +201,7 @@ library. They are also documented in the pcrebuild man page. platforms. It is not possible to use both --enable-utf and --enable-ebcdic at the same time. -. There are no separate options for enabling UTF-8 and UTF-16 independently +. There are no separate options for enabling UTF-8 and UTF-16 independently because that would allow ridiculous settings such as requesting UTF-16 support while building only the 8-bit library. However, the option --enable-utf8 is retained for backwards compatibility with earlier releases @@ -669,7 +669,7 @@ general cases, UTF-8/16 support, and Unicode property support, respectively. The twentieth test is run only in 16-bit mode. It tests some specific 16-bit features of the DFA matching engine. -The twenty-first and twenty-second tests are run only in 16-bit mode, when the +The twenty-first and twenty-second tests are run only in 16-bit mode, when the link size is set to 2. They test reloading pre-compiled patterns. @@ -275,7 +275,7 @@ if [ $do1 = no -a $do2 = no -a $do3 = no -a $do4 = no -a \ do19=yes do20=yes do21=yes - do22=yes + do22=yes fi # Show which release and which test data diff --git a/configure.ac b/configure.ac index ce78147..5db475c 100644 --- a/configure.ac +++ b/configure.ac @@ -11,7 +11,7 @@ dnl be defined as -RC2, for example. For real releases, it should be empty. m4_define(pcre_major, [8]) m4_define(pcre_minor, [30]) m4_define(pcre_prerelease, [-RC1]) -m4_define(pcre_date, [2012-01-20]) +m4_define(pcre_date, [2012-01-21]) # NOTE: The CMakeLists.txt file searches for the above variables in the first # 50 lines of this file. Please update that if the variables above are moved. @@ -774,8 +774,9 @@ AC_SUBST(EXTRA_LIBPCRE16_LDFLAGS) AC_SUBST(EXTRA_LIBPCREPOSIX_LDFLAGS) AC_SUBST(EXTRA_LIBPCRECPP_LDFLAGS) -# When we run 'make distcheck', use these arguments. -DISTCHECK_CONFIGURE_FLAGS="--enable-pcre16 --enable-jit --enable-cpp --enable-unicode-properties" +# When we run 'make distcheck', use these arguments. Turning off compiler +# optimization makes it run faster. +DISTCHECK_CONFIGURE_FLAGS="CFLAGS='' CXXFLAGS='' --enable-pcre16 --enable-jit --enable-cpp --enable-unicode-properties" AC_SUBST(DISTCHECK_CONFIGURE_FLAGS) # Check that, if --enable-pcregrep-libz or --enable-pcregrep-libbz2 is diff --git a/doc/html/index.html b/doc/html/index.html index 20720df..b87c2a9 100644 --- a/doc/html/index.html +++ b/doc/html/index.html @@ -1,10 +1,10 @@ <html> -<!-- This is a manually maintained file that is the root of the HTML version of - the PCRE documentation. When the HTML documents are built from the man - page versions, the entire doc/html directory is emptied, this file is then - copied into doc/html/index.html, and the remaining files therein are +<!-- This is a manually maintained file that is the root of the HTML version of + the PCRE documentation. When the HTML documents are built from the man + page versions, the entire doc/html directory is emptied, this file is then + copied into doc/html/index.html, and the remaining files therein are created by the 132html script. ---> +--> <head> <title>PCRE specification</title> </head> @@ -86,11 +86,11 @@ The HTML documentation for PCRE comprises the following pages: </table> <p> -There are also individual pages that summarize the interface for each function +There are also individual pages that summarize the interface for each function in the library. There is a single page for each pair of 8-bit/16-bit functions. </p> -<table> +<table> <tr><td><a href="pcre_assign_jit_stack.html">pcre_assign_jit_stack</a></td> <td> Assign stack for JIT matching</td></tr> @@ -153,7 +153,7 @@ in the library. There is a single page for each pair of 8-bit/16-bit functions. <tr><td><a href="pcre_maketables.html">pcre_maketables</a></td> <td> Build character tables in current locale</td></tr> - + <tr><td><a href="pcre_pattern_to_host_byte_order.html">pcre_pattern_to_host_byte_order</a></td> <td> Convert compiled pattern to host byte order if necessary</td></tr> diff --git a/doc/html/pcre-config.html b/doc/html/pcre-config.html index 141b805..87c874d 100644 --- a/doc/html/pcre-config.html +++ b/doc/html/pcre-config.html @@ -23,15 +23,15 @@ man page, in case the conversion went wrong. <br><a name="SEC1" href="#TOC1">SYNOPSIS</a><br> <P> <b>pcre-config [--prefix] [--exec-prefix] [--version] [--libs]</b> -<b>[--libs16] [--libs-cpp] [--libs-posix] [--cflags] </b> +<b>[--libs16] [--libs-cpp] [--libs-posix] [--cflags]</b> <b>[--cflags-posix]</b> </P> <br><a name="SEC2" href="#TOC1">DESCRIPTION</a><br> <P> <b>pcre-config</b> returns the configuration of the installed PCRE -libraries and the options required to compile a program to use them. Some of -the options apply only to the 8-bit or 16-bit libraries, respectively, and are -not available if only one of those libraries has been built. If an unavailable +libraries and the options required to compile a program to use them. Some of +the options apply only to the 8-bit or 16-bit libraries, respectively, and are +not available if only one of those libraries has been built. If an unavailable option is encountered, the "usage" information is output. </P> <br><a name="SEC3" href="#TOC1">OPTIONS</a><br> diff --git a/doc/html/pcre.html b/doc/html/pcre.html index 52afb47..ff5202f 100644 --- a/doc/html/pcre.html +++ b/doc/html/pcre.html @@ -28,11 +28,11 @@ support for one or two .NET and Oniguruma syntax items, and there is an option for requesting some minor changes that give better JavaScript compatibility. </P> <P> -Starting with release 8.30, it is possible to compile two separate PCRE +Starting with release 8.30, it is possible to compile two separate PCRE libraries: the original, which supports 8-bit character strings (including UTF-8 strings), and a second library that supports 16-bit character strings (including UTF-16 strings). The build process allows either one or both to be -built. The majority of the work to make this possible was done by Zoltan +built. The majority of the work to make this possible was done by Zoltan Herczeg. </P> <P> @@ -42,8 +42,8 @@ over-complication and reduce the documentation maintenance load, most of the documentation describes the 8-bit library, with the differences for the 16-bit library described separately in the <a href="pcre16.html"><b>pcre16</b></a> -page. References to functions or structures of the form <i>pcre[16]_xxx</i> -should be read as meaning "<i>pcre_xxx</i> when using the 8-bit library and +page. References to functions or structures of the form <i>pcre[16]_xxx</i> +should be read as meaning "<i>pcre_xxx</i> when using the 8-bit library and <i>pcre16_xxx</i> when using the 16-bit library". </P> <P> @@ -109,7 +109,7 @@ all the sections, except the <b>pcredemo</b> section, are concatenated, for ease of searching. The sections are as follows: <pre> pcre this document - pcre16 details of the 16-bit library + pcre16 details of the 16-bit library pcre-config show PCRE installation configuration information pcreapi details of PCRE's native C API pcrebuild options for building PCRE diff --git a/doc/html/pcre16.html b/doc/html/pcre16.html index 66e89cd..126ff75 100644 --- a/doc/html/pcre16.html +++ b/doc/html/pcre16.html @@ -160,7 +160,7 @@ man page, in case the conversion went wrong. <br><a name="SEC5" href="#TOC1">PCRE 16-BIT API 16-BIT-ONLY FUNCTION</a><br> <P> <b>int pcre16_utf16_to_host_byte_order(PCRE_UCHAR16 *<i>output</i>,</b> -<b>PCRE_SPTR16 <i>input</i>, int <i>length</i>, int *<i>byte_order</i>, </b> +<b>PCRE_SPTR16 <i>input</i>, int <i>length</i>, int *<i>byte_order</i>,</b> <b>int <i>keep_boms</i>);</b> </P> <br><a name="SEC6" href="#TOC1">THE PCRE 16-BIT LIBRARY</a><br> @@ -177,8 +177,8 @@ to the 16-bit library. This page describes what is different when you use the 16-bit library. </P> <P> -WARNING: A single application can be linked with both libraries, but you must -take care when processing any particular pattern to use functions from just one +WARNING: A single application can be linked with both libraries, but you must +take care when processing any particular pattern to use functions from just one library. For example, if you want to study a pattern that was compiled with <b>pcre16_compile()</b>, you must do so with <b>pcre16_study()</b>, not <b>pcre_study()</b>, and you must free the study data with @@ -186,52 +186,52 @@ library. For example, if you want to study a pattern that was compiled with </P> <br><a name="SEC7" href="#TOC1">THE HEADER FILE</a><br> <P> -There is only one header file, <b>pcre.h</b>. It contains prototypes for all the +There is only one header file, <b>pcre.h</b>. It contains prototypes for all the functions in both libraries, as well as definitions of flags, structures, error codes, etc. </P> <br><a name="SEC8" href="#TOC1">THE LIBRARY NAME</a><br> <P> -In Unix-like systems, the 16-bit library is called <b>libpcre16</b>, and can -normally be accesss by adding <b>-lpcre16</b> to the command for linking an +In Unix-like systems, the 16-bit library is called <b>libpcre16</b>, and can +normally be accesss by adding <b>-lpcre16</b> to the command for linking an application that uses PCRE. </P> <br><a name="SEC9" href="#TOC1">STRING TYPES</a><br> <P> -In the 8-bit library, strings are passed to PCRE library functions as vectors -of bytes with the C type "char *". In the 16-bit library, strings are passed as -vectors of unsigned 16-bit quantities. The macro PCRE_UCHAR16 specifies an -appropriate data type, and PCRE_SPTR16 is defined as "const PCRE_UCHAR16 *". In -very many environments, "short int" is a 16-bit data type. When PCRE is built, -it defines PCRE_UCHAR16 as "short int", but checks that it really is a 16-bit -data type. If it is not, the build fails with an error message telling the +In the 8-bit library, strings are passed to PCRE library functions as vectors +of bytes with the C type "char *". In the 16-bit library, strings are passed as +vectors of unsigned 16-bit quantities. The macro PCRE_UCHAR16 specifies an +appropriate data type, and PCRE_SPTR16 is defined as "const PCRE_UCHAR16 *". In +very many environments, "short int" is a 16-bit data type. When PCRE is built, +it defines PCRE_UCHAR16 as "short int", but checks that it really is a 16-bit +data type. If it is not, the build fails with an error message telling the maintainer to modify the definition appropriately. </P> <br><a name="SEC10" href="#TOC1">STRUCTURE TYPES</a><br> <P> -The types of the opaque structures that are used for compiled 16-bit patterns -and JIT stacks are <b>pcre16</b> and <b>pcre16_jit_stack</b> respectively. The -type of the user-accessible structure that is returned by <b>pcre16_study()</b> +The types of the opaque structures that are used for compiled 16-bit patterns +and JIT stacks are <b>pcre16</b> and <b>pcre16_jit_stack</b> respectively. The +type of the user-accessible structure that is returned by <b>pcre16_study()</b> is <b>pcre16_extra</b>, and the type of the structure that is used for passing -data to a callout function is <b>pcre16_callout_block</b>. These structures -contain the same fields, with the same names, as their 8-bit counterparts. The -only difference is that pointers to character strings are 16-bit instead of +data to a callout function is <b>pcre16_callout_block</b>. These structures +contain the same fields, with the same names, as their 8-bit counterparts. The +only difference is that pointers to character strings are 16-bit instead of 8-bit types. </P> <br><a name="SEC11" href="#TOC1">16-BIT FUNCTIONS</a><br> <P> For every function in the 8-bit library there is a corresponding function in -the 16-bit library with a name that starts with <b>pcre16_</b> instead of +the 16-bit library with a name that starts with <b>pcre16_</b> instead of <b>pcre_</b>. The prototypes are listed above. In addition, there is one extra -function, <b>pcre16_utf16_to_host_byte_order()</b>. This is a utility function -that converts a UTF-16 character string to host byte order if necessary. The -other 16-bit functions expect the strings they are passed to be in host byte -order. +function, <b>pcre16_utf16_to_host_byte_order()</b>. This is a utility function +that converts a UTF-16 character string to host byte order if necessary. The +other 16-bit functions expect the strings they are passed to be in host byte +order. </P> <P> The <i>input</i> and <i>output</i> arguments of -<b>pcre16_utf16_to_host_byte_order()</b> may point to the same address, that is, -conversion in place is supported. The output buffer must be at least as long as +<b>pcre16_utf16_to_host_byte_order()</b> may point to the same address, that is, +conversion in place is supported. The output buffer must be at least as long as the input. </P> <P> @@ -239,18 +239,18 @@ The <i>length</i> argument specifies the number of 16-bit data units in the input string; a negative value specifies a zero-terminated string. </P> <P> -If <i>byte_order</i> is NULL, it is assumed that the string starts off in host +If <i>byte_order</i> is NULL, it is assumed that the string starts off in host byte order. This may be changed by byte-order marks (BOMs) anywhere in the string (commonly as the first character). </P> <P> -If <i>byte_order</i> is not NULL, a non-zero value of the integer to which it -points means that the input starts off in host byte order, otherwise the -opposite order is assumed. Again, BOMs in the string can change this. The final -byte order is passed back at the end of processing. +If <i>byte_order</i> is not NULL, a non-zero value of the integer to which it +points means that the input starts off in host byte order, otherwise the +opposite order is assumed. Again, BOMs in the string can change this. The final +byte order is passed back at the end of processing. </P> <P> -If <i>keep_boms</i> is not zero, byte-order mark characters (0xfeff) are copied +If <i>keep_boms</i> is not zero, byte-order mark characters (0xfeff) are copied into the output string. Otherwise they are discarded. </P> <P> @@ -259,14 +259,14 @@ buffer, including the zero terminator if the string was zero-terminated. </P> <br><a name="SEC12" href="#TOC1">SUBJECT STRING OFFSETS</a><br> <P> -The offsets within subject strings that are returned by the matching functions +The offsets within subject strings that are returned by the matching functions are in 16-bit units rather than bytes. </P> <br><a name="SEC13" href="#TOC1">NAMED SUBPATTERNS</a><br> <P> -The name-to-number translation table that is maintained for named subpatterns -uses 16-bit characters. The <b>pcre16_get_stringtable_entries()</b> function -returns the length of each entry in the table as the number of 16-bit data +The name-to-number translation table that is maintained for named subpatterns +uses 16-bit characters. The <b>pcre16_get_stringtable_entries()</b> function +returns the length of each entry in the table as the number of 16-bit data units. </P> <br><a name="SEC14" href="#TOC1">OPTION NAMES</a><br> @@ -276,27 +276,27 @@ which correspond to PCRE_UTF8 and PCRE_NO_UTF8_CHECK in the 8-bit library. In fact, these new options define the same bits in the options word. </P> <P> -For the <b>pcre16_config()</b> function there is an option PCRE_CONFIG_UTF16 +For the <b>pcre16_config()</b> function there is an option PCRE_CONFIG_UTF16 that returns 1 if UTF-16 support is configured, otherwise 0. If this option is given to <b>pcre_config()</b>, or if the PCRE_CONFIG_UTF8 option is given to <b>pcre16_config()</b>, the result is the PCRE_ERROR_BADOPTION error. </P> <br><a name="SEC15" href="#TOC1">CHARACTER CODES</a><br> <P> -In 16-bit mode, when PCRE_UTF16 is not set, character values are treated in the -same way as in 8-bit, non UTF-8 mode, except, of course, that they can range -from 0 to 0xffff instead of 0 to 0xff. Character types for characters less than -0xff can therefore be influenced by the locale in the same way as before. -Characters greater than 0xff have only one case, and no "type" (such as letter +In 16-bit mode, when PCRE_UTF16 is not set, character values are treated in the +same way as in 8-bit, non UTF-8 mode, except, of course, that they can range +from 0 to 0xffff instead of 0 to 0xff. Character types for characters less than +0xff can therefore be influenced by the locale in the same way as before. +Characters greater than 0xff have only one case, and no "type" (such as letter or digit). </P> <P> -In UTF-16 mode, the character code is Unicode, in the range 0 to 0x10ffff, with -the exception of values in the range 0xd800 to 0xdfff because those are +In UTF-16 mode, the character code is Unicode, in the range 0 to 0x10ffff, with +the exception of values in the range 0xd800 to 0xdfff because those are "surrogate" values that are used in pairs to encode values greater than 0xffff. </P> <P> -A UTF-16 string can indicate its endianness by special code knows as a +A UTF-16 string can indicate its endianness by special code knows as a byte-order mark (BOM). The PCRE functions do not handle this, expecting strings to be in host byte order. A utility function called <b>pcre16_utf16_to_host_byte_order()</b> is provided to help with this (see @@ -304,18 +304,18 @@ above). </P> <br><a name="SEC16" href="#TOC1">ERROR NAMES</a><br> <P> -The errors PCRE_ERROR_BADUTF16_OFFSET and PCRE_ERROR_SHORTUTF16 correspond to +The errors PCRE_ERROR_BADUTF16_OFFSET and PCRE_ERROR_SHORTUTF16 correspond to their 8-bit counterparts. The error PCRE_ERROR_BADMODE is given when a compiled pattern is passed to a function that processes patterns in the other -mode, for example, if a pattern compiled with <b>pcre_compile()</b> is passed to +mode, for example, if a pattern compiled with <b>pcre_compile()</b> is passed to <b>pcre16_exec()</b>. </P> <P> There are new error codes whose names begin with PCRE_UTF16_ERR for invalid -UTF-16 strings, corresponding to the PCRE_UTF8_ERR codes for UTF-8 strings that +UTF-16 strings, corresponding to the PCRE_UTF8_ERR codes for UTF-8 strings that are described in the section entitled <a href="pcreapi.html#badutf8reasons">"Reason codes for invalid UTF-8 strings"</a> -in the main +in the main <a href="pcreapi.html"><b>pcreapi</b></a> page. The UTF-16 errors are: <pre> @@ -327,8 +327,8 @@ page. The UTF-16 errors are: </P> <br><a name="SEC17" href="#TOC1">ERROR TEXTS</a><br> <P> -If there is an error while compiling a pattern, the error text that is passed -back by <b>pcre16_compile()</b> or <b>pcre16_compile2()</b> is still an 8-bit +If there is an error while compiling a pattern, the error text that is passed +back by <b>pcre16_compile()</b> or <b>pcre16_compile2()</b> is still an 8-bit character string, zero-terminated. </P> <br><a name="SEC18" href="#TOC1">CALLOUTS</a><br> @@ -338,23 +338,23 @@ a callout function point to 16-bit vectors. </P> <br><a name="SEC19" href="#TOC1">TESTING</a><br> <P> -The <b>pcretest</b> program continues to operate with 8-bit input and output -files, but it can be used for testing the 16-bit library. If it is run with the -command line option <b>-16</b>, patterns and subject strings are converted from -8-bit to 16-bit before being passed to PCRE, and the 16-bit library functions -are used instead of the 8-bit ones. Returned 16-bit strings are converted to +The <b>pcretest</b> program continues to operate with 8-bit input and output +files, but it can be used for testing the 16-bit library. If it is run with the +command line option <b>-16</b>, patterns and subject strings are converted from +8-bit to 16-bit before being passed to PCRE, and the 16-bit library functions +are used instead of the 8-bit ones. Returned 16-bit strings are converted to 8-bit for output. If the 8-bit library was not compiled, <b>pcretest</b> defaults to 16-bit and the <b>-16</b> option is ignored. </P> <P> -When PCRE is being built, the <b>RunTest</b> script that is called by "make +When PCRE is being built, the <b>RunTest</b> script that is called by "make check" uses the <b>pcretest</b> <b>-C</b> option to discover which of the 8-bit and 16-bit libraries has been built, and runs the tests appropriately. </P> <br><a name="SEC20" href="#TOC1">NOT SUPPORTED IN 16-BIT MODE</a><br> <P> -Not all the features of the 8-bit library are available with the 16-bit -library. The C++ and POSIX wrapper functions support only the 8-bit library, +Not all the features of the 8-bit library are available with the 16-bit +library. The C++ and POSIX wrapper functions support only the 8-bit library, and the <b>pcregrep</b> program is at present 8-bit only. </P> <br><a name="SEC21" href="#TOC1">AUTHOR</a><br> diff --git a/doc/html/pcre_config.html b/doc/html/pcre_config.html index 31747b0..dcfb831 100644 --- a/doc/html/pcre_config.html +++ b/doc/html/pcre_config.html @@ -41,6 +41,9 @@ point to an unsigned long integer. The available codes are: <pre> PCRE_CONFIG_JIT Availability of just-in-time compiler support (1=yes 0=no) + PCRE_CONFIG_JITTARGET String containing information about the + target architecture for the JIT compiler, + or NULL if there is no JIT support PCRE_CONFIG_LINK_SIZE Internal link size: 2, 3, or 4 PCRE_CONFIG_MATCH_LIMIT Internal resource limit PCRE_CONFIG_MATCH_LIMIT_RECURSION @@ -66,7 +69,7 @@ point to an unsigned long integer. The available codes are: Availability of Unicode property support (1=yes 0=no) </pre> -The function yields 0 on success or PCRE_ERROR_BADOPTION otherwise. That error +The function yields 0 on success or PCRE_ERROR_BADOPTION otherwise. That error is also given if PCRE_CONFIG_UTF16 is passed to <b>pcre_config()</b> or if PCRE_CONFIG_UTF8 is passed to <b>pcre16_config()</b>. </P> diff --git a/doc/html/pcre_fullinfo.html b/doc/html/pcre_fullinfo.html index 49186dd..edb6eb7 100644 --- a/doc/html/pcre_fullinfo.html +++ b/doc/html/pcre_fullinfo.html @@ -50,7 +50,7 @@ The following information is available: PCRE_INFO_HASCRORLF Return 1 if explicit CR or LF matches exist PCRE_INFO_JCHANGED Return 1 if (?J) or (?-J) was used PCRE_INFO_JIT Return 1 after successful JIT compilation - PCRE_INFO_JITSIZE Size of JIT compiled code + PCRE_INFO_JITSIZE Size of JIT compiled code PCRE_INFO_LASTLITERAL Literal last data unit required PCRE_INFO_MINLENGTH Lower bound length of matching strings PCRE_INFO_NAMECOUNT Number of named subpatterns diff --git a/doc/html/pcre_jit_stack_alloc.html b/doc/html/pcre_jit_stack_alloc.html index 72d4b4e..2daac72 100644 --- a/doc/html/pcre_jit_stack_alloc.html +++ b/doc/html/pcre_jit_stack_alloc.html @@ -23,7 +23,7 @@ SYNOPSIS <b>int <i>maxsize</i>);</b> </P> <P> -<b>pcre16_jit_stack *pcre16_jit_stack_alloc(int <i>startsize</i>, </b> +<b>pcre16_jit_stack *pcre16_jit_stack_alloc(int <i>startsize</i>,</b> <b>int <i>maxsize</i>);</b> </P> <br><b> diff --git a/doc/html/pcre_pattern_to_host_byte_order.html b/doc/html/pcre_pattern_to_host_byte_order.html index 3c311de..2fb7f10 100644 --- a/doc/html/pcre_pattern_to_host_byte_order.html +++ b/doc/html/pcre_pattern_to_host_byte_order.html @@ -20,7 +20,7 @@ SYNOPSIS </P> <P> <b>int pcre_pattern_to_host_byte_order(pcre *<i>code</i>,</b> -<b>pcre_extra *<i>extra</i>, const unsigned char *<i>tables</i>); </b> +<b>pcre_extra *<i>extra</i>, const unsigned char *<i>tables</i>);</b> </P> <P> <b>int pcre16_pattern_to_host_byte_order(pcre16 *<i>code</i>,</b> @@ -31,8 +31,8 @@ DESCRIPTION </b><br> <P> This function ensures that the bytes in 2-byte and 4-byte values in a compiled -pattern are in the correct order for the current host. It is useful when a -pattern that has been compiled on one host is transferred to another that might +pattern are in the correct order for the current host. It is useful when a +pattern that has been compiled on one host is transferred to another that might have different endianness. The arguments are: <pre> <i>code</i> A compiled regular expression diff --git a/doc/html/pcre_utf16_to_host_byte_order.html b/doc/html/pcre_utf16_to_host_byte_order.html index 5434554..164e236 100644 --- a/doc/html/pcre_utf16_to_host_byte_order.html +++ b/doc/html/pcre_utf16_to_host_byte_order.html @@ -20,15 +20,15 @@ SYNOPSIS </P> <P> <b>int pcre16_utf16_to_host_byte_order(PCRE_UCHAR16 *<i>output</i>,</b> -<b>PCRE_SPTR16 <i>input</i>, int <i>length</i>, int *<i>host_byte_order</i>, </b> +<b>PCRE_SPTR16 <i>input</i>, int <i>length</i>, int *<i>host_byte_order</i>,</b> <b>int <i>keep_boms</i>);</b> </P> <br><b> DESCRIPTION </b><br> <P> -This function, which exists only in the 16-bit library, converts a UTF-16 -string to the correct order for the current host, taking account of any byte +This function, which exists only in the 16-bit library, converts a UTF-16 +string to the correct order for the current host, taking account of any byte order marks (BOMs) within the string. Its arguments are: <pre> <i>output</i> pointer to output buffer, may be the same as <i>input</i> diff --git a/doc/html/pcreapi.html b/doc/html/pcreapi.html index e4566a3..c5b58ff 100644 --- a/doc/html/pcreapi.html +++ b/doc/html/pcreapi.html @@ -34,10 +34,11 @@ man page, in case the conversion went wrong. <li><a name="TOC19" href="#SEC19">EXTRACTING CAPTURED SUBSTRINGS BY NAME</a> <li><a name="TOC20" href="#SEC20">DUPLICATE SUBPATTERN NAMES</a> <li><a name="TOC21" href="#SEC21">FINDING ALL POSSIBLE MATCHES</a> -<li><a name="TOC22" href="#SEC22">MATCHING A PATTERN: THE ALTERNATIVE FUNCTION</a> -<li><a name="TOC23" href="#SEC23">SEE ALSO</a> -<li><a name="TOC24" href="#SEC24">AUTHOR</a> -<li><a name="TOC25" href="#SEC25">REVISION</a> +<li><a name="TOC22" href="#SEC22">OBTAINING AN ESTIMATE OF STACK USAGE</a> +<li><a name="TOC23" href="#SEC23">MATCHING A PATTERN: THE ALTERNATIVE FUNCTION</a> +<li><a name="TOC24" href="#SEC24">SEE ALSO</a> +<li><a name="TOC25" href="#SEC25">AUTHOR</a> +<li><a name="TOC26" href="#SEC26">REVISION</a> </ul> <P> <b>#include <pcre.h></b> @@ -174,7 +175,7 @@ just use different data types for their arguments and results, and their names start with <b>pcre16_</b> instead of <b>pcre_</b>. For every option that has UTF8 in its name (for example, PCRE_UTF8), there is a corresponding 16-bit name with UTF8 replaced by UTF16. This facility is in fact just cosmetic; the 16-bit -option names define the same bit values. +option names define the same bit values. </P> <P> References to bytes and UTF-8 in this document should be read as references to @@ -182,7 +183,7 @@ References to bytes and UTF-8 in this document should be read as references to specified otherwise. More details of the specific differences for the 16-bit library are given in the <a href="pcre16.html"><b>pcre16</b></a> -page. +page. </P> <br><a name="SEC6" href="#TOC1">PCRE API OVERVIEW</a><br> <P> @@ -397,7 +398,7 @@ not recognized. The following information is available: PCRE_CONFIG_UTF8 </pre> The output is an integer that is set to one if UTF-8 support is available; -otherwise it is set to zero. If this option is given to the 16-bit version of +otherwise it is set to zero. If this option is given to the 16-bit version of this function, <b>pcre16_config()</b>, the result is PCRE_ERROR_BADOPTION. <pre> PCRE_CONFIG_UTF16 @@ -417,6 +418,13 @@ properties is available; otherwise it is set to zero. The output is an integer that is set to one if support for just-in-time compiling is available; otherwise it is set to zero. <pre> + PCRE_CONFIG_JITTARGET +</pre> +The output is a pointer to a zero-terminated "const char *" string. If JIT +support is available, the string contains the name of the architecture for +which the JIT compiler is configured, for example "x86 32bit (little endian + +unaligned)". If JIT support is not available, the result is NULL. +<pre> PCRE_CONFIG_NEWLINE </pre> The output is an integer whose value specifies the default character sequence @@ -738,7 +746,7 @@ preceding sequences should be recognized. Setting PCRE_NEWLINE_ANY specifies that any Unicode newline sequence should be recognized. The Unicode newline sequences are the three just mentioned, plus the single characters VT (vertical tab, U+000B), FF (formfeed, U+000C), NEL (next line, U+0085), LS (line -separator, U+2028), and PS (paragraph separator, U+2029). For the 8-bit +separator, U+2028), and PS (paragraph separator, U+2029). For the 8-bit library, the last two are recognized only in UTF-8 mode. </P> <P> @@ -808,7 +816,7 @@ page. <pre> PCRE_NO_UTF8_CHECK </pre> -When PCRE_UTF8 is set, the validity of the pattern as a UTF-8 +When PCRE_UTF8 is set, the validity of the pattern as a UTF-8 string is automatically checked. There is a discussion about the <a href="pcreunicode.html#utf8strings">validity of UTF-8 strings</a> in the @@ -825,7 +833,7 @@ validity checking of subject strings. <P> The following table lists the error codes than may be returned by <b>pcre_compile2()</b>, along with the error messages that may be returned by -both compiling functions. Note that error messages are always 8-bit ASCII +both compiling functions. Note that error messages are always 8-bit ASCII strings, even in 16-bit mode. As PCRE has developed, some error codes have fallen out of use. To avoid confusion, they have not been re-used. <pre> @@ -899,14 +907,14 @@ fallen out of use. To avoid confusion, they have not been re-used. 65 different names for subpatterns of the same number are not allowed 66 (*MARK) must have an argument - 67 this version of PCRE is not compiled with Unicode property + 67 this version of PCRE is not compiled with Unicode property support 68 \c must be followed by an ASCII character 69 \k is not followed by a braced, angle-bracketed, or quoted name 70 internal error: unknown opcode in find_fixedlength() 71 \N is not supported in a class 72 too many forward references - 73 disallowed Unicode code point (>= 0xd800 && <= 0xdfff) + 73 disallowed Unicode code point (>= 0xd800 && <= 0xdfff) 74 invalid UTF-16 string (specifically UTF-16) </pre> The numbers 32 and 10000 in errors 48 and 49 are defaults; different values may @@ -1101,12 +1109,12 @@ the following negative numbers: PCRE_ERROR_NULL the argument <i>code</i> was NULL the argument <i>where</i> was NULL PCRE_ERROR_BADMAGIC the "magic number" was not found - PCRE_ERROR_BADENDIANNESS the pattern was compiled with different + PCRE_ERROR_BADENDIANNESS the pattern was compiled with different endianness PCRE_ERROR_BADOPTION the value of <i>what</i> was invalid </pre> The "magic number" is placed at the start of each compiled pattern as an simple -check against passing an arbitrary memory pointer. The endianness error can +check against passing an arbitrary memory pointer. The endianness error can occur if a compiled pattern is saved and reloaded on a different host. Here is a typical call of <b>pcre_fullinfo()</b>, to obtain the length of the compiled pattern: @@ -1150,8 +1158,8 @@ variable. </P> <P> If there is a fixed first value, for example, the letter "c" from a pattern -such as (cat|cow|coyote), its value is returned. In the 8-bit library, the -value is always less than 256; in the 16-bit library the value can be up to +such as (cat|cow|coyote), its value is returned. In the 8-bit library, the +value is always less than 256; in the 16-bit library the value can be up to 0xffff. </P> <P> @@ -1427,7 +1435,7 @@ fields (not necessarily in this order): const unsigned char *<i>tables</i>; unsigned char **<i>mark</i>; </pre> -In the 16-bit version of this structure, the <i>mark</i> field has type +In the 16-bit version of this structure, the <i>mark</i> field has type "PCRE_UCHAR16 **". </P> <P> @@ -2067,14 +2075,14 @@ documentation for more details. <pre> PCRE_ERROR_BADMODE (-28) </pre> -This error is given if a pattern that was compiled by the 8-bit library is +This error is given if a pattern that was compiled by the 8-bit library is passed to a 16-bit library function, or vice versa. <pre> PCRE_ERROR_BADENDIANNESS (-29) </pre> -This error is given if a pattern that was compiled and saved is reloaded on a -host with different endianness. The utility function -<b>pcre_pattern_to_host_byte_order()</b> can be used to convert such a pattern +This error is given if a pattern that was compiled and saved is reloaded on a +host with different endianness. The utility function +<b>pcre_pattern_to_host_byte_order()</b> can be used to convert such a pattern so that it runs on the new host. </P> <P> @@ -2084,7 +2092,7 @@ Error numbers -16 to -20 and -22 are not used by <b>pcre_exec()</b>. Reason codes for invalid UTF-8 strings </b><br> <P> -This section applies only to the 8-bit library. The corresponding information +This section applies only to the 8-bit library. The corresponding information for the 16-bit library is given in the <a href="pcre16.html"><b>pcre16</b></a> page. @@ -2374,8 +2382,32 @@ When your callout function is called, extract and save the current matched substring. Then return 1, which forces <b>pcre_exec()</b> to backtrack and try other alternatives. Ultimately, when it runs out of matches, <b>pcre_exec()</b> will yield PCRE_ERROR_NOMATCH. +</P> +<br><a name="SEC22" href="#TOC1">OBTAINING AN ESTIMATE OF STACK USAGE</a><br> +<P> +Matching certain patterns using <b>pcre_exec()</b> can use a lot of process +stack, which in certain environments can be rather limited in size. Some users +find it helpful to have an estimate of the amount of stack that is used by +<b>pcre_exec()</b>, to help them set recursion limits, as described in the +<a href="pcrestack.html"><b>pcrestack</b></a> +documentation. The estimate that is output by <b>pcretest</b> when called with +the <b>-m</b> and <b>-C</b> options is obtained by calling <b>pcre_exec</b> with +the values NULL, NULL, NULL, -999, and -999 for its first five arguments. +</P> +<P> +Normally, if its first argument is NULL, <b>pcre_exec()</b> immediately returns +the negative error code PCRE_ERROR_NULL, but with this special combination of +arguments, it returns instead a negative number whose absolute value is the +approximate stack frame size in bytes. (A negative number is used so that it is +clear that no match has happened.) The value is approximate because in some +cases, recursive calls to <b>pcre_exec()</b> occur when there are one or two +additional variables on the stack. +</P> +<P> +If PCRE has been compiled to use the heap instead of the stack for recursion, +the value returned is the size of each block that is obtained from the heap. <a name="dfamatch"></a></P> -<br><a name="SEC22" href="#TOC1">MATCHING A PATTERN: THE ALTERNATIVE FUNCTION</a><br> +<br><a name="SEC23" href="#TOC1">MATCHING A PATTERN: THE ALTERNATIVE FUNCTION</a><br> <P> <b>int pcre_dfa_exec(const pcre *<i>code</i>, const pcre_extra *<i>extra</i>,</b> <b>const char *<i>subject</i>, int <i>length</i>, int <i>startoffset</i>,</b> @@ -2550,13 +2582,13 @@ recursively, using private vectors for <i>ovector</i> and <i>workspace</i>. This error is given if the output vector is not large enough. This should be extremely rare, as a vector of size 1000 is used. </P> -<br><a name="SEC23" href="#TOC1">SEE ALSO</a><br> +<br><a name="SEC24" href="#TOC1">SEE ALSO</a><br> <P> <b>pcre16</b>(3), <b>pcrebuild</b>(3), <b>pcrecallout</b>(3), <b>pcrecpp(3)</b>(3), <b>pcrematching</b>(3), <b>pcrepartial</b>(3), <b>pcreposix</b>(3), <b>pcreprecompile</b>(3), <b>pcresample</b>(3), <b>pcrestack</b>(3). </P> -<br><a name="SEC24" href="#TOC1">AUTHOR</a><br> +<br><a name="SEC25" href="#TOC1">AUTHOR</a><br> <P> Philip Hazel <br> @@ -2565,9 +2597,9 @@ University Computing Service Cambridge CB2 3QH, England. <br> </P> -<br><a name="SEC25" href="#TOC1">REVISION</a><br> +<br><a name="SEC26" href="#TOC1">REVISION</a><br> <P> -Last updated: 07 January 2012 +Last updated: 21 January 2012 <br> Copyright © 1997-2012 University of Cambridge. <br> diff --git a/doc/html/pcrebuild.html b/doc/html/pcrebuild.html index eee9a74..8faed44 100644 --- a/doc/html/pcrebuild.html +++ b/doc/html/pcrebuild.html @@ -66,11 +66,11 @@ exists as well, but as it specifies the default, it is not described. </P> <br><a name="SEC2" href="#TOC1">BUILDING 8-BIT and 16-BIT LIBRARIES</a><br> <P> -By default, a library called <b>libpcre</b> is built, containing functions that -take string arguments contained in vectors of bytes, either as single-byte +By default, a library called <b>libpcre</b> is built, containing functions that +take string arguments contained in vectors of bytes, either as single-byte characters, or interpreted as UTF-8 strings. You can also build a separate -library, called <b>libpcre16</b>, in which strings are contained in vectors of -16-bit data units and interpreted either as single-unit characters or UTF-16 +library, called <b>libpcre16</b>, in which strings are contained in vectors of +16-bit data units and interpreted either as single-unit characters or UTF-16 strings, by adding <pre> --enable-pcre16 @@ -97,7 +97,7 @@ to the <b>configure</b> command, as required. <P> By default, if the 8-bit library is being built, the <b>configure</b> script will search for a C++ compiler and C++ header files. If it finds them, it -automatically builds the C++ wrapper library (which supports only 8-bit +automatically builds the C++ wrapper library (which supports only 8-bit strings). You can disable this by adding <pre> --disable-cpp @@ -122,7 +122,7 @@ configuration. (For backwards compatibility, --enable-utf8 is a synonym of <P> Of itself, this setting does not make PCRE treat strings as UTF-8 or UTF-16. As well as compiling PCRE with this option, you also have have to set the -PCRE_UTF8 or PCRE_UTF16 option when you call one of the pattern compiling +PCRE_UTF8 or PCRE_UTF16 option when you call one of the pattern compiling functions. </P> <P> diff --git a/doc/html/pcrecallout.html b/doc/html/pcrecallout.html index 75bda4b..8076cee 100644 --- a/doc/html/pcrecallout.html +++ b/doc/html/pcrecallout.html @@ -31,7 +31,7 @@ man page, in case the conversion went wrong. PCRE provides a feature called "callout", which is a means of temporarily passing control to the caller of PCRE in the middle of pattern matching. The caller of PCRE provides an external function by putting its entry point in the -global variable <i>pcre_callout</i> (<i>pcre16_callout</i> for the 16-bit +global variable <i>pcre_callout</i> (<i>pcre16_callout</i> for the 16-bit library). By default, this variable contains NULL, which disables all calling out. </P> @@ -105,7 +105,7 @@ These structures contains the following fields: int <i>callout_number</i>; int *<i>offset_vector</i>; const char *<i>subject</i>; (8-bit version) - PCRE_SPTR16 <i>subject</i>; (16-bit version) + PCRE_SPTR16 <i>subject</i>; (16-bit version) int <i>subject_length</i>; int <i>start_match</i>; int <i>current_position</i>; @@ -129,7 +129,7 @@ automatically generated callouts). </P> <P> The <i>offset_vector</i> field is a pointer to the vector of offsets that was -passed by the caller to the matching function. When <b>pcre_exec()</b> or +passed by the caller to the matching function. When <b>pcre_exec()</b> or <b>pcre16_exec()</b> is used, the contents can be inspected, in order to extract substrings that have been matched so far, in the same way as for extracting substrings after a match has completed. For the DFA matching functions, this diff --git a/doc/html/pcrecpp.html b/doc/html/pcrecpp.html index 2c5879a..06518da 100644 --- a/doc/html/pcrecpp.html +++ b/doc/html/pcrecpp.html @@ -35,7 +35,7 @@ man page, in case the conversion went wrong. The C++ wrapper for PCRE was provided by Google Inc. Some additional functionality was added by Giuseppe Maxia. This brief man page was constructed from the notes in the <i>pcrecpp.h</i> file, which should be consulted for -further details. Note that the C++ wrapper supports only the original 8-bit +further details. Note that the C++ wrapper supports only the original 8-bit PCRE library. There is no 16-bit support at present. </P> <br><a name="SEC3" href="#TOC1">MATCHING INTERFACE</a><br> diff --git a/doc/html/pcrejit.html b/doc/html/pcrejit.html index a604157..7b23edb 100644 --- a/doc/html/pcrejit.html +++ b/doc/html/pcrejit.html @@ -45,10 +45,10 @@ this support was written by Zoltan Herczeg. </P> <br><a name="SEC2" href="#TOC1">8-BIT and 16-BIT SUPPORT</a><br> <P> -JIT support is available for both the 8-bit and 16-bit PCRE libraries. To keep -this documentation simple, only the 8-bit interface is described in what -follows. If you are using the 16-bit library, substitute the 16-bit functions -and 16-bit structures (for example, <i>pcre16_jit_stack</i> instead of +JIT support is available for both the 8-bit and 16-bit PCRE libraries. To keep +this documentation simple, only the 8-bit interface is described in what +follows. If you are using the 16-bit library, substitute the 16-bit functions +and 16-bit structures (for example, <i>pcre16_jit_stack</i> instead of <i>pcre_jit_stack</i>). </P> <br><a name="SEC3" href="#TOC1">AVAILABILITY OF JIT SUPPORT</a><br> diff --git a/doc/html/pcrematching.html b/doc/html/pcrematching.html index 5cd41ae..6abd17e 100644 --- a/doc/html/pcrematching.html +++ b/doc/html/pcrematching.html @@ -28,13 +28,13 @@ This document describes the two different algorithms that are available in PCRE for matching a compiled regular expression against a given subject string. The "standard" algorithm is the one provided by the <b>pcre_exec()</b> and <b>pcre16_exec()</b> functions. These work in the same was as Perl's matching -function, and provide a Perl-compatible matching operation. The just-in-time +function, and provide a Perl-compatible matching operation. The just-in-time (JIT) optimization that is described in the <a href="pcrejit.html"><b>pcrejit</b></a> documentation is compatible with these functions. </P> <P> -An alternative algorithm is provided by the <b>pcre_dfa_exec()</b> and +An alternative algorithm is provided by the <b>pcre_dfa_exec()</b> and <b>pcre16_dfa_exec()</b> functions; they operate in a different way, and are not Perl-compatible. This alternative has advantages and disadvantages compared with the standard algorithm, and these are described below. diff --git a/doc/html/pcrepartial.html b/doc/html/pcrepartial.html index 40cdf97..989ce38 100644 --- a/doc/html/pcrepartial.html +++ b/doc/html/pcrepartial.html @@ -50,7 +50,7 @@ long and is not all available at once. </P> <P> PCRE supports partial matching by means of the PCRE_PARTIAL_SOFT and -PCRE_PARTIAL_HARD options, which can be set when calling any of the matching +PCRE_PARTIAL_HARD options, which can be set when calling any of the matching functions. For backwards compatibility, PCRE_PARTIAL is a synonym for PCRE_PARTIAL_SOFT. The essential difference between the two options is whether or not a partial match is preferred to an alternative complete match, though @@ -70,7 +70,7 @@ strings. This optimization is also disabled for partial matching. </P> <br><a name="SEC2" href="#TOC1">PARTIAL MATCHING USING pcre_exec() OR pcre16_exec()</a><br> <P> -A partial match occurs during a call to <b>pcre_exec()</b> or +A partial match occurs during a call to <b>pcre_exec()</b> or <b>pcre16_exec()</b> when the end of the subject string is reached successfully, but matching cannot continue because more characters are needed. However, at least one character in the subject must have been inspected. This character @@ -144,7 +144,8 @@ because it prefers an earlier partial match over a later complete match. For this reason, the assumption is made that the end of the supplied subject string may not be the true end of the available data, and so, if \z, \Z, \b, \B, or $ are encountered at the end of the subject, the result is -PCRE_ERROR_PARTIAL. +PCRE_ERROR_PARTIAL, provided that at least one character in the subject has +been inspected. </P> <P> Setting PCRE_PARTIAL_HARD also affects the way UTF-8 and UTF-16 @@ -294,7 +295,7 @@ program to do that if it needs to. <P> You can set the PCRE_PARTIAL_SOFT or PCRE_PARTIAL_HARD options with PCRE_DFA_RESTART to continue partial matching over multiple segments. This -facility can be used to pass very long subject strings to the DFA matching +facility can be used to pass very long subject strings to the DFA matching functions. </P> <br><a name="SEC8" href="#TOC1">MULTI-SEGMENT MATCHING WITH pcre_exec() OR pcre16_exec()</a><br> @@ -434,7 +435,7 @@ Cambridge CB2 3QH, England. </P> <br><a name="SEC11" href="#TOC1">REVISION</a><br> <P> -Last updated: 08 January 2012 +Last updated: 21 January 2012 <br> Copyright © 1997-2012 University of Cambridge. <br> diff --git a/doc/html/pcrepattern.html b/doc/html/pcrepattern.html index 26c23f2..1dca37a 100644 --- a/doc/html/pcrepattern.html +++ b/doc/html/pcrepattern.html @@ -65,11 +65,11 @@ there is now also support for UTF-8 strings in the original library, and a second library that supports 16-bit and UTF-16 character strings. To use these features, PCRE must be built to include appropriate support. When using UTF strings you must either call the compiling function with the PCRE_UTF8 or -PCRE_UTF16 option, or the pattern must start with one of these special +PCRE_UTF16 option, or the pattern must start with one of these special sequences: <pre> (*UTF8) - (*UTF16) + (*UTF16) </pre> Starting a pattern with such a sequence is equivalent to setting the relevant option. This feature is not Perl-compatible. How setting a UTF mode affects @@ -292,7 +292,7 @@ between \x{ and }, but the character code is constrained as follows: 16-bit non-UTF mode less than 0x10000 16-bit UTF-16 mode less than 0x10ffff and a valid codepoint </pre> -Invalid Unicode codepoints are the range 0xd800 to 0xdfff (the so-called +Invalid Unicode codepoints are the range 0xd800 to 0xdfff (the so-called "surrogate" codepoints). </P> <P> @@ -335,7 +335,7 @@ following the discussion of Inside a character class, or if the decimal number is greater than 9 and there have not been that many capturing subpatterns, PCRE re-reads up to three octal digits following the backslash, and uses them to generate a data character. Any -subsequent digits stand for themselves. The value of the character is +subsequent digits stand for themselves. The value of the character is constrained in the same way as characters specified in hexadecimal. For example: <pre> @@ -503,8 +503,8 @@ The vertical space characters are: U+2028 Line separator U+2029 Paragraph separator </pre> -In 8-bit, non-UTF-8 mode, only the characters with codepoints less than 256 are -relevant. +In 8-bit, non-UTF-8 mode, only the characters with codepoints less than 256 are +relevant. <a name="newlineseq"></a></P> <br><b> Newline sequences @@ -970,7 +970,7 @@ end of the subject in both modes, and if all branches of a pattern start with <P> Outside a character class, a dot in the pattern matches any one character in the subject string except (by default) a character that signifies the end of a -line. +line. </P> <P> When a line ending is defined as a single character, dot never matches that @@ -1103,7 +1103,7 @@ followed by two other characters. The octal or hexadecimal representation of </P> <P> Ranges operate in the collating sequence of character values. They can also be -used for characters specified numerically, for example [\000-\037]. Ranges +used for characters specified numerically, for example [\000-\037]. Ranges can include any characters that are valid for the current mode. </P> <P> @@ -1298,8 +1298,8 @@ match "cataract", "erpillar" or an empty string. <br> 2. It sets up the subpattern as a capturing subpattern. This means that, when the whole pattern matches, that portion of the subject string that matched the -subpattern is passed back to the caller via the <i>ovector</i> argument of the -matching function. (This applies only to the traditional matching functions; +subpattern is passed back to the caller via the <i>ovector</i> argument of the +matching function. (This applies only to the traditional matching functions; the DFA matching functions do not support capturing.) </P> <P> @@ -2505,7 +2505,7 @@ same pair of parentheses when there is a repetition. <P> PCRE provides a similar feature, but of course it cannot obey arbitrary Perl code. The feature is called "callout". The caller of PCRE provides an external -function by putting its entry point in the global variable <i>pcre_callout</i> +function by putting its entry point in the global variable <i>pcre_callout</i> (8-bit library) or <i>pcre16_callout</i> (16-bit library). By default, this variable contains NULL, which disables all calling out. </P> diff --git a/doc/html/pcreposix.html b/doc/html/pcreposix.html index 637305d..9aa699a 100644 --- a/doc/html/pcreposix.html +++ b/doc/html/pcreposix.html @@ -48,7 +48,7 @@ This set of functions provides a POSIX-style API for the PCRE regular expression 8-bit library. See the <a href="pcreapi.html"><b>pcreapi</b></a> documentation for a description of PCRE's native API, which contains much -additional functionality. There is no POSIX-style wrapper for PCRE's 16-bit +additional functionality. There is no POSIX-style wrapper for PCRE's 16-bit library. </P> <P> diff --git a/doc/html/pcreprecompile.html b/doc/html/pcreprecompile.html index cbc2812..8361b7a 100644 --- a/doc/html/pcreprecompile.html +++ b/doc/html/pcreprecompile.html @@ -120,7 +120,7 @@ documentation. </P> <P> If you did not provide custom character tables when the pattern was compiled, -the pointer in the compiled pattern is NULL, which causes the matching +the pointer in the compiled pattern is NULL, which causes the matching functions to use PCRE's internal tables. Thus, you do not need to take any special action at run time in this case. </P> diff --git a/doc/html/pcrestack.html b/doc/html/pcrestack.html index 8bf8c92..76101b3 100644 --- a/doc/html/pcrestack.html +++ b/doc/html/pcrestack.html @@ -130,9 +130,9 @@ documentation. </P> <P> As a very rough rule of thumb, you should reckon on about 500 bytes per -recursion. Thus, if you want to limit your stack usage to 8Mb, you -should set the limit at 16000 recursions. A 64Mb stack, on the other hand, can -support around 128000 recursions. +recursion. Thus, if you want to limit your stack usage to 8Mb, you should set +the limit at 16000 recursions. A 64Mb stack, on the other hand, can support +around 128000 recursions. </P> <P> In Unix-like environments, the <b>pcretest</b> test program has a command line @@ -143,6 +143,32 @@ string. This is done by calling <b>pcre[16]_exec()</b> repeatedly with different limits. </P> <br><b> +Obtaining an estimate of stack usage +</b><br> +<P> +The actual amount of stack used per recursion can vary quite a lot, depending +on the compiler that was used to build PCRE and the optimization or debugging +options that were set for it. The rule of thumb value of 500 bytes mentioned +above may be larger or smaller than what is actually needed. A better +approximation can be obtained by running this command: +<pre> + pcretest -m -C +</pre> +The <b>-C</b> option causes <b>pcretest</b> to output information about the +options with which PCRE was compiled. When <b>-m</b> is also given (before +<b>-C</b>), information about stack use is given in a line like this: +<pre> + Match recursion uses stack: approximate frame size = 640 bytes +</pre> +The value is approximate because some recursions need a bit more (up to perhaps +16 more bytes). +</P> +<P> +If the above command is given when PCRE is compiled to use the heap instead of +the stack for recursion, the value that is output is the size of each block +that is obtained from the heap. +</P> +<br><b> Changing stack size in Unix-like systems </b><br> <P> @@ -190,7 +216,7 @@ Cambridge CB2 3QH, England. REVISION </b><br> <P> -Last updated: 10 January 2012 +Last updated: 21 January 2012 <br> Copyright © 1997-2012 University of Cambridge. <br> diff --git a/doc/html/pcresyntax.html b/doc/html/pcresyntax.html index 0e7d364..5181d7a 100644 --- a/doc/html/pcresyntax.html +++ b/doc/html/pcresyntax.html @@ -448,12 +448,12 @@ pattern is not anchored. <pre> (*COMMIT) overall failure, no advance of starting point (*PRUNE) advance to next starting character - (*PRUNE:NAME) equivalent to (*MARK:NAME)(*PRUNE) + (*PRUNE:NAME) equivalent to (*MARK:NAME)(*PRUNE) (*SKIP) advance to current matching position (*SKIP:NAME) advance to position corresponding to an earlier - (*MARK:NAME); if not found, the (*SKIP) is ignored + (*MARK:NAME); if not found, the (*SKIP) is ignored (*THEN) local failure, backtrack to next alternation - (*THEN:NAME) equivalent to (*MARK:NAME)(*THEN) + (*THEN:NAME) equivalent to (*MARK:NAME)(*THEN) </PRE> </P> <br><a name="SEC22" href="#TOC1">NEWLINE CONVENTIONS</a><br> diff --git a/doc/html/pcretest.html b/doc/html/pcretest.html index 6915115..a88dbd0 100644 --- a/doc/html/pcretest.html +++ b/doc/html/pcretest.html @@ -52,26 +52,26 @@ options and exactly what is output. </P> <br><a name="SEC2" href="#TOC1">PCRE's 8-BIT and 16-BIT LIBRARIES</a><br> <P> -From release 8.30, two separate PCRE libraries can be built. The original one -supports 8-bit character strings, whereas the newer 16-bit library supports -character strings encoded in 16-bit units. The <b>pcretest</b> program can be +From release 8.30, two separate PCRE libraries can be built. The original one +supports 8-bit character strings, whereas the newer 16-bit library supports +character strings encoded in 16-bit units. The <b>pcretest</b> program can be used to test both libraries. However, it is itself still an 8-bit program, reading 8-bit input and writing 8-bit output. When testing the 16-bit library, the patterns and data strings are converted to 16-bit format before being -passed to the PCRE library functions. Results are converted to 8-bit for +passed to the PCRE library functions. Results are converted to 8-bit for output. </P> <P> -References to functions and structures of the form <b>pcre[16]_xx</b> below -mean "<b>pcre_xx</b> when using the 8-bit library or <b>pcre16_xx</b> when using +References to functions and structures of the form <b>pcre[16]_xx</b> below +mean "<b>pcre_xx</b> when using the 8-bit library or <b>pcre16_xx</b> when using the 16-bit library". </P> <br><a name="SEC3" href="#TOC1">COMMAND LINE OPTIONS</a><br> <P> <b>-16</b> -If both the 8-bit and the 16-bit libraries have been built, this option causes -the 16-bit library to be used. If only the 16-bit library has been built, this -is the default (so has no effect). If only the 8-bit library has been built, +If both the 8-bit and the 16-bit libraries have been built, this option causes +the 16-bit library to be used. If only the 16-bit library has been built, this +is the default (so has no effect). If only the 8-bit library has been built, this option causes an error. </P> <P> @@ -82,25 +82,25 @@ internal form is output after compilation. <P> <b>-C</b> Output the version number of the PCRE library, and all available information -about the optional features that are included, and then exit. All other options +about the optional features that are included, and then exit. All other options are ignored. </P> <P> <b>-C</b> <i>option</i> -Output information about a specific build-time option, then exit. This -functionality is intended for use in scripts such as <b>RunTest</b>. The +Output information about a specific build-time option, then exit. This +functionality is intended for use in scripts such as <b>RunTest</b>. The following options output the value indicated: <pre> linksize the internal link size (2, 3, or 4) - newline the default newline setting: - CR, LF, CRLF, ANYCRLF, or ANY + newline the default newline setting: + CR, LF, CRLF, ANYCRLF, or ANY </pre> The following options output 1 for true or zero for false: <pre> jit just-in-time support is available pcre16 the 16-bit library was built pcre8 the 8-bit library was built - ucp Unicode property support is available + ucp Unicode property support is available utf UTF-8 and/or UTF-16 support is available </PRE> </P> @@ -134,7 +134,7 @@ calling <b>pcre[16]_exec()</b> repeatedly with different limits. <P> <b>-m</b> Output the size of each compiled pattern after it has been compiled. This is -equivalent to adding <b>/M</b> to each regular expression. The size is given in +equivalent to adding <b>/M</b> to each regular expression. The size is given in bytes for both libraries. </P> <P> @@ -172,7 +172,7 @@ result of studying is not included when studying is caused only by <b>-s</b> and neither <b>-i</b> nor <b>-d</b> is present on the command line. This behaviour means that the output from tests that are run with and without <b>-s</b> should be identical, except when options that output information about the actual -running of a match are set. +running of a match are set. <br> <br> The <b>-M</b>, <b>-t</b>, and <b>-tm</b> options, which give information about @@ -276,7 +276,7 @@ effect as they do in Perl. For example: The following table shows additional modifiers for setting PCRE compile-time options that do not correspond to anything in Perl: <pre> - <b>/8</b> PCRE_UTF8 ) when using the 8-bit + <b>/8</b> PCRE_UTF8 ) when using the 8-bit <b>/?</b> PCRE_NO_UTF8_CHECK ) library <b>/8</b> PCRE_UTF16 ) when using the 16-bit @@ -309,7 +309,7 @@ This example sets multiline matching with CRLF as the line ending sequence: </pre> As well as turning on the PCRE_UTF8/16 option, the <b>/8</b> modifier causes all non-printing characters in output strings to be printed using the -\x{hh...} notation. Otherwise, those less than 0x100 are output in hex without +\x{hh...} notation. Otherwise, those less than 0x100 are output in hex without the curly brackets. </P> <P> @@ -661,7 +661,7 @@ substring is shown as "<unset>", as for the second data line. 2: b </pre> If the strings contain any non-printing characters, they are output as \xhh -escapes if the value is less than 256 and UTF mode is not set. Otherwise they +escapes if the value is less than 256 and UTF mode is not set. Otherwise they are output as \x{hh...} escapes. See below for the definition of non-printing characters. If the pattern has the <b>/+</b> modifier, the output for substring 0 is followed by the the rest of the subject string, identified by "0+" like @@ -881,15 +881,15 @@ been loaded, <b>pcretest</b> proceeds to read data lines in the usual way. You can copy a file written by <b>pcretest</b> to a different host and reload it there, even if the new host has opposite endianness to the one on which the pattern was compiled. For example, you can compile on an i86 machine and run on -a SPARC machine. When a pattern is reloaded on a host with different +a SPARC machine. When a pattern is reloaded on a host with different endianness, the confirmation message is changed to: <pre> Compiled pattern (byte-inverted) loaded from /some/file </pre> -The test suite contains some saved pre-compiled patterns with different -endianness. These are reloaded using "<!" instead of just "<". This suppresses -the "(byte-inverted)" text so that the output is the same on all hosts. It also -forces debugging output once the pattern has been reloaded. +The test suite contains some saved pre-compiled patterns with different +endianness. These are reloaded using "<!" instead of just "<". This suppresses +the "(byte-inverted)" text so that the output is the same on all hosts. It also +forces debugging output once the pattern has been reloaded. </P> <P> File names for saving and reloading can be absolute or relative, but note that diff --git a/doc/html/pcreunicode.html b/doc/html/pcreunicode.html index bacde25..e3c6d58 100644 --- a/doc/html/pcreunicode.html +++ b/doc/html/pcreunicode.html @@ -17,7 +17,7 @@ UTF-8, UTF-16, AND UNICODE PROPERTY SUPPORT </b><br> <P> From Release 8.30, in addition to its previous UTF-8 support, PCRE also -supports UTF-16 by means of a separate 16-bit library. This can be built as +supports UTF-16 by means of a separate 16-bit library. This can be built as well as, or instead of, the 8-bit library. </P> <br><b> @@ -82,7 +82,7 @@ range U+0 to U+10FFFF, excluding U+D800 to U+DFFF. </P> <P> The excluded code points are the "Surrogate Area" of Unicode. They are reserved -for use by UTF-16, where they are used in pairs to encode codepoints with +for use by UTF-16, where they are used in pairs to encode codepoints with values greater than 0xFFFF. The code points that are encoded by UTF-16 pairs are available independently in the UTF-8 encoding. (In other words, the whole surrogate thing is a fudge for UTF-16 which unfortunately messes up UTF-8.) @@ -161,7 +161,7 @@ two-byte characters for values greater than \177. data units, for example: \x{100}{3}. </P> <P> -4. The dot metacharacter matches one UTF character instead of a single data +4. The dot metacharacter matches one UTF character instead of a single data unit. </P> <P> @@ -179,7 +179,7 @@ be carried out by the normal interpretive function. <P> 6. The character escapes \b, \B, \d, \D, \s, \S, \w, and \W correctly test characters of any code value, but, by default, the characters that PCRE -recognizes as digits, spaces, or word characters remain the same set as in +recognizes as digits, spaces, or word characters remain the same set as in non-UTF mode, all with values less than 256. This remains true even when PCRE is built to include Unicode property support, because to do otherwise would slow down PCRE in many common cases. Note in particular that this applies to diff --git a/doc/pcre-config.1 b/doc/pcre-config.1 index 445fe18..6f2c48f 100644 --- a/doc/pcre-config.1 +++ b/doc/pcre-config.1 @@ -6,7 +6,7 @@ pcre-config - program to return PCRE configuration .sp .B pcre-config [--prefix] [--exec-prefix] [--version] [--libs] .ti +5n -.B [--libs16] [--libs-cpp] [--libs-posix] [--cflags] +.B [--libs16] [--libs-cpp] [--libs-posix] [--cflags] .ti +5n .B [--cflags-posix] . @@ -15,9 +15,9 @@ pcre-config - program to return PCRE configuration .rs .sp \fBpcre-config\fP returns the configuration of the installed PCRE -libraries and the options required to compile a program to use them. Some of -the options apply only to the 8-bit or 16-bit libraries, respectively, and are -not available if only one of those libraries has been built. If an unavailable +libraries and the options required to compile a program to use them. Some of +the options apply only to the 8-bit or 16-bit libraries, respectively, and are +not available if only one of those libraries has been built. If an unavailable option is encountered, the "usage" information is output. . . @@ -11,11 +11,11 @@ appeared in Perl are also available using the Python syntax, there is some support for one or two .NET and Oniguruma syntax items, and there is an option for requesting some minor changes that give better JavaScript compatibility. .P -Starting with release 8.30, it is possible to compile two separate PCRE +Starting with release 8.30, it is possible to compile two separate PCRE libraries: the original, which supports 8-bit character strings (including UTF-8 strings), and a second library that supports 16-bit character strings (including UTF-16 strings). The build process allows either one or both to be -built. The majority of the work to make this possible was done by Zoltan +built. The majority of the work to make this possible was done by Zoltan Herczeg. .P The two libraries contain identical sets of functions, except that the names in @@ -26,8 +26,8 @@ library described separately in the .\" HREF \fBpcre16\fP .\" -page. References to functions or structures of the form \fIpcre[16]_xxx\fP -should be read as meaning "\fIpcre_xxx\fP when using the 8-bit library and +page. References to functions or structures of the form \fIpcre[16]_xxx\fP +should be read as meaning "\fIpcre_xxx\fP when using the 8-bit library and \fIpcre16_xxx\fP when using the 16-bit library". .P The current implementation of PCRE corresponds approximately with Perl 5.12, @@ -106,7 +106,7 @@ all the sections, except the \fBpcredemo\fP section, are concatenated, for ease of searching. The sections are as follows: .sp pcre this document - pcre16 details of the 16-bit library + pcre16 details of the 16-bit library pcre-config show PCRE installation configuration information pcreapi details of PCRE's native C API pcrebuild options for building PCRE diff --git a/doc/pcre.txt b/doc/pcre.txt index 6740394..c9a7b2e 100644 --- a/doc/pcre.txt +++ b/doc/pcre.txt @@ -138,8 +138,8 @@ REVISION Last updated: 10 January 2012 Copyright (c) 1997-2012 University of Cambridge. ------------------------------------------------------------------------------ - - + + PCRE(3) PCRE(3) @@ -463,8 +463,8 @@ REVISION Last updated: 08 January 2012 Copyright (c) 1997-2012 University of Cambridge. ------------------------------------------------------------------------------ - - + + PCREBUILD(3) PCREBUILD(3) @@ -859,8 +859,8 @@ REVISION Last updated: 07 January 2012 Copyright (c) 1997-2012 University of Cambridge. ------------------------------------------------------------------------------ - - + + PCREMATCHING(3) PCREMATCHING(3) @@ -1066,8 +1066,8 @@ REVISION Last updated: 08 January 2012 Copyright (c) 1997-2012 University of Cambridge. ------------------------------------------------------------------------------ - - + + PCREAPI(3) PCREAPI(3) @@ -1405,6 +1405,14 @@ CHECKING BUILD-TIME OPTIONS The output is an integer that is set to one if support for just-in-time compiling is available; otherwise it is set to zero. + PCRE_CONFIG_JITTARGET + + The output is a pointer to a zero-terminated "const char *" string. If + JIT support is available, the string contains the name of the architec- + ture for which the JIT compiler is configured, for example "x86 32bit + (little endian + unaligned)". If JIT support is not available, the + result is NULL. + PCRE_CONFIG_NEWLINE The output is an integer whose value specifies the default character @@ -3255,6 +3263,31 @@ FINDING ALL POSSIBLE MATCHES matches, pcre_exec() will yield PCRE_ERROR_NOMATCH. +OBTAINING AN ESTIMATE OF STACK USAGE + + Matching certain patterns using pcre_exec() can use a lot of process + stack, which in certain environments can be rather limited in size. + Some users find it helpful to have an estimate of the amount of stack + that is used by pcre_exec(), to help them set recursion limits, as + described in the pcrestack documentation. The estimate that is output + by pcretest when called with the -m and -C options is obtained by call- + ing pcre_exec with the values NULL, NULL, NULL, -999, and -999 for its + first five arguments. + + Normally, if its first argument is NULL, pcre_exec() immediately + returns the negative error code PCRE_ERROR_NULL, but with this special + combination of arguments, it returns instead a negative number whose + absolute value is the approximate stack frame size in bytes. (A nega- + tive number is used so that it is clear that no match has happened.) + The value is approximate because in some cases, recursive calls to + pcre_exec() occur when there are one or two additional variables on the + stack. + + If PCRE has been compiled to use the heap instead of the stack for + recursion, the value returned is the size of each block that is + obtained from the heap. + + MATCHING A PATTERN: THE ALTERNATIVE FUNCTION int pcre_dfa_exec(const pcre *code, const pcre_extra *extra, @@ -3436,11 +3469,11 @@ AUTHOR REVISION - Last updated: 07 January 2012 + Last updated: 21 January 2012 Copyright (c) 1997-2012 University of Cambridge. ------------------------------------------------------------------------------ - - + + PCRECALLOUT(3) PCRECALLOUT(3) @@ -3638,8 +3671,8 @@ REVISION Last updated: 08 Janurary 2012 Copyright (c) 1997-2012 University of Cambridge. ------------------------------------------------------------------------------ - - + + PCRECOMPAT(3) PCRECOMPAT(3) @@ -3813,8 +3846,8 @@ REVISION Last updated: 08 Januray 2012 Copyright (c) 1997-2012 University of Cambridge. ------------------------------------------------------------------------------ - - + + PCREPATTERN(3) PCREPATTERN(3) @@ -6418,8 +6451,8 @@ REVISION Last updated: 09 January 2012 Copyright (c) 1997-2012 University of Cambridge. ------------------------------------------------------------------------------ - - + + PCRESYNTAX(3) PCRESYNTAX(3) @@ -6794,8 +6827,8 @@ REVISION Last updated: 10 January 2012 Copyright (c) 1997-2012 University of Cambridge. ------------------------------------------------------------------------------ - - + + PCREUNICODE(3) PCREUNICODE(3) @@ -6992,8 +7025,8 @@ REVISION Last updated: 13 January 2012 Copyright (c) 1997-2012 University of Cambridge. ------------------------------------------------------------------------------ - - + + PCREJIT(3) PCREJIT(3) @@ -7348,8 +7381,8 @@ REVISION Last updated: 08 January 2012 Copyright (c) 1997-2012 University of Cambridge. ------------------------------------------------------------------------------ - - + + PCREPARTIAL(3) PCREPARTIAL(3) @@ -7469,111 +7502,112 @@ PARTIAL MATCHING USING pcre_exec() OR pcre16_exec() plete match. For this reason, the assumption is made that the end of the supplied subject string may not be the true end of the available data, and so, if \z, \Z, \b, \B, or $ are encountered at the end of the - subject, the result is PCRE_ERROR_PARTIAL. + subject, the result is PCRE_ERROR_PARTIAL, provided that at least one + character in the subject has been inspected. Setting PCRE_PARTIAL_HARD also affects the way UTF-8 and UTF-16 subject - strings are checked for validity. Normally, an invalid sequence causes - the error PCRE_ERROR_BADUTF8 or PCRE_ERROR_BADUTF16. However, in the - special case of a truncated character at the end of the subject, - PCRE_ERROR_SHORTUTF8 or PCRE_ERROR_SHORTUTF16 is returned when + strings are checked for validity. Normally, an invalid sequence causes + the error PCRE_ERROR_BADUTF8 or PCRE_ERROR_BADUTF16. However, in the + special case of a truncated character at the end of the subject, + PCRE_ERROR_SHORTUTF8 or PCRE_ERROR_SHORTUTF16 is returned when PCRE_PARTIAL_HARD is set. Comparing hard and soft partial matching - The difference between the two partial matching options can be illus- + The difference between the two partial matching options can be illus- trated by a pattern such as: /dog(sbody)?/ - This matches either "dog" or "dogsbody", greedily (that is, it prefers - the longer string if possible). If it is matched against the string - "dog" with PCRE_PARTIAL_SOFT, it yields a complete match for "dog". + This matches either "dog" or "dogsbody", greedily (that is, it prefers + the longer string if possible). If it is matched against the string + "dog" with PCRE_PARTIAL_SOFT, it yields a complete match for "dog". However, if PCRE_PARTIAL_HARD is set, the result is PCRE_ERROR_PARTIAL. - On the other hand, if the pattern is made ungreedy the result is dif- + On the other hand, if the pattern is made ungreedy the result is dif- ferent: /dog(sbody)??/ - In this case the result is always a complete match because that is - found first, and matching never continues after finding a complete + In this case the result is always a complete match because that is + found first, and matching never continues after finding a complete match. It might be easier to follow this explanation by thinking of the two patterns like this: /dog(sbody)?/ is the same as /dogsbody|dog/ /dog(sbody)??/ is the same as /dog|dogsbody/ - The second pattern will never match "dogsbody", because it will always + The second pattern will never match "dogsbody", because it will always find the shorter match first. PARTIAL MATCHING USING pcre_dfa_exec() OR pcre16_dfa_exec() The DFA functions move along the subject string character by character, - without backtracking, searching for all possible matches simultane- - ously. If the end of the subject is reached before the end of the pat- - tern, there is the possibility of a partial match, again provided that + without backtracking, searching for all possible matches simultane- + ously. If the end of the subject is reached before the end of the pat- + tern, there is the possibility of a partial match, again provided that at least one character has been inspected. - When PCRE_PARTIAL_SOFT is set, PCRE_ERROR_PARTIAL is returned only if - there have been no complete matches. Otherwise, the complete matches - are returned. However, if PCRE_PARTIAL_HARD is set, a partial match - takes precedence over any complete matches. The portion of the string - that was inspected when the longest partial match was found is set as + When PCRE_PARTIAL_SOFT is set, PCRE_ERROR_PARTIAL is returned only if + there have been no complete matches. Otherwise, the complete matches + are returned. However, if PCRE_PARTIAL_HARD is set, a partial match + takes precedence over any complete matches. The portion of the string + that was inspected when the longest partial match was found is set as the first matching string, provided there are at least two slots in the offsets vector. - Because the DFA functions always search for all possible matches, and - there is no difference between greedy and ungreedy repetition, their - behaviour is different from the standard functions when PCRE_PAR- - TIAL_HARD is set. Consider the string "dog" matched against the + Because the DFA functions always search for all possible matches, and + there is no difference between greedy and ungreedy repetition, their + behaviour is different from the standard functions when PCRE_PAR- + TIAL_HARD is set. Consider the string "dog" matched against the ungreedy pattern shown above: /dog(sbody)??/ - Whereas the standard functions stop as soon as they find the complete - match for "dog", the DFA functions also find the partial match for + Whereas the standard functions stop as soon as they find the complete + match for "dog", the DFA functions also find the partial match for "dogsbody", and so return that when PCRE_PARTIAL_HARD is set. PARTIAL MATCHING AND WORD BOUNDARIES - If a pattern ends with one of sequences \b or \B, which test for word - boundaries, partial matching with PCRE_PARTIAL_SOFT can give counter- + If a pattern ends with one of sequences \b or \B, which test for word + boundaries, partial matching with PCRE_PARTIAL_SOFT can give counter- intuitive results. Consider this pattern: /\bcat\b/ This matches "cat", provided there is a word boundary at either end. If the subject string is "the cat", the comparison of the final "t" with a - following character cannot take place, so a partial match is found. - However, normal matching carries on, and \b matches at the end of the - subject when the last character is a letter, so a complete match is - found. The result, therefore, is not PCRE_ERROR_PARTIAL. Using - PCRE_PARTIAL_HARD in this case does yield PCRE_ERROR_PARTIAL, because + following character cannot take place, so a partial match is found. + However, normal matching carries on, and \b matches at the end of the + subject when the last character is a letter, so a complete match is + found. The result, therefore, is not PCRE_ERROR_PARTIAL. Using + PCRE_PARTIAL_HARD in this case does yield PCRE_ERROR_PARTIAL, because then the partial match takes precedence. FORMERLY RESTRICTED PATTERNS For releases of PCRE prior to 8.00, because of the way certain internal - optimizations were implemented in the pcre_exec() function, the - PCRE_PARTIAL option (predecessor of PCRE_PARTIAL_SOFT) could not be - used with all patterns. From release 8.00 onwards, the restrictions no - longer apply, and partial matching with can be requested for any pat- + optimizations were implemented in the pcre_exec() function, the + PCRE_PARTIAL option (predecessor of PCRE_PARTIAL_SOFT) could not be + used with all patterns. From release 8.00 onwards, the restrictions no + longer apply, and partial matching with can be requested for any pat- tern. Items that were formerly restricted were repeated single characters and - repeated metasequences. If PCRE_PARTIAL was set for a pattern that did - not conform to the restrictions, pcre_exec() returned the error code - PCRE_ERROR_BADPARTIAL (-13). This error code is no longer in use. The - PCRE_INFO_OKPARTIAL call to pcre_fullinfo() to find out if a compiled + repeated metasequences. If PCRE_PARTIAL was set for a pattern that did + not conform to the restrictions, pcre_exec() returned the error code + PCRE_ERROR_BADPARTIAL (-13). This error code is no longer in use. The + PCRE_INFO_OKPARTIAL call to pcre_fullinfo() to find out if a compiled pattern can be used for partial matching now always returns 1. EXAMPLE OF PARTIAL MATCHING USING PCRETEST - If the escape sequence \P is present in a pcretest data line, the - PCRE_PARTIAL_SOFT option is used for the match. Here is a run of + If the escape sequence \P is present in a pcretest data line, the + PCRE_PARTIAL_SOFT option is used for the match. Here is a run of pcretest that uses the date example quoted above: re> /^\d?\d(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\d\d$/ @@ -7589,24 +7623,24 @@ EXAMPLE OF PARTIAL MATCHING USING PCRETEST data> j\P No match - The first data string is matched completely, so pcretest shows the - matched substrings. The remaining four strings do not match the com- + The first data string is matched completely, so pcretest shows the + matched substrings. The remaining four strings do not match the com- plete pattern, but the first two are partial matches. Similar output is obtained if DFA matching is used. - If the escape sequence \P is present more than once in a pcretest data + If the escape sequence \P is present more than once in a pcretest data line, the PCRE_PARTIAL_HARD option is set for the match. MULTI-SEGMENT MATCHING WITH pcre_dfa_exec() OR pcre16_dfa_exec() - When a partial match has been found using a DFA matching function, it - is possible to continue the match by providing additional subject data - and calling the function again with the same compiled regular expres- - sion, this time setting the PCRE_DFA_RESTART option. You must pass the + When a partial match has been found using a DFA matching function, it + is possible to continue the match by providing additional subject data + and calling the function again with the same compiled regular expres- + sion, this time setting the PCRE_DFA_RESTART option. You must pass the same working space as before, because this is where details of the pre- - vious partial match are stored. Here is an example using pcretest, - using the \R escape sequence to set the PCRE_DFA_RESTART option (\D + vious partial match are stored. Here is an example using pcretest, + using the \R escape sequence to set the PCRE_DFA_RESTART option (\D specifies the use of the DFA matching function): re> /^\d?\d(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\d\d$/ @@ -7615,47 +7649,47 @@ MULTI-SEGMENT MATCHING WITH pcre_dfa_exec() OR pcre16_dfa_exec() data> n05\R\D 0: n05 - The first call has "23ja" as the subject, and requests partial match- - ing; the second call has "n05" as the subject for the continued - (restarted) match. Notice that when the match is complete, only the - last part is shown; PCRE does not retain the previously partially- - matched string. It is up to the calling program to do that if it needs + The first call has "23ja" as the subject, and requests partial match- + ing; the second call has "n05" as the subject for the continued + (restarted) match. Notice that when the match is complete, only the + last part is shown; PCRE does not retain the previously partially- + matched string. It is up to the calling program to do that if it needs to. - You can set the PCRE_PARTIAL_SOFT or PCRE_PARTIAL_HARD options with - PCRE_DFA_RESTART to continue partial matching over multiple segments. - This facility can be used to pass very long subject strings to the DFA + You can set the PCRE_PARTIAL_SOFT or PCRE_PARTIAL_HARD options with + PCRE_DFA_RESTART to continue partial matching over multiple segments. + This facility can be used to pass very long subject strings to the DFA matching functions. MULTI-SEGMENT MATCHING WITH pcre_exec() OR pcre16_exec() - From release 8.00, the standard matching functions can also be used to + From release 8.00, the standard matching functions can also be used to do multi-segment matching. Unlike the DFA functions, it is not possible - to restart the previous match with a new segment of data. Instead, new + to restart the previous match with a new segment of data. Instead, new data must be added to the previous subject string, and the entire match - re-run, starting from the point where the partial match occurred. Ear- + re-run, starting from the point where the partial match occurred. Ear- lier data can be discarded. - It is best to use PCRE_PARTIAL_HARD in this situation, because it does - not treat the end of a segment as the end of the subject when matching - \z, \Z, \b, \B, and $. Consider an unanchored pattern that matches + It is best to use PCRE_PARTIAL_HARD in this situation, because it does + not treat the end of a segment as the end of the subject when matching + \z, \Z, \b, \B, and $. Consider an unanchored pattern that matches dates: re> /\d?\d(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\d\d/ data> The date is 23ja\P\P Partial match: 23ja - At this stage, an application could discard the text preceding "23ja", - add on text from the next segment, and call the matching function - again. Unlike the DFA matching functions the entire matching string - must always be available, and the complete matching process occurs for + At this stage, an application could discard the text preceding "23ja", + add on text from the next segment, and call the matching function + again. Unlike the DFA matching functions the entire matching string + must always be available, and the complete matching process occurs for each call, so more memory and more processing time is needed. - Note: If the pattern contains lookbehind assertions, or \K, or starts + Note: If the pattern contains lookbehind assertions, or \K, or starts with \b or \B, the string that is returned for a partial match includes - characters that precede the partially matched string itself, because - these must be retained when adding on more characters for a subsequent + characters that precede the partially matched string itself, because + these must be retained when adding on more characters for a subsequent matching attempt. @@ -7665,28 +7699,28 @@ ISSUES WITH MULTI-SEGMENT MATCHING whichever matching function is used. 1. If the pattern contains a test for the beginning of a line, you need - to pass the PCRE_NOTBOL option when the subject string for any call - does start at the beginning of a line. There is also a PCRE_NOTEOL + to pass the PCRE_NOTBOL option when the subject string for any call + does start at the beginning of a line. There is also a PCRE_NOTEOL option, but in practice when doing multi-segment matching you should be using PCRE_PARTIAL_HARD, which includes the effect of PCRE_NOTEOL. - 2. Lookbehind assertions at the start of a pattern are catered for in - the offsets that are returned for a partial match. However, in theory, - a lookbehind assertion later in the pattern could require even earlier - characters to be inspected, and it might not have been reached when a - partial match occurs. This is probably an extremely unlikely case; you - could guard against it to a certain extent by always including extra + 2. Lookbehind assertions at the start of a pattern are catered for in + the offsets that are returned for a partial match. However, in theory, + a lookbehind assertion later in the pattern could require even earlier + characters to be inspected, and it might not have been reached when a + partial match occurs. This is probably an extremely unlikely case; you + could guard against it to a certain extent by always including extra characters at the start. - 3. Matching a subject string that is split into multiple segments may - not always produce exactly the same result as matching over one single - long string, especially when PCRE_PARTIAL_SOFT is used. The section - "Partial Matching and Word Boundaries" above describes an issue that - arises if the pattern ends with \b or \B. Another kind of difference - may occur when there are multiple matching possibilities, because (for - PCRE_PARTIAL_SOFT) a partial match result is given only when there are + 3. Matching a subject string that is split into multiple segments may + not always produce exactly the same result as matching over one single + long string, especially when PCRE_PARTIAL_SOFT is used. The section + "Partial Matching and Word Boundaries" above describes an issue that + arises if the pattern ends with \b or \B. Another kind of difference + may occur when there are multiple matching possibilities, because (for + PCRE_PARTIAL_SOFT) a partial match result is given only when there are no completed matches. This means that as soon as the shortest match has - been found, continuation to a new subject segment is no longer possi- + been found, continuation to a new subject segment is no longer possi- ble. Consider again this pcretest example: re> /dog(sbody)?/ @@ -7700,18 +7734,18 @@ ISSUES WITH MULTI-SEGMENT MATCHING 0: dogsbody 1: dog - The first data line passes the string "dogsb" to a standard matching - function, setting the PCRE_PARTIAL_SOFT option. Although the string is - a partial match for "dogsbody", the result is not PCRE_ERROR_PARTIAL, - because the shorter string "dog" is a complete match. Similarly, when - the subject is presented to a DFA matching function in several parts - ("do" and "gsb" being the first two) the match stops when "dog" has - been found, and it is not possible to continue. On the other hand, if - "dogsbody" is presented as a single string, a DFA matching function + The first data line passes the string "dogsb" to a standard matching + function, setting the PCRE_PARTIAL_SOFT option. Although the string is + a partial match for "dogsbody", the result is not PCRE_ERROR_PARTIAL, + because the shorter string "dog" is a complete match. Similarly, when + the subject is presented to a DFA matching function in several parts + ("do" and "gsb" being the first two) the match stops when "dog" has + been found, and it is not possible to continue. On the other hand, if + "dogsbody" is presented as a single string, a DFA matching function finds both matches. - Because of these problems, it is best to use PCRE_PARTIAL_HARD when - matching multi-segment data. The example above then behaves differ- + Because of these problems, it is best to use PCRE_PARTIAL_HARD when + matching multi-segment data. The example above then behaves differ- ently: re> /dog(sbody)?/ @@ -7723,25 +7757,25 @@ ISSUES WITH MULTI-SEGMENT MATCHING Partial match: gsb 4. Patterns that contain alternatives at the top level which do not all - start with the same pattern item may not work as expected when + start with the same pattern item may not work as expected when PCRE_DFA_RESTART is used. For example, consider this pattern: 1234|3789 - If the first part of the subject is "ABC123", a partial match of the - first alternative is found at offset 3. There is no partial match for + If the first part of the subject is "ABC123", a partial match of the + first alternative is found at offset 3. There is no partial match for the second alternative, because such a match does not start at the same - point in the subject string. Attempting to continue with the string - "7890" does not yield a match because only those alternatives that - match at one point in the subject are remembered. The problem arises - because the start of the second alternative matches within the first - alternative. There is no problem with anchored patterns or patterns + point in the subject string. Attempting to continue with the string + "7890" does not yield a match because only those alternatives that + match at one point in the subject are remembered. The problem arises + because the start of the second alternative matches within the first + alternative. There is no problem with anchored patterns or patterns such as: 1234|ABCD - where no string can be a partial match for both alternatives. This is - not a problem if a standard matching function is used, because the + where no string can be a partial match for both alternatives. This is + not a problem if a standard matching function is used, because the entire match has to be rerun each time: re> /1234|3789/ @@ -7751,10 +7785,10 @@ ISSUES WITH MULTI-SEGMENT MATCHING 0: 3789 Of course, instead of using PCRE_DFA_RESTART, the same technique of re- - running the entire match can also be used with the DFA matching func- - tions. Another possibility is to work with two buffers. If a partial - match at offset n in the first buffer is followed by "no match" when - PCRE_DFA_RESTART is used on the second buffer, you can then try a new + running the entire match can also be used with the DFA matching func- + tions. Another possibility is to work with two buffers. If a partial + match at offset n in the first buffer is followed by "no match" when + PCRE_DFA_RESTART is used on the second buffer, you can then try a new match starting at offset n+1 in the first buffer. @@ -7767,11 +7801,11 @@ AUTHOR REVISION - Last updated: 08 January 2012 + Last updated: 21 January 2012 Copyright (c) 1997-2012 University of Cambridge. ------------------------------------------------------------------------------ - - + + PCREPRECOMPILE(3) PCREPRECOMPILE(3) @@ -7905,8 +7939,8 @@ REVISION Last updated: 10 January 2012 Copyright (c) 1997-2012 University of Cambridge. ------------------------------------------------------------------------------ - - + + PCREPERFORM(3) PCREPERFORM(3) @@ -8075,8 +8109,8 @@ REVISION Last updated: 09 January 2012 Copyright (c) 1997-2012 University of Cambridge. ------------------------------------------------------------------------------ - - + + PCREPOSIX(3) PCREPOSIX(3) @@ -8339,8 +8373,8 @@ REVISION Last updated: 09 January 2012 Copyright (c) 1997-2012 University of Cambridge. ------------------------------------------------------------------------------ - - + + PCRECPP(3) PCRECPP(3) @@ -8681,8 +8715,8 @@ REVISION Last updated: 08 January 2012 ------------------------------------------------------------------------------ - - + + PCRESAMPLE(3) PCRESAMPLE(3) @@ -8825,8 +8859,8 @@ REVISION Last updated: 08 January 2012 Copyright (c) 1997-2012 University of Cambridge. ------------------------------------------------------------------------------ - - + + PCRESTACK(3) PCRESTACK(3) @@ -8944,6 +8978,30 @@ PCRE DISCUSSION OF STACK USAGE subject string. This is done by calling pcre[16]_exec() repeatedly with different limits. + Obtaining an estimate of stack usage + + The actual amount of stack used per recursion can vary quite a lot, + depending on the compiler that was used to build PCRE and the optimiza- + tion or debugging options that were set for it. The rule of thumb value + of 500 bytes mentioned above may be larger or smaller than what is + actually needed. A better approximation can be obtained by running this + command: + + pcretest -m -C + + The -C option causes pcretest to output information about the options + with which PCRE was compiled. When -m is also given (before -C), infor- + mation about stack use is given in a line like this: + + Match recursion uses stack: approximate frame size = 640 bytes + + The value is approximate because some recursions need a bit more (up to + perhaps 16 more bytes). + + If the above command is given when PCRE is compiled to use the heap + instead of the stack for recursion, the value that is output is the + size of each block that is obtained from the heap. + Changing stack size in Unix-like systems In Unix-like environments, there is not often a problem with the stack @@ -8983,8 +9041,8 @@ AUTHOR REVISION - Last updated: 10 January 2012 + Last updated: 21 January 2012 Copyright (c) 1997-2012 University of Cambridge. ------------------------------------------------------------------------------ - - + + diff --git a/doc/pcre16.3 b/doc/pcre16.3 index c206e44..726ef90 100644 --- a/doc/pcre16.3 +++ b/doc/pcre16.3 @@ -139,7 +139,7 @@ PCRE - Perl-compatible regular expressions .sp .B int pcre16_utf16_to_host_byte_order(PCRE_UCHAR16 *\fIoutput\fP, .ti +5n -.B PCRE_SPTR16 \fIinput\fP, int \fIlength\fP, int *\fIbyte_order\fP, +.B PCRE_SPTR16 \fIinput\fP, int \fIlength\fP, int *\fIbyte_order\fP, .ti +5n .B int \fIkeep_boms\fP); . @@ -158,8 +158,8 @@ PCRE documentation describes the 8-bit library, with only occasional references to the 16-bit library. This page describes what is different when you use the 16-bit library. .P -WARNING: A single application can be linked with both libraries, but you must -take care when processing any particular pattern to use functions from just one +WARNING: A single application can be linked with both libraries, but you must +take care when processing any particular pattern to use functions from just one library. For example, if you want to study a pattern that was compiled with \fBpcre16_compile()\fP, you must do so with \fBpcre16_study()\fP, not \fBpcre_study()\fP, and you must free the study data with @@ -169,7 +169,7 @@ library. For example, if you want to study a pattern that was compiled with .SH "THE HEADER FILE" .rs .sp -There is only one header file, \fBpcre.h\fP. It contains prototypes for all the +There is only one header file, \fBpcre.h\fP. It contains prototypes for all the functions in both libraries, as well as definitions of flags, structures, error codes, etc. . @@ -177,34 +177,34 @@ codes, etc. .SH "THE LIBRARY NAME" .rs .sp -In Unix-like systems, the 16-bit library is called \fBlibpcre16\fP, and can -normally be accesss by adding \fB-lpcre16\fP to the command for linking an +In Unix-like systems, the 16-bit library is called \fBlibpcre16\fP, and can +normally be accesss by adding \fB-lpcre16\fP to the command for linking an application that uses PCRE. . . .SH "STRING TYPES" .rs .sp -In the 8-bit library, strings are passed to PCRE library functions as vectors -of bytes with the C type "char *". In the 16-bit library, strings are passed as -vectors of unsigned 16-bit quantities. The macro PCRE_UCHAR16 specifies an -appropriate data type, and PCRE_SPTR16 is defined as "const PCRE_UCHAR16 *". In -very many environments, "short int" is a 16-bit data type. When PCRE is built, -it defines PCRE_UCHAR16 as "short int", but checks that it really is a 16-bit -data type. If it is not, the build fails with an error message telling the +In the 8-bit library, strings are passed to PCRE library functions as vectors +of bytes with the C type "char *". In the 16-bit library, strings are passed as +vectors of unsigned 16-bit quantities. The macro PCRE_UCHAR16 specifies an +appropriate data type, and PCRE_SPTR16 is defined as "const PCRE_UCHAR16 *". In +very many environments, "short int" is a 16-bit data type. When PCRE is built, +it defines PCRE_UCHAR16 as "short int", but checks that it really is a 16-bit +data type. If it is not, the build fails with an error message telling the maintainer to modify the definition appropriately. . . .SH "STRUCTURE TYPES" .rs .sp -The types of the opaque structures that are used for compiled 16-bit patterns -and JIT stacks are \fBpcre16\fP and \fBpcre16_jit_stack\fP respectively. The -type of the user-accessible structure that is returned by \fBpcre16_study()\fP +The types of the opaque structures that are used for compiled 16-bit patterns +and JIT stacks are \fBpcre16\fP and \fBpcre16_jit_stack\fP respectively. The +type of the user-accessible structure that is returned by \fBpcre16_study()\fP is \fBpcre16_extra\fP, and the type of the structure that is used for passing -data to a callout function is \fBpcre16_callout_block\fP. These structures -contain the same fields, with the same names, as their 8-bit counterparts. The -only difference is that pointers to character strings are 16-bit instead of +data to a callout function is \fBpcre16_callout_block\fP. These structures +contain the same fields, with the same names, as their 8-bit counterparts. The +only difference is that pointers to character strings are 16-bit instead of 8-bit types. . . @@ -212,31 +212,31 @@ only difference is that pointers to character strings are 16-bit instead of .rs .sp For every function in the 8-bit library there is a corresponding function in -the 16-bit library with a name that starts with \fBpcre16_\fP instead of +the 16-bit library with a name that starts with \fBpcre16_\fP instead of \fBpcre_\fP. The prototypes are listed above. In addition, there is one extra -function, \fBpcre16_utf16_to_host_byte_order()\fP. This is a utility function -that converts a UTF-16 character string to host byte order if necessary. The -other 16-bit functions expect the strings they are passed to be in host byte -order. +function, \fBpcre16_utf16_to_host_byte_order()\fP. This is a utility function +that converts a UTF-16 character string to host byte order if necessary. The +other 16-bit functions expect the strings they are passed to be in host byte +order. .P The \fIinput\fP and \fIoutput\fP arguments of -\fBpcre16_utf16_to_host_byte_order()\fP may point to the same address, that is, -conversion in place is supported. The output buffer must be at least as long as +\fBpcre16_utf16_to_host_byte_order()\fP may point to the same address, that is, +conversion in place is supported. The output buffer must be at least as long as the input. .P The \fIlength\fP argument specifies the number of 16-bit data units in the input string; a negative value specifies a zero-terminated string. .P -If \fIbyte_order\fP is NULL, it is assumed that the string starts off in host +If \fIbyte_order\fP is NULL, it is assumed that the string starts off in host byte order. This may be changed by byte-order marks (BOMs) anywhere in the string (commonly as the first character). .P -If \fIbyte_order\fP is not NULL, a non-zero value of the integer to which it -points means that the input starts off in host byte order, otherwise the -opposite order is assumed. Again, BOMs in the string can change this. The final -byte order is passed back at the end of processing. +If \fIbyte_order\fP is not NULL, a non-zero value of the integer to which it +points means that the input starts off in host byte order, otherwise the +opposite order is assumed. Again, BOMs in the string can change this. The final +byte order is passed back at the end of processing. .P -If \fIkeep_boms\fP is not zero, byte-order mark characters (0xfeff) are copied +If \fIkeep_boms\fP is not zero, byte-order mark characters (0xfeff) are copied into the output string. Otherwise they are discarded. .P The result of the function is the number of 16-bit units placed into the output @@ -246,16 +246,16 @@ buffer, including the zero terminator if the string was zero-terminated. .SH "SUBJECT STRING OFFSETS" .rs .sp -The offsets within subject strings that are returned by the matching functions +The offsets within subject strings that are returned by the matching functions are in 16-bit units rather than bytes. . . .SH "NAMED SUBPATTERNS" .rs .sp -The name-to-number translation table that is maintained for named subpatterns -uses 16-bit characters. The \fBpcre16_get_stringtable_entries()\fP function -returns the length of each entry in the table as the number of 16-bit data +The name-to-number translation table that is maintained for named subpatterns +uses 16-bit characters. The \fBpcre16_get_stringtable_entries()\fP function +returns the length of each entry in the table as the number of 16-bit data units. . . @@ -266,7 +266,7 @@ There are two new general option names, PCRE_UTF16 and PCRE_NO_UTF16_CHECK, which correspond to PCRE_UTF8 and PCRE_NO_UTF8_CHECK in the 8-bit library. In fact, these new options define the same bits in the options word. .P -For the \fBpcre16_config()\fP function there is an option PCRE_CONFIG_UTF16 +For the \fBpcre16_config()\fP function there is an option PCRE_CONFIG_UTF16 that returns 1 if UTF-16 support is configured, otherwise 0. If this option is given to \fBpcre_config()\fP, or if the PCRE_CONFIG_UTF8 option is given to \fBpcre16_config()\fP, the result is the PCRE_ERROR_BADOPTION error. @@ -275,18 +275,18 @@ given to \fBpcre_config()\fP, or if the PCRE_CONFIG_UTF8 option is given to .SH "CHARACTER CODES" .rs .sp -In 16-bit mode, when PCRE_UTF16 is not set, character values are treated in the -same way as in 8-bit, non UTF-8 mode, except, of course, that they can range -from 0 to 0xffff instead of 0 to 0xff. Character types for characters less than -0xff can therefore be influenced by the locale in the same way as before. -Characters greater than 0xff have only one case, and no "type" (such as letter +In 16-bit mode, when PCRE_UTF16 is not set, character values are treated in the +same way as in 8-bit, non UTF-8 mode, except, of course, that they can range +from 0 to 0xffff instead of 0 to 0xff. Character types for characters less than +0xff can therefore be influenced by the locale in the same way as before. +Characters greater than 0xff have only one case, and no "type" (such as letter or digit). .P -In UTF-16 mode, the character code is Unicode, in the range 0 to 0x10ffff, with -the exception of values in the range 0xd800 to 0xdfff because those are +In UTF-16 mode, the character code is Unicode, in the range 0 to 0x10ffff, with +the exception of values in the range 0xd800 to 0xdfff because those are "surrogate" values that are used in pairs to encode values greater than 0xffff. .P -A UTF-16 string can indicate its endianness by special code knows as a +A UTF-16 string can indicate its endianness by special code knows as a byte-order mark (BOM). The PCRE functions do not handle this, expecting strings to be in host byte order. A utility function called \fBpcre16_utf16_to_host_byte_order()\fP is provided to help with this (see @@ -296,20 +296,20 @@ above). .SH "ERROR NAMES" .rs .sp -The errors PCRE_ERROR_BADUTF16_OFFSET and PCRE_ERROR_SHORTUTF16 correspond to +The errors PCRE_ERROR_BADUTF16_OFFSET and PCRE_ERROR_SHORTUTF16 correspond to their 8-bit counterparts. The error PCRE_ERROR_BADMODE is given when a compiled pattern is passed to a function that processes patterns in the other -mode, for example, if a pattern compiled with \fBpcre_compile()\fP is passed to +mode, for example, if a pattern compiled with \fBpcre_compile()\fP is passed to \fBpcre16_exec()\fP. .P There are new error codes whose names begin with PCRE_UTF16_ERR for invalid -UTF-16 strings, corresponding to the PCRE_UTF8_ERR codes for UTF-8 strings that +UTF-16 strings, corresponding to the PCRE_UTF8_ERR codes for UTF-8 strings that are described in the section entitled .\" HTML <a href="pcreapi.html#badutf8reasons"> .\" </a> "Reason codes for invalid UTF-8 strings" .\" -in the main +in the main .\" HREF \fBpcreapi\fP .\" @@ -324,8 +324,8 @@ page. The UTF-16 errors are: .SH "ERROR TEXTS" .rs .sp -If there is an error while compiling a pattern, the error text that is passed -back by \fBpcre16_compile()\fP or \fBpcre16_compile2()\fP is still an 8-bit +If there is an error while compiling a pattern, the error text that is passed +back by \fBpcre16_compile()\fP or \fBpcre16_compile2()\fP is still an 8-bit character string, zero-terminated. . . @@ -339,15 +339,15 @@ a callout function point to 16-bit vectors. .SH "TESTING" .rs .sp -The \fBpcretest\fP program continues to operate with 8-bit input and output -files, but it can be used for testing the 16-bit library. If it is run with the -command line option \fB-16\fP, patterns and subject strings are converted from -8-bit to 16-bit before being passed to PCRE, and the 16-bit library functions -are used instead of the 8-bit ones. Returned 16-bit strings are converted to +The \fBpcretest\fP program continues to operate with 8-bit input and output +files, but it can be used for testing the 16-bit library. If it is run with the +command line option \fB-16\fP, patterns and subject strings are converted from +8-bit to 16-bit before being passed to PCRE, and the 16-bit library functions +are used instead of the 8-bit ones. Returned 16-bit strings are converted to 8-bit for output. If the 8-bit library was not compiled, \fBpcretest\fP defaults to 16-bit and the \fB-16\fP option is ignored. .P -When PCRE is being built, the \fBRunTest\fP script that is called by "make +When PCRE is being built, the \fBRunTest\fP script that is called by "make check" uses the \fBpcretest\fP \fB-C\fP option to discover which of the 8-bit and 16-bit libraries has been built, and runs the tests appropriately. . @@ -355,8 +355,8 @@ and 16-bit libraries has been built, and runs the tests appropriately. .SH "NOT SUPPORTED IN 16-BIT MODE" .rs .sp -Not all the features of the 8-bit library are available with the 16-bit -library. The C++ and POSIX wrapper functions support only the 8-bit library, +Not all the features of the 8-bit library are available with the 16-bit +library. The C++ and POSIX wrapper functions support only the 8-bit library, and the \fBpcregrep\fP program is at present 8-bit only. . . diff --git a/doc/pcre_config.3 b/doc/pcre_config.3 index 8ebf27f..ac298c2 100644 --- a/doc/pcre_config.3 +++ b/doc/pcre_config.3 @@ -28,7 +28,7 @@ point to an unsigned long integer. The available codes are: PCRE_CONFIG_JIT Availability of just-in-time compiler support (1=yes 0=no) PCRE_CONFIG_JITTARGET String containing information about the - target architecture for the JIT compiler, + target architecture for the JIT compiler, or NULL if there is no JIT support PCRE_CONFIG_LINK_SIZE Internal link size: 2, 3, or 4 PCRE_CONFIG_MATCH_LIMIT Internal resource limit @@ -55,7 +55,7 @@ point to an unsigned long integer. The available codes are: Availability of Unicode property support (1=yes 0=no) .sp -The function yields 0 on success or PCRE_ERROR_BADOPTION otherwise. That error +The function yields 0 on success or PCRE_ERROR_BADOPTION otherwise. That error is also given if PCRE_CONFIG_UTF16 is passed to \fBpcre_config()\fP or if PCRE_CONFIG_UTF8 is passed to \fBpcre16_config()\fP. .P diff --git a/doc/pcre_fullinfo.3 b/doc/pcre_fullinfo.3 index c16406b..7ba6532 100644 --- a/doc/pcre_fullinfo.3 +++ b/doc/pcre_fullinfo.3 @@ -38,7 +38,7 @@ The following information is available: PCRE_INFO_HASCRORLF Return 1 if explicit CR or LF matches exist PCRE_INFO_JCHANGED Return 1 if (?J) or (?-J) was used PCRE_INFO_JIT Return 1 after successful JIT compilation - PCRE_INFO_JITSIZE Size of JIT compiled code + PCRE_INFO_JITSIZE Size of JIT compiled code PCRE_INFO_LASTLITERAL Literal last data unit required PCRE_INFO_MINLENGTH Lower bound length of matching strings PCRE_INFO_NAMECOUNT Number of named subpatterns diff --git a/doc/pcre_jit_stack_alloc.3 b/doc/pcre_jit_stack_alloc.3 index 1c97f30..0392839 100644 --- a/doc/pcre_jit_stack_alloc.3 +++ b/doc/pcre_jit_stack_alloc.3 @@ -11,7 +11,7 @@ PCRE - Perl-compatible regular expressions .ti +5n .B int \fImaxsize\fP); .PP -.B pcre16_jit_stack *pcre16_jit_stack_alloc(int \fIstartsize\fP, +.B pcre16_jit_stack *pcre16_jit_stack_alloc(int \fIstartsize\fP, .ti +5n .B int \fImaxsize\fP); . diff --git a/doc/pcre_pattern_to_host_byte_order.3 b/doc/pcre_pattern_to_host_byte_order.3 index adb51c0..615cf55 100644 --- a/doc/pcre_pattern_to_host_byte_order.3 +++ b/doc/pcre_pattern_to_host_byte_order.3 @@ -9,7 +9,7 @@ PCRE - Perl-compatible regular expressions .SM .B int pcre_pattern_to_host_byte_order(pcre *\fIcode\fP, .ti +5n -.B pcre_extra *\fIextra\fP, const unsigned char *\fItables\fP); +.B pcre_extra *\fIextra\fP, const unsigned char *\fItables\fP); .PP .B int pcre16_pattern_to_host_byte_order(pcre16 *\fIcode\fP, .ti +5n @@ -20,8 +20,8 @@ PCRE - Perl-compatible regular expressions .rs .sp This function ensures that the bytes in 2-byte and 4-byte values in a compiled -pattern are in the correct order for the current host. It is useful when a -pattern that has been compiled on one host is transferred to another that might +pattern are in the correct order for the current host. It is useful when a +pattern that has been compiled on one host is transferred to another that might have different endianness. The arguments are: .sp \fIcode\fP A compiled regular expression diff --git a/doc/pcre_utf16_to_host_byte_order.3 b/doc/pcre_utf16_to_host_byte_order.3 index 557d208..f08ce1e 100644 --- a/doc/pcre_utf16_to_host_byte_order.3 +++ b/doc/pcre_utf16_to_host_byte_order.3 @@ -9,7 +9,7 @@ PCRE - Perl-compatible regular expressions .SM .B int pcre16_utf16_to_host_byte_order(PCRE_UCHAR16 *\fIoutput\fP, .ti +5n -.B PCRE_SPTR16 \fIinput\fP, int \fIlength\fP, int *\fIhost_byte_order\fP, +.B PCRE_SPTR16 \fIinput\fP, int \fIlength\fP, int *\fIhost_byte_order\fP, .ti +5n .B int \fIkeep_boms\fP); . @@ -17,8 +17,8 @@ PCRE - Perl-compatible regular expressions .SH DESCRIPTION .rs .sp -This function, which exists only in the 16-bit library, converts a UTF-16 -string to the correct order for the current host, taking account of any byte +This function, which exists only in the 16-bit library, converts a UTF-16 +string to the correct order for the current host, taking account of any byte order marks (BOMs) within the string. Its arguments are: .sp \fIoutput\fP pointer to output buffer, may be the same as \fIinput\fP diff --git a/doc/pcreapi.3 b/doc/pcreapi.3 index b6f2507..6263e7b 100644 --- a/doc/pcreapi.3 +++ b/doc/pcreapi.3 @@ -148,7 +148,7 @@ just use different data types for their arguments and results, and their names start with \fBpcre16_\fP instead of \fBpcre_\fP. For every option that has UTF8 in its name (for example, PCRE_UTF8), there is a corresponding 16-bit name with UTF8 replaced by UTF16. This facility is in fact just cosmetic; the 16-bit -option names define the same bit values. +option names define the same bit values. .P References to bytes and UTF-8 in this document should be read as references to 16-bit data quantities and UTF-16 when using the 16-bit library, unless @@ -157,7 +157,7 @@ library are given in the .\" HREF \fBpcre16\fP .\" -page. +page. . . .SH "PCRE API OVERVIEW" @@ -392,7 +392,7 @@ not recognized. The following information is available: PCRE_CONFIG_UTF8 .sp The output is an integer that is set to one if UTF-8 support is available; -otherwise it is set to zero. If this option is given to the 16-bit version of +otherwise it is set to zero. If this option is given to the 16-bit version of this function, \fBpcre16_config()\fP, the result is PCRE_ERROR_BADOPTION. .sp PCRE_CONFIG_UTF16 @@ -415,8 +415,8 @@ compiling is available; otherwise it is set to zero. PCRE_CONFIG_JITTARGET .sp The output is a pointer to a zero-terminated "const char *" string. If JIT -support is available, the string contains the name of the architecture for -which the JIT compiler is configured, for example "x86 32bit (little endian + +support is available, the string contains the name of the architecture for +which the JIT compiler is configured, for example "x86 32bit (little endian + unaligned)". If JIT support is not available, the result is NULL. .sp PCRE_CONFIG_NEWLINE @@ -742,7 +742,7 @@ preceding sequences should be recognized. Setting PCRE_NEWLINE_ANY specifies that any Unicode newline sequence should be recognized. The Unicode newline sequences are the three just mentioned, plus the single characters VT (vertical tab, U+000B), FF (formfeed, U+000C), NEL (next line, U+0085), LS (line -separator, U+2028), and PS (paragraph separator, U+2029). For the 8-bit +separator, U+2028), and PS (paragraph separator, U+2029). For the 8-bit library, the last two are recognized only in UTF-8 mode. .P The newline setting in the options word uses three bits that are treated @@ -819,11 +819,11 @@ page. .sp PCRE_NO_UTF8_CHECK .sp -When PCRE_UTF8 is set, the validity of the pattern as a UTF-8 +When PCRE_UTF8 is set, the validity of the pattern as a UTF-8 string is automatically checked. There is a discussion about the .\" HTML <a href="pcreunicode.html#utf8strings"> .\" </a> -validity of UTF-8 strings +validity of UTF-8 strings .\" in the .\" HREF @@ -843,7 +843,7 @@ validity checking of subject strings. .sp The following table lists the error codes than may be returned by \fBpcre_compile2()\fP, along with the error messages that may be returned by -both compiling functions. Note that error messages are always 8-bit ASCII +both compiling functions. Note that error messages are always 8-bit ASCII strings, even in 16-bit mode. As PCRE has developed, some error codes have fallen out of use. To avoid confusion, they have not been re-used. .sp @@ -917,14 +917,14 @@ fallen out of use. To avoid confusion, they have not been re-used. 65 different names for subpatterns of the same number are not allowed 66 (*MARK) must have an argument - 67 this version of PCRE is not compiled with Unicode property + 67 this version of PCRE is not compiled with Unicode property support 68 \ec must be followed by an ASCII character 69 \ek is not followed by a braced, angle-bracketed, or quoted name 70 internal error: unknown opcode in find_fixedlength() 71 \eN is not supported in a class 72 too many forward references - 73 disallowed Unicode code point (>= 0xd800 && <= 0xdfff) + 73 disallowed Unicode code point (>= 0xd800 && <= 0xdfff) 74 invalid UTF-16 string (specifically UTF-16) .sp The numbers 32 and 10000 in errors 48 and 49 are defaults; different values may @@ -1120,12 +1120,12 @@ the following negative numbers: PCRE_ERROR_NULL the argument \fIcode\fP was NULL the argument \fIwhere\fP was NULL PCRE_ERROR_BADMAGIC the "magic number" was not found - PCRE_ERROR_BADENDIANNESS the pattern was compiled with different + PCRE_ERROR_BADENDIANNESS the pattern was compiled with different endianness PCRE_ERROR_BADOPTION the value of \fIwhat\fP was invalid .sp The "magic number" is placed at the start of each compiled pattern as an simple -check against passing an arbitrary memory pointer. The endianness error can +check against passing an arbitrary memory pointer. The endianness error can occur if a compiled pattern is saved and reloaded on a different host. Here is a typical call of \fBpcre_fullinfo()\fP, to obtain the length of the compiled pattern: @@ -1168,8 +1168,8 @@ where data units are bytes.) The fourth argument should point to an \fBint\fP variable. .P If there is a fixed first value, for example, the letter "c" from a pattern -such as (cat|cow|coyote), its value is returned. In the 8-bit library, the -value is always less than 256; in the 16-bit library the value can be up to +such as (cat|cow|coyote), its value is returned. In the 8-bit library, the +value is always less than 256; in the 16-bit library the value can be up to 0xffff. .P If there is no fixed first value, and if either @@ -1459,7 +1459,7 @@ fields (not necessarily in this order): const unsigned char *\fItables\fP; unsigned char **\fImark\fP; .sp -In the 16-bit version of this structure, the \fImark\fP field has type +In the 16-bit version of this structure, the \fImark\fP field has type "PCRE_UCHAR16 **". .P The \fIflags\fP field is a bitmap that specifies which of the other fields @@ -2092,14 +2092,14 @@ documentation for more details. .sp PCRE_ERROR_BADMODE (-28) .sp -This error is given if a pattern that was compiled by the 8-bit library is +This error is given if a pattern that was compiled by the 8-bit library is passed to a 16-bit library function, or vice versa. .sp PCRE_ERROR_BADENDIANNESS (-29) -.sp -This error is given if a pattern that was compiled and saved is reloaded on a -host with different endianness. The utility function -\fBpcre_pattern_to_host_byte_order()\fP can be used to convert such a pattern +.sp +This error is given if a pattern that was compiled and saved is reloaded on a +host with different endianness. The utility function +\fBpcre_pattern_to_host_byte_order()\fP can be used to convert such a pattern so that it runs on the new host. .P Error numbers -16 to -20 and -22 are not used by \fBpcre_exec()\fP. @@ -2109,7 +2109,7 @@ Error numbers -16 to -20 and -22 are not used by \fBpcre_exec()\fP. .SS "Reason codes for invalid UTF-8 strings" .rs .sp -This section applies only to the 8-bit library. The corresponding information +This section applies only to the 8-bit library. The corresponding information for the 16-bit library is given in the .\" HREF \fBpcre16\fP @@ -2417,14 +2417,14 @@ will yield PCRE_ERROR_NOMATCH. .rs .sp Matching certain patterns using \fBpcre_exec()\fP can use a lot of process -stack, which in certain environments can be rather limited in size. Some users -find it helpful to have an estimate of the amount of stack that is used by +stack, which in certain environments can be rather limited in size. Some users +find it helpful to have an estimate of the amount of stack that is used by \fBpcre_exec()\fP, to help them set recursion limits, as described in the .\" HREF \fBpcrestack\fP .\" -documentation. The estimate that is output by \fBpcretest\fP when called with -the \fB-m\fP and \fB-C\fP options is obtained by calling \fBpcre_exec\fP with +documentation. The estimate that is output by \fBpcretest\fP when called with +the \fB-m\fP and \fB-C\fP options is obtained by calling \fBpcre_exec\fP with the values NULL, NULL, NULL, -999, and -999 for its first five arguments. .P Normally, if its first argument is NULL, \fBpcre_exec()\fP immediately returns @@ -2432,10 +2432,10 @@ the negative error code PCRE_ERROR_NULL, but with this special combination of arguments, it returns instead a negative number whose absolute value is the approximate stack frame size in bytes. (A negative number is used so that it is clear that no match has happened.) The value is approximate because in some -cases, recursive calls to \fBpcre_exec()\fP occur when there are one or two +cases, recursive calls to \fBpcre_exec()\fP occur when there are one or two additional variables on the stack. .P -If PCRE has been compiled to use the heap instead of the stack for recursion, +If PCRE has been compiled to use the heap instead of the stack for recursion, the value returned is the size of each block that is obtained from the heap. . . diff --git a/doc/pcrebuild.3 b/doc/pcrebuild.3 index 11efdc2..aea25b4 100644 --- a/doc/pcrebuild.3 +++ b/doc/pcrebuild.3 @@ -35,11 +35,11 @@ exists as well, but as it specifies the default, it is not described. .SH "BUILDING 8-BIT and 16-BIT LIBRARIES" .rs .sp -By default, a library called \fBlibpcre\fP is built, containing functions that -take string arguments contained in vectors of bytes, either as single-byte +By default, a library called \fBlibpcre\fP is built, containing functions that +take string arguments contained in vectors of bytes, either as single-byte characters, or interpreted as UTF-8 strings. You can also build a separate -library, called \fBlibpcre16\fP, in which strings are contained in vectors of -16-bit data units and interpreted either as single-unit characters or UTF-16 +library, called \fBlibpcre16\fP, in which strings are contained in vectors of +16-bit data units and interpreted either as single-unit characters or UTF-16 strings, by adding .sp --enable-pcre16 @@ -70,7 +70,7 @@ to the \fBconfigure\fP command, as required. .sp By default, if the 8-bit library is being built, the \fBconfigure\fP script will search for a C++ compiler and C++ header files. If it finds them, it -automatically builds the C++ wrapper library (which supports only 8-bit +automatically builds the C++ wrapper library (which supports only 8-bit strings). You can disable this by adding .sp --disable-cpp @@ -96,7 +96,7 @@ configuration. (For backwards compatibility, --enable-utf8 is a synonym of .P Of itself, this setting does not make PCRE treat strings as UTF-8 or UTF-16. As well as compiling PCRE with this option, you also have have to set the -PCRE_UTF8 or PCRE_UTF16 option when you call one of the pattern compiling +PCRE_UTF8 or PCRE_UTF16 option when you call one of the pattern compiling functions. .P If you set --enable-utf when compiling in an EBCDIC environment, PCRE expects diff --git a/doc/pcrecallout.3 b/doc/pcrecallout.3 index 575eab9..7421d54 100644 --- a/doc/pcrecallout.3 +++ b/doc/pcrecallout.3 @@ -11,7 +11,7 @@ PCRE - Perl-compatible regular expressions PCRE provides a feature called "callout", which is a means of temporarily passing control to the caller of PCRE in the middle of pattern matching. The caller of PCRE provides an external function by putting its entry point in the -global variable \fIpcre_callout\fP (\fIpcre16_callout\fP for the 16-bit +global variable \fIpcre_callout\fP (\fIpcre16_callout\fP for the 16-bit library). By default, this variable contains NULL, which disables all calling out. .P @@ -85,7 +85,7 @@ These structures contains the following fields: int \fIcallout_number\fP; int *\fIoffset_vector\fP; const char *\fIsubject\fP; (8-bit version) - PCRE_SPTR16 \fIsubject\fP; (16-bit version) + PCRE_SPTR16 \fIsubject\fP; (16-bit version) int \fIsubject_length\fP; int \fIstart_match\fP; int \fIcurrent_position\fP; @@ -107,7 +107,7 @@ into the pattern (that is, the number after ?C for manual callouts, and 255 for automatically generated callouts). .P The \fIoffset_vector\fP field is a pointer to the vector of offsets that was -passed by the caller to the matching function. When \fBpcre_exec()\fP or +passed by the caller to the matching function. When \fBpcre_exec()\fP or \fBpcre16_exec()\fP is used, the contents can be inspected, in order to extract substrings that have been matched so far, in the same way as for extracting substrings after a match has completed. For the DFA matching functions, this diff --git a/doc/pcrecpp.3 b/doc/pcrecpp.3 index 146d222..772ce92 100644 --- a/doc/pcrecpp.3 +++ b/doc/pcrecpp.3 @@ -12,7 +12,7 @@ PCRE - Perl-compatible regular expressions. The C++ wrapper for PCRE was provided by Google Inc. Some additional functionality was added by Giuseppe Maxia. This brief man page was constructed from the notes in the \fIpcrecpp.h\fP file, which should be consulted for -further details. Note that the C++ wrapper supports only the original 8-bit +further details. Note that the C++ wrapper supports only the original 8-bit PCRE library. There is no 16-bit support at present. . . diff --git a/doc/pcrejit.3 b/doc/pcrejit.3 index 64764b1..0a32a11 100644 --- a/doc/pcrejit.3 +++ b/doc/pcrejit.3 @@ -21,10 +21,10 @@ this support was written by Zoltan Herczeg. .SH "8-BIT and 16-BIT SUPPORT" .rs .sp -JIT support is available for both the 8-bit and 16-bit PCRE libraries. To keep -this documentation simple, only the 8-bit interface is described in what -follows. If you are using the 16-bit library, substitute the 16-bit functions -and 16-bit structures (for example, \fIpcre16_jit_stack\fP instead of +JIT support is available for both the 8-bit and 16-bit PCRE libraries. To keep +this documentation simple, only the 8-bit interface is described in what +follows. If you are using the 16-bit library, substitute the 16-bit functions +and 16-bit structures (for example, \fIpcre16_jit_stack\fP instead of \fIpcre_jit_stack\fP). . . diff --git a/doc/pcrematching.3 b/doc/pcrematching.3 index 09b20c8..7ec2f5b 100644 --- a/doc/pcrematching.3 +++ b/doc/pcrematching.3 @@ -8,14 +8,14 @@ This document describes the two different algorithms that are available in PCRE for matching a compiled regular expression against a given subject string. The "standard" algorithm is the one provided by the \fBpcre_exec()\fP and \fBpcre16_exec()\fP functions. These work in the same was as Perl's matching -function, and provide a Perl-compatible matching operation. The just-in-time +function, and provide a Perl-compatible matching operation. The just-in-time (JIT) optimization that is described in the .\" HREF \fBpcrejit\fP .\" documentation is compatible with these functions. .P -An alternative algorithm is provided by the \fBpcre_dfa_exec()\fP and +An alternative algorithm is provided by the \fBpcre_dfa_exec()\fP and \fBpcre16_dfa_exec()\fP functions; they operate in a different way, and are not Perl-compatible. This alternative has advantages and disadvantages compared with the standard algorithm, and these are described below. diff --git a/doc/pcrepartial.3 b/doc/pcrepartial.3 index 1706a62..356fd61 100644 --- a/doc/pcrepartial.3 +++ b/doc/pcrepartial.3 @@ -25,7 +25,7 @@ entered. Partial matching can also be useful when the subject string is very long and is not all available at once. .P PCRE supports partial matching by means of the PCRE_PARTIAL_SOFT and -PCRE_PARTIAL_HARD options, which can be set when calling any of the matching +PCRE_PARTIAL_HARD options, which can be set when calling any of the matching functions. For backwards compatibility, PCRE_PARTIAL is a synonym for PCRE_PARTIAL_SOFT. The essential difference between the two options is whether or not a partial match is preferred to an alternative complete match, though @@ -46,7 +46,7 @@ strings. This optimization is also disabled for partial matching. .SH "PARTIAL MATCHING USING pcre_exec() OR pcre16_exec()" .rs .sp -A partial match occurs during a call to \fBpcre_exec()\fP or +A partial match occurs during a call to \fBpcre_exec()\fP or \fBpcre16_exec()\fP when the end of the subject string is reached successfully, but matching cannot continue because more characters are needed. However, at least one character in the subject must have been inspected. This character @@ -115,7 +115,7 @@ because it prefers an earlier partial match over a later complete match. For this reason, the assumption is made that the end of the supplied subject string may not be the true end of the available data, and so, if \ez, \eZ, \eb, \eB, or $ are encountered at the end of the subject, the result is -PCRE_ERROR_PARTIAL, provided that at least one character in the subject has +PCRE_ERROR_PARTIAL, provided that at least one character in the subject has been inspected. .P Setting PCRE_PARTIAL_HARD also affects the way UTF-8 and UTF-16 @@ -270,7 +270,7 @@ program to do that if it needs to. .P You can set the PCRE_PARTIAL_SOFT or PCRE_PARTIAL_HARD options with PCRE_DFA_RESTART to continue partial matching over multiple segments. This -facility can be used to pass very long subject strings to the DFA matching +facility can be used to pass very long subject strings to the DFA matching functions. . . diff --git a/doc/pcrepattern.3 b/doc/pcrepattern.3 index 49cfcd5..5ffadb7 100644 --- a/doc/pcrepattern.3 +++ b/doc/pcrepattern.3 @@ -25,11 +25,11 @@ there is now also support for UTF-8 strings in the original library, and a second library that supports 16-bit and UTF-16 character strings. To use these features, PCRE must be built to include appropriate support. When using UTF strings you must either call the compiling function with the PCRE_UTF8 or -PCRE_UTF16 option, or the pattern must start with one of these special +PCRE_UTF16 option, or the pattern must start with one of these special sequences: .sp (*UTF8) - (*UTF16) + (*UTF16) .sp Starting a pattern with such a sequence is equivalent to setting the relevant option. This feature is not Perl-compatible. How setting a UTF mode affects @@ -263,8 +263,8 @@ between \ex{ and }, but the character code is constrained as follows: 8-bit UTF-8 mode less than 0x10ffff and a valid codepoint 16-bit non-UTF mode less than 0x10000 16-bit UTF-16 mode less than 0x10ffff and a valid codepoint -.sp -Invalid Unicode codepoints are the range 0xd800 to 0xdfff (the so-called +.sp +Invalid Unicode codepoints are the range 0xd800 to 0xdfff (the so-called "surrogate" codepoints). .P If characters other than hexadecimal digits appear between \ex{ and }, or if @@ -307,7 +307,7 @@ parenthesized subpatterns. Inside a character class, or if the decimal number is greater than 9 and there have not been that many capturing subpatterns, PCRE re-reads up to three octal digits following the backslash, and uses them to generate a data character. Any -subsequent digits stand for themselves. The value of the character is +subsequent digits stand for themselves. The value of the character is constrained in the same way as characters specified in hexadecimal. For example: .sp @@ -499,8 +499,8 @@ The vertical space characters are: U+2028 Line separator U+2029 Paragraph separator .sp -In 8-bit, non-UTF-8 mode, only the characters with codepoints less than 256 are -relevant. +In 8-bit, non-UTF-8 mode, only the characters with codepoints less than 256 are +relevant. . . .\" HTML <a name="newlineseq"></a> @@ -974,7 +974,7 @@ end of the subject in both modes, and if all branches of a pattern start with .sp Outside a character class, a dot in the pattern matches any one character in the subject string except (by default) a character that signifies the end of a -line. +line. .P When a line ending is defined as a single character, dot never matches that character; when the two-character sequence CRLF is used, dot does not match CR @@ -1104,7 +1104,7 @@ followed by two other characters. The octal or hexadecimal representation of "]" can also be used to end a range. .P Ranges operate in the collating sequence of character values. They can also be -used for characters specified numerically, for example [\e000-\e037]. Ranges +used for characters specified numerically, for example [\e000-\e037]. Ranges can include any characters that are valid for the current mode. .P If a range that includes letters is used when caseless matching is set, it @@ -1305,8 +1305,8 @@ match "cataract", "erpillar" or an empty string. .sp 2. It sets up the subpattern as a capturing subpattern. This means that, when the whole pattern matches, that portion of the subject string that matched the -subpattern is passed back to the caller via the \fIovector\fP argument of the -matching function. (This applies only to the traditional matching functions; +subpattern is passed back to the caller via the \fIovector\fP argument of the +matching function. (This applies only to the traditional matching functions; the DFA matching functions do not support capturing.) .P Opening parentheses are counted from left to right (starting from 1) to obtain @@ -2538,7 +2538,7 @@ same pair of parentheses when there is a repetition. .P PCRE provides a similar feature, but of course it cannot obey arbitrary Perl code. The feature is called "callout". The caller of PCRE provides an external -function by putting its entry point in the global variable \fIpcre_callout\fP +function by putting its entry point in the global variable \fIpcre_callout\fP (8-bit library) or \fIpcre16_callout\fP (16-bit library). By default, this variable contains NULL, which disables all calling out. .P diff --git a/doc/pcreposix.3 b/doc/pcreposix.3 index 567cd89..7f08a1d 100644 --- a/doc/pcreposix.3 +++ b/doc/pcreposix.3 @@ -30,7 +30,7 @@ expression 8-bit library. See the \fBpcreapi\fP .\" documentation for a description of PCRE's native API, which contains much -additional functionality. There is no POSIX-style wrapper for PCRE's 16-bit +additional functionality. There is no POSIX-style wrapper for PCRE's 16-bit library. .P The functions described here are just wrapper functions that ultimately call diff --git a/doc/pcreprecompile.3 b/doc/pcreprecompile.3 index 716527f..aafb723 100644 --- a/doc/pcreprecompile.3 +++ b/doc/pcreprecompile.3 @@ -111,7 +111,7 @@ in the documentation. .P If you did not provide custom character tables when the pattern was compiled, -the pointer in the compiled pattern is NULL, which causes the matching +the pointer in the compiled pattern is NULL, which causes the matching functions to use PCRE's internal tables. Thus, you do not need to take any special action at run time in this case. .P diff --git a/doc/pcrestack.3 b/doc/pcrestack.3 index 37a6fe4..12e5cbd 100644 --- a/doc/pcrestack.3 +++ b/doc/pcrestack.3 @@ -140,24 +140,24 @@ limits. .sp The actual amount of stack used per recursion can vary quite a lot, depending on the compiler that was used to build PCRE and the optimization or debugging -options that were set for it. The rule of thumb value of 500 bytes mentioned -above may be larger or smaller than what is actually needed. A better +options that were set for it. The rule of thumb value of 500 bytes mentioned +above may be larger or smaller than what is actually needed. A better approximation can be obtained by running this command: .sp pcretest -m -C .sp -The \fB-C\fP option causes \fBpcretest\fP to output information about the -options with which PCRE was compiled. When \fB-m\fP is also given (before +The \fB-C\fP option causes \fBpcretest\fP to output information about the +options with which PCRE was compiled. When \fB-m\fP is also given (before \fB-C\fP), information about stack use is given in a line like this: .sp Match recursion uses stack: approximate frame size = 640 bytes -.sp -The value is approximate because some recursions need a bit more (up to perhaps +.sp +The value is approximate because some recursions need a bit more (up to perhaps 16 more bytes). .P -If the above command is given when PCRE is compiled to use the heap instead of -the stack for recursion, the value that is output is the size of each block -that is obtained from the heap. +If the above command is given when PCRE is compiled to use the heap instead of +the stack for recursion, the value that is output is the size of each block +that is obtained from the heap. . . .SS "Changing stack size in Unix-like systems" diff --git a/doc/pcresyntax.3 b/doc/pcresyntax.3 index f722892..43ba1db 100644 --- a/doc/pcresyntax.3 +++ b/doc/pcresyntax.3 @@ -420,12 +420,12 @@ pattern is not anchored. .sp (*COMMIT) overall failure, no advance of starting point (*PRUNE) advance to next starting character - (*PRUNE:NAME) equivalent to (*MARK:NAME)(*PRUNE) + (*PRUNE:NAME) equivalent to (*MARK:NAME)(*PRUNE) (*SKIP) advance to current matching position (*SKIP:NAME) advance to position corresponding to an earlier - (*MARK:NAME); if not found, the (*SKIP) is ignored + (*MARK:NAME); if not found, the (*SKIP) is ignored (*THEN) local failure, backtrack to next alternation - (*THEN:NAME) equivalent to (*MARK:NAME)(*THEN) + (*THEN:NAME) equivalent to (*MARK:NAME)(*THEN) . . .SH "NEWLINE CONVENTIONS" diff --git a/doc/pcretest.1 b/doc/pcretest.1 index 1be87c1..beb9d17 100644 --- a/doc/pcretest.1 +++ b/doc/pcretest.1 @@ -31,17 +31,17 @@ options and exactly what is output. .SH "PCRE's 8-BIT and 16-BIT LIBRARIES" .rs .sp -From release 8.30, two separate PCRE libraries can be built. The original one -supports 8-bit character strings, whereas the newer 16-bit library supports -character strings encoded in 16-bit units. The \fBpcretest\fP program can be +From release 8.30, two separate PCRE libraries can be built. The original one +supports 8-bit character strings, whereas the newer 16-bit library supports +character strings encoded in 16-bit units. The \fBpcretest\fP program can be used to test both libraries. However, it is itself still an 8-bit program, reading 8-bit input and writing 8-bit output. When testing the 16-bit library, the patterns and data strings are converted to 16-bit format before being -passed to the PCRE library functions. Results are converted to 8-bit for +passed to the PCRE library functions. Results are converted to 8-bit for output. .P -References to functions and structures of the form \fBpcre[16]_xx\fP below -mean "\fBpcre_xx\fP when using the 8-bit library or \fBpcre16_xx\fP when using +References to functions and structures of the form \fBpcre[16]_xx\fP below +mean "\fBpcre_xx\fP when using the 8-bit library or \fBpcre16_xx\fP when using the 16-bit library". . . @@ -49,9 +49,9 @@ the 16-bit library". .rs .TP 10 \fB-16\fP -If both the 8-bit and the 16-bit libraries have been built, this option causes -the 16-bit library to be used. If only the 16-bit library has been built, this -is the default (so has no effect). If only the 8-bit library has been built, +If both the 8-bit and the 16-bit libraries have been built, this option causes +the 16-bit library to be used. If only the 16-bit library has been built, this +is the default (so has no effect). If only the 8-bit library has been built, this option causes an error. .TP 10 \fB-b\fP @@ -60,24 +60,24 @@ internal form is output after compilation. .TP 10 \fB-C\fP Output the version number of the PCRE library, and all available information -about the optional features that are included, and then exit. All other options +about the optional features that are included, and then exit. All other options are ignored. .TP 10 \fB-C\fP \fIoption\fP -Output information about a specific build-time option, then exit. This -functionality is intended for use in scripts such as \fBRunTest\fP. The +Output information about a specific build-time option, then exit. This +functionality is intended for use in scripts such as \fBRunTest\fP. The following options output the value indicated: .sp linksize the internal link size (2, 3, or 4) - newline the default newline setting: - CR, LF, CRLF, ANYCRLF, or ANY + newline the default newline setting: + CR, LF, CRLF, ANYCRLF, or ANY .sp The following options output 1 for true or zero for false: -.sp +.sp jit just-in-time support is available pcre16 the 16-bit library was built pcre8 the 8-bit library was built - ucp Unicode property support is available + ucp Unicode property support is available utf UTF-8 and/or UTF-16 support is available .TP 10 \fB-d\fP @@ -104,7 +104,7 @@ calling \fBpcre[16]_exec()\fP repeatedly with different limits. .TP 10 \fB-m\fP Output the size of each compiled pattern after it has been compiled. This is -equivalent to adding \fB/M\fP to each regular expression. The size is given in +equivalent to adding \fB/M\fP to each regular expression. The size is given in bytes for both libraries. .TP 10 \fB-o\fP \fIosize\fP @@ -137,7 +137,7 @@ result of studying is not included when studying is caused only by \fB-s\fP and neither \fB-i\fP nor \fB-d\fP is present on the command line. This behaviour means that the output from tests that are run with and without \fB-s\fP should be identical, except when options that output information about the actual -running of a match are set. +running of a match are set. .sp The \fB-M\fP, \fB-t\fP, and \fB-tm\fP options, which give information about resources used, are likely to produce different output with and without @@ -237,12 +237,12 @@ effect as they do in Perl. For example: The following table shows additional modifiers for setting PCRE compile-time options that do not correspond to anything in Perl: .sp - \fB/8\fP PCRE_UTF8 ) when using the 8-bit + \fB/8\fP PCRE_UTF8 ) when using the 8-bit \fB/?\fP PCRE_NO_UTF8_CHECK ) library -.sp +.sp \fB/8\fP PCRE_UTF16 ) when using the 16-bit \fB/?\fP PCRE_NO_UTF16_CHECK ) library -.sp +.sp \fB/A\fP PCRE_ANCHORED \fB/C\fP PCRE_AUTO_CALLOUT \fB/E\fP PCRE_DOLLAR_ENDONLY @@ -270,7 +270,7 @@ This example sets multiline matching with CRLF as the line ending sequence: .sp As well as turning on the PCRE_UTF8/16 option, the \fB/8\fP modifier causes all non-printing characters in output strings to be printed using the -\ex{hh...} notation. Otherwise, those less than 0x100 are output in hex without +\ex{hh...} notation. Otherwise, those less than 0x100 are output in hex without the curly brackets. .P Full details of the PCRE options are given in the @@ -663,7 +663,7 @@ substring is shown as "<unset>", as for the second data line. 2: b .sp If the strings contain any non-printing characters, they are output as \exhh -escapes if the value is less than 256 and UTF mode is not set. Otherwise they +escapes if the value is less than 256 and UTF mode is not set. Otherwise they are output as \ex{hh...} escapes. See below for the definition of non-printing characters. If the pattern has the \fB/+\fP modifier, the output for substring 0 is followed by the the rest of the subject string, identified by "0+" like @@ -890,15 +890,15 @@ been loaded, \fBpcretest\fP proceeds to read data lines in the usual way. You can copy a file written by \fBpcretest\fP to a different host and reload it there, even if the new host has opposite endianness to the one on which the pattern was compiled. For example, you can compile on an i86 machine and run on -a SPARC machine. When a pattern is reloaded on a host with different +a SPARC machine. When a pattern is reloaded on a host with different endianness, the confirmation message is changed to: .sp Compiled pattern (byte-inverted) loaded from /some/file .sp -The test suite contains some saved pre-compiled patterns with different -endianness. These are reloaded using "<!" instead of just "<". This suppresses -the "(byte-inverted)" text so that the output is the same on all hosts. It also -forces debugging output once the pattern has been reloaded. +The test suite contains some saved pre-compiled patterns with different +endianness. These are reloaded using "<!" instead of just "<". This suppresses +the "(byte-inverted)" text so that the output is the same on all hosts. It also +forces debugging output once the pattern has been reloaded. .P File names for saving and reloading can be absolute or relative, but note that the shell facility of expanding a file name that starts with a tilde (~) is not diff --git a/doc/pcreunicode.3 b/doc/pcreunicode.3 index eab65b4..0f51d03 100644 --- a/doc/pcreunicode.3 +++ b/doc/pcreunicode.3 @@ -5,7 +5,7 @@ PCRE - Perl-compatible regular expressions .rs .sp From Release 8.30, in addition to its previous UTF-8 support, PCRE also -supports UTF-16 by means of a separate 16-bit library. This can be built as +supports UTF-16 by means of a separate 16-bit library. This can be built as well as, or instead of, the 8-bit library. . . @@ -77,7 +77,7 @@ releases of PCRE followed the rules of RFC 2279, which allows the full range of range U+0 to U+10FFFF, excluding U+D800 to U+DFFF. .P The excluded code points are the "Surrogate Area" of Unicode. They are reserved -for use by UTF-16, where they are used in pairs to encode codepoints with +for use by UTF-16, where they are used in pairs to encode codepoints with values greater than 0xFFFF. The code points that are encoded by UTF-16 pairs are available independently in the UTF-8 encoding. (In other words, the whole surrogate thing is a fudge for UTF-16 which unfortunately messes up UTF-8.) @@ -148,7 +148,7 @@ two-byte characters for values greater than \e177. 3. Repeat quantifiers apply to complete UTF characters, not to individual data units, for example: \ex{100}{3}. .P -4. The dot metacharacter matches one UTF character instead of a single data +4. The dot metacharacter matches one UTF character instead of a single data unit. .P 5. The escape sequence \eC can be used to match a single byte in UTF-8 mode, or @@ -166,7 +166,7 @@ be carried out by the normal interpretive function. .P 6. The character escapes \eb, \eB, \ed, \eD, \es, \eS, \ew, and \eW correctly test characters of any code value, but, by default, the characters that PCRE -recognizes as digits, spaces, or word characters remain the same set as in +recognizes as digits, spaces, or word characters remain the same set as in non-UTF mode, all with values less than 256. This remains true even when PCRE is built to include Unicode property support, because to do otherwise would slow down PCRE in many common cases. Note in particular that this applies to diff --git a/doc/perltest.txt b/doc/perltest.txt index 37e0012..bb1a52a 100644 --- a/doc/perltest.txt +++ b/doc/perltest.txt @@ -14,7 +14,7 @@ other pcretest modifiers that are either handled or ignored: /W ignored /S ignored /SS ignored - /Y ignored + /Y ignored The pcretest \Y escape in data lines is removed before matching. The data lines are processed as Perl double-quoted strings, so if they contain " $ or @ @@ -29,7 +29,7 @@ The perltest.pl script can also test UTF-8 features. It recognizes the special modifier /8 that pcretest uses to invoke UTF-8 functionality. The testinput4 and testinput6 files can be fed to perltest to run compatible UTF-8 tests. However, it is necessary to add "use utf8; require Encode" to the script to -make this work correctly. I have not managed to find a way to handle this +make this work correctly. I have not managed to find a way to handle this automatically. The other testinput files are not suitable for feeding to perltest.pl, since diff --git a/pcre-config.in b/pcre-config.in index f557f8c..595e5d1 100644 --- a/pcre-config.in +++ b/pcre-config.in @@ -10,7 +10,7 @@ if test @enable_cpp@ = yes ; then libs="[--libs-cpp]" else libs= -fi +fi if test @enable_pcre16@ = yes ; then libs="[--libs16] $libs" @@ -18,7 +18,7 @@ fi if test @enable_pcre8@ = yes ; then libs="[--libs] [--libs-posix] $libs" - cflags="$cflags [--cflags-posix]" + cflags="$cflags [--cflags-posix]" fi usage="Usage: pcre-config [--prefix] [--exec-prefix] [--version] $libs $cflags" diff --git a/pcre_compile.c b/pcre_compile.c index 5f95ac7..dbb5419 100644 --- a/pcre_compile.c +++ b/pcre_compile.c @@ -488,7 +488,7 @@ static const char error_texts[] = "\\N is not supported in a class\0" "too many forward references\0" "disallowed Unicode code point (>= 0xd800 && <= 0xdfff)\0" - "invalid UTF-16 string\0" + "invalid UTF-16 string\0" ; /* Table to identify digits and hex digits. This is used when compiling @@ -998,9 +998,9 @@ else c -= CHAR_0; while(i++ < 2 && ptr[1] >= CHAR_0 && ptr[1] <= CHAR_7) c = c * 8 + *(++ptr) - CHAR_0; -#ifdef COMPILE_PCRE8 +#ifdef COMPILE_PCRE8 if (!utf && c > 0xff) *errorcodeptr = ERR51; -#endif +#endif break; /* \x is complicated. \x{ddd} is a character number which can be greater @@ -7709,11 +7709,11 @@ not used here. */ if (utf && (options & PCRE_NO_UTF8_CHECK) == 0 && (errorcode = PRIV(valid_utf)((PCRE_PUCHAR)pattern, -1, erroroffset)) != 0) { -#ifdef COMPILE_PCRE8 +#ifdef COMPILE_PCRE8 errorcode = ERR44; -#else +#else errorcode = ERR74; -#endif +#endif goto PCRE_EARLY_ERROR_RETURN2; } #else diff --git a/pcre_exec.c b/pcre_exec.c index 9fdda7a..dcab5aa 100644 --- a/pcre_exec.c +++ b/pcre_exec.c @@ -487,13 +487,13 @@ int condcode; /* When recursion is not being used, all "local" variables that have to be preserved over calls to RMATCH() are part of a "frame". We set up the top-level frame on the stack here; subsequent instantiations are obtained from the heap -whenever RMATCH() does a "recursion". See the macro definitions above. Putting -the top-level on the stack rather than malloc-ing them all gives a performance +whenever RMATCH() does a "recursion". See the macro definitions above. Putting +the top-level on the stack rather than malloc-ing them all gives a performance boost in many cases where there is not much "recursion". */ #ifdef NO_RECURSE -heapframe frame_zero; -heapframe *frame = &frame_zero; +heapframe frame_zero; +heapframe *frame = &frame_zero; frame->Xprevframe = NULL; /* Marks the top level */ /* Copy in the original argument variables */ @@ -616,7 +616,7 @@ int stacksave[REC_STACK_SAVE_MAX]; eptrblock newptrb; -/* There is a special fudge for calling match() in a way that causes it to +/* There is a special fudge for calling match() in a way that causes it to measure the size of its basic stack frame when the stack is being used for recursion. The second argument (ecode) being NULL triggers this behaviour. It cannot normally ever be NULL. The return is the negated value of the frame @@ -631,7 +631,7 @@ if (ecode == NULL) int len = (char *)&rdepth - (char *)eptr; return (len > 0)? -len : len; } - } + } #endif /* NO_RECURSE */ /* To save space on the stack and in the heap frame, I have doubled up on some @@ -838,7 +838,7 @@ for (;;) case OP_ONCE_NC: prev = ecode; saved_eptr = eptr; - save_mark = md->mark; + save_mark = md->mark; do { RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM64); @@ -857,7 +857,7 @@ for (;;) if (rrc != MATCH_NOMATCH) RRETURN(rrc); ecode += GET(ecode,1); - md->mark = save_mark; + md->mark = save_mark; } while (*ecode == OP_ALT); @@ -937,7 +937,7 @@ for (;;) save_offset2 = md->offset_vector[offset+1]; save_offset3 = md->offset_vector[md->offset_end - number]; save_capture_last = md->capture_last; - save_mark = md->mark; + save_mark = md->mark; DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3)); md->offset_vector[md->offset_end - number] = @@ -1043,7 +1043,7 @@ for (;;) save_mark = md->mark; RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md, eptrb, RM2); - + /* See comment in the code for capturing groups above about handling THEN. */ @@ -1070,7 +1070,7 @@ for (;;) RRETURN(rrc); } ecode += GET(ecode, 1); - md->mark = save_mark; + md->mark = save_mark; if (*ecode != OP_ALT) break; } @@ -1549,7 +1549,7 @@ for (;;) case OP_ASSERT: case OP_ASSERTBACK: - save_mark = md->mark; + save_mark = md->mark; if (md->match_function_type == MATCH_CONDASSERT) { condassert = TRUE; @@ -1571,7 +1571,7 @@ for (;;) if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc); ecode += GET(ecode, 1); - md->mark = save_mark; + md->mark = save_mark; } while (*ecode == OP_ALT); @@ -1595,7 +1595,7 @@ for (;;) case OP_ASSERT_NOT: case OP_ASSERTBACK_NOT: - save_mark = md->mark; + save_mark = md->mark; if (md->match_function_type == MATCH_CONDASSERT) { condassert = TRUE; @@ -1606,7 +1606,7 @@ for (;;) do { RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM5); - md->mark = save_mark; + md->mark = save_mark; if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) RRETURN(MATCH_NOMATCH); if (rrc == MATCH_SKIP || rrc == MATCH_PRUNE || rrc == MATCH_COMMIT) { @@ -6207,21 +6207,21 @@ PCRE_PUCHAR req_char_ptr = start_match - 1; const pcre_study_data *study; const REAL_PCRE *re = (const REAL_PCRE *)argument_re; -/* Check for the special magic call that measures the size of the stack used +/* Check for the special magic call that measures the size of the stack used per recursive call of match(). */ if (re == NULL && extra_data == NULL && subject == NULL && length == -999 && start_offset == -999) #ifdef NO_RECURSE return -sizeof(heapframe); -#else +#else return match(NULL, NULL, NULL, 0, NULL, NULL, 0); -#endif +#endif /* Plausibility checks */ if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION; -if (re == NULL || subject == NULL || (offsets == NULL && offsetcount > 0)) +if (re == NULL || subject == NULL || (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL; if (offsetcount < 0) return PCRE_ERROR_BADCOUNT; if (start_offset < 0 || start_offset > length) return PCRE_ERROR_BADOFFSET; diff --git a/pcreposix.c b/pcreposix.c index 06cdd01..808c9da 100644 --- a/pcreposix.c +++ b/pcreposix.c @@ -158,7 +158,7 @@ static const int eint[] = { REG_BADPAT, /* \N is not supported in a class */ REG_BADPAT, /* too many forward references */ REG_BADPAT, /* disallowed UTF-8/16 code point (>= 0xd800 && <= 0xdfff) */ - REG_BADPAT /* invalid UTF-16 string (should not occur) */ + REG_BADPAT /* invalid UTF-16 string (should not occur) */ }; /* Table of texts corresponding to POSIX error codes */ @@ -2412,9 +2412,9 @@ are set, either both UTFs are supported or both are not supported. */ if (rc) { const char *arch; - (void)PCRE_CONFIG(PCRE_CONFIG_JITTARGET, &arch); + (void)PCRE_CONFIG(PCRE_CONFIG_JITTARGET, &arch); printf(" Just-in-time compiler support: %s\n", arch); - } + } else printf(" No just-in-time compiler support\n"); (void)PCRE_CONFIG(PCRE_CONFIG_NEWLINE, &rc); @@ -2438,11 +2438,11 @@ are set, either both UTFs are supported or both are not supported. */ (void)PCRE_CONFIG(PCRE_CONFIG_STACKRECURSE, &rc); printf(" Match recursion uses %s", rc? "stack" : "heap"); if (showstore) - { + { PCRE_EXEC(stack_size, NULL, NULL, NULL, -999, -999, 0, NULL, 0); - printf(": %sframe size = %d bytes", rc? "approximate " : "", -stack_size); + printf(": %sframe size = %d bytes", rc? "approximate " : "", -stack_size); } - printf("\n"); + printf("\n"); goto EXIT; } else if (strcmp(argv[op], "-help") == 0 || @@ -3385,10 +3385,10 @@ while (!done) cn16ptr = copynames; gn16ptr = getnames; #endif -#ifdef SUPPORT_PCRE8 +#ifdef SUPPORT_PCRE8 cn8ptr = copynames8; gn8ptr = getnames8; -#endif +#endif SET_PCRE_CALLOUT(callout); first_callout = 1; @@ -3483,9 +3483,9 @@ while (!done) { if (++i == 9) fprintf(outfile, "** Too many hex digits in \\x{...} item; " - "using only the first eight.\n"); + "using only the first eight.\n"); else c = c * 16 + tolower(*pt) - ((isdigit(*pt))? '0' : 'a' - 10); - } + } if (*pt == '}') { p = pt + 1; diff --git a/perltest.pl b/perltest.pl index d44e6c5..ca32cd7 100755 --- a/perltest.pl +++ b/perltest.pl @@ -23,7 +23,7 @@ if ($utf8) foreach $c (@p) { if ($c >= 32 && $c < 127) { $t .= chr $c; } - else { $t .= sprintf("\\x{%02x}", $c); + else { $t .= sprintf("\\x{%02x}", $c); } } } @@ -216,16 +216,16 @@ for (;;) } splice(@subs, 0, 18); } - + # It seems that $REGMARK is not marked as UTF-8 even when use utf8 is # set and the input pattern was a UTF-8 string. We can, however, force - # it to be so marked. - + # it to be so marked. + if (defined $REGMARK && $REGMARK != 1) { - $xx = $REGMARK; - $xx = Encode::decode_utf8($xx) if $utf8; - printf $outfile ("MK: %s\n", &pchars($xx)); + $xx = $REGMARK; + $xx = Encode::decode_utf8($xx) if $utf8; + printf $outfile ("MK: %s\n", &pchars($xx)); } } } |