summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authornigel <nigel@2f5784b3-3f2a-0410-8824-cb99058d5e15>2007-02-24 21:39:21 +0000
committernigel <nigel@2f5784b3-3f2a-0410-8824-cb99058d5e15>2007-02-24 21:39:21 +0000
commit09f9da9675b33a31c605d9d1f913bc2b05522be2 (patch)
treee4e2b0bbc47b23f497e3f1b2208a9ac9a9d4ebea
parent1622a3e7058dec7de74889c69595693ac0c64187 (diff)
downloadpcre-09f9da9675b33a31c605d9d1f913bc2b05522be2.tar.gz
Load pcre-3.0 into code/trunk.
git-svn-id: svn://vcs.exim.org/pcre/code/trunk@43 2f5784b3-3f2a-0410-8824-cb99058d5e15
-rw-r--r--COPYING32
-rw-r--r--ChangeLog28
-rw-r--r--LICENCE2
-rw-r--r--Makefile.in26
-rw-r--r--NEWS13
-rw-r--r--NON-UNIX-USE20
-rw-r--r--README60
-rw-r--r--config.in30
-rwxr-xr-xconfigure130
-rw-r--r--configure.in55
-rw-r--r--dftables.c12
-rw-r--r--doc/Tech.Notes41
-rw-r--r--doc/pcre.3272
-rw-r--r--doc/pcre.html398
-rw-r--r--doc/pcre.txt346
-rw-r--r--doc/pcreposix.310
-rw-r--r--doc/pcreposix.html10
-rw-r--r--doc/pcreposix.txt14
-rw-r--r--doc/pcretest.txt19
-rw-r--r--internal.h33
-rw-r--r--maketables.c29
-rw-r--r--pcre-config.in59
-rw-r--r--pcre.c322
-rw-r--r--pcre.in (renamed from pcre.h)19
-rw-r--r--pcreposix.c12
-rw-r--r--pcreposix.h8
-rw-r--r--pcretest.c142
-rw-r--r--pgrep.c2
-rw-r--r--study.c10
-rw-r--r--testdata/testinput2115
-rw-r--r--testdata/testoutput12
-rw-r--r--testdata/testoutput2988
-rw-r--r--testdata/testoutput32
-rw-r--r--testdata/testoutput410
34 files changed, 2588 insertions, 683 deletions
diff --git a/COPYING b/COPYING
new file mode 100644
index 0000000..f305033
--- /dev/null
+++ b/COPYING
@@ -0,0 +1,32 @@
+PCRE LICENCE
+------------
+
+PCRE is a library of functions to support regular expressions whose syntax
+and semantics are as close as possible to those of the Perl 5 language.
+
+Written by: Philip Hazel <ph10@cam.ac.uk>
+
+University of Cambridge Computing Service,
+Cambridge, England. Phone: +44 1223 334714.
+
+Copyright (c) 1997-2000 University of Cambridge
+
+Permission is granted to anyone to use this software for any purpose on any
+computer system, and to redistribute it freely, subject to the following
+restrictions:
+
+1. This software is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+
+2. The origin of this software must not be misrepresented, either by
+ explicit claim or by omission.
+
+3. Altered versions must be plainly marked as such, and must not be
+ misrepresented as being the original software.
+
+4. If PCRE is embedded in any software that is released under the GNU
+ General Purpose Licence (GPL), then the terms of that licence shall
+ supersede any condition above with which it is incompatible.
+
+End
diff --git a/ChangeLog b/ChangeLog
index cd02638..6da0bbd 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -2,8 +2,8 @@ ChangeLog for PCRE
------------------
-Version 2.09 14-Sep-99
-----------------------
+Version 3.0 01-Feb-00
+---------------------
1. Add support for the /+ modifier to perltest (to output $` like it does in
pcretest).
@@ -23,6 +23,30 @@ captured string vector to pcre_exec(), but (since release 2.00) PCRE has
required a bigger vector, with some working space on the end. This means that
the POSIX wrapper now has to get and free some memory, and copy the results.
+6. Added some simple autoconf support, placing the test data and the
+documentation in separate directories, re-organizing some of the
+information files, and making it build pcre-config (a GNU standard). Also added
+libtool support for building PCRE as a shared library, which is now the
+default.
+
+7. Got rid of the leading zero in the definition of PCRE_MINOR because 08 and
+09 are not valid octal constants. Single digits will be used for minor values
+less than 10.
+
+8. Defined REG_EXTENDED and REG_NOSUB as zero in the POSIX header, so that
+existing programs that set these in the POSIX interface can use PCRE without
+modification.
+
+9. Added a new function, pcre_fullinfo() with an extensible interface. It can
+return all that pcre_info() returns, plus additional data. The pcre_info()
+function is retained for compatibility, but is considered to be obsolete.
+
+10. Added experimental recursion feature (?R) to handle one common case that
+Perl 5.6 will be able to do with (?p{...}).
+
+11. Added support for POSIX character classes like [:alpha:], which Perl is
+adopting.
+
Version 2.08 31-Aug-99
----------------------
diff --git a/LICENCE b/LICENCE
index 246515a..f305033 100644
--- a/LICENCE
+++ b/LICENCE
@@ -9,7 +9,7 @@ Written by: Philip Hazel <ph10@cam.ac.uk>
University of Cambridge Computing Service,
Cambridge, England. Phone: +44 1223 334714.
-Copyright (c) 1997-1999 University of Cambridge
+Copyright (c) 1997-2000 University of Cambridge
Permission is granted to anyone to use this software for any purpose on any
computer system, and to redistribute it freely, subject to the following
diff --git a/Makefile.in b/Makefile.in
index ff677c6..958332c 100644
--- a/Makefile.in
+++ b/Makefile.in
@@ -27,10 +27,10 @@
prefix = @prefix@
exec_prefix = @exec_prefix@
-BINDIR = $(exec_prefix)/bin
-LIBDIR = $(exec_prefix)/lib
-INCDIR = $(prefix)/include
-MANDIR = $(prefix)/man
+BINDIR = @bindir@
+LIBDIR = @libdir@
+INCDIR = @includedir@
+MANDIR = @mandir@
CC = @CC@
CFLAGS = @CFLAGS@
@@ -45,6 +45,11 @@ RANLIB = @RANLIB@
LIBTOOL = @LIBTOOL@
LIBSUFFIX = @LIBSUFFIX@
+# These are the version numbers for the shared libraries
+
+PCRELIBVERSION = @PCRE_LIB_VERSION@
+PCREPOSIXLIBVERSION = @PCRE_POSIXLIB_VERSION@
+
#---------------------------------------------------------------------------#
# A copy of install-sh is in this distribution and is used by default. #
@@ -105,7 +110,7 @@ libpcre.la: $(OBJ)
@echo '--- Building shared library: libpcre'
@echo ' '
-rm -f libpcre.la
- libtool $(CC) -o libpcre.la -rpath $(LIBDIR) $(LOBJ)
+ libtool $(CC) -version-info '$(PCRELIBVERSION)' -o libpcre.la -rpath $(LIBDIR) $(LOBJ)
libpcreposix.a: pcreposix.o
@echo ' '
@@ -120,7 +125,7 @@ libpcreposix.la: pcreposix.o
@echo '--- Building shared library: libpcreposix'
@echo ' '
-rm -f libpcreposix.la
- libtool $(CC) -o libpcreposix.la -rpath $(LIBDIR) pcreposix.lo
+ libtool $(CC) -version-info '$(PCREPOSIXLIBVERSION)' -o libpcreposix.la -rpath $(LIBDIR) pcreposix.lo
pcre.o: chartables.c pcre.c pcre.h internal.h config.h Makefile
$(LIBTOOL) $(CC) -c $(CFLAGS) pcre.c
@@ -159,11 +164,16 @@ install: all
$(INSTALL_DATA) doc/pcreposix.3 $(MANDIR)/man3/pcre.3
$(INSTALL_DATA) doc/pgrep.1 $(MANDIR)/man1/pgrep.1
@if test "$(LIBTOOL)" = "libtool"; then \
+ echo ' '; \
echo '--- Rebuilding pgrep to use installed shared library ---'; \
echo $(CC) $(CFLAGS) -o pgrep pgrep.o -L$(LIBDIR) -lpcre; \
$(CC) $(CFLAGS) -o pgrep pgrep.o -L$(LIBDIR) -lpcre; \
+ echo '--- Rebuilding pcretest to use installed shared library ---'; \
+ echo $(CC) $(CFLAGS) -o pcretest pcretest.o -L$(LIBDIR) -lpcre -lpcreposix; \
+ $(CC) $(CFLAGS) -o pcretest pcretest.o -L$(LIBDIR) -lpcre -lpcreposix; \
fi
- $(INSTALL) pgrep $(BINDIR)/pgrep \
+ $(INSTALL) pgrep $(BINDIR)/pgrep
+ $(INSTALL) pcre-config $(BINDIR)/pcre-config
# We deliberately omit dftables and chartables.c from 'make clean'; once made
# chartables.c shouldn't change, and if people have edited the tables by hand,
@@ -174,7 +184,7 @@ clean:; -rm -rf *.o *.lo *.a *.la .libs pcretest pgrep testtry
# But "make distclean" should get back to a virgin distribution
distclean: clean
- -rm -f chartables.c libtool \
+ -rm -f chartables.c libtool pcre-config pcre.h \
Makefile config.h config.status config.log config.cache
check: runtest
diff --git a/NEWS b/NEWS
index 4c0c62e..4c80bd6 100644
--- a/NEWS
+++ b/NEWS
@@ -1,8 +1,17 @@
News about PCRE releases
------------------------
-A "configure" script is now used to configure PCRE for Unix systems. It builds
-a Makefile and a config.h file.
+Release 3.0 01-Feb-00
+---------------------
+
+1. A "configure" script is now used to configure PCRE for Unix systems. It
+builds a Makefile, a config.h file, and the pcre-config script.
+
+2. PCRE is built as a shared library by default.
+
+3. There is support for POSIX classes such as [:alpha:].
+
+5. There is an experimental recursion feature.
----------------------------------------------------------------------------
IMPORTANT FOR THOSE UPGRADING FROM VERSIONS BEFORE 2.00
diff --git a/NON-UNIX-USE b/NON-UNIX-USE
index fee5db1..09a7432 100644
--- a/NON-UNIX-USE
+++ b/NON-UNIX-USE
@@ -6,24 +6,30 @@ entirely of code written in Standard C, and so should compile successfully
on any machine with a Standard C compiler and library, using normal compiling
commands to do the following:
-(1) Copy or rename the file config.in as config.h. You should not have to
-change any settings inside it for a Standard C environment.
+(1) Copy or rename the file config.in as config.h, and change the macros that
+define HAVE_STRERROR and HAVE_MEMMOVE to define them as 1 rather than 0.
+Unfortunately, because of the way Unix autoconf works, the default setting has
+to be 0.
-(2) Compile dftables.c as a stand-alone program, and then run it with
+(2) Copy or rename the file pcre.in as pcre.h, and change the macro definitions
+for PCRE_MAJOR, PCRE_MINOR, and PCRE_DATE near its start to the values set in
+configure.in.
+
+(3) Compile dftables.c as a stand-alone program, and then run it with
the standard output sent to chartables.c. This generates a set of standard
character tables.
-(3) Compile maketables.c, get.c, study.c and pcre.c and link them all
+(4) Compile maketables.c, get.c, study.c and pcre.c and link them all
together into an object library in whichever form your system keeps such
libraries. This is the pcre library (chartables.c gets included by means of an
#include directive).
-(4) Similarly, compile pcreposix.c and link it as the pcreposix library.
+(5) Similarly, compile pcreposix.c and link it as the pcreposix library.
-(5) Compile the test program pcretest.c. This needs the functions in the
+(6) Compile the test program pcretest.c. This needs the functions in the
pcre and pcreposix libraries when linking.
-(6) Run pcretest on the testinput files in the testdata directory, and check
+(7) Run pcretest on the testinput files in the testdata directory, and check
that the output matches the corresponding testoutput files. You must use the
-i option when checking testinput2.
diff --git a/README b/README
index aa49877..90aaf4d 100644
--- a/README
+++ b/README
@@ -1,6 +1,10 @@
README file for PCRE (Perl-compatible regular expression library)
-----------------------------------------------------------------
+The latest release of PCRE is always available from
+
+ ftp://ftp.csx.cam.ac.uk/pub/software/programming/pcre/pcre-xxx.tar.gz
+
Please read the NEWS file if you are upgrading from a previous release.
@@ -17,34 +21,51 @@ CFLAGS='-O2 -Wall' ./configure --prefix=/opt/local
specifies that the C compiler should be run with the flags '-O2 -Wall' instead
of the default, and that "make install" should install PCRE under /opt/local
-instead of the default /usr/local. The "configure" script builds two files:
+instead of the default /usr/local. The "configure" script builds thre files:
-. Makefile is built by copying Makefile.in and making certain substitutions.
-. config.h is built by copying config.in and making certain substitutions.
+. Makefile is built by copying Makefile.in and making substitutions.
+. config.h is built by copying config.in and making substitutions.
+. pcre-config is built by copying pcre-config.in and making substitutions.
Once "configure" has run, you can run "make". It builds two libraries called
-libpcre.a and libpcreposix.a, a test program called pcretest, and the pgrep
+libpcre and libpcreposix, a test program called pcretest, and the pgrep
command. You can use "make install" to copy these, and the public header file
pcre.h, to appropriate live directories on your system, in the normal way.
+Running "make install" also installs the command pcre-config, which can be used
+to recall information about the PCRE configuration and installation. For
+example,
+
+ pcre-config --version
+
+prints the version number, and
+
+ pcre-config --libs
+
+outputs information about where the library is installed. This command can be
+included in makefiles for programs that use PCRE, saving the programmer from
+having to remember too many details.
+
Shared libraries on Unix systems
--------------------------------
-The default distribution builds static libraries. It is also possible to build
-PCRE as two shared libraries. This support is new and experimental and may not
-work on all systems. It relies on the "libtool" scripts - these are distributed
-with PCRE. To build PCRE using shared libraries you must use --enable-shared
-when configuring it. For example
+The default distribution builds PCRE as two shared libraries. This support is
+new and experimental and may not work on all systems. It relies on the
+"libtool" scripts - these are distributed with PCRE. It should build a
+"libtool" script and use this to compile and link shared libraries, which are
+placed in a subdirectory called .libs. The programs pcretest and pgrep are
+built to use these uninstalled libraries by means of wrapper scripts. When you
+use "make install" to install shared libraries, pgrep and pcretest are
+automatically re-built to use the newly installed libraries. However, only
+pgrep is installed, as pcretest is really just a test program.
+
+To build PCRE using static libraries you must use --disable-shared when
+configuring it. For example
-./configure --prefix=/usr/gnu --enable-shared
+./configure --prefix=/usr/gnu --disable-shared
-Then run "make" in the usual way. It should build a "libtool" script and use
-this to compile and link shared libraries, which are placed in a subdirectory
-called .libs. The programs pcretest and pgrep are built to use these
-uninstalled libraries by means of wrapper scripts. When you use "make install"
-to install shared libraries, pgrep is automatically re-built to use the newly
-installed library before it itself is installed.
+Then run "make" in the usual way.
Building on non-Unix systems
@@ -159,7 +180,8 @@ The distribution should contain the following files:
study.c ) source of
pcre.c ) the functions
pcreposix.c )
- pcre.h header for the external API
+ pcre.in "source" for the header for the external API; pcre.h
+ is built from this by "configure"
pcreposix.h header for the external POSIX wrapper API
internal.h header for internal use
config.in template for config.h, which is built by configure
@@ -170,6 +192,7 @@ The distribution should contain the following files:
ChangeLog log of changes to the code
INSTALL generic installation instructions
LICENCE conditions for the use of PCRE
+ COPYING the same, using GNU's standard name
Makefile.in template for Unix Makefile, which is built by configure
NEWS important changes in this release
NON-UNIX-USE notes on building PCRE on non-Unix systems
@@ -197,6 +220,7 @@ The distribution should contain the following files:
pcretest.c test program
perltest Perl test program
pgrep.c source of a grep utility that uses PCRE
+ pcre-config.in source of script which retains PCRE information
testdata/testinput1 test data, compatible with Perl 5.004 and 5.005
testdata/testinput2 test data for error messages and non-Perl things
testdata/testinput3 test data, compatible with Perl 5.005
@@ -212,4 +236,4 @@ The distribution should contain the following files:
pcre.def
Philip Hazel <ph10@cam.ac.uk>
-January 2000
+February 2000
diff --git a/config.in b/config.in
index c0f4537..7631d46 100644
--- a/config.in
+++ b/config.in
@@ -1,20 +1,28 @@
-/* config.in is converted by configure into config.h. PCRE is written in
-Standard C, but there are a few non-standard things it can cope with, allowing
-it to run on SunOS4 and other "close to standard" systems. The defaults below
-are the correct ones on a Standard C system. On a non-Unix system you can just
-copy this file into config.h. */
+/* On Unix systems config.in is converted by configure into config.h. PCRE is
+written in Standard C, but there are a few non-standard things it can cope
+with, allowing it to run on SunOS4 and other "close to standard" systems.
+
+On a non-Unix system you should just copy this file into config.h and change
+the definitions of HAVE_STRERROR and HAVE_MEMMOVE to 1. Unfortunately, because
+of the way autoconf works, these cannot be made the defaults. */
+
+/* Define to empty if the keyword does not work. */
-/* Define to empty if the keyword does not work. */
#undef const
-/* Define to `unsigned' if <stddef.h> doesn't define size_t. */
+/* Define to `unsigned' if <stddef.h> doesn't define size_t. */
+
#undef size_t
-/* Undefine if you don't have the strerror function. */
-#define HAVE_STRERROR
+/* The following two definitions are mainly for the benefit of SunOS4, which
+doesn't have the strerror() or memmove() functions that should be present in
+all Standard C libraries. The macros should normally be defined with the value
+1 for other systems, but unfortunately we can't make this the default because
+"configure" files generated by autoconf will only change 0 to 1; they won't
+change 1 to 0 if the functions are not found. */
-/* Undefine if you don't have the memmove function. */
-#define HAVE_MEMMOVE
+#define HAVE_STRERROR 0
+#define HAVE_MEMMOVE 0
/* End */
diff --git a/configure b/configure
index 91bba35..738230a 100755
--- a/configure
+++ b/configure
@@ -12,7 +12,7 @@ ac_help=
ac_default_prefix=/usr/local
# Any additions from configure.in:
ac_help="$ac_help
- --enable-shared build PCRE as a shared library (using libtool)"
+ --disable-shared build PCRE as a static library"
# Initialize some variables set by options.
# The variables have the same names as the options, with
@@ -138,9 +138,6 @@ do
# The list generated by autoconf has been trimmed to remove many
# options that are totally irrelevant to PCRE (e.g. relating to X),
# or are not supported by its Makefile.
- # The list generated by autoconf has been trimmed to remove many
- # options that are totally irrelevant to PCRE (e.g. relating to X),
- # or are not supported by its Makefile.
# This message is too long to be a string in the A/UX 3.1 sh.
cat << EOF
Usage: ./configure [options]
@@ -504,10 +501,23 @@ fi
+
+
+
+PCRE_MAJOR=3
+PCRE_MINOR=0
+PCRE_DATE=01-Feb-2000
+PCRE_VERSION=${PCRE_MAJOR}.${PCRE_MINOR}
+
+
+PCRE_LIB_VERSION=0:0:0
+PCRE_POSIXLIB_VERSION=0:0:0
+
+
# Extract the first word of "gcc", so it can be a program name with args.
set dummy gcc; ac_word=$2
echo $ac_n "checking for $ac_word""... $ac_c" 1>&6
-echo "configure:531: checking for $ac_word" >&5
+echo "configure:544: checking for $ac_word" >&5
if eval "test \"`echo '$''{'ac_cv_prog_CC'+set}'`\" = set"; then
echo $ac_n "(cached) $ac_c" 1>&6
else
@@ -537,7 +547,7 @@ if test -z "$CC"; then
# Extract the first word of "cc", so it can be a program name with args.
set dummy cc; ac_word=$2
echo $ac_n "checking for $ac_word""... $ac_c" 1>&6
-echo "configure:561: checking for $ac_word" >&5
+echo "configure:574: checking for $ac_word" >&5
if eval "test \"`echo '$''{'ac_cv_prog_CC'+set}'`\" = set"; then
echo $ac_n "(cached) $ac_c" 1>&6
else
@@ -588,7 +598,7 @@ fi
# Extract the first word of "cl", so it can be a program name with args.
set dummy cl; ac_word=$2
echo $ac_n "checking for $ac_word""... $ac_c" 1>&6
-echo "configure:612: checking for $ac_word" >&5
+echo "configure:625: checking for $ac_word" >&5
if eval "test \"`echo '$''{'ac_cv_prog_CC'+set}'`\" = set"; then
echo $ac_n "(cached) $ac_c" 1>&6
else
@@ -620,7 +630,7 @@ fi
fi
echo $ac_n "checking whether the C compiler ($CC $CFLAGS $LDFLAGS) works""... $ac_c" 1>&6
-echo "configure:644: checking whether the C compiler ($CC $CFLAGS $LDFLAGS) works" >&5
+echo "configure:657: checking whether the C compiler ($CC $CFLAGS $LDFLAGS) works" >&5
ac_ext=c
# CFLAGS is not in ac_cpp because -g, -O, etc. are not valid cpp options.
@@ -631,12 +641,12 @@ cross_compiling=$ac_cv_prog_cc_cross
cat > conftest.$ac_ext << EOF
-#line 655 "configure"
+#line 668 "configure"
#include "confdefs.h"
main(){return(0);}
EOF
-if { (eval echo configure:660: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
+if { (eval echo configure:673: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
ac_cv_prog_cc_works=yes
# If we can't run a trivial program, we are probably using a cross compiler.
if (./conftest; exit) 2>/dev/null; then
@@ -662,12 +672,12 @@ if test $ac_cv_prog_cc_works = no; then
{ echo "configure: error: installation or configuration problem: C compiler cannot create executables." 1>&2; exit 1; }
fi
echo $ac_n "checking whether the C compiler ($CC $CFLAGS $LDFLAGS) is a cross-compiler""... $ac_c" 1>&6
-echo "configure:686: checking whether the C compiler ($CC $CFLAGS $LDFLAGS) is a cross-compiler" >&5
+echo "configure:699: checking whether the C compiler ($CC $CFLAGS $LDFLAGS) is a cross-compiler" >&5
echo "$ac_t""$ac_cv_prog_cc_cross" 1>&6
cross_compiling=$ac_cv_prog_cc_cross
echo $ac_n "checking whether we are using GNU C""... $ac_c" 1>&6
-echo "configure:691: checking whether we are using GNU C" >&5
+echo "configure:704: checking whether we are using GNU C" >&5
if eval "test \"`echo '$''{'ac_cv_prog_gcc'+set}'`\" = set"; then
echo $ac_n "(cached) $ac_c" 1>&6
else
@@ -676,7 +686,7 @@ else
yes;
#endif
EOF
-if { ac_try='${CC-cc} -E conftest.c'; { (eval echo configure:700: \"$ac_try\") 1>&5; (eval $ac_try) 2>&5; }; } | egrep yes >/dev/null 2>&1; then
+if { ac_try='${CC-cc} -E conftest.c'; { (eval echo configure:713: \"$ac_try\") 1>&5; (eval $ac_try) 2>&5; }; } | egrep yes >/dev/null 2>&1; then
ac_cv_prog_gcc=yes
else
ac_cv_prog_gcc=no
@@ -695,7 +705,7 @@ ac_test_CFLAGS="${CFLAGS+set}"
ac_save_CFLAGS="$CFLAGS"
CFLAGS=
echo $ac_n "checking whether ${CC-cc} accepts -g""... $ac_c" 1>&6
-echo "configure:719: checking whether ${CC-cc} accepts -g" >&5
+echo "configure:732: checking whether ${CC-cc} accepts -g" >&5
if eval "test \"`echo '$''{'ac_cv_prog_cc_g'+set}'`\" = set"; then
echo $ac_n "(cached) $ac_c" 1>&6
else
@@ -729,7 +739,7 @@ fi
# Extract the first word of "ranlib", so it can be a program name with args.
set dummy ranlib; ac_word=$2
echo $ac_n "checking for $ac_word""... $ac_c" 1>&6
-echo "configure:753: checking for $ac_word" >&5
+echo "configure:766: checking for $ac_word" >&5
if eval "test \"`echo '$''{'ac_cv_prog_RANLIB'+set}'`\" = set"; then
echo $ac_n "(cached) $ac_c" 1>&6
else
@@ -757,8 +767,9 @@ else
fi
+
echo $ac_n "checking how to run the C preprocessor""... $ac_c" 1>&6
-echo "configure:782: checking how to run the C preprocessor" >&5
+echo "configure:796: checking how to run the C preprocessor" >&5
# On Suns, sometimes $CPP names a directory.
if test -n "$CPP" && test -d "$CPP"; then
CPP=
@@ -773,13 +784,13 @@ else
# On the NeXT, cc -E runs the code through the compiler's parser,
# not just through cpp.
cat > conftest.$ac_ext <<EOF
-#line 797 "configure"
+#line 811 "configure"
#include "confdefs.h"
#include <assert.h>
Syntax Error
EOF
ac_try="$ac_cpp conftest.$ac_ext >/dev/null 2>conftest.out"
-{ (eval echo configure:803: \"$ac_try\") 1>&5; (eval $ac_try) 2>&5; }
+{ (eval echo configure:817: \"$ac_try\") 1>&5; (eval $ac_try) 2>&5; }
ac_err=`grep -v '^ *+' conftest.out | grep -v "^conftest.${ac_ext}\$"`
if test -z "$ac_err"; then
:
@@ -790,13 +801,13 @@ else
rm -rf conftest*
CPP="${CC-cc} -E -traditional-cpp"
cat > conftest.$ac_ext <<EOF
-#line 814 "configure"
+#line 828 "configure"
#include "confdefs.h"
#include <assert.h>
Syntax Error
EOF
ac_try="$ac_cpp conftest.$ac_ext >/dev/null 2>conftest.out"
-{ (eval echo configure:820: \"$ac_try\") 1>&5; (eval $ac_try) 2>&5; }
+{ (eval echo configure:834: \"$ac_try\") 1>&5; (eval $ac_try) 2>&5; }
ac_err=`grep -v '^ *+' conftest.out | grep -v "^conftest.${ac_ext}\$"`
if test -z "$ac_err"; then
:
@@ -807,13 +818,13 @@ else
rm -rf conftest*
CPP="${CC-cc} -nologo -E"
cat > conftest.$ac_ext <<EOF
-#line 831 "configure"
+#line 845 "configure"
#include "confdefs.h"
#include <assert.h>
Syntax Error
EOF
ac_try="$ac_cpp conftest.$ac_ext >/dev/null 2>conftest.out"
-{ (eval echo configure:837: \"$ac_try\") 1>&5; (eval $ac_try) 2>&5; }
+{ (eval echo configure:851: \"$ac_try\") 1>&5; (eval $ac_try) 2>&5; }
ac_err=`grep -v '^ *+' conftest.out | grep -v "^conftest.${ac_ext}\$"`
if test -z "$ac_err"; then
:
@@ -838,12 +849,12 @@ fi
echo "$ac_t""$CPP" 1>&6
echo $ac_n "checking for ANSI C header files""... $ac_c" 1>&6
-echo "configure:862: checking for ANSI C header files" >&5
+echo "configure:876: checking for ANSI C header files" >&5
if eval "test \"`echo '$''{'ac_cv_header_stdc'+set}'`\" = set"; then
echo $ac_n "(cached) $ac_c" 1>&6
else
cat > conftest.$ac_ext <<EOF
-#line 867 "configure"
+#line 881 "configure"
#include "confdefs.h"
#include <stdlib.h>
#include <stdarg.h>
@@ -851,7 +862,7 @@ else
#include <float.h>
EOF
ac_try="$ac_cpp conftest.$ac_ext >/dev/null 2>conftest.out"
-{ (eval echo configure:875: \"$ac_try\") 1>&5; (eval $ac_try) 2>&5; }
+{ (eval echo configure:889: \"$ac_try\") 1>&5; (eval $ac_try) 2>&5; }
ac_err=`grep -v '^ *+' conftest.out | grep -v "^conftest.${ac_ext}\$"`
if test -z "$ac_err"; then
rm -rf conftest*
@@ -868,7 +879,7 @@ rm -f conftest*
if test $ac_cv_header_stdc = yes; then
# SunOS 4.x string.h does not declare mem*, contrary to ANSI.
cat > conftest.$ac_ext <<EOF
-#line 892 "configure"
+#line 906 "configure"
#include "confdefs.h"
#include <string.h>
EOF
@@ -886,7 +897,7 @@ fi
if test $ac_cv_header_stdc = yes; then
# ISC 2.0.2 stdlib.h does not declare free, contrary to ANSI.
cat > conftest.$ac_ext <<EOF
-#line 910 "configure"
+#line 924 "configure"
#include "confdefs.h"
#include <stdlib.h>
EOF
@@ -907,7 +918,7 @@ if test "$cross_compiling" = yes; then
:
else
cat > conftest.$ac_ext <<EOF
-#line 931 "configure"
+#line 945 "configure"
#include "confdefs.h"
#include <ctype.h>
#define ISLOWER(c) ('a' <= (c) && (c) <= 'z')
@@ -918,7 +929,7 @@ if (XOR (islower (i), ISLOWER (i)) || toupper (i) != TOUPPER (i)) exit(2);
exit (0); }
EOF
-if { (eval echo configure:942: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext} && (./conftest; exit) 2>/dev/null
+if { (eval echo configure:956: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext} && (./conftest; exit) 2>/dev/null
then
:
else
@@ -945,17 +956,17 @@ for ac_hdr in limits.h
do
ac_safe=`echo "$ac_hdr" | sed 'y%./+-%__p_%'`
echo $ac_n "checking for $ac_hdr""... $ac_c" 1>&6
-echo "configure:969: checking for $ac_hdr" >&5
+echo "configure:983: checking for $ac_hdr" >&5
if eval "test \"`echo '$''{'ac_cv_header_$ac_safe'+set}'`\" = set"; then
echo $ac_n "(cached) $ac_c" 1>&6
else
cat > conftest.$ac_ext <<EOF
-#line 974 "configure"
+#line 988 "configure"
#include "confdefs.h"
#include <$ac_hdr>
EOF
ac_try="$ac_cpp conftest.$ac_ext >/dev/null 2>conftest.out"
-{ (eval echo configure:979: \"$ac_try\") 1>&5; (eval $ac_try) 2>&5; }
+{ (eval echo configure:993: \"$ac_try\") 1>&5; (eval $ac_try) 2>&5; }
ac_err=`grep -v '^ *+' conftest.out | grep -v "^conftest.${ac_ext}\$"`
if test -z "$ac_err"; then
rm -rf conftest*
@@ -982,13 +993,14 @@ fi
done
+
echo $ac_n "checking for working const""... $ac_c" 1>&6
-echo "configure:1007: checking for working const" >&5
+echo "configure:1022: checking for working const" >&5
if eval "test \"`echo '$''{'ac_cv_c_const'+set}'`\" = set"; then
echo $ac_n "(cached) $ac_c" 1>&6
else
cat > conftest.$ac_ext <<EOF
-#line 1012 "configure"
+#line 1027 "configure"
#include "confdefs.h"
int main() {
@@ -1037,7 +1049,7 @@ ccp = (char const *const *) p;
; return 0; }
EOF
-if { (eval echo configure:1061: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>&5; }; then
+if { (eval echo configure:1076: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>&5; }; then
rm -rf conftest*
ac_cv_c_const=yes
else
@@ -1058,12 +1070,12 @@ EOF
fi
echo $ac_n "checking for size_t""... $ac_c" 1>&6
-echo "configure:1082: checking for size_t" >&5
+echo "configure:1097: checking for size_t" >&5
if eval "test \"`echo '$''{'ac_cv_type_size_t'+set}'`\" = set"; then
echo $ac_n "(cached) $ac_c" 1>&6
else
cat > conftest.$ac_ext <<EOF
-#line 1087 "configure"
+#line 1102 "configure"
#include "confdefs.h"
#include <sys/types.h>
#if STDC_HEADERS
@@ -1091,15 +1103,16 @@ EOF
fi
+
for ac_func in memmove strerror
do
echo $ac_n "checking for $ac_func""... $ac_c" 1>&6
-echo "configure:1118: checking for $ac_func" >&5
+echo "configure:1134: checking for $ac_func" >&5
if eval "test \"`echo '$''{'ac_cv_func_$ac_func'+set}'`\" = set"; then
echo $ac_n "(cached) $ac_c" 1>&6
else
cat > conftest.$ac_ext <<EOF
-#line 1123 "configure"
+#line 1139 "configure"
#include "confdefs.h"
/* System header to define __stub macros and hopefully few prototypes,
which can conflict with char $ac_func(); below. */
@@ -1122,7 +1135,7 @@ $ac_func();
; return 0; }
EOF
-if { (eval echo configure:1146: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
+if { (eval echo configure:1162: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
rm -rf conftest*
eval "ac_cv_func_$ac_func=yes"
else
@@ -1148,22 +1161,31 @@ done
-
-LIBTOOL=
-LIBSUFFIX=a
+LIBTOOL=libtool
+LIBSUFFIX=la
# Check whether --enable-shared or --disable-shared was given.
if test "${enable_shared+set}" = set; then
enableval="$enable_shared"
- if test "$enableval" = "yes"; then
- LIBTOOL=libtool
- LIBSUFFIX=la
-fi
+ if test "$enableval" = "no"; then
+ LIBTOOL=
+ LIBSUFFIX=a
+fi
fi
+
+
+
+
+
+
+
+
+
+
trap '' 1 2 15
cat > confcache <<\EOF
# This file is a shell script that caches the results of configure
@@ -1264,7 +1286,7 @@ done
ac_given_srcdir=$srcdir
-trap 'rm -fr `echo "Makefile config.h:config.in" | sed "s/:[^ ]*//g"` conftest*; exit 1' 1 2 15
+trap 'rm -fr `echo "Makefile pcre.h:pcre.in pcre-config config.h:config.in" | sed "s/:[^ ]*//g"` conftest*; exit 1' 1 2 15
EOF
cat >> $CONFIG_STATUS <<EOF
@@ -1299,8 +1321,16 @@ s%@mandir@%$mandir%g
s%@CC@%$CC%g
s%@RANLIB@%$RANLIB%g
s%@CPP@%$CPP%g
+s%@HAVE_MEMMOVE@%$HAVE_MEMMOVE%g
+s%@HAVE_STRERROR@%$HAVE_STRERROR%g
s%@LIBTOOL@%$LIBTOOL%g
s%@LIBSUFFIX@%$LIBSUFFIX%g
+s%@PCRE_MAJOR@%$PCRE_MAJOR%g
+s%@PCRE_MINOR@%$PCRE_MINOR%g
+s%@PCRE_DATE@%$PCRE_DATE%g
+s%@PCRE_VERSION@%$PCRE_VERSION%g
+s%@PCRE_LIB_VERSION@%$PCRE_LIB_VERSION%g
+s%@PCRE_POSIXLIB_VERSION@%$PCRE_POSIXLIB_VERSION%g
CEOF
EOF
@@ -1342,7 +1372,7 @@ EOF
cat >> $CONFIG_STATUS <<EOF
-CONFIG_FILES=\${CONFIG_FILES-"Makefile"}
+CONFIG_FILES=\${CONFIG_FILES-"Makefile pcre.h:pcre.in pcre-config"}
EOF
cat >> $CONFIG_STATUS <<\EOF
for ac_file in .. $CONFIG_FILES; do if test "x$ac_file" != x..; then
@@ -1508,7 +1538,7 @@ cat >> $CONFIG_STATUS <<EOF
EOF
cat >> $CONFIG_STATUS <<\EOF
-
+chmod a+x pcre-config
exit 0
EOF
chmod +x $CONFIG_STATUS
diff --git a/configure.in b/configure.in
index ef94892..507888b 100644
--- a/configure.in
+++ b/configure.in
@@ -5,36 +5,71 @@ dnl it should be seeing, to verify it is in the same directory.
AC_INIT(dftables.c)
+dnl Arrange to build config.h from config.in. Note that pcre.h is
+dnl built differently, as it is just a "substitution" file.
+dnl Manual says this macro should come right after AC_INIT.
+AC_CONFIG_HEADER(config.h:config.in)
+
+dnl Provide the current PCRE version information. Do not use numbers
+dnl with leading zeros for the minor version, as they end up in a C
+dnl macro, and may be treated as octal constants. Stick to single
+dnl digits for minor numbers less than 10. There are unlikely to be
+dnl that many releases anyway.
+
+PCRE_MAJOR=3
+PCRE_MINOR=0
+PCRE_DATE=01-Feb-2000
+PCRE_VERSION=${PCRE_MAJOR}.${PCRE_MINOR}
+
+dnl Provide versioning information for libtool shared libraries that
+dnl are built by default on Unix systems.
+
+PCRE_LIB_VERSION=0:0:0
+PCRE_POSIXLIB_VERSION=0:0:0
+
dnl Checks for programs.
+
AC_PROG_CC
AC_PROG_RANLIB
dnl Checks for header files.
+
AC_HEADER_STDC
AC_CHECK_HEADERS(limits.h)
dnl Checks for typedefs, structures, and compiler characteristics.
+
AC_C_CONST
AC_TYPE_SIZE_T
dnl Checks for library functions.
-AC_CHECK_FUNCS(memmove strerror)
-dnl Arrange to build config.h from config.in
-AC_CONFIG_HEADER(config.h:config.in)
+AC_CHECK_FUNCS(memmove strerror)
dnl Handle --enable-shared-libraries
-LIBTOOL=
-LIBSUFFIX=a
+
+LIBTOOL=libtool
+LIBSUFFIX=la
AC_ARG_ENABLE(shared,
-[ --enable-shared build PCRE as a shared library (using libtool)],
-if test "$enableval" = "yes"; then
- LIBTOOL=libtool
- LIBSUFFIX=la
+[ --disable-shared build PCRE as a static library],
+if test "$enableval" = "no"; then
+ LIBTOOL=
+ LIBSUFFIX=a
fi
)
+
+dnl "Export" these variables
+
+AC_SUBST(HAVE_MEMMOVE)
+AC_SUBST(HAVE_STRERROR)
AC_SUBST(LIBTOOL)
AC_SUBST(LIBSUFFIX)
+AC_SUBST(PCRE_MAJOR)
+AC_SUBST(PCRE_MINOR)
+AC_SUBST(PCRE_DATE)
+AC_SUBST(PCRE_VERSION)
+AC_SUBST(PCRE_LIB_VERSION)
+AC_SUBST(PCRE_POSIXLIB_VERSION)
dnl This must be last; it determines what files are written
-AC_OUTPUT(Makefile)
+AC_OUTPUT(Makefile pcre.h:pcre.in pcre-config,[chmod a+x pcre-config])
diff --git a/dftables.c b/dftables.c
index 7b336e6..d572dfd 100644
--- a/dftables.c
+++ b/dftables.c
@@ -8,7 +8,7 @@ and semantics are as close as possible to those of the Perl 5 language.
Written by: Philip Hazel <ph10@cam.ac.uk>
- Copyright (c) 1997-1999 University of Cambridge
+ Copyright (c) 1997-2000 University of Cambridge
-----------------------------------------------------------------------------
Permission is granted to anyone to use this software for any purpose on any
@@ -89,9 +89,11 @@ for (i = 0; i < 256; i++)
printf(",\n\n");
printf(
- "/* This table contains bit maps for digits, 'word' chars, and white\n"
- "space. Each map is 32 bytes long and the bits run from the least\n"
- "significant end of each byte. */\n\n");
+ "/* This table contains bit maps for various character classes.\n"
+ "Each map is 32 bytes long and the bits run from the least\n"
+ "significant end of each byte. The classes that have their own\n"
+ "maps are: space, xdigit, digit, upper, lower, word, graph\n"
+ "print, punct, and cntrl. Other classes are built from combinations. */\n\n");
printf(" ");
for (i = 0; i < cbit_length; i++)
@@ -104,7 +106,7 @@ for (i = 0; i < cbit_length; i++)
printf("0x%02x", *tables++);
if (i != cbit_length - 1) printf(",");
}
-printf(" ,\n\n");
+printf(",\n\n");
printf(
"/* This table identifies various classes of character by individual bits:\n"
diff --git a/doc/Tech.Notes b/doc/Tech.Notes
index d485a4e..03904db 100644
--- a/doc/Tech.Notes
+++ b/doc/Tech.Notes
@@ -23,18 +23,19 @@ optionally, minimizing in Perl) the amount of the subject that matches
individual wild portions of the pattern. This is an "NFA algorithm" in Friedl's
terminology.
-For this set of functions that forms PCRE, I tried at first to invent an
-algorithm that used an amount of store bounded by a multiple of the number of
-characters in the pattern, to save on compiling time. However, because of the
-greater complexity in Perl regular expressions, I couldn't do this. In any
-case, a first pass through the pattern is needed, in order to find internal
-flag settings like (?i) at top level. So it works by running a very degenerate
-first pass to calculate a maximum store size, and then a second pass to do the
-real compile - which may use a bit less than the predicted amount of store. The
-idea is that this is going to turn out faster because the first pass is
-degenerate and the second can just store stuff straight into the vector. It
-does make the compiling functions bigger, of course, but they have got quite
-big anyway to handle all the Perl stuff.
+For the set of functions that forms PCRE (which are unrelated to those
+mentioned above), I tried at first to invent an algorithm that used an amount
+of store bounded by a multiple of the number of characters in the pattern, to
+save on compiling time. However, because of the greater complexity in Perl
+regular expressions, I couldn't do this. In any case, a first pass through the
+pattern is needed, in order to find internal flag settings like (?i) at top
+level. So PCRE works by running a very degenerate first pass to calculate a
+maximum store size, and then a second pass to do the real compile - which may
+use a bit less than the predicted amount of store. The idea is that this is
+going to turn out faster because the first pass is degenerate and the second
+pass can just store stuff straight into the vector. It does make the compiling
+functions bigger, of course, but they have got quite big anyway to handle all
+the Perl stuff.
The compiled form of a pattern is a vector of bytes, containing items of
variable length. The first byte in an item is an opcode, and the length of the
@@ -61,6 +62,7 @@ These items are all just one byte long
OP_EODN match end of data or \n at end: \Z
OP_EOD match end of data: \z
OP_DOLL $ (end of data, or before \n in multiline)
+ OP_RECURSE match the pattern recursively
Repeating single characters
@@ -125,9 +127,9 @@ positive class, and OP_NOT for a negative one (that is, for something like
repeated, negated, single-character class. The normal ones (OP_STAR etc.) are
used for a repeated positive single-character class.
-OP_CLASS is followed by a 32-byte bit map containing a 1
-bit for every character that is acceptable. The bits are counted from the least
-significant end of each byte.
+OP_CLASS is followed by a 32-byte bit map containing a 1 bit for every
+character that is acceptable. The bits are counted from the least significant
+end of each byte.
Back references
@@ -159,11 +161,12 @@ four bytes of data, comprising the minimum and maximum repeat counts.
Brackets and alternation
------------------------
-A pair of non-identifying (round) brackets is wrapped round each expression at
+A pair of non-capturing (round) brackets is wrapped round each expression at
compile time, so alternation always happens in the context of brackets.
-Non-identifying brackets use the opcode OP_BRA, while identifying brackets use
+Non-capturing brackets use the opcode OP_BRA, while capturing brackets use
OP_BRA+1, OP_BRA+2, etc. [Note for North Americans: "bracket" to some English
-speakers, including myself, can be round, square, or curly. Hence this usage.]
+speakers, including myself, can be round, square, curly, or pointy. Hence this
+usage.]
A bracket opcode is followed by two bytes which give the offset to the next
alternative OP_ALT or, if there aren't any branches, to the matching KET
@@ -236,4 +239,4 @@ the compiled data.
Philip Hazel
-January 1999
+February 2000
diff --git a/doc/pcre.3 b/doc/pcre.3
index 47971b9..bd435e9 100644
--- a/doc/pcre.3
+++ b/doc/pcre.3
@@ -47,6 +47,11 @@ pcre - Perl-compatible regular expressions.
.B const unsigned char *pcre_maketables(void);
.PP
.br
+.B int pcre_fullinfo(const pcre *\fIcode\fR, "const pcre_extra *\fIextra\fR,"
+.ti +5n
+.B int \fIwhat\fR, void *\fIwhere\fR);
+.PP
+.br
.B int pcre_info(const pcre *\fIcode\fR, int *\fIoptptr\fR, int
.B *\fIfirstcharptr\fR);
.PP
@@ -64,16 +69,19 @@ pcre - Perl-compatible regular expressions.
.SH DESCRIPTION
The PCRE library is a set of functions that implement regular expression
pattern matching using the same syntax and semantics as Perl 5, with just a few
-differences (see below). The current implementation corresponds to Perl 5.005.
+differences (see below). The current implementation corresponds to Perl 5.005,
+with some additional features from the Perl development release.
PCRE has its own native API, which is described in this document. There is also
-a set of wrapper functions that correspond to the POSIX API. These are
-described in the \fBpcreposix\fR documentation.
+a set of wrapper functions that correspond to the POSIX regular expression API.
+These are described in the \fBpcreposix\fR documentation.
The native API function prototypes are defined in the header file \fBpcre.h\fR,
and on Unix systems the library itself is called \fBlibpcre.a\fR, so can be
accessed by adding \fB-lpcre\fR to the command for linking an application which
-calls it.
+calls it. The header file defines the macros PCRE_MAJOR and PCRE_MINOR to
+contain the major and minor release numbers for the library. Applications can
+use these to include support for different releases.
The functions \fBpcre_compile()\fR, \fBpcre_study()\fR, and \fBpcre_exec()\fR
are used for compiling and matching regular expressions, while
@@ -83,9 +91,11 @@ captured substrings from a matched subject string. The function
\fBpcre_maketables()\fR is used (optionally) to build a set of character tables
in the current locale for passing to \fBpcre_compile()\fR.
-The function \fBpcre_info()\fR is used to find out information about a compiled
-pattern, while the function \fBpcre_version()\fR returns a pointer to a string
-containing the version of PCRE and its date of release.
+The function \fBpcre_fullinfo()\fR is used to find out information about a
+compiled pattern; \fBpcre_info()\fR is an obsolete version which returns only
+some of the available information, but is retained for backwards compatibility.
+The function \fBpcre_version()\fR returns a pointer to a string containing the
+version of PCRE and its date of release.
The global variables \fBpcre_malloc\fR and \fBpcre_free\fR initially contain
the entry points of the standard \fBmalloc()\fR and \fBfree()\fR functions
@@ -182,12 +192,14 @@ sequence (?( which introduces a conditional subpattern.
PCRE_EXTRA
-This option turns on additional functionality of PCRE that is incompatible with
-Perl. Any backslash in a pattern that is followed by a letter that has no
+This option was invented in order to turn on additional functionality of PCRE
+that is incompatible with Perl, but it is currently of very little use. When
+set, any backslash in a pattern that is followed by a letter that has no
special meaning causes an error, thus reserving these combinations for future
expansion. By default, as in Perl, a backslash followed by a letter with no
special meaning is treated as a literal. There are at present no other features
-controlled by this option.
+controlled by this option. It can also be set by a (?X) option setting within a
+pattern.
PCRE_MULTILINE
@@ -261,25 +273,58 @@ memory containing the tables remains available for as long as it is needed.
.SH INFORMATION ABOUT A PATTERN
-The \fBpcre_info()\fR function returns information about a compiled pattern.
-Its yield is the number of capturing subpatterns, or one of the following
-negative numbers:
+The \fBpcre_fullinfo()\fR function returns information about a compiled
+pattern. It replaces the obsolete \fBpcre_info()\fR function, which is
+nevertheless retained for backwards compability (and is documented below).
+
+The first argument for \fBpcre_fullinfo()\fR is a pointer to the compiled
+pattern. The second argument is the result of \fBpcre_study()\fR, or NULL if
+the pattern was not studied. The third argument specifies which piece of
+information is required, while the fourth argument is a pointer to a variable
+to receive the data. The yield of the function is zero for success, or one of
+the following negative numbers:
PCRE_ERROR_NULL the argument \fIcode\fR was NULL
+ the argument \fIwhere\fR was NULL
PCRE_ERROR_BADMAGIC the "magic number" was not found
+ PCRE_ERROR_BADOPTION the value of \fIwhat\fR was invalid
-If the \fIoptptr\fR argument is not NULL, a copy of the options with which the
-pattern was compiled is placed in the integer it points to. These option bits
+The possible values for the third argument are defined in \fBpcre.h\fR, and are
+as follows:
+
+ PCRE_INFO_OPTIONS
+
+Return a copy of the options with which the pattern was compiled. The fourth
+argument should point to au \fBunsigned long int\fR variable. These option bits
are those specified in the call to \fBpcre_compile()\fR, modified by any
top-level option settings within the pattern itself, and with the PCRE_ANCHORED
-bit set if the form of the pattern implies that it can match only at the start
-of a subject string.
+bit forcibly set if the form of the pattern implies that it can match only at
+the start of a subject string.
-If the pattern is not anchored and the \fIfirstcharptr\fR argument is not NULL,
-it is used to pass back information about the first character of any matched
-string. If there is a fixed first character, e.g. from a pattern such as
-(cat|cow|coyote), then it is returned in the integer pointed to by
-\fIfirstcharptr\fR. Otherwise, if either
+ PCRE_INFO_SIZE
+
+Return the size of the compiled pattern, that is, the value that was passed as
+the argument to \fBpcre_malloc()\fR when PCRE was getting memory in which to
+place the compiled data. The fourth argument should point to a \fBsize_t\fR
+variable.
+
+ PCRE_INFO_CAPTURECOUNT
+
+Return the number of capturing subpatterns in the pattern. The fourth argument
+should point to an \fbint\fR variable.
+
+ PCRE_INFO_BACKREFMAX
+
+Return the number of the highest back reference in the pattern. The fourth
+argument should point to an \fBint\fR variable. Zero is returned if there are
+no back references.
+
+ PCRE_INFO_FIRSTCHAR
+
+Return information about the first character of any matched string, for a
+non-anchored pattern. If there is a fixed first character, e.g. from a pattern
+such as (cat|cow|coyote), then it is returned in the integer pointed to by
+\fIwhere\fR. Otherwise, if either
(a) the pattern was compiled with the PCRE_MULTILINE option, and every branch
starts with "^", or
@@ -289,7 +334,40 @@ starts with "^", or
then -1 is returned, indicating that the pattern matches only at the
start of a subject string or after any "\\n" within the string. Otherwise -2 is
-returned.
+returned. For anchored patterns, -2 is returned.
+
+ PCRE_INFO_FIRSTTABLE
+
+If the pattern was studied, and this resulted in the construction of a 256-bit
+table indicating a fixed set of characters for the first character in any
+matching string, a pointer to the table is returned. Otherwise NULL is
+returned. The fourth argument should point to an \fBunsigned char *\fR
+variable.
+
+ PCRE_INFO_LASTLITERAL
+
+For a non-anchored pattern, return the value of the rightmost literal character
+which must exist in any matched string, other than at its start. The fourth
+argument should point to an \fBint\fR variable. If there is no such character,
+or if the pattern is anchored, -1 is returned. For example, for the pattern
+/a\\d+z\\d+/ the returned value is 'z'.
+
+The \fBpcre_info()\fR function is now obsolete because its interface is too
+restrictive to return all the available data about a compiled pattern. New
+programs should use \fBpcre_fullinfo()\fR instead. The yield of
+\fBpcre_info()\fR is the number of capturing subpatterns, or one of the
+following negative numbers:
+
+ PCRE_ERROR_NULL the argument \fIcode\fR was NULL
+ PCRE_ERROR_BADMAGIC the "magic number" was not found
+
+If the \fIoptptr\fR argument is not NULL, a copy of the options with which the
+pattern was compiled is placed in the integer it points to (see
+PCRE_INFO_OPTIONS above).
+
+If the pattern is not anchored and the \fIfirstcharptr\fR argument is not NULL,
+it is used to pass back information about the first character of any matched
+string (see PCRE_INFO_FIRSTCHAR above).
.SH MATCHING A PATTERN
@@ -564,7 +642,9 @@ are not part of its pattern matching engine.
6. The Perl \\G assertion is not supported as it is not relevant to single
pattern matches.
-7. Fairly obviously, PCRE does not support the (?{code}) construction.
+7. Fairly obviously, PCRE does not support the (?{code}) and (?p{code})
+constructions. However, there is some experimental support for recursive
+patterns using the non-Perl item (?R).
8. There are at the time of writing some oddities in Perl 5.005_02 concerned
with the settings of captured strings when part of a pattern is repeated. For
@@ -602,13 +682,16 @@ of the subject.
(f) The PCRE_NOTBOL, PCRE_NOTEOL, and PCRE_NOTEMPTY options for
\fBpcre_exec()\fR have no Perl equivalents.
+(g) The (?R) construct allows for recursive pattern matching (Perl 5.6 can do
+this using the (?p{code}) construct, which PCRE cannot of course support.)
+
.SH REGULAR EXPRESSION DETAILS
The syntax and semantics of the regular expressions supported by PCRE are
described below. Regular expressions are also described in the Perl
documentation and in a number of other books, some of which have copious
examples. Jeffrey Friedl's "Mastering Regular Expressions", published by
-O'Reilly (ISBN 1-56592-257-3), covers them in great detail. The description
+O'Reilly (ISBN 1-56592-257), covers them in great detail. The description
here is intended as reference documentation.
A regular expression is a pattern that is matched against a subject string from
@@ -906,6 +989,40 @@ terminating ] are non-special in character classes, but it does no harm if they
are escaped.
+.SH POSIX CHARACTER CLASSES
+Perl 5.6 (not yet released at the time of writing) is going to support the
+POSIX notation for character classes, which uses names enclosed by [: and :]
+within the enclosing square brackets. PCRE supports this notation. For example,
+
+ [01[:alpha:]%]
+
+matches "0", "1", any alphabetic character, or "%". The supported class names
+are
+
+ alnum letters and digits
+ alpha letters
+ ascii character codes 0 - 127
+ cntrl control characters
+ digit decimal digits (same as \\d)
+ graph printing characters, excluding space
+ lower lower case letters
+ print printing characters, including space
+ punct printing characters, excluding letters and digits
+ space white space (same as \\s)
+ upper upper case letters
+ word "word" characters (same as \\w)
+ xdigit hexadecimal digits
+
+The names "ascii" and "word" are Perl extensions. Another Perl extension is
+negation, which is indicated by a ^ character after the colon. For example,
+
+ [12[:^digit:]]
+
+matches "1", "2", or any non-digit. PCRE (and Perl) also recogize the POSIX
+syntax [.ch.] and [=ch=] where "ch" is a "collating element", but these are not
+supported, and an error is given if they are encountered.
+
+
.SH VERTICAL BAR
Vertical bar characters are used to separate alternative patterns. For example,
the pattern
@@ -1352,18 +1469,17 @@ pattern such as
abcd$
-when applied to a long string which does not match it. Because matching
-proceeds from left to right, PCRE will look for each "a" in the subject and
-then see if what follows matches the rest of the pattern. If the pattern is
-specified as
+when applied to a long string which does not match. Because matching proceeds
+from left to right, PCRE will look for each "a" in the subject and then see if
+what follows matches the rest of the pattern. If the pattern is specified as
^.*abcd$
-then the initial .* matches the entire string at first, but when this fails, it
-backtracks to match all but the last character, then all but the last two
-characters, and so on. Once again the search for "a" covers the entire string,
-from right to left, so we are no better off. However, if the pattern is written
-as
+then the initial .* matches the entire string at first, but when this fails
+(because there is no following "a"), it backtracks to match all but the last
+character, then all but the last two characters, and so on. Once again the
+search for "a" covers the entire string, from right to left, so we are no
+better off. However, if the pattern is written as
^(?>.*)(?<=abcd)
@@ -1372,6 +1488,31 @@ string. The subsequent lookbehind assertion does a single test on the last four
characters. If it fails, the match fails immediately. For long strings, this
approach makes a significant difference to the processing time.
+When a pattern contains an unlimited repeat inside a subpattern that can itself
+be repeated an unlimited number of times, the use of a once-only subpattern is
+the only way to avoid some failing matches taking a very long time indeed.
+The pattern
+
+ (\\D+|<\\d+>)*[!?]
+
+matches an unlimited number of substrings that either consist of non-digits, or
+digits enclosed in <>, followed by either ! or ?. When it matches, it runs
+quickly. However, if it is applied to
+
+ aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
+
+it takes a long time before reporting failure. This is because the string can
+be divided between the two repeats in a large number of ways, and all have to
+be tried. (The example used [!?] rather than a single character at the end,
+because both PCRE and Perl have an optimization that allows for fast failure
+when a single character is used. They remember the last single character that
+is required for a match, and fail early if it is not present in the string.)
+If the pattern is changed to
+
+ ((?>\\D+)|<\\d+>)*[!?]
+
+sequences of non-digits cannot be broken, and failure happens quickly.
+
.SH CONDITIONAL SUBPATTERNS
It is possible to cause the matching process to obey a subpattern
@@ -1431,6 +1572,65 @@ character class introduces a comment that continues up to the next newline
character in the pattern.
+.SH RECURSIVE PATTERNS
+Consider the problem of matching a string in parentheses, allowing for
+unlimited nested parentheses. Without the use of recursion, the best that can
+be done is to use a pattern that matches up to some fixed depth of nesting. It
+is not possible to handle an arbitrary nesting depth. Perl 5.6 has provided an
+experimental facility that allows regular expressions to recurse (amongst other
+things). It does this by interpolating Perl code in the expression at run time,
+and the code can refer to the expression itself. A Perl pattern to solve the
+parentheses problem can be created like this:
+
+ $re = qr{\\( (?: (?>[^()]+) | (?p{$re}) )* \\)}x;
+
+The (?p{...}) item interpolates Perl code at run time, and in this case refers
+recursively to the pattern in which it appears. Obviously, PCRE cannot support
+the interpolation of Perl code. Instead, the special item (?R) is provided for
+the specific case of recursion. This PCRE pattern solves the parentheses
+problem (assume the PCRE_EXTENDED option is set so that white space is
+ignored):
+
+ \\( ( (?>[^()]+) | (?R) )* \\)
+
+First it matches an opening parenthesis. Then it matches any number of
+substrings which can either be a sequence of non-parentheses, or a recursive
+match of the pattern itself (i.e. a correctly parenthesized substring). Finally
+there is a closing parenthesis.
+
+This particular example pattern contains nested unlimited repeats, and so the
+use of a once-only subpattern for matching strings of non-parentheses is
+important when applying the pattern to strings that do not match. For example,
+when it is applied to
+
+ (aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa()
+
+it yields "no match" quickly. However, if a once-only subpattern is not used,
+the match runs for a very long time indeed because there are so many different
+ways the + and * repeats can carve up the subject, and all have to be tested
+before failure can be reported.
+
+The values set for any capturing subpatterns are those from the outermost level
+of the recursion at which the subpattern value is set. If the pattern above is
+matched against
+
+ (ab(cd)ef)
+
+the value for the capturing parentheses is "ef", which is the last value taken
+on at the top level. If additional parentheses are added, giving
+
+ \\( ( ( (?>[^()]+) | (?R) )* ) \\)
+ ^ ^
+ ^ ^
+then the string they capture is "ab(cd)ef", the contents of the top level
+parentheses. If there are more than 15 capturing parentheses in a pattern, PCRE
+has to obtain extra memory to store data during a recursion, which it does by
+using \fBpcre_malloc\fR, freeing it via \fBpcre_free\fR afterwards. If no
+memory can be obtained, it saves data for the first 15 capturing parentheses
+only, as there is no way to give an out-of-memory error from within a
+recursion.
+
+
.SH PERFORMANCE
Certain items that may appear in patterns are more efficient than others. It is
more efficient to use a character class like [aeiou] than a set of alternatives
@@ -1497,6 +1697,6 @@ Cambridge CB2 3QG, England.
.br
Phone: +44 1223 334714
-Last updated: 29 July 1999
+Last updated: 27 January 2000
.br
-Copyright (c) 1997-1999 University of Cambridge.
+Copyright (c) 1997-2000 University of Cambridge.
diff --git a/doc/pcre.html b/doc/pcre.html
index 6d91a5c..2ce2890 100644
--- a/doc/pcre.html
+++ b/doc/pcre.html
@@ -25,17 +25,19 @@ conversion went wrong.
<LI><A NAME="TOC15" HREF="#SEC15">CIRCUMFLEX AND DOLLAR</A>
<LI><A NAME="TOC16" HREF="#SEC16">FULL STOP (PERIOD, DOT)</A>
<LI><A NAME="TOC17" HREF="#SEC17">SQUARE BRACKETS</A>
-<LI><A NAME="TOC18" HREF="#SEC18">VERTICAL BAR</A>
-<LI><A NAME="TOC19" HREF="#SEC19">INTERNAL OPTION SETTING</A>
-<LI><A NAME="TOC20" HREF="#SEC20">SUBPATTERNS</A>
-<LI><A NAME="TOC21" HREF="#SEC21">REPETITION</A>
-<LI><A NAME="TOC22" HREF="#SEC22">BACK REFERENCES</A>
-<LI><A NAME="TOC23" HREF="#SEC23">ASSERTIONS</A>
-<LI><A NAME="TOC24" HREF="#SEC24">ONCE-ONLY SUBPATTERNS</A>
-<LI><A NAME="TOC25" HREF="#SEC25">CONDITIONAL SUBPATTERNS</A>
-<LI><A NAME="TOC26" HREF="#SEC26">COMMENTS</A>
-<LI><A NAME="TOC27" HREF="#SEC27">PERFORMANCE</A>
-<LI><A NAME="TOC28" HREF="#SEC28">AUTHOR</A>
+<LI><A NAME="TOC18" HREF="#SEC18">POSIX CHARACTER CLASSES</A>
+<LI><A NAME="TOC19" HREF="#SEC19">VERTICAL BAR</A>
+<LI><A NAME="TOC20" HREF="#SEC20">INTERNAL OPTION SETTING</A>
+<LI><A NAME="TOC21" HREF="#SEC21">SUBPATTERNS</A>
+<LI><A NAME="TOC22" HREF="#SEC22">REPETITION</A>
+<LI><A NAME="TOC23" HREF="#SEC23">BACK REFERENCES</A>
+<LI><A NAME="TOC24" HREF="#SEC24">ASSERTIONS</A>
+<LI><A NAME="TOC25" HREF="#SEC25">ONCE-ONLY SUBPATTERNS</A>
+<LI><A NAME="TOC26" HREF="#SEC26">CONDITIONAL SUBPATTERNS</A>
+<LI><A NAME="TOC27" HREF="#SEC27">COMMENTS</A>
+<LI><A NAME="TOC28" HREF="#SEC28">RECURSIVE PATTERNS</A>
+<LI><A NAME="TOC29" HREF="#SEC29">PERFORMANCE</A>
+<LI><A NAME="TOC30" HREF="#SEC30">AUTHOR</A>
</UL>
<LI><A NAME="SEC1" HREF="#TOC1">NAME</A>
<P>
@@ -77,6 +79,10 @@ pcre - Perl-compatible regular expressions.
<B>const unsigned char *pcre_maketables(void);</B>
</P>
<P>
+<B>int pcre_fullinfo(const pcre *<I>code</I>, const pcre_extra *<I>extra</I>,</B>
+<B>int <I>what</I>, void *<I>where</I>);</B>
+</P>
+<P>
<B>int pcre_info(const pcre *<I>code</I>, int *<I>optptr</I>, int</B>
<B>*<I>firstcharptr</I>);</B>
</P>
@@ -93,18 +99,21 @@ pcre - Perl-compatible regular expressions.
<P>
The PCRE library is a set of functions that implement regular expression
pattern matching using the same syntax and semantics as Perl 5, with just a few
-differences (see below). The current implementation corresponds to Perl 5.005.
+differences (see below). The current implementation corresponds to Perl 5.005,
+with some additional features from the Perl development release.
</P>
<P>
PCRE has its own native API, which is described in this document. There is also
-a set of wrapper functions that correspond to the POSIX API. These are
-described in the <B>pcreposix</B> documentation.
+a set of wrapper functions that correspond to the POSIX regular expression API.
+These are described in the <B>pcreposix</B> documentation.
</P>
<P>
The native API function prototypes are defined in the header file <B>pcre.h</B>,
and on Unix systems the library itself is called <B>libpcre.a</B>, so can be
accessed by adding <B>-lpcre</B> to the command for linking an application which
-calls it.
+calls it. The header file defines the macros PCRE_MAJOR and PCRE_MINOR to
+contain the major and minor release numbers for the library. Applications can
+use these to include support for different releases.
</P>
<P>
The functions <B>pcre_compile()</B>, <B>pcre_study()</B>, and <B>pcre_exec()</B>
@@ -116,9 +125,11 @@ captured substrings from a matched subject string. The function
in the current locale for passing to <B>pcre_compile()</B>.
</P>
<P>
-The function <B>pcre_info()</B> is used to find out information about a compiled
-pattern, while the function <B>pcre_version()</B> returns a pointer to a string
-containing the version of PCRE and its date of release.
+The function <B>pcre_fullinfo()</B> is used to find out information about a
+compiled pattern; <B>pcre_info()</B> is an obsolete version which returns only
+some of the available information, but is retained for backwards compatibility.
+The function <B>pcre_version()</B> returns a pointer to a string containing the
+version of PCRE and its date of release.
</P>
<P>
The global variables <B>pcre_malloc</B> and <B>pcre_free</B> initially contain
@@ -246,12 +257,14 @@ sequence (?( which introduces a conditional subpattern.
</PRE>
</P>
<P>
-This option turns on additional functionality of PCRE that is incompatible with
-Perl. Any backslash in a pattern that is followed by a letter that has no
+This option was invented in order to turn on additional functionality of PCRE
+that is incompatible with Perl, but it is currently of very little use. When
+set, any backslash in a pattern that is followed by a letter that has no
special meaning causes an error, thus reserving these combinations for future
expansion. By default, as in Perl, a backslash followed by a letter with no
special meaning is treated as a literal. There are at present no other features
-controlled by this option.
+controlled by this option. It can also be set by a (?X) option setting within a
+pattern.
</P>
<P>
<PRE>
@@ -342,30 +355,83 @@ memory containing the tables remains available for as long as it is needed.
</P>
<LI><A NAME="SEC8" HREF="#TOC1">INFORMATION ABOUT A PATTERN</A>
<P>
-The <B>pcre_info()</B> function returns information about a compiled pattern.
-Its yield is the number of capturing subpatterns, or one of the following
-negative numbers:
+The <B>pcre_fullinfo()</B> function returns information about a compiled
+pattern. It replaces the obsolete <B>pcre_info()</B> function, which is
+nevertheless retained for backwards compability (and is documented below).
+</P>
+<P>
+The first argument for <B>pcre_fullinfo()</B> is a pointer to the compiled
+pattern. The second argument is the result of <B>pcre_study()</B>, or NULL if
+the pattern was not studied. The third argument specifies which piece of
+information is required, while the fourth argument is a pointer to a variable
+to receive the data. The yield of the function is zero for success, or one of
+the following negative numbers:
</P>
<P>
<PRE>
PCRE_ERROR_NULL the argument <I>code</I> was NULL
+ the argument <I>where</I> was NULL
PCRE_ERROR_BADMAGIC the "magic number" was not found
+ PCRE_ERROR_BADOPTION the value of <I>what</I> was invalid
</PRE>
</P>
<P>
-If the <I>optptr</I> argument is not NULL, a copy of the options with which the
-pattern was compiled is placed in the integer it points to. These option bits
+The possible values for the third argument are defined in <B>pcre.h</B>, and are
+as follows:
+</P>
+<P>
+<PRE>
+ PCRE_INFO_OPTIONS
+</PRE>
+</P>
+<P>
+Return a copy of the options with which the pattern was compiled. The fourth
+argument should point to au <B>unsigned long int</B> variable. These option bits
are those specified in the call to <B>pcre_compile()</B>, modified by any
top-level option settings within the pattern itself, and with the PCRE_ANCHORED
-bit set if the form of the pattern implies that it can match only at the start
-of a subject string.
+bit forcibly set if the form of the pattern implies that it can match only at
+the start of a subject string.
</P>
<P>
-If the pattern is not anchored and the <I>firstcharptr</I> argument is not NULL,
-it is used to pass back information about the first character of any matched
-string. If there is a fixed first character, e.g. from a pattern such as
-(cat|cow|coyote), then it is returned in the integer pointed to by
-<I>firstcharptr</I>. Otherwise, if either
+<PRE>
+ PCRE_INFO_SIZE
+</PRE>
+</P>
+<P>
+Return the size of the compiled pattern, that is, the value that was passed as
+the argument to <B>pcre_malloc()</B> when PCRE was getting memory in which to
+place the compiled data. The fourth argument should point to a <B>size_t</B>
+variable.
+</P>
+<P>
+<PRE>
+ PCRE_INFO_CAPTURECOUNT
+</PRE>
+</P>
+<P>
+Return the number of capturing subpatterns in the pattern. The fourth argument
+should point to an \fbint\fR variable.
+</P>
+<P>
+<PRE>
+ PCRE_INFO_BACKREFMAX
+</PRE>
+</P>
+<P>
+Return the number of the highest back reference in the pattern. The fourth
+argument should point to an <B>int</B> variable. Zero is returned if there are
+no back references.
+</P>
+<P>
+<PRE>
+ PCRE_INFO_FIRSTCHAR
+</PRE>
+</P>
+<P>
+Return information about the first character of any matched string, for a
+non-anchored pattern. If there is a fixed first character, e.g. from a pattern
+such as (cat|cow|coyote), then it is returned in the integer pointed to by
+<I>where</I>. Otherwise, if either
</P>
<P>
(a) the pattern was compiled with the PCRE_MULTILINE option, and every branch
@@ -378,7 +444,54 @@ starts with "^", or
<P>
then -1 is returned, indicating that the pattern matches only at the
start of a subject string or after any "\n" within the string. Otherwise -2 is
-returned.
+returned. For anchored patterns, -2 is returned.
+</P>
+<P>
+<PRE>
+ PCRE_INFO_FIRSTTABLE
+</PRE>
+</P>
+<P>
+If the pattern was studied, and this resulted in the construction of a 256-bit
+table indicating a fixed set of characters for the first character in any
+matching string, a pointer to the table is returned. Otherwise NULL is
+returned. The fourth argument should point to an <B>unsigned char *</B>
+variable.
+</P>
+<P>
+<PRE>
+ PCRE_INFO_LASTLITERAL
+</PRE>
+</P>
+<P>
+For a non-anchored pattern, return the value of the rightmost literal character
+which must exist in any matched string, other than at its start. The fourth
+argument should point to an <B>int</B> variable. If there is no such character,
+or if the pattern is anchored, -1 is returned. For example, for the pattern
+/a\d+z\d+/ the returned value is 'z'.
+</P>
+<P>
+The <B>pcre_info()</B> function is now obsolete because its interface is too
+restrictive to return all the available data about a compiled pattern. New
+programs should use <B>pcre_fullinfo()</B> instead. The yield of
+<B>pcre_info()</B> is the number of capturing subpatterns, or one of the
+following negative numbers:
+</P>
+<P>
+<PRE>
+ PCRE_ERROR_NULL the argument <I>code</I> was NULL
+ PCRE_ERROR_BADMAGIC the "magic number" was not found
+</PRE>
+</P>
+<P>
+If the <I>optptr</I> argument is not NULL, a copy of the options with which the
+pattern was compiled is placed in the integer it points to (see
+PCRE_INFO_OPTIONS above).
+</P>
+<P>
+If the pattern is not anchored and the <I>firstcharptr</I> argument is not NULL,
+it is used to pass back information about the first character of any matched
+string (see PCRE_INFO_FIRSTCHAR above).
</P>
<LI><A NAME="SEC9" HREF="#TOC1">MATCHING A PATTERN</A>
<P>
@@ -735,7 +848,9 @@ are not part of its pattern matching engine.
pattern matches.
</P>
<P>
-7. Fairly obviously, PCRE does not support the (?{code}) construction.
+7. Fairly obviously, PCRE does not support the (?{code}) and (?p{code})
+constructions. However, there is some experimental support for recursive
+patterns using the non-Perl item (?R).
</P>
<P>
8. There are at the time of writing some oddities in Perl 5.005_02 concerned
@@ -783,13 +898,17 @@ of the subject.
(f) The PCRE_NOTBOL, PCRE_NOTEOL, and PCRE_NOTEMPTY options for
<B>pcre_exec()</B> have no Perl equivalents.
</P>
+<P>
+(g) The (?R) construct allows for recursive pattern matching (Perl 5.6 can do
+this using the (?p{code}) construct, which PCRE cannot of course support.)
+</P>
<LI><A NAME="SEC13" HREF="#TOC1">REGULAR EXPRESSION DETAILS</A>
<P>
The syntax and semantics of the regular expressions supported by PCRE are
described below. Regular expressions are also described in the Perl
documentation and in a number of other books, some of which have copious
examples. Jeffrey Friedl's "Mastering Regular Expressions", published by
-O'Reilly (ISBN 1-56592-257-3), covers them in great detail. The description
+O'Reilly (ISBN 1-56592-257), covers them in great detail. The description
here is intended as reference documentation.
</P>
<P>
@@ -1144,7 +1263,53 @@ All non-alphameric characters other than \, -, ^ (at the start) and the
terminating ] are non-special in character classes, but it does no harm if they
are escaped.
</P>
-<LI><A NAME="SEC18" HREF="#TOC1">VERTICAL BAR</A>
+<LI><A NAME="SEC18" HREF="#TOC1">POSIX CHARACTER CLASSES</A>
+<P>
+Perl 5.6 (not yet released at the time of writing) is going to support the
+POSIX notation for character classes, which uses names enclosed by [: and :]
+within the enclosing square brackets. PCRE supports this notation. For example,
+</P>
+<P>
+<PRE>
+ [01[:alpha:]%]
+</PRE>
+</P>
+<P>
+matches "0", "1", any alphabetic character, or "%". The supported class names
+are
+</P>
+<P>
+<PRE>
+ alnum letters and digits
+ alpha letters
+ ascii character codes 0 - 127
+ cntrl control characters
+ digit decimal digits (same as \d)
+ graph printing characters, excluding space
+ lower lower case letters
+ print printing characters, including space
+ punct printing characters, excluding letters and digits
+ space white space (same as \s)
+ upper upper case letters
+ word "word" characters (same as \w)
+ xdigit hexadecimal digits
+</PRE>
+</P>
+<P>
+The names "ascii" and "word" are Perl extensions. Another Perl extension is
+negation, which is indicated by a ^ character after the colon. For example,
+</P>
+<P>
+<PRE>
+ [12[:^digit:]]
+</PRE>
+</P>
+<P>
+matches "1", "2", or any non-digit. PCRE (and Perl) also recogize the POSIX
+syntax [.ch.] and [=ch=] where "ch" is a "collating element", but these are not
+supported, and an error is given if they are encountered.
+</P>
+<LI><A NAME="SEC19" HREF="#TOC1">VERTICAL BAR</A>
<P>
Vertical bar characters are used to separate alternative patterns. For example,
the pattern
@@ -1162,7 +1327,7 @@ and the first one that succeeds is used. If the alternatives are within a
subpattern (defined below), "succeeds" means matching the rest of the main
pattern as well as the alternative in the subpattern.
</P>
-<LI><A NAME="SEC19" HREF="#TOC1">INTERNAL OPTION SETTING</A>
+<LI><A NAME="SEC20" HREF="#TOC1">INTERNAL OPTION SETTING</A>
<P>
The settings of PCRE_CASELESS, PCRE_MULTILINE, PCRE_DOTALL, and PCRE_EXTENDED
can be changed from within the pattern by a sequence of Perl option letters
@@ -1238,7 +1403,7 @@ respectively. The (?X) flag setting is special in that it must always occur
earlier in the pattern than any of the additional features it turns on, even
when it is at top level. It is best put at the start.
</P>
-<LI><A NAME="SEC20" HREF="#TOC1">SUBPATTERNS</A>
+<LI><A NAME="SEC21" HREF="#TOC1">SUBPATTERNS</A>
<P>
Subpatterns are delimited by parentheses (round brackets), which can be nested.
Marking part of a pattern as a subpattern does two things:
@@ -1309,7 +1474,7 @@ from left to right, and options are not reset until the end of the subpattern
is reached, an option setting in one branch does affect subsequent branches, so
the above patterns match "SUNDAY" as well as "Saturday".
</P>
-<LI><A NAME="SEC21" HREF="#TOC1">REPETITION</A>
+<LI><A NAME="SEC22" HREF="#TOC1">REPETITION</A>
<P>
Repetition is specified by quantifiers, which can follow any of the following
items:
@@ -1484,7 +1649,7 @@ example, after
<P>
matches "aba" the value of the second captured substring is "b".
</P>
-<LI><A NAME="SEC22" HREF="#TOC1">BACK REFERENCES</A>
+<LI><A NAME="SEC23" HREF="#TOC1">BACK REFERENCES</A>
<P>
Outside a character class, a backslash followed by a digit greater than 0 (and
possibly further digits) is a back reference to a capturing subpattern earlier
@@ -1560,7 +1725,7 @@ that the first iteration does not need to match the back reference. This can be
done using alternation, as in the example above, or by a quantifier with a
minimum of zero.
</P>
-<LI><A NAME="SEC23" HREF="#TOC1">ASSERTIONS</A>
+<LI><A NAME="SEC24" HREF="#TOC1">ASSERTIONS</A>
<P>
An assertion is a test on the characters following or preceding the current
matching point that does not actually consume any characters. The simple
@@ -1718,7 +1883,7 @@ because it does not make sense for negative assertions.
<P>
Assertions count towards the maximum of 200 parenthesized subpatterns.
</P>
-<LI><A NAME="SEC24" HREF="#TOC1">ONCE-ONLY SUBPATTERNS</A>
+<LI><A NAME="SEC25" HREF="#TOC1">ONCE-ONLY SUBPATTERNS</A>
<P>
With both maximizing and minimizing repetition, failure of what follows
normally causes the repeated item to be re-evaluated to see if a different
@@ -1782,10 +1947,9 @@ pattern such as
</PRE>
</P>
<P>
-when applied to a long string which does not match it. Because matching
-proceeds from left to right, PCRE will look for each "a" in the subject and
-then see if what follows matches the rest of the pattern. If the pattern is
-specified as
+when applied to a long string which does not match. Because matching proceeds
+from left to right, PCRE will look for each "a" in the subject and then see if
+what follows matches the rest of the pattern. If the pattern is specified as
</P>
<P>
<PRE>
@@ -1793,11 +1957,11 @@ specified as
</PRE>
</P>
<P>
-then the initial .* matches the entire string at first, but when this fails, it
-backtracks to match all but the last character, then all but the last two
-characters, and so on. Once again the search for "a" covers the entire string,
-from right to left, so we are no better off. However, if the pattern is written
-as
+then the initial .* matches the entire string at first, but when this fails
+(because there is no following "a"), it backtracks to match all but the last
+character, then all but the last two characters, and so on. Once again the
+search for "a" covers the entire string, from right to left, so we are no
+better off. However, if the pattern is written as
</P>
<P>
<PRE>
@@ -1810,7 +1974,45 @@ string. The subsequent lookbehind assertion does a single test on the last four
characters. If it fails, the match fails immediately. For long strings, this
approach makes a significant difference to the processing time.
</P>
-<LI><A NAME="SEC25" HREF="#TOC1">CONDITIONAL SUBPATTERNS</A>
+<P>
+When a pattern contains an unlimited repeat inside a subpattern that can itself
+be repeated an unlimited number of times, the use of a once-only subpattern is
+the only way to avoid some failing matches taking a very long time indeed.
+The pattern
+</P>
+<P>
+<PRE>
+ (\D+|&#60;\d+&#62;)*[!?]
+</PRE>
+</P>
+<P>
+matches an unlimited number of substrings that either consist of non-digits, or
+digits enclosed in &#60;&#62;, followed by either ! or ?. When it matches, it runs
+quickly. However, if it is applied to
+</P>
+<P>
+<PRE>
+ aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
+</PRE>
+</P>
+<P>
+it takes a long time before reporting failure. This is because the string can
+be divided between the two repeats in a large number of ways, and all have to
+be tried. (The example used [!?] rather than a single character at the end,
+because both PCRE and Perl have an optimization that allows for fast failure
+when a single character is used. They remember the last single character that
+is required for a match, and fail early if it is not present in the string.)
+If the pattern is changed to
+</P>
+<P>
+<PRE>
+ ((?&#62;\D+)|&#60;\d+&#62;)*[!?]
+</PRE>
+</P>
+<P>
+sequences of non-digits cannot be broken, and failure happens quickly.
+</P>
+<LI><A NAME="SEC26" HREF="#TOC1">CONDITIONAL SUBPATTERNS</A>
<P>
It is possible to cause the matching process to obey a subpattern
conditionally or to choose between two alternative subpatterns, depending on
@@ -1872,7 +2074,7 @@ subject is matched against the first alternative; otherwise it is matched
against the second. This pattern matches strings in one of the two forms
dd-aaa-dd or dd-dd-dd, where aaa are letters and dd are digits.
</P>
-<LI><A NAME="SEC26" HREF="#TOC1">COMMENTS</A>
+<LI><A NAME="SEC27" HREF="#TOC1">COMMENTS</A>
<P>
The sequence (?# marks the start of a comment which continues up to the next
closing parenthesis. Nested parentheses are not permitted. The characters
@@ -1883,7 +2085,87 @@ If the PCRE_EXTENDED option is set, an unescaped # character outside a
character class introduces a comment that continues up to the next newline
character in the pattern.
</P>
-<LI><A NAME="SEC27" HREF="#TOC1">PERFORMANCE</A>
+<LI><A NAME="SEC28" HREF="#TOC1">RECURSIVE PATTERNS</A>
+<P>
+Consider the problem of matching a string in parentheses, allowing for
+unlimited nested parentheses. Without the use of recursion, the best that can
+be done is to use a pattern that matches up to some fixed depth of nesting. It
+is not possible to handle an arbitrary nesting depth. Perl 5.6 has provided an
+experimental facility that allows regular expressions to recurse (amongst other
+things). It does this by interpolating Perl code in the expression at run time,
+and the code can refer to the expression itself. A Perl pattern to solve the
+parentheses problem can be created like this:
+</P>
+<P>
+<PRE>
+ $re = qr{\( (?: (?&#62;[^()]+) | (?p{$re}) )* \)}x;
+</PRE>
+</P>
+<P>
+The (?p{...}) item interpolates Perl code at run time, and in this case refers
+recursively to the pattern in which it appears. Obviously, PCRE cannot support
+the interpolation of Perl code. Instead, the special item (?R) is provided for
+the specific case of recursion. This PCRE pattern solves the parentheses
+problem (assume the PCRE_EXTENDED option is set so that white space is
+ignored):
+</P>
+<P>
+<PRE>
+ \( ( (?&#62;[^()]+) | (?R) )* \)
+</PRE>
+</P>
+<P>
+First it matches an opening parenthesis. Then it matches any number of
+substrings which can either be a sequence of non-parentheses, or a recursive
+match of the pattern itself (i.e. a correctly parenthesized substring). Finally
+there is a closing parenthesis.
+</P>
+<P>
+This particular example pattern contains nested unlimited repeats, and so the
+use of a once-only subpattern for matching strings of non-parentheses is
+important when applying the pattern to strings that do not match. For example,
+when it is applied to
+</P>
+<P>
+<PRE>
+ (aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa()
+</PRE>
+</P>
+<P>
+it yields "no match" quickly. However, if a once-only subpattern is not used,
+the match runs for a very long time indeed because there are so many different
+ways the + and * repeats can carve up the subject, and all have to be tested
+before failure can be reported.
+</P>
+<P>
+The values set for any capturing subpatterns are those from the outermost level
+of the recursion at which the subpattern value is set. If the pattern above is
+matched against
+</P>
+<P>
+<PRE>
+ (ab(cd)ef)
+</PRE>
+</P>
+<P>
+the value for the capturing parentheses is "ef", which is the last value taken
+on at the top level. If additional parentheses are added, giving
+</P>
+<P>
+<PRE>
+ \( ( ( (?&#62;[^()]+) | (?R) )* ) \)
+ ^ ^
+ ^ ^
+</PRE>
+then the string they capture is "ab(cd)ef", the contents of the top level
+parentheses. If there are more than 15 capturing parentheses in a pattern, PCRE
+has to obtain extra memory to store data during a recursion, which it does by
+using <B>pcre_malloc</B>, freeing it via <B>pcre_free</B> afterwards. If no
+memory can be obtained, it saves data for the first 15 capturing parentheses
+only, as there is no way to give an out-of-memory error from within a
+recursion.
+</P>
+<LI><A NAME="SEC29" HREF="#TOC1">PERFORMANCE</A>
<P>
Certain items that may appear in patterns are more efficient than others. It is
more efficient to use a character class like [aeiou] than a set of alternatives
@@ -1959,7 +2241,7 @@ with the pattern above. The former gives a failure almost instantly when
applied to a whole line of "a" characters, whereas the latter takes an
appreciable time with strings longer than about 20 characters.
</P>
-<LI><A NAME="SEC28" HREF="#TOC1">AUTHOR</A>
+<LI><A NAME="SEC30" HREF="#TOC1">AUTHOR</A>
<P>
Philip Hazel &#60;ph10@cam.ac.uk&#62;
<BR>
@@ -1972,6 +2254,6 @@ Cambridge CB2 3QG, England.
Phone: +44 1223 334714
</P>
<P>
-Last updated: 29 July 1999
+Last updated: 27 January 2000
<BR>
-Copyright (c) 1997-1999 University of Cambridge.
+Copyright (c) 1997-2000 University of Cambridge.
diff --git a/doc/pcre.txt b/doc/pcre.txt
index 2374f7c..f28ee99 100644
--- a/doc/pcre.txt
+++ b/doc/pcre.txt
@@ -30,6 +30,9 @@ SYNOPSIS
const unsigned char *pcre_maketables(void);
+ int pcre_fullinfo(const pcre *code, const pcre_extra *extra,
+ int what, void *where);
+
int pcre_info(const pcre *code, int *optptr, *firstcharptr);
char *pcre_version(void);
@@ -46,16 +49,22 @@ DESCRIPTION
lar expression pattern matching using the same syntax and
semantics as Perl 5, with just a few differences (see
below). The current implementation corresponds to Perl
- 5.005.
+ 5.005, with some additional features from the Perl develop-
+ ment release.
PCRE has its own native API, which is described in this
document. There is also a set of wrapper functions that
- correspond to the POSIX API. These are described in the
- pcreposix documentation.
+ correspond to the POSIX regular expression API. These are
+ described in the pcreposix documentation.
+
The native API function prototypes are defined in the header
file pcre.h, and on Unix systems the library itself is
called libpcre.a, so can be accessed by adding -lpcre to the
- command for linking an application which calls it.
+ command for linking an application which calls it. The
+ header file defines the macros PCRE_MAJOR and PCRE_MINOR to
+ contain the major and minor release numbers for the library.
+ Applications can use these to include support for different
+ releases.
The functions pcre_compile(), pcre_study(), and pcre_exec()
are used for compiling and matching regular expressions,
@@ -66,10 +75,12 @@ DESCRIPTION
to build a set of character tables in the current locale for
passing to pcre_compile().
- The function pcre_info() is used to find out information
- about a compiled pattern, while the function pcre_version()
- returns a pointer to a string containing the version of PCRE
- and its date of release.
+ The function pcre_fullinfo() is used to find out information
+ about a compiled pattern; pcre_info() is an obsolete version
+ which returns only some of the available information, but is
+ retained for backwards compatibility. The function
+ pcre_version() returns a pointer to a string containing the
+ version of PCRE and its date of release.
The global variables pcre_malloc and pcre_free initially
contain the entry points of the standard malloc() and free()
@@ -92,6 +103,7 @@ MULTI-THREADING
+
COMPILING A PATTERN
The function pcre_compile() is called to compile a pattern
into an internal form. The pattern is a C string terminated
@@ -187,14 +199,16 @@ COMPILING A PATTERN
PCRE_EXTRA
- This option turns on additional functionality of PCRE that
- is incompatible with Perl. Any backslash in a pattern that
- is followed by a letter that has no special meaning causes
- an error, thus reserving these combinations for future
- expansion. By default, as in Perl, a backslash followed by a
- letter with no special meaning is treated as a literal.
- There are at present no other features controlled by this
- option.
+ This option was invented in order to turn on additional
+ functionality of PCRE that is incompatible with Perl, but it
+ is currently of very little use. When set, any backslash in
+ a pattern that is followed by a letter that has no special
+ meaning causes an error, thus reserving these combinations
+ for future expansion. By default, as in Perl, a backslash
+ followed by a letter with no special meaning is treated as a
+ literal. There are at present no other features controlled
+ by this option. It can also be set by a (?X) option setting
+ within a pattern.
PCRE_MULTILINE
@@ -207,9 +221,9 @@ COMPILING A PATTERN
PCRE_DOLLAR_ENDONLY is set). This is the same as Perl.
When PCRE_MULTILINE it is set, the "start of line" and "end
- of line" constructs match immediately following or
- immediately before any newline in the subject string,
- respectively, as well as at the very start and end. This is
+ of line" constructs match immediately following or immedi-
+ ately before any newline in the subject string, respec-
+ tively, as well as at the very start and end. This is
equivalent to Perl's /m option. If there are no "\n" charac-
ters in a subject string, or no occurrences of ^ or $ in a
pattern, setting PCRE_MULTILINE has no effect.
@@ -284,27 +298,63 @@ LOCALE SUPPORT
INFORMATION ABOUT A PATTERN
- The pcre_info() function returns information about a com-
- piled pattern. Its yield is the number of capturing subpat-
- terns, or one of the following negative numbers:
+ The pcre_fullinfo() function returns information about a
+ compiled pattern. It replaces the obsolete pcre_info() func-
+ tion, which is nevertheless retained for backwards compabil-
+ ity (and is documented below).
+
+ The first argument for pcre_fullinfo() is a pointer to the
+ compiled pattern. The second argument is the result of
+ pcre_study(), or NULL if the pattern was not studied. The
+ third argument specifies which piece of information is
+ required, while the fourth argument is a pointer to a vari-
+ able to receive the data. The yield of the function is zero
+ for success, or one of the following negative numbers:
PCRE_ERROR_NULL the argument code was NULL
+ the argument where was NULL
PCRE_ERROR_BADMAGIC the "magic number" was not found
+ PCRE_ERROR_BADOPTION the value of what was invalid
- If the optptr argument is not NULL, a copy of the options
- with which the pattern was compiled is placed in the integer
- it points to. These option bits are those specified in the
+ The possible values for the third argument are defined in
+ pcre.h, and are as follows:
+
+ PCRE_INFO_OPTIONS
+
+ Return a copy of the options with which the pattern was com-
+ piled. The fourth argument should point to au unsigned long
+ int variable. These option bits are those specified in the
call to pcre_compile(), modified by any top-level option
settings within the pattern itself, and with the
- PCRE_ANCHORED bit set if the form of the pattern implies
- that it can match only at the start of a subject string.
+ PCRE_ANCHORED bit forcibly set if the form of the pattern
+ implies that it can match only at the start of a subject
+ string.
- If the pattern is not anchored and the firstcharptr argument
- is not NULL, it is used to pass back information about the
- first character of any matched string. If there is a fixed
- first character, e.g. from a pattern such as
+ PCRE_INFO_SIZE
+
+ Return the size of the compiled pattern, that is, the value
+ that was passed as the argument to pcre_malloc() when PCRE
+ was getting memory in which to place the compiled data. The
+ fourth argument should point to a size_t variable.
+
+ PCRE_INFO_CAPTURECOUNT
+
+ Return the number of capturing subpatterns in the pattern.
+ The fourth argument should point to an int variable.
+
+ PCRE_INFO_BACKREFMAX
+
+ Return the number of the highest back reference in the pat-
+ tern. The fourth argument should point to an int variable.
+ Zero is returned if there are no back references.
+
+ PCRE_INFO_FIRSTCHAR
+
+ Return information about the first character of any matched
+ string, for a non-anchored pattern. If there is a fixed
+ first character, e.g. from a pattern such as
(cat|cow|coyote), then it is returned in the integer pointed
- to by firstcharptr. Otherwise, if either
+ to by where. Otherwise, if either
(a) the pattern was compiled with the PCRE_MULTILINE option,
and every branch starts with "^", or
@@ -312,9 +362,48 @@ INFORMATION ABOUT A PATTERN
(b) every branch of the pattern starts with ".*" and
PCRE_DOTALL is not set (if it were set, the pattern would be
anchored),
+
then -1 is returned, indicating that the pattern matches
only at the start of a subject string or after any "\n"
- within the string. Otherwise -2 is returned.
+ within the string. Otherwise -2 is returned. For anchored
+ patterns, -2 is returned.
+
+ PCRE_INFO_FIRSTTABLE
+
+ If the pattern was studied, and this resulted in the con-
+ struction of a 256-bit table indicating a fixed set of char-
+ acters for the first character in any matching string, a
+ pointer to the table is returned. Otherwise NULL is
+ returned. The fourth argument should point to an unsigned
+ char * variable.
+
+ PCRE_INFO_LASTLITERAL
+
+ For a non-anchored pattern, return the value of the right-
+ most literal character which must exist in any matched
+ string, other than at its start. The fourth argument should
+ point to an int variable. If there is no such character, or
+ if the pattern is anchored, -1 is returned. For example, for
+ the pattern /a\d+z\d+/ the returned value is 'z'.
+
+ The pcre_info() function is now obsolete because its inter-
+ face is too restrictive to return all the available data
+ about a compiled pattern. New programs should use
+ pcre_fullinfo() instead. The yield of pcre_info() is the
+ number of capturing subpatterns, or one of the following
+ negative numbers:
+
+ PCRE_ERROR_NULL the argument code was NULL
+ PCRE_ERROR_BADMAGIC the "magic number" was not found
+
+ If the optptr argument is not NULL, a copy of the options
+ with which the pattern was compiled is placed in the integer
+ it points to (see PCRE_INFO_OPTIONS above).
+
+ If the pattern is not anchored and the firstcharptr argument
+ is not NULL, it is used to pass back information about the
+ first character of any matched string (see
+ PCRE_INFO_FIRSTCHAR above).
@@ -640,9 +729,10 @@ DIFFERENCES FROM PERL
6. The Perl \G assertion is not supported as it is not
relevant to single pattern matches.
- 7. Fairly obviously, PCRE does not support the (?{code})
- construction.
-
+ 7. Fairly obviously, PCRE does not support the (?{code}) and
+ (?p{code}) constructions. However, there is some experimen-
+ tal support for recursive patterns using the non-Perl item
+ (?R).
8. There are at the time of writing some oddities in Perl
5.005_02 concerned with the settings of captured strings
when part of a pattern is repeated. For example, matching
@@ -675,9 +765,9 @@ DIFFERENCES FROM PERL
(c) If PCRE_EXTRA is set, a backslash followed by a letter
with no special meaning is faulted.
- (d) If PCRE_UNGREEDY is set, the greediness of the
- repetition quantifiers is inverted, that is, by default they
- are not greedy, but if followed by a question mark they are.
+ (d) If PCRE_UNGREEDY is set, the greediness of the repeti-
+ tion quantifiers is inverted, that is, by default they are
+ not greedy, but if followed by a question mark they are.
(e) PCRE_ANCHORED can be used to force a pattern to be tried
only at the start of the subject.
@@ -685,15 +775,20 @@ DIFFERENCES FROM PERL
(f) The PCRE_NOTBOL, PCRE_NOTEOL, and PCRE_NOTEMPTY options
for pcre_exec() have no Perl equivalents.
+ (g) The (?R) construct allows for recursive pattern matching
+ (Perl 5.6 can do this using the (?p{code}) construct, which
+ PCRE cannot of course support.)
+
REGULAR EXPRESSION DETAILS
The syntax and semantics of the regular expressions sup-
ported by PCRE are described below. Regular expressions are
also described in the Perl documentation and in a number of
+
other books, some of which have copious examples. Jeffrey
Friedl's "Mastering Regular Expressions", published by
- O'Reilly (ISBN 1-56592-257-3), covers them in great detail.
+ O'Reilly (ISBN 1-56592-257), covers them in great detail.
The description here is intended as reference documentation.
A regular expression is a pattern that is matched against a
@@ -780,8 +875,7 @@ BACKSLASH
\f formfeed (hex 0C)
\n newline (hex 0A)
\r carriage return (hex 0D)
-
- tab (hex 09)
+ \t tab (hex 09)
\xhh character with hex code hh
\ddd character with octal code ddd, or backreference
@@ -833,6 +927,7 @@ BACKSLASH
Note that octal values of 100 or greater must not be intro-
duced by a leading zero, because no more than three octal
digits are ever read.
+
All the sequences that define a single byte value can be
used both inside and outside character classes. In addition,
inside a character class, the sequence "\b" is interpreted
@@ -885,6 +980,7 @@ BACKSLASH
These assertions may not appear in character classes (but
note that "\b" has a different meaning, namely the backspace
character, inside a character class).
+
A word boundary is a position in the subject string where
the current character and the previous character do not both
match \w or \W (i.e. one matches \w and the other matches
@@ -1046,6 +1142,44 @@ SQUARE BRACKETS
+POSIX CHARACTER CLASSES
+ Perl 5.6 (not yet released at the time of writing) is going
+ to support the POSIX notation for character classes, which
+ uses names enclosed by [: and :] within the enclosing
+ square brackets. PCRE supports this notation. For example,
+
+ [01[:alpha:]%]
+
+ matches "0", "1", any alphabetic character, or "%". The sup-
+ ported class names are
+
+ alnum letters and digits
+ alpha letters
+ ascii character codes 0 - 127
+ cntrl control characters
+ digit decimal digits (same as \d)
+ graph printing characters, excluding space
+ lower lower case letters
+ print printing characters, including space
+ punct printing characters, excluding letters and digits
+ space white space (same as \s)
+ upper upper case letters
+ word "word" characters (same as \w)
+ xdigit hexadecimal digits
+
+ The names "ascii" and "word" are Perl extensions. Another
+ Perl extension is negation, which is indicated by a ^ char-
+ acter after the colon. For example,
+
+ [12[:^digit:]]
+
+ matches "1", "2", or any non-digit. PCRE (and Perl) also
+ recogize the POSIX syntax [.ch.] and [=ch=] where "ch" is a
+ "collating element", but these are not supported, and an
+ error is given if they are encountered.
+
+
+
VERTICAL BAR
Vertical bar characters are used to separate alternative
patterns. For example, the pattern
@@ -1197,7 +1331,6 @@ REPETITION
Repetition is specified by quantifiers, which can follow any
of the following items:
-
a single character, possibly escaped
the . metacharacter
a character class
@@ -1384,8 +1517,8 @@ BACK REFERENCES
A back reference that occurs inside the parentheses to which
it refers fails when the subpattern is first used, so, for
example, (a\1) never matches. However, such references can
- be useful inside repeated subpatterns. For example, the pat-
- tern
+ be useful inside repeated subpatterns. For example, the
+ pattern
(a|b\1)+
@@ -1407,6 +1540,7 @@ ASSERTIONS
cated assertions are coded as subpatterns. There are two
kinds: those that look ahead of the current position in the
subject string, and those that look behind it.
+
An assertion subpattern is matched in the normal way, except
that it does not cause the current matching position to be
changed. Lookahead assertions start with (?= for positive
@@ -1572,20 +1706,19 @@ ONCE-ONLY SUBPATTERNS
abcd$
- when applied to a long string which does not match it.
- Because matching proceeds from left to right, PCRE will look
- for each "a" in the subject and then see if what follows
- matches the rest of the pattern. If the pattern is specified
- as
+ when applied to a long string which does not match. Because
+ matching proceeds from left to right, PCRE will look for
+ each "a" in the subject and then see if what follows matches
+ the rest of the pattern. If the pattern is specified as
^.*abcd$
then the initial .* matches the entire string at first, but
- when this fails, it backtracks to match all but the last
- character, then all but the last two characters, and so on.
- Once again the search for "a" covers the entire string, from
- right to left, so we are no better off. However, if the pat-
- tern is written as
+ when this fails (because there is no following "a"), it
+ backtracks to match all but the last character, then all but
+ the last two characters, and so on. Once again the search
+ for "a" covers the entire string, from right to left, so we
+ are no better off. However, if the pattern is written as
^(?>.*)(?<=abcd)
@@ -1596,6 +1729,36 @@ ONCE-ONLY SUBPATTERNS
this approach makes a significant difference to the process-
ing time.
+ When a pattern contains an unlimited repeat inside a subpat-
+ tern that can itself be repeated an unlimited number of
+ times, the use of a once-only subpattern is the only way to
+ avoid some failing matches taking a very long time indeed.
+ The pattern
+
+ (\D+|<\d+>)*[!?]
+
+ matches an unlimited number of substrings that either con-
+ sist of non-digits, or digits enclosed in <>, followed by
+ either ! or ?. When it matches, it runs quickly. However, if
+ it is applied to
+
+ aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
+
+ it takes a long time before reporting failure. This is
+ because the string can be divided between the two repeats in
+ a large number of ways, and all have to be tried. (The exam-
+ ple used [!?] rather than a single character at the end,
+ because both PCRE and Perl have an optimization that allows
+ for fast failure when a single character is used. They
+ remember the last single character that is required for a
+ match, and fail early if it is not present in the string.)
+ If the pattern is changed to
+
+ ((?>\D+)|<\d+>)*[!?]
+
+ sequences of non-digits cannot be broken, and failure hap-
+ pens quickly.
+
CONDITIONAL SUBPATTERNS
@@ -1668,6 +1831,75 @@ COMMENTS
+RECURSIVE PATTERNS
+ Consider the problem of matching a string in parentheses,
+ allowing for unlimited nested parentheses. Without the use
+ of recursion, the best that can be done is to use a pattern
+ that matches up to some fixed depth of nesting. It is not
+ possible to handle an arbitrary nesting depth. Perl 5.6 has
+ provided an experimental facility that allows regular
+ expressions to recurse (amongst other things). It does this
+ by interpolating Perl code in the expression at run time,
+ and the code can refer to the expression itself. A Perl pat-
+ tern to solve the parentheses problem can be created like
+ this:
+
+ $re = qr{\( (?: (?>[^()]+) | (?p{$re}) )* \)}x;
+
+ The (?p{...}) item interpolates Perl code at run time, and
+ in this case refers recursively to the pattern in which it
+ appears. Obviously, PCRE cannot support the interpolation of
+ Perl code. Instead, the special item (?R) is provided for
+ the specific case of recursion. This PCRE pattern solves the
+ parentheses problem (assume the PCRE_EXTENDED option is set
+ so that white space is ignored):
+
+ \( ( (?>[^()]+) | (?R) )* \)
+
+ First it matches an opening parenthesis. Then it matches any
+ number of substrings which can either be a sequence of non-
+ parentheses, or a recursive match of the pattern itself
+ (i.e. a correctly parenthesized substring). Finally there is
+ a closing parenthesis.
+
+ This particular example pattern contains nested unlimited
+ repeats, and so the use of a once-only subpattern for match-
+ ing strings of non-parentheses is important when applying
+ the pattern to strings that do not match. For example, when
+ it is applied to
+
+ (aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa()
+
+ it yields "no match" quickly. However, if a once-only sub-
+ pattern is not used, the match runs for a very long time
+ indeed because there are so many different ways the + and *
+ repeats can carve up the subject, and all have to be tested
+ before failure can be reported.
+
+ The values set for any capturing subpatterns are those from
+ the outermost level of the recursion at which the subpattern
+ value is set. If the pattern above is matched against
+
+ (ab(cd)ef)
+
+ the value for the capturing parentheses is "ef", which is
+ the last value taken on at the top level. If additional
+ parentheses are added, giving
+
+ \( ( ( (?>[^()]+) | (?R) )* ) \)
+ ^ ^
+ ^ ^ then the string they capture
+ is "ab(cd)ef", the contents of the top level parentheses. If
+ there are more than 15 capturing parentheses in a pattern,
+ PCRE has to obtain extra memory to store data during a
+ recursion, which it does by using pcre_malloc, freeing it
+ via pcre_free afterwards. If no memory can be obtained, it
+ saves data for the first 15 capturing parentheses only, as
+ there is no way to give an out-of-memory error from within a
+ recursion.
+
+
+
PERFORMANCE
Certain items that may appear in patterns are more efficient
than others. It is more efficient to use a character class
@@ -1742,5 +1974,5 @@ AUTHOR
Cambridge CB2 3QG, England.
Phone: +44 1223 334714
- Last updated: 29 July 1999
- Copyright (c) 1997-1999 University of Cambridge.
+ Last updated: 27 January 2000
+ Copyright (c) 1997-2000 University of Cambridge.
diff --git a/doc/pcreposix.3 b/doc/pcreposix.3
index 0a40369..1be5d9a 100644
--- a/doc/pcreposix.3
+++ b/doc/pcreposix.3
@@ -36,11 +36,11 @@ can be accessed by adding \fB-lpcreposix\fR to the command for linking an
application which uses them. Because the POSIX functions call the native ones,
it is also necessary to add \fR-lpcre\fR.
-As I am pretty ignorant about POSIX, these functions must be considered as
-experimental. I have implemented only those option bits that can be reasonably
-mapped to PCRE native options. Other POSIX options are not even defined. It may
-be that it is useful to define, but ignore, other options. Feedback from more
-knowledgeable folk may cause this kind of detail to change.
+I have implemented only those option bits that can be reasonably mapped to PCRE
+native options. In addition, the options REG_EXTENDED and REG_NOSUB are defined
+with the value zero. They have no effect, but since programs that are written
+to the POSIX interface often use them, this makes it easier to slot in PCRE as
+a replacement library. Other POSIX options are not even defined.
When PCRE is called via these functions, it is only the API that is POSIX-like
in style. The syntax and semantics of the regular expressions themselves are
diff --git a/doc/pcreposix.html b/doc/pcreposix.html
index 2c764b6..121d90f 100644
--- a/doc/pcreposix.html
+++ b/doc/pcreposix.html
@@ -55,11 +55,11 @@ application which uses them. Because the POSIX functions call the native ones,
it is also necessary to add \fR-lpcre\fR.
</P>
<P>
-As I am pretty ignorant about POSIX, these functions must be considered as
-experimental. I have implemented only those option bits that can be reasonably
-mapped to PCRE native options. Other POSIX options are not even defined. It may
-be that it is useful to define, but ignore, other options. Feedback from more
-knowledgeable folk may cause this kind of detail to change.
+I have implemented only those option bits that can be reasonably mapped to PCRE
+native options. In addition, the options REG_EXTENDED and REG_NOSUB are defined
+with the value zero. They have no effect, but since programs that are written
+to the POSIX interface often use them, this makes it easier to slot in PCRE as
+a replacement library. Other POSIX options are not even defined.
</P>
<P>
When PCRE is called via these functions, it is only the API that is POSIX-like
diff --git a/doc/pcreposix.txt b/doc/pcreposix.txt
index c85fb84..4a7036f 100644
--- a/doc/pcreposix.txt
+++ b/doc/pcreposix.txt
@@ -34,13 +34,13 @@ DESCRIPTION
which uses them. Because the POSIX functions call the native
ones, it is also necessary to add -lpcre.
- As I am pretty ignorant about POSIX, these functions must be
- considered as experimental. I have implemented only those
- option bits that can be reasonably mapped to PCRE native
- options. Other POSIX options are not even defined. It may be
- that it is useful to define, but ignore, other options.
- Feedback from more knowledgeable folk may cause this kind of
- detail to change.
+ I have implemented only those option bits that can be rea-
+ sonably mapped to PCRE native options. In addition, the
+ options REG_EXTENDED and REG_NOSUB are defined with the
+ value zero. They have no effect, but since programs that are
+ written to the POSIX interface often use them, this makes it
+ easier to slot in PCRE as a replacement library. Other POSIX
+ options are not even defined.
When PCRE is called via these functions, it is only the API
that is POSIX-like in style. The syntax and semantics of the
diff --git a/doc/pcretest.txt b/doc/pcretest.txt
index 29e2f5c..831fdac 100644
--- a/doc/pcretest.txt
+++ b/doc/pcretest.txt
@@ -7,20 +7,23 @@ experimenting with regular expressions.
If it is given two filename arguments, it reads from the first and writes to
the second. If it is given only one filename argument, it reads from that file
and writes to stdout. Otherwise, it reads from stdin and writes to stdout, and
-prompts for each line of input.
+prompts for each line of input, using "re>" to prompt for regular expressions,
+and "data>" to prompt for data lines.
The program handles any number of sets of input on a single input file. Each
set starts with a regular expression, and continues with any number of data
lines to be matched against the pattern. An empty line signals the end of the
-set. The regular expressions are given enclosed in any non-alphameric
-delimiters other than backslash, for example
+data lines, at which point a new regular expression is read. The regular
+expressions are given enclosed in any non-alphameric delimiters other than
+backslash, for example
/(a|bc)x+yz/
White space before the initial delimiter is ignored. A regular expression may
be continued over several input lines, in which case the newline characters are
-included within it. See the testinput files for many examples. It is possible
-to include the delimiter within the pattern by escaping it, for example
+included within it. See the test input files in the testdata directory for many
+examples. It is possible to include the delimiter within the pattern by
+escaping it, for example
/abc\/def/
@@ -85,9 +88,9 @@ is, /L applies only to the expression on which it appears.
The /I modifier requests that pcretest output information about the compiled
expression (whether it is anchored, has a fixed first character, and so on). It
-does this by calling pcre_info() after compiling an expression, and outputting
-the information it gets back. If the pattern is studied, the results of that
-are also output.
+does this by calling pcre_fullinfo() after compiling an expression, and
+outputting the information it gets back. If the pattern is studied, the results
+of that are also output.
The /D modifier is a PCRE debugging feature, which also assumes /I. It causes
the internal form of compiled regular expressions to be output after
diff --git a/internal.h b/internal.h
index 5c782ac..91ff301 100644
--- a/internal.h
+++ b/internal.h
@@ -9,7 +9,7 @@ the file Tech.Notes for some information on the internals.
Written by: Philip Hazel <ph10@cam.ac.uk>
- Copyright (c) 1997-1999 University of Cambridge
+ Copyright (c) 1997-2000 University of Cambridge
-----------------------------------------------------------------------------
Permission is granted to anyone to use this software for any purpose on any
@@ -40,9 +40,9 @@ modules, but which are not relevant to the outside. */
#include "config.h"
/* To cope with SunOS4 and other systems that lack memmove() but have bcopy(),
-define a macro for memmove() if HAVE_MEMMOVE is not defined. */
+define a macro for memmove() if HAVE_MEMMOVE is false. */
-#ifndef HAVE_MEMMOVE
+#if ! HAVE_MEMMOVE
#undef memmove /* some systems may have a macro */
#define memmove(a, b, c) bcopy(b, a, c)
#endif
@@ -188,6 +188,7 @@ enum {
OP_CLASS, /* Match a character class */
OP_REF, /* Match a back reference */
+ OP_RECURSE, /* Match this pattern recursively */
OP_ALT, /* Start of alternation */
OP_KET, /* End of group that doesn't have an unbounded repeat */
@@ -254,6 +255,9 @@ just to accommodate the POSIX wrapper. */
#define ERR26 "malformed number after (?("
#define ERR27 "conditional group contains more than two branches"
#define ERR28 "assertion expected after (?("
+#define ERR29 "(?p must be followed by )"
+#define ERR30 "unknown POSIX class name"
+#define ERR31 "POSIX collating elements are not supported"
/* All character handling must be done as unsigned characters. Otherwise there
are problems with top-bit-set characters and functions such as isspace().
@@ -269,6 +273,7 @@ runs on as long as necessary after the end. */
typedef struct real_pcre {
unsigned long int magic_number;
+ size_t size;
const unsigned char *tables;
unsigned long int options;
uschar top_bracket;
@@ -311,11 +316,12 @@ typedef struct match_data {
BOOL noteol; /* NOTEOL flag */
BOOL endonly; /* Dollar not before final \n */
BOOL notempty; /* Empty string match not wanted */
+ const uschar *start_pattern; /* For use when recursing */
const uschar *start_subject; /* Start of the subject string */
const uschar *end_subject; /* End of the subject string */
const uschar *start_match; /* Start of this match attempt */
const uschar *end_match_ptr; /* Subject position at end match */
- int end_offset_top; /* Highwater mark at end of match */
+ int end_offset_top; /* Highwater mark at end of match */
} match_data;
/* Bit definitions for entries in the pcre_ctypes table. */
@@ -328,12 +334,19 @@ typedef struct match_data {
#define ctype_meta 0x80 /* regexp meta char or zero (end pattern) */
/* Offsets for the bitmap tables in pcre_cbits. Each table contains a set
-of bits for a class map. */
-
-#define cbit_digit 0 /* for \d */
-#define cbit_word 32 /* for \w */
-#define cbit_space 64 /* for \s */
-#define cbit_length 96 /* Length of the cbits table */
+of bits for a class map. Some classes are built by combining these tables. */
+
+#define cbit_space 0 /* [:space:] or \s */
+#define cbit_xdigit 32 /* [:xdigit:] */
+#define cbit_digit 64 /* [:digit:] or \d */
+#define cbit_upper 96 /* [:upper:] */
+#define cbit_lower 128 /* [:lower:] */
+#define cbit_word 160 /* [:word:] or \w */
+#define cbit_graph 192 /* [:graph:] */
+#define cbit_print 224 /* [:print:] */
+#define cbit_punct 256 /* [:punct:] */
+#define cbit_cntrl 288 /* [:cntrl:] */
+#define cbit_length 320 /* Length of the cbits table */
/* Offsets of the various tables from the base tables pointer, and
total length. */
diff --git a/maketables.c b/maketables.c
index eb5fcd1..c0f06c0 100644
--- a/maketables.c
+++ b/maketables.c
@@ -8,7 +8,7 @@ and semantics are as close as possible to those of the Perl 5 language.
Written by: Philip Hazel <ph10@cam.ac.uk>
- Copyright (c) 1997-1999 University of Cambridge
+ Copyright (c) 1997-2000 University of Cambridge
-----------------------------------------------------------------------------
Permission is granted to anyone to use this software for any purpose on any
@@ -81,15 +81,34 @@ for (i = 0; i < 256; i++) *p++ = tolower(i);
for (i = 0; i < 256; i++) *p++ = islower(i)? toupper(i) : tolower(i);
-/* Then the character class tables */
+/* Then the character class tables. Don't try to be clever and save effort
+on exclusive ones - in some locales things may be different. */
memset(p, 0, cbit_length);
for (i = 0; i < 256; i++)
{
- if (isdigit(i)) p[cbit_digit + i/8] |= 1 << (i&7);
- if (isalnum(i) || i == '_')
- p[cbit_word + i/8] |= 1 << (i&7);
+ if (isdigit(i))
+ {
+ p[cbit_digit + i/8] |= 1 << (i&7);
+ p[cbit_word + i/8] |= 1 << (i&7);
+ }
+ if (isupper(i))
+ {
+ p[cbit_upper + i/8] |= 1 << (i&7);
+ p[cbit_word + i/8] |= 1 << (i&7);
+ }
+ if (islower(i))
+ {
+ p[cbit_lower + i/8] |= 1 << (i&7);
+ p[cbit_word + i/8] |= 1 << (i&7);
+ }
+ if (i == '_') p[cbit_word + i/8] |= 1 << (i&7);
if (isspace(i)) p[cbit_space + i/8] |= 1 << (i&7);
+ if (isxdigit(i))p[cbit_xdigit + i/8] |= 1 << (i&7);
+ if (isgraph(i)) p[cbit_graph + i/8] |= 1 << (i&7);
+ if (isprint(i)) p[cbit_print + i/8] |= 1 << (i&7);
+ if (ispunct(i)) p[cbit_punct + i/8] |= 1 << (i&7);
+ if (iscntrl(i)) p[cbit_cntrl + i/8] |= 1 << (i&7);
}
p += cbit_length;
diff --git a/pcre-config.in b/pcre-config.in
new file mode 100644
index 0000000..8daded9
--- /dev/null
+++ b/pcre-config.in
@@ -0,0 +1,59 @@
+#!/bin/sh
+
+prefix=@prefix@
+exec_prefix=@exec_prefix@
+exec_prefix_set=no
+
+usage="\
+Usage: pcre-config [--prefix] [--exec-prefix] [--version] [--libs] [--libs-posix] [--cflags] [--cflags-posix]"
+
+if test $# -eq 0; then
+ echo "${usage}" 1>&2
+ exit 1
+fi
+
+while test $# -gt 0; do
+ case "$1" in
+ -*=*) optarg=`echo "$1" | sed 's/[-_a-zA-Z0-9]*=//'` ;;
+ *) optarg= ;;
+ esac
+
+ case $1 in
+ --prefix=*)
+ prefix=$optarg
+ if test $exec_prefix_set = no ; then
+ exec_prefix=$optarg
+ fi
+ ;;
+ --prefix)
+ echo $prefix
+ ;;
+ --exec-prefix=*)
+ exec_prefix=$optarg
+ exec_prefix_set=yes
+ ;;
+ --exec-prefix)
+ echo $exec_prefix
+ ;;
+ --version)
+ echo @PCRE_VERSION@
+ ;;
+ --cflags | --cflags-posix)
+ if test @includedir@ != /usr/include ; then
+ includes=-I@includedir@
+ fi
+ echo $includes
+ ;;
+ --libs-posix)
+ echo -L@libdir@ -lpcreposix -lpcre
+ ;;
+ --libs)
+ echo -L@libdir@ -lpcre
+ ;;
+ *)
+ echo "${usage}" 1>&2
+ exit 1
+ ;;
+ esac
+ shift
+done
diff --git a/pcre.c b/pcre.c
index 6735b82..e45dee8 100644
--- a/pcre.c
+++ b/pcre.c
@@ -9,7 +9,7 @@ the file Tech.Notes for some information on the internals.
Written by: Philip Hazel <ph10@cam.ac.uk>
- Copyright (c) 1997-1999 University of Cambridge
+ Copyright (c) 1997-2000 University of Cambridge
-----------------------------------------------------------------------------
Permission is granted to anyone to use this software for any purpose on any
@@ -82,7 +82,7 @@ static const char *OP_names[] = {
"*", "*?", "+", "+?", "?", "??", "{", "{", "{",
"*", "*?", "+", "+?", "?", "??", "{", "{", "{",
"*", "*?", "+", "+?", "?", "??", "{", "{",
- "class", "Ref",
+ "class", "Ref", "Recurse",
"Alt", "Ket", "KetRmax", "KetRmin", "Assert", "Assert not",
"AssertB", "AssertB not", "Reverse", "Once", "Cond", "Cref",
"Brazero", "Braminzero", "Bra"
@@ -107,6 +107,38 @@ static const short int escapes[] = {
0, 0, -ESC_z /* x - z */
};
+/* Tables of names of POSIX character classes and their lengths. The list is
+terminated by a zero length entry. The first three must be alpha, upper, lower,
+as this is assumed for handling case independence. */
+
+static const char *posix_names[] = {
+ "alpha", "lower", "upper",
+ "alnum", "ascii", "cntrl", "digit", "graph",
+ "print", "punct", "space", "word", "xdigit" };
+
+static const uschar posix_name_lengths[] = {
+ 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
+
+/* Table of class bit maps for each POSIX class; up to three may be combined
+to form the class. */
+
+static const int posix_class_maps[] = {
+ cbit_lower, cbit_upper, -1, /* alpha */
+ cbit_lower, -1, -1, /* lower */
+ cbit_upper, -1, -1, /* upper */
+ cbit_digit, cbit_lower, cbit_upper, /* alnum */
+ cbit_print, cbit_cntrl, -1, /* ascii */
+ cbit_cntrl, -1, -1, /* cntrl */
+ cbit_digit, -1, -1, /* digit */
+ cbit_graph, -1, -1, /* graph */
+ cbit_print, -1, -1, /* print */
+ cbit_punct, -1, -1, /* punct */
+ cbit_space, -1, -1, /* space */
+ cbit_word, -1, -1, /* word */
+ cbit_xdigit,-1, -1 /* xdigit */
+};
+
+
/* Definition to allow mutual recursion */
static BOOL
@@ -161,12 +193,13 @@ return XSTRING(PCRE_MAJOR) "." XSTRING(PCRE_MINOR) " " XSTRING(PCRE_DATE);
/*************************************************
-* Return info about a compiled pattern *
+* (Obsolete) Return info about compiled pattern *
*************************************************/
-/* This function picks potentially useful data out of the private
-structure. The public options are passed back in an int - though the
-re->options field has been expanded to a long int, all the public options
+/* This is the original "info" function. It picks potentially useful data out
+of the private structure, but its interface was too rigid. It remains for
+backwards compatibility. The public options are passed back in an int - though
+the re->options field has been expanded to a long int, all the public options
at the low end of it, and so even on 16-bit systems this will still be OK.
Therefore, I haven't changed the API for pcre_info().
@@ -177,7 +210,7 @@ Arguments:
or -1 if multiline and all branches start ^,
or -2 otherwise
-Returns: number of identifying extraction brackets
+Returns: number of capturing subpatterns
or negative values on error
*/
@@ -196,6 +229,74 @@ return re->top_bracket;
+/*************************************************
+* Return info about compiled pattern *
+*************************************************/
+
+/* This is a newer "info" function which has an extensible interface so
+that additional items can be added compatibly.
+
+Arguments:
+ external_re points to compiled code
+ external_study points to study data, or NULL
+ what what information is required
+ where where to put the information
+
+Returns: 0 if data returned, negative on error
+*/
+
+int
+pcre_fullinfo(const pcre *external_re, const pcre_extra *study_data, int what,
+ void *where)
+{
+const real_pcre *re = (const real_pcre *)external_re;
+const real_pcre_extra *study = (const real_pcre_extra *)study_data;
+
+if (re == NULL || where == NULL) return PCRE_ERROR_NULL;
+if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
+
+switch (what)
+ {
+ case PCRE_INFO_OPTIONS:
+ *((unsigned long int *)where) = re->options & PUBLIC_OPTIONS;
+ break;
+
+ case PCRE_INFO_SIZE:
+ *((size_t *)where) = re->size;
+ break;
+
+ case PCRE_INFO_CAPTURECOUNT:
+ *((int *)where) = re->top_bracket;
+ break;
+
+ case PCRE_INFO_BACKREFMAX:
+ *((int *)where) = re->top_backref;
+ break;
+
+ case PCRE_INFO_FIRSTCHAR:
+ *((int *)where) =
+ ((re->options & PCRE_FIRSTSET) != 0)? re->first_char :
+ ((re->options & PCRE_STARTLINE) != 0)? -1 : -2;
+ break;
+
+ case PCRE_INFO_FIRSTTABLE:
+ *((const uschar **)where) =
+ (study != NULL && (study->options & PCRE_STUDY_MAPPED) != 0)?
+ study->start_bits : NULL;
+ break;
+
+ case PCRE_INFO_LASTLITERAL:
+ *((int *)where) =
+ ((re->options & PCRE_REQCHSET) != 0)? re->req_char : -1;
+ break;
+
+ default: return PCRE_ERROR_BADOPTION;
+ }
+
+return 0;
+}
+
+
#ifdef DEBUG
/*************************************************
@@ -255,9 +356,9 @@ check_escape(const uschar **ptrptr, const char **errorptr, int bracount,
int options, BOOL isclass, compile_data *cd)
{
const uschar *ptr = *ptrptr;
-int c = *(++ptr) & 255; /* Ensure > 0 on signed-char systems */
-int i;
+int c, i;
+c = *(++ptr) & 255; /* Ensure > 0 on signed-char systems */
if (c == 0) *errorptr = ERR1;
/* Digits or letters may have special meaning; all others are literals. */
@@ -622,6 +723,71 @@ for (;;)
/*************************************************
+* Check for POSIX class syntax *
+*************************************************/
+
+/* This function is called when the sequence "[:" or "[." or "[=" is
+encountered in a character class. It checks whether this is followed by an
+optional ^ and then a sequence of letters, terminated by a matching ":]" or
+".]" or "=]".
+
+Argument:
+ ptr pointer to the initial [
+ endptr where to return the end pointer
+ cd pointer to compile data
+
+Returns: TRUE or FALSE
+*/
+
+static BOOL
+check_posix_syntax(const uschar *ptr, const uschar **endptr, compile_data *cd)
+{
+int terminator; /* Don't combine these lines; the Solaris cc */
+terminator = *(++ptr); /* compiler warns about "non-constant" initializer. */
+if (*(++ptr) == '^') ptr++;
+while ((cd->ctypes[*ptr] & ctype_letter) != 0) ptr++;
+if (*ptr == terminator && ptr[1] == ']')
+ {
+ *endptr = ptr;
+ return TRUE;
+ }
+return FALSE;
+}
+
+
+
+
+/*************************************************
+* Check POSIX class name *
+*************************************************/
+
+/* This function is called to check the name given in a POSIX-style class entry
+such as [:alnum:].
+
+Arguments:
+ ptr points to the first letter
+ len the length of the name
+
+Returns: a value representing the name, or -1 if unknown
+*/
+
+static int
+check_posix_name(const uschar *ptr, int len)
+{
+register int yield = 0;
+while (posix_name_lengths[yield] != 0)
+ {
+ if (len == posix_name_lengths[yield] &&
+ strncmp((const char *)ptr, posix_names[yield], len) == 0) return yield;
+ yield++;
+ }
+return -1;
+}
+
+
+
+
+/*************************************************
* Compile one branch *
*************************************************/
@@ -764,6 +930,66 @@ for (;; ptr++)
goto FAILED;
}
+ /* Handle POSIX class names. Perl allows a negation extension of the
+ form [:^name]. A square bracket that doesn't match the syntax is
+ treated as a literal. We also recognize the POSIX constructions
+ [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
+ 5.6 does. */
+
+ if (c == '[' &&
+ (ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
+ check_posix_syntax(ptr, &tempptr, cd))
+ {
+ BOOL local_negate = FALSE;
+ int posix_class, i;
+ register const uschar *cbits = cd->cbits;
+
+ if (ptr[1] != ':')
+ {
+ *errorptr = ERR31;
+ goto FAILED;
+ }
+
+ ptr += 2;
+ if (*ptr == '^')
+ {
+ local_negate = TRUE;
+ ptr++;
+ }
+
+ posix_class = check_posix_name(ptr, tempptr - ptr);
+ if (posix_class < 0)
+ {
+ *errorptr = ERR30;
+ goto FAILED;
+ }
+
+ /* If matching is caseless, upper and lower are converted to
+ alpha. This relies on the fact that the class table starts with
+ alpha, lower, upper as the first 3 entries. */
+
+ if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
+ posix_class = 0;
+
+ /* Or into the map we are building up to 3 of the static class
+ tables, or their negations. */
+
+ posix_class *= 3;
+ for (i = 0; i < 3; i++)
+ {
+ int taboffset = posix_class_maps[posix_class + i];
+ if (taboffset < 0) break;
+ if (local_negate)
+ for (c = 0; c < 32; c++) class[c] |= ~cbits[c+taboffset];
+ else
+ for (c = 0; c < 32; c++) class[c] |= cbits[c+taboffset];
+ }
+
+ ptr = tempptr + 1;
+ class_charcount = 10; /* Set > 1; assumes more than 1 per class */
+ continue;
+ }
+
/* Backslash may introduce a single character, or it may introduce one
of the specials, which just set a flag. Escaped items are checked for
validity in the pre-compiling pass. The sequence \b is a special case.
@@ -791,13 +1017,11 @@ for (;; ptr++)
continue;
case ESC_w:
- for (c = 0; c < 32; c++)
- class[c] |= (cbits[c+cbit_digit] | cbits[c+cbit_word]);
+ for (c = 0; c < 32; c++) class[c] |= cbits[c+cbit_word];
continue;
case ESC_W:
- for (c = 0; c < 32; c++)
- class[c] |= ~(cbits[c+cbit_digit] | cbits[c+cbit_word]);
+ for (c = 0; c < 32; c++) class[c] |= ~cbits[c+cbit_word];
continue;
case ESC_s:
@@ -1360,6 +1584,11 @@ for (;; ptr++)
ptr++;
break;
+ case 'R': /* Pattern recursion */
+ *code++ = OP_RECURSE;
+ ptr++;
+ continue;
+
default: /* Option setting */
set = unset = 0;
optset = &set;
@@ -2015,12 +2244,13 @@ pcre_compile(const char *pattern, int options, const char **errorptr,
real_pcre *re;
int length = 3; /* For initial BRA plus length */
int runlength;
-int c, size, reqchar, countlits;
+int c, reqchar, countlits;
int bracount = 0;
int top_backref = 0;
int branch_extra = 0;
int branch_newextra;
unsigned int brastackptr = 0;
+size_t size;
uschar *code;
const uschar *ptr;
compile_data compile_block;
@@ -2248,6 +2478,19 @@ while ((c = *(++ptr)) != 0)
ptr += 2;
break;
+ /* A recursive call to the regex is an extension, to provide the
+ facility which can be obtained by $(?p{perl-code}) in Perl 5.6. */
+
+ case 'R':
+ if (ptr[3] != ')')
+ {
+ *errorptr = ERR29;
+ goto PCRE_ERROR_RETURN;
+ }
+ ptr += 3;
+ length += 1;
+ break;
+
/* Lookbehinds are in Perl from version 5.005 */
case '<':
@@ -2550,9 +2793,10 @@ if (re == NULL)
return NULL;
}
-/* Put in the magic number and the options. */
+/* Put in the magic number, and save the size, options, and table pointer */
re->magic_number = MAGIC_NUMBER;
+re->size = size;
re->options = options;
re->tables = tables;
@@ -3147,6 +3391,53 @@ for (;;)
ecode += 3;
break;
+ /* Recursion matches the current regex, nested. If there are any capturing
+ brackets started but not finished, we have to save their starting points
+ and reinstate them after the recursion. However, we don't know how many
+ such there are (offset_top records the completed total) so we just have
+ to save all the potential data. There may be up to 99 such values, which
+ is a bit large to put on the stack, but using malloc for small numbers
+ seems expensive. As a compromise, the stack is used when there are fewer
+ than 16 values to store; otherwise malloc is used. A problem is what to do
+ if the malloc fails ... there is no way of returning to the top level with
+ an error. Save the top 15 values on the stack, and accept that the rest
+ may be wrong. */
+
+ case OP_RECURSE:
+ {
+ BOOL rc;
+ int *save;
+ int stacksave[15];
+
+ c = md->offset_max;
+
+ if (c < 16) save = stacksave; else
+ {
+ save = (int *)(pcre_malloc)((c+1) * sizeof(int));
+ if (save == NULL)
+ {
+ save = stacksave;
+ c = 15;
+ }
+ }
+
+ for (i = 1; i <= c; i++)
+ save[i] = md->offset_vector[md->offset_end - i];
+ rc = match(eptr, md->start_pattern, offset_top, md, ims, FALSE, eptrb);
+ for (i = 1; i <= c; i++)
+ md->offset_vector[md->offset_end - i] = save[i];
+ if (save != stacksave) (pcre_free)(save);
+ if (!rc) return FALSE;
+
+ /* In case the recursion has set more capturing values, save the final
+ number, then move along the subject till after the recursive match,
+ and advance one byte in the pattern code. */
+
+ offset_top = md->end_offset_top;
+ eptr = md->end_match_ptr;
+ ecode++;
+ }
+ break;
/* "Once" brackets are like assertion brackets except that after a match,
the point in the subject string is not moved back. Thus there can never be
@@ -4216,6 +4507,7 @@ if (re == NULL || subject == NULL ||
(offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
+match_block.start_pattern = re->code;
match_block.start_subject = (const uschar *)subject;
match_block.end_subject = match_block.start_subject + length;
end_subject = match_block.end_subject;
diff --git a/pcre.h b/pcre.in
index 4888b45..74b0cfc 100644
--- a/pcre.h
+++ b/pcre.in
@@ -2,14 +2,14 @@
* Perl-Compatible Regular Expressions *
*************************************************/
-/* Copyright (c) 1997-1999 University of Cambridge */
+/* Copyright (c) 1997-2000 University of Cambridge */
#ifndef _PCRE_H
#define _PCRE_H
-#define PCRE_MAJOR 2
-#define PCRE_MINOR 08
-#define PCRE_DATE 31-Aug-1999
+#define PCRE_MAJOR @PCRE_MAJOR@
+#define PCRE_MINOR @PCRE_MINOR@
+#define PCRE_DATE @PCRE_DATE@
/* Win32 uses DLL by default */
@@ -59,6 +59,16 @@ extern "C" {
#define PCRE_ERROR_NOMEMORY (-6)
#define PCRE_ERROR_NOSUBSTRING (-7)
+/* Request types for pcre_fullinfo() */
+
+#define PCRE_INFO_OPTIONS 0
+#define PCRE_INFO_SIZE 1
+#define PCRE_INFO_CAPTURECOUNT 2
+#define PCRE_INFO_BACKREFMAX 3
+#define PCRE_INFO_FIRSTCHAR 4
+#define PCRE_INFO_FIRSTTABLE 5
+#define PCRE_INFO_LASTLITERAL 6
+
/* Types */
typedef void pcre;
@@ -83,6 +93,7 @@ extern int pcre_exec(const pcre *, const pcre_extra *, const char *,
extern int pcre_get_substring(const char *, int *, int, int, const char **);
extern int pcre_get_substring_list(const char *, int *, int, const char ***);
extern int pcre_info(const pcre *, int *, int *);
+extern int pcre_fullinfo(const pcre *, const pcre_extra *, int, void *);
extern unsigned const char *pcre_maketables(void);
extern pcre_extra *pcre_study(const pcre *, int, const char **);
extern const char *pcre_version(void);
diff --git a/pcreposix.c b/pcreposix.c
index 12606af..7c66cce 100644
--- a/pcreposix.c
+++ b/pcreposix.c
@@ -12,7 +12,7 @@ functions.
Written by: Philip Hazel <ph10@cam.ac.uk>
- Copyright (c) 1997-1999 University of Cambridge
+ Copyright (c) 1997-2000 University of Cambridge
-----------------------------------------------------------------------------
Permission is granted to anyone to use this software for any purpose on any
@@ -46,7 +46,8 @@ restrictions:
static const char *estring[] = {
ERR1, ERR2, ERR3, ERR4, ERR5, ERR6, ERR7, ERR8, ERR9, ERR10,
ERR11, ERR12, ERR13, ERR14, ERR15, ERR16, ERR17, ERR18, ERR19, ERR20,
- ERR21, ERR22, ERR23, ERR24, ERR25 };
+ ERR21, ERR22, ERR23, ERR24, ERR25, ERR26, ERR27, ERR29, ERR29, ERR30,
+ ERR31 };
static int eint[] = {
REG_EESCAPE, /* "\\ at end of pattern" */
@@ -76,7 +77,10 @@ static int eint[] = {
REG_BADPAT, /* "lookbehind assertion is not fixed length" */
REG_BADPAT, /* "malformed number after (?(" */
REG_BADPAT, /* "conditional group containe more than two branches" */
- REG_BADPAT /* "assertion expected after (?(" */
+ REG_BADPAT, /* "assertion expected after (?(" */
+ REG_BADPAT, /* "(?p must be followed by )" */
+ REG_ECTYPE, /* "unknown POSIX class name" */
+ REG_BADPAT /* "POSIX collating elements are not supported" */
};
/* Table of texts corresponding to POSIX error codes */
@@ -231,7 +235,7 @@ preg->re_erroffset = (size_t)(-1); /* Only has meaning after compile */
if (nmatch > 0)
{
- ovector = malloc(sizeof(int) * nmatch * 3);
+ ovector = (int *)malloc(sizeof(int) * nmatch * 3);
if (ovector == NULL) return REG_ESPACE;
}
diff --git a/pcreposix.h b/pcreposix.h
index 208db35..7660acb 100644
--- a/pcreposix.h
+++ b/pcreposix.h
@@ -2,7 +2,7 @@
* Perl-Compatible Regular Expressions *
*************************************************/
-/* Copyright (c) 1997-1999 University of Cambridge */
+/* Copyright (c) 1997-2000 University of Cambridge */
#ifndef _PCREPOSIX_H
#define _PCREPOSIX_H
@@ -28,6 +28,12 @@ extern "C" {
#define REG_NOTBOL 0x04
#define REG_NOTEOL 0x08
+/* These are not used by PCRE, but by defining them we make it easier
+to slot PCRE into existing programs that make POSIX calls. */
+
+#define REG_EXTENDED 0
+#define REG_NOSUB 0
+
/* Error values. Not all these are relevant or used by the wrapper. */
enum {
diff --git a/pcretest.c b/pcretest.c
index 8e2fe4f..b9e36e2 100644
--- a/pcretest.c
+++ b/pcretest.c
@@ -34,6 +34,7 @@ Makefile. */
static FILE *outfile;
static int log_store = 0;
+static size_t gotten_store;
@@ -48,7 +49,7 @@ static const char *OP_names[] = {
"*", "*?", "+", "+?", "?", "??", "{", "{", "{",
"*", "*?", "+", "+?", "?", "??", "{", "{", "{",
"*", "*?", "+", "+?", "?", "??", "{", "{",
- "class", "Ref",
+ "class", "Ref", "Recurse",
"Alt", "Ket", "KetRmax", "KetRmin", "Assert", "Assert not",
"AssertB", "AssertB not", "Reverse", "Once", "Cond", "Cref",
"Brazero", "Braminzero", "Bra"
@@ -281,6 +282,7 @@ compiled re. */
static void *new_malloc(size_t size)
{
+gotten_store = size;
if (log_store)
fprintf(outfile, "Memory allocation (code space): %d\n",
(int)((int)size - offsetof(real_pcre, code[0])));
@@ -289,6 +291,19 @@ return malloc(size);
+
+/* Get one piece of information from the pcre_fullinfo() function */
+
+static void new_info(pcre *re, pcre_extra *study, int option, void *ptr)
+{
+int rc;
+if ((rc = pcre_fullinfo(re, study, option, ptr)) < 0)
+ fprintf(outfile, "Error %d from pcre_fullinfo(%d)\n", rc, option);
+}
+
+
+
+
/* Read lines from named file or stdin and write to named file or stdout; lines
consist of a regular expression, in delimiters and optionally followed by
options, followed by a set of test data, terminated by an empty line. */
@@ -573,59 +588,90 @@ while (!done)
goto CONTINUE;
}
- /* Compilation succeeded; print data if required */
+ /* Compilation succeeded; print data if required. There are now two
+ info-returning functions. The old one has a limited interface and
+ returns only limited data. Check that it agrees with the newer one. */
if (do_showinfo)
{
- int first_char, count;
+ int old_first_char, old_options, old_count;
+ int count, backrefmax, first_char, need_char;
+ size_t size;
if (do_debug) print_internals(re);
- count = pcre_info(re, &options, &first_char);
+ new_info(re, NULL, PCRE_INFO_OPTIONS, &options);
+ new_info(re, NULL, PCRE_INFO_SIZE, &size);
+ new_info(re, NULL, PCRE_INFO_CAPTURECOUNT, &count);
+ new_info(re, NULL, PCRE_INFO_BACKREFMAX, &backrefmax);
+ new_info(re, NULL, PCRE_INFO_FIRSTCHAR, &first_char);
+ new_info(re, NULL, PCRE_INFO_LASTLITERAL, &need_char);
+
+ old_count = pcre_info(re, &old_options, &old_first_char);
if (count < 0) fprintf(outfile,
- "Error %d while reading info\n", count);
+ "Error %d from pcre_info()\n", count);
else
{
- fprintf(outfile, "Identifying subpattern count = %d\n", count);
- if (options == 0) fprintf(outfile, "No options\n");
- else fprintf(outfile, "Options:%s%s%s%s%s%s%s%s\n",
- ((options & PCRE_ANCHORED) != 0)? " anchored" : "",
- ((options & PCRE_CASELESS) != 0)? " caseless" : "",
- ((options & PCRE_EXTENDED) != 0)? " extended" : "",
- ((options & PCRE_MULTILINE) != 0)? " multiline" : "",
- ((options & PCRE_DOTALL) != 0)? " dotall" : "",
- ((options & PCRE_DOLLAR_ENDONLY) != 0)? " dollar_endonly" : "",
- ((options & PCRE_EXTRA) != 0)? " extra" : "",
- ((options & PCRE_UNGREEDY) != 0)? " ungreedy" : "");
-
- if (((((real_pcre *)re)->options) & PCRE_ICHANGED) != 0)
- fprintf(outfile, "Case state changes\n");
-
- if (first_char == -1)
- {
- fprintf(outfile, "First char at start or follows \\n\n");
- }
- else if (first_char < 0)
- {
- fprintf(outfile, "No first char\n");
- }
+ if (old_count != count) fprintf(outfile,
+ "Count disagreement: pcre_fullinfo=%d pcre_info=%d\n", count,
+ old_count);
+
+ if (old_first_char != first_char) fprintf(outfile,
+ "First char disagreement: pcre_fullinfo=%d pcre_info=%d\n",
+ first_char, old_first_char);
+
+ if (old_options != options) fprintf(outfile,
+ "Options disagreement: pcre_fullinfo=%d pcre_info=%d\n", options,
+ old_options);
+ }
+
+ if (size != gotten_store) fprintf(outfile,
+ "Size disagreement: pcre_fullinfo=%d call to malloc for %d\n",
+ size, gotten_store);
+
+ fprintf(outfile, "Capturing subpattern count = %d\n", count);
+ if (backrefmax > 0)
+ fprintf(outfile, "Max back reference = %d\n", backrefmax);
+ if (options == 0) fprintf(outfile, "No options\n");
+ else fprintf(outfile, "Options:%s%s%s%s%s%s%s%s\n",
+ ((options & PCRE_ANCHORED) != 0)? " anchored" : "",
+ ((options & PCRE_CASELESS) != 0)? " caseless" : "",
+ ((options & PCRE_EXTENDED) != 0)? " extended" : "",
+ ((options & PCRE_MULTILINE) != 0)? " multiline" : "",
+ ((options & PCRE_DOTALL) != 0)? " dotall" : "",
+ ((options & PCRE_DOLLAR_ENDONLY) != 0)? " dollar_endonly" : "",
+ ((options & PCRE_EXTRA) != 0)? " extra" : "",
+ ((options & PCRE_UNGREEDY) != 0)? " ungreedy" : "");
+
+ if (((((real_pcre *)re)->options) & PCRE_ICHANGED) != 0)
+ fprintf(outfile, "Case state changes\n");
+
+ if (first_char == -1)
+ {
+ fprintf(outfile, "First char at start or follows \\n\n");
+ }
+ else if (first_char < 0)
+ {
+ fprintf(outfile, "No first char\n");
+ }
+ else
+ {
+ if (isprint(first_char))
+ fprintf(outfile, "First char = \'%c\'\n", first_char);
else
- {
- if (isprint(first_char))
- fprintf(outfile, "First char = \'%c\'\n", first_char);
- else
- fprintf(outfile, "First char = %d\n", first_char);
- }
+ fprintf(outfile, "First char = %d\n", first_char);
+ }
- if (((((real_pcre *)re)->options) & PCRE_REQCHSET) != 0)
- {
- int req_char = ((real_pcre *)re)->req_char;
- if (isprint(req_char))
- fprintf(outfile, "Req char = \'%c\'\n", req_char);
- else
- fprintf(outfile, "Req char = %d\n", req_char);
- }
- else fprintf(outfile, "No req char\n");
+ if (need_char < 0)
+ {
+ fprintf(outfile, "No need char\n");
+ }
+ else
+ {
+ if (isprint(need_char))
+ fprintf(outfile, "Need char = \'%c\'\n", need_char);
+ else
+ fprintf(outfile, "Need char = %d\n", need_char);
}
}
@@ -654,13 +700,11 @@ while (!done)
else if (extra == NULL)
fprintf(outfile, "Study returned NULL\n");
- /* This looks at internal information. A bit kludgy to do it this
- way, but it is useful for testing. */
-
else if (do_showinfo)
{
- real_pcre_extra *xx = (real_pcre_extra *)extra;
- if ((xx->options & PCRE_STUDY_MAPPED) == 0)
+ uschar *start_bits = NULL;
+ new_info(re, extra, PCRE_INFO_FIRSTTABLE, &start_bits);
+ if (start_bits == NULL)
fprintf(outfile, "No starting character set\n");
else
{
@@ -669,7 +713,7 @@ while (!done)
fprintf(outfile, "Starting character set: ");
for (i = 0; i < 256; i++)
{
- if ((xx->start_bits[i/8] & (1<<(i%8))) != 0)
+ if ((start_bits[i/8] & (1<<(i%8))) != 0)
{
if (c > 75)
{
diff --git a/pgrep.c b/pgrep.c
index 3e63058..ad1b87e 100644
--- a/pgrep.c
+++ b/pgrep.c
@@ -32,7 +32,7 @@ static BOOL whole_lines = FALSE;
-#ifndef HAVE_STRERROR
+#if ! HAVE_STRERROR
/*************************************************
* Provide strerror() for non-ANSI libraries *
*************************************************/
diff --git a/study.c b/study.c
index 284833b..676db94 100644
--- a/study.c
+++ b/study.c
@@ -9,7 +9,7 @@ the file Tech.Notes for some information on the internals.
Written by: Philip Hazel <ph10@cam.ac.uk>
- Copyright (c) 1997-1999 University of Cambridge
+ Copyright (c) 1997-2000 University of Cambridge
-----------------------------------------------------------------------------
Permission is granted to anyone to use this software for any purpose on any
@@ -207,12 +207,12 @@ do
case OP_NOT_WORDCHAR:
for (c = 0; c < 32; c++)
- start_bits[c] |= ~(cd->cbits[c] | cd->cbits[c+cbit_word]);
+ start_bits[c] |= ~cd->cbits[c+cbit_word];
break;
case OP_WORDCHAR:
for (c = 0; c < 32; c++)
- start_bits[c] |= (cd->cbits[c] | cd->cbits[c+cbit_word]);
+ start_bits[c] |= cd->cbits[c+cbit_word];
break;
/* One or more character type fudges the pointer and restarts, knowing
@@ -264,12 +264,12 @@ do
case OP_NOT_WORDCHAR:
for (c = 0; c < 32; c++)
- start_bits[c] |= ~(cd->cbits[c] | cd->cbits[c+cbit_word]);
+ start_bits[c] |= ~cd->cbits[c+cbit_word];
break;
case OP_WORDCHAR:
for (c = 0; c < 32; c++)
- start_bits[c] |= (cd->cbits[c] | cd->cbits[c+cbit_word]);
+ start_bits[c] |= cd->cbits[c+cbit_word];
break;
}
diff --git a/testdata/testinput2 b/testdata/testinput2
index 5e641b2..1d9504c 100644
--- a/testdata/testinput2
+++ b/testdata/testinput2
@@ -591,5 +591,120 @@
aaaabbbbzzzz\O3
aaaabbbbzzzz\O4
aaaabbbbzzzz\O5
+
+/^.?abcd/S
+
+/\( # ( at start
+ (?: # Non-capturing bracket
+ (?>[^()]+) # Either a sequence of non-brackets (no backtracking)
+ | # Or
+ (?R) # Recurse - i.e. nested bracketed string
+ )* # Zero or more contents
+ \) # Closing )
+ /x
+ (abcd)
+ (abcd)xyz
+ xyz(abcd)
+ (ab(xy)cd)pqr
+ (ab(xycd)pqr
+ () abc ()
+ 12(abcde(fsh)xyz(foo(bar))lmno)89
+ *** Failers
+ abcd
+ abcd)
+ (abcd
+
+/\( ( (?>[^()]+) | (?R) )* \) /xg
+ (ab(xy)cd)pqr
+ 1(abcd)(x(y)z)pqr
+
+/\( (?: (?>[^()]+) | (?R) ) \) /x
+ (abcd)
+ (ab(xy)cd)
+ (a(b(c)d)e)
+ ((ab))
+ *** Failers
+ ()
+
+/\( (?: (?>[^()]+) | (?R) )? \) /x
+ ()
+ 12(abcde(fsh)xyz(foo(bar))lmno)89
+
+/\( ( (?>[^()]+) | (?R) )* \) /x
+ (ab(xy)cd)
+
+/\( ( ( (?>[^()]+) | (?R) )* ) \) /x
+ (ab(xy)cd)
+
+/\( (123)? ( ( (?>[^()]+) | (?R) )* ) \) /x
+ (ab(xy)cd)
+ (123ab(xy)cd)
+
+/\( ( (123)? ( (?>[^()]+) | (?R) )* ) \) /x
+ (ab(xy)cd)
+ (123ab(xy)cd)
+
+/\( (((((((((( ( (?>[^()]+) | (?R) )* )))))))))) \) /x
+ (ab(xy)cd)
+
+/\( ( ( (?>[^()<>]+) | ((?>[^()]+)) | (?R) )* ) \) /x
+ (abcd(xyz<p>qrs)123)
+
+/\( ( ( (?>[^()]+) | ((?R)) )* ) \) /x
+ (ab(cd)ef)
+ (ab(cd(ef)gh)ij)
+
+/^[[:alnum:]]/D
+
+/^[[:alpha:]]/D
+
+/^[[:ascii:]]/D
+
+/^[[:cntrl:]]/D
+
+/^[[:digit:]]/D
+
+/^[[:graph:]]/D
+
+/^[[:lower:]]/D
+
+/^[[:print:]]/D
+
+/^[[:punct:]]/D
+
+/^[[:space:]]/D
+
+/^[[:upper:]]/D
+
+/^[[:xdigit:]]/D
+
+/^[[:word:]]/D
+
+/^[[:^cntrl:]]/D
+
+/^[12[:^digit:]]/D
+
+/[01[:alpha:]%]/D
+
+/[[.ch.]]/
+
+/[[=ch=]]/
+
+/[[:rhubarb:]]/
+
+/[[:upper:]]/i
+ A
+ a
+
+/[[:lower:]]/i
+ A
+ a
+
+/((?-i)[[:lower:]])[[:lower:]]/i
+ ab
+ aB
+ *** Failers
+ Ab
+ AB
/ End of test input /
diff --git a/testdata/testoutput1 b/testdata/testoutput1
index b53e183..1a10a74 100644
--- a/testdata/testoutput1
+++ b/testdata/testoutput1
@@ -1,4 +1,4 @@
-PCRE version 2.08 31-Aug-1999
+PCRE version 3.0 01-Feb-2000
/the quick brown fox/
the quick brown fox
diff --git a/testdata/testoutput2 b/testdata/testoutput2
index aaea4b7..493f460 100644
--- a/testdata/testoutput2
+++ b/testdata/testoutput2
@@ -1,16 +1,16 @@
-PCRE version 2.08 31-Aug-1999
+PCRE version 3.0 01-Feb-2000
/(a)b|/
-Identifying subpattern count = 1
+Capturing subpattern count = 1
No options
No first char
-No req char
+No need char
/abc/
-Identifying subpattern count = 0
+Capturing subpattern count = 0
No options
First char = 'a'
-Req char = 'c'
+Need char = 'c'
abc
0: abc
defabc
@@ -25,10 +25,10 @@ No match
No match
/^abc/
-Identifying subpattern count = 0
+Capturing subpattern count = 0
Options: anchored
No first char
-Req char = 'c'
+Need char = 'c'
abc
0: abc
\Aabc
@@ -41,34 +41,34 @@ No match
No match
/a+bc/
-Identifying subpattern count = 0
+Capturing subpattern count = 0
No options
First char = 'a'
-Req char = 'c'
+Need char = 'c'
/a*bc/
-Identifying subpattern count = 0
+Capturing subpattern count = 0
No options
No first char
-Req char = 'c'
+Need char = 'c'
/a{3}bc/
-Identifying subpattern count = 0
+Capturing subpattern count = 0
No options
First char = 'a'
-Req char = 'c'
+Need char = 'c'
/(abc|a+z)/
-Identifying subpattern count = 1
+Capturing subpattern count = 1
No options
First char = 'a'
-No req char
+No need char
/^abc$/
-Identifying subpattern count = 0
+Capturing subpattern count = 0
Options: anchored
No first char
-Req char = 'c'
+Need char = 'c'
abc
0: abc
*** Failers
@@ -113,32 +113,32 @@ Failed: missing ) after comment at offset 7
Failed: unrecognized character after (? at offset 2
/.*b/
-Identifying subpattern count = 0
+Capturing subpattern count = 0
No options
First char at start or follows \n
-Req char = 'b'
+Need char = 'b'
/.*?b/
-Identifying subpattern count = 0
+Capturing subpattern count = 0
No options
First char at start or follows \n
-Req char = 'b'
+Need char = 'b'
/cat|dog|elephant/
-Identifying subpattern count = 0
+Capturing subpattern count = 0
No options
No first char
-No req char
+No need char
this sentence eventually mentions a cat
0: cat
this sentences rambles on and on for a while and then reaches elephant
0: elephant
/cat|dog|elephant/S
-Identifying subpattern count = 0
+Capturing subpattern count = 0
No options
No first char
-No req char
+No need char
Starting character set: c d e
this sentence eventually mentions a cat
0: cat
@@ -146,10 +146,10 @@ Starting character set: c d e
0: elephant
/cat|dog|elephant/iS
-Identifying subpattern count = 0
+Capturing subpattern count = 0
Options: caseless
No first char
-No req char
+No need char
Starting character set: C D E c d e
this sentence eventually mentions a CAT cat
0: CAT
@@ -157,17 +157,17 @@ Starting character set: C D E c d e
0: elephant
/a|[bcd]/S
-Identifying subpattern count = 0
+Capturing subpattern count = 0
No options
No first char
-No req char
+No need char
Starting character set: a b c d
/(a|[^\dZ])/S
-Identifying subpattern count = 1
+Capturing subpattern count = 1
No options
No first char
-No req char
+No need char
Starting character set: \x00 \x01 \x02 \x03 \x04 \x05 \x06 \x07 \x08 \x09 \x0a
\x0b \x0c \x0d \x0e \x0f \x10 \x11 \x12 \x13 \x14 \x15 \x16 \x17 \x18 \x19
\x1a \x1b \x1c \x1d \x1e \x1f \x20 ! " # $ % & ' ( ) * + , - . / : ; < = >
@@ -184,10 +184,10 @@ Starting character set: \x00 \x01 \x02 \x03 \x04 \x05 \x06 \x07 \x08 \x09 \x0a
\xfc \xfd \xfe \xff
/(a|b)*[\s]/S
-Identifying subpattern count = 1
+Capturing subpattern count = 1
No options
No first char
-No req char
+No need char
Starting character set: \x09 \x0a \x0b \x0c \x0d \x20 a b
/(ab\2)/
@@ -197,10 +197,11 @@ Failed: back reference to non-existent subpattern at offset 6
Failed: nothing to repeat at offset 4
/(a)(b)(c)\2/
-Identifying subpattern count = 3
+Capturing subpattern count = 3
+Max back reference = 2
No options
First char = 'a'
-Req char = 'c'
+Need char = 'c'
abcb
0: abcb
1: a
@@ -227,10 +228,11 @@ Matched, but too many substrings
3: c
/(a)bc|(a)(b)\2/
-Identifying subpattern count = 3
+Capturing subpattern count = 3
+Max back reference = 2
No options
First char = 'a'
-No req char
+No need char
abc
0: abc
1: a
@@ -268,10 +270,10 @@ Matched, but too many substrings
3: b
/abc$/E
-Identifying subpattern count = 0
+Capturing subpattern count = 0
Options: dollar_endonly
First char = 'a'
-Req char = 'c'
+Need char = 'c'
abc
0: abc
*** Failers
@@ -285,20 +287,20 @@ No match
Failed: back reference to non-existent subpattern at offset 17
/the quick brown fox/
-Identifying subpattern count = 0
+Capturing subpattern count = 0
No options
First char = 't'
-Req char = 'x'
+Need char = 'x'
the quick brown fox
0: the quick brown fox
this is a line with the quick brown fox
0: the quick brown fox
/the quick brown fox/A
-Identifying subpattern count = 0
+Capturing subpattern count = 0
Options: anchored
No first char
-Req char = 'x'
+Need char = 'x'
the quick brown fox
0: the quick brown fox
*** Failers
@@ -310,20 +312,20 @@ No match
Failed: unrecognized character after (? at offset 4
/^abc|def/
-Identifying subpattern count = 0
+Capturing subpattern count = 0
No options
No first char
-No req char
+No need char
abcdef
0: abc
abcdef\B
0: def
/.*((abc)$|(def))/
-Identifying subpattern count = 3
+Capturing subpattern count = 3
No options
First char at start or follows \n
-No req char
+No need char
defabc
0: defabc
1: abc
@@ -396,74 +398,74 @@ Failed: unmatched parentheses at offset 0
Failed: missing terminating ] for character class at offset 4
/[^aeiou ]{3,}/
-Identifying subpattern count = 0
+Capturing subpattern count = 0
No options
No first char
-No req char
+No need char
co-processors, and for
0: -pr
/<.*>/
-Identifying subpattern count = 0
+Capturing subpattern count = 0
No options
First char = '<'
-Req char = '>'
+Need char = '>'
abc<def>ghi<klm>nop
0: <def>ghi<klm>
/<.*?>/
-Identifying subpattern count = 0
+Capturing subpattern count = 0
No options
First char = '<'
-Req char = '>'
+Need char = '>'
abc<def>ghi<klm>nop
0: <def>
/<.*>/U
-Identifying subpattern count = 0
+Capturing subpattern count = 0
Options: ungreedy
First char = '<'
-Req char = '>'
+Need char = '>'
abc<def>ghi<klm>nop
0: <def>
/<.*>(?U)/
-Identifying subpattern count = 0
+Capturing subpattern count = 0
Options: ungreedy
First char = '<'
-Req char = '>'
+Need char = '>'
abc<def>ghi<klm>nop
0: <def>
/<.*?>/U
-Identifying subpattern count = 0
+Capturing subpattern count = 0
Options: ungreedy
First char = '<'
-Req char = '>'
+Need char = '>'
abc<def>ghi<klm>nop
0: <def>ghi<klm>
/={3,}/U
-Identifying subpattern count = 0
+Capturing subpattern count = 0
Options: ungreedy
First char = '='
-Req char = '='
+Need char = '='
abc========def
0: ===
/(?U)={3,}?/
-Identifying subpattern count = 0
+Capturing subpattern count = 0
Options: ungreedy
First char = '='
-Req char = '='
+Need char = '='
abc========def
0: ========
/(?<!bar|cattle)foo/
-Identifying subpattern count = 0
+Capturing subpattern count = 0
No options
First char = 'f'
-Req char = 'o'
+Need char = 'o'
foo
0: foo
catfoo
@@ -485,68 +487,68 @@ Failed: lookbehind assertion is not fixed length at offset 14
Failed: lookbehind assertion is not fixed length at offset 12
/(?i)abc/
-Identifying subpattern count = 0
+Capturing subpattern count = 0
Options: caseless
First char = 'a'
-Req char = 'c'
+Need char = 'c'
/(a|(?m)a)/
-Identifying subpattern count = 1
+Capturing subpattern count = 1
No options
First char = 'a'
-No req char
+No need char
/(?i)^1234/
-Identifying subpattern count = 0
+Capturing subpattern count = 0
Options: anchored caseless
No first char
-Req char = '4'
+Need char = '4'
/(^b|(?i)^d)/
-Identifying subpattern count = 1
+Capturing subpattern count = 1
Options: anchored
Case state changes
No first char
-No req char
+No need char
/(?s).*/
-Identifying subpattern count = 0
+Capturing subpattern count = 0
Options: anchored dotall
No first char
-No req char
+No need char
/[abcd]/S
-Identifying subpattern count = 0
+Capturing subpattern count = 0
No options
No first char
-No req char
+No need char
Starting character set: a b c d
/(?i)[abcd]/S
-Identifying subpattern count = 0
+Capturing subpattern count = 0
Options: caseless
No first char
-No req char
+No need char
Starting character set: A B C D a b c d
/(?m)[xy]|(b|c)/S
-Identifying subpattern count = 1
+Capturing subpattern count = 1
Options: multiline
No first char
-No req char
+No need char
Starting character set: b c x y
/(^a|^b)/m
-Identifying subpattern count = 1
+Capturing subpattern count = 1
Options: multiline
First char at start or follows \n
-No req char
+No need char
/(?i)(^a|^b)/m
-Identifying subpattern count = 1
+Capturing subpattern count = 1
Options: caseless multiline
First char at start or follows \n
-No req char
+No need char
/(a)(?(1)a|b|c)/
Failed: conditional group contains more than two branches at offset 13
@@ -567,17 +569,19 @@ Failed: assertion expected after (?( at offset 3
Failed: unrecognized character after (?< at offset 2
/((?s)blah)\s+\1/
-Identifying subpattern count = 1
+Capturing subpattern count = 1
+Max back reference = 1
No options
First char = 'b'
-Req char = 'h'
+Need char = 'h'
/((?i)blah)\s+\1/
-Identifying subpattern count = 1
+Capturing subpattern count = 1
+Max back reference = 1
No options
Case state changes
No first char
-Req char = 'h'
+Need char = 'h'
/((?i)b)/DS
------------------------------------------------------------------
@@ -590,26 +594,26 @@ Req char = 'h'
16 16 Ket
19 End
------------------------------------------------------------------
-Identifying subpattern count = 1
+Capturing subpattern count = 1
No options
Case state changes
No first char
-Req char = 'b'
+Need char = 'b'
Starting character set: B b
/(a*b|(?i:c*(?-i)d))/S
-Identifying subpattern count = 1
+Capturing subpattern count = 1
No options
Case state changes
No first char
-No req char
+No need char
Starting character set: C a b c d
/a$/
-Identifying subpattern count = 0
+Capturing subpattern count = 0
No options
First char = 'a'
-No req char
+No need char
a
0: a
a\n
@@ -622,10 +626,10 @@ No match
No match
/a$/m
-Identifying subpattern count = 0
+Capturing subpattern count = 0
Options: multiline
First char = 'a'
-No req char
+No need char
a
0: a
a\n
@@ -638,22 +642,22 @@ No match
No match
/\Aabc/m
-Identifying subpattern count = 0
+Capturing subpattern count = 0
Options: anchored multiline
No first char
-Req char = 'c'
+Need char = 'c'
/^abc/m
-Identifying subpattern count = 0
+Capturing subpattern count = 0
Options: multiline
First char at start or follows \n
-Req char = 'c'
+Need char = 'c'
/^((a+)(?U)([ab]+)(?-U)([bc]+)(\w*))/
-Identifying subpattern count = 5
+Capturing subpattern count = 5
Options: anchored
No first char
-Req char = 'a'
+Need char = 'a'
aaaaabbbbbcccccdef
0: aaaaabbbbbcccccdef
1: aaaaabbbbbcccccdef
@@ -663,37 +667,37 @@ Req char = 'a'
5: def
/(?<=foo)[ab]/S
-Identifying subpattern count = 0
+Capturing subpattern count = 0
No options
No first char
-No req char
+No need char
Starting character set: a b
/(?<!foo)(alpha|omega)/S
-Identifying subpattern count = 1
+Capturing subpattern count = 1
No options
No first char
-Req char = 'a'
+Need char = 'a'
Starting character set: a o
/(?!alphabet)[ab]/S
-Identifying subpattern count = 0
+Capturing subpattern count = 0
No options
No first char
-No req char
+No need char
Starting character set: a b
/(?<=foo\n)^bar/m
-Identifying subpattern count = 0
+Capturing subpattern count = 0
Options: multiline
First char at start or follows \n
-Req char = 'r'
+Need char = 'r'
/(?>^abc)/m
-Identifying subpattern count = 0
+Capturing subpattern count = 0
Options: multiline
First char at start or follows \n
-Req char = 'c'
+Need char = 'c'
abc
0: abc
def\nabc
@@ -713,16 +717,16 @@ Failed: lookbehind assertion is not fixed length at offset 12
Failed: lookbehind assertion is not fixed length at offset 13
/The next three are in testinput2 because they have variable length branches/
-Identifying subpattern count = 0
+Capturing subpattern count = 0
No options
First char = 'T'
-Req char = 's'
+Need char = 's'
/(?<=bullock|donkey)-cart/
-Identifying subpattern count = 0
+Capturing subpattern count = 0
No options
First char = '-'
-Req char = 't'
+Need char = 't'
the bullock-cart
0: -cart
a donkey-cart race
@@ -735,17 +739,17 @@ No match
No match
/(?<=ab(?i)x|y|z)/
-Identifying subpattern count = 0
+Capturing subpattern count = 0
No options
Case state changes
No first char
-No req char
+No need char
/(?>.*)(?<=(abcd)|(xyz))/
-Identifying subpattern count = 2
+Capturing subpattern count = 2
No options
First char at start or follows \n
-No req char
+No need char
alphabetabcd
0: alphabetabcd
1: abcd
@@ -755,11 +759,11 @@ No req char
2: xyz
/(?<=ab(?i)x(?-i)y|(?i)z|b)ZZ/
-Identifying subpattern count = 0
+Capturing subpattern count = 0
No options
Case state changes
First char = 'Z'
-Req char = 'Z'
+Need char = 'Z'
abxyZZ
0: ZZ
abXyZZ
@@ -784,10 +788,10 @@ No match
No match
/(?<!(foo)a)bar/
-Identifying subpattern count = 1
+Capturing subpattern count = 1
No options
First char = 'b'
-Req char = 'r'
+Need char = 'r'
bar
0: bar
foobbar
@@ -798,41 +802,42 @@ No match
No match
/This one is here because Perl 5.005_02 doesn't fail it/
-Identifying subpattern count = 0
+Capturing subpattern count = 0
No options
First char = 'T'
-Req char = 't'
+Need char = 't'
/^(a)?(?(1)a|b)+$/
-Identifying subpattern count = 1
+Capturing subpattern count = 1
Options: anchored
No first char
-No req char
+No need char
*** Failers
No match
a
No match
/This one is here because I think Perl 5.005_02 gets the setting of $1 wrong/
-Identifying subpattern count = 0
+Capturing subpattern count = 0
No options
First char = 'T'
-Req char = 'g'
+Need char = 'g'
/^(a\1?){4}$/
-Identifying subpattern count = 1
+Capturing subpattern count = 1
+Max back reference = 1
Options: anchored
No first char
-Req char = 'a'
+Need char = 'a'
aaaaaa
0: aaaaaa
1: aa
/These are syntax tests from Perl 5.005/
-Identifying subpattern count = 0
+Capturing subpattern count = 0
No options
First char = 'T'
-Req char = '5'
+Need char = '5'
/a[b-a]/
Failed: range out of order in character class at offset 4
@@ -943,10 +948,10 @@ Failed: POSIX code 9: bad escape sequence at offset 4
Failed: \ at end of pattern at offset 4
/(a)bc(d)/
-Identifying subpattern count = 2
+Capturing subpattern count = 2
No options
First char = 'a'
-Req char = 'd'
+Need char = 'd'
abcd
0: abcd
1: a
@@ -963,10 +968,10 @@ Req char = 'd'
copy substring 5 failed -7
/(.{20})/
-Identifying subpattern count = 1
+Capturing subpattern count = 1
No options
No first char
-No req char
+No need char
abcdefghijklmnopqrstuvwxyz
0: abcdefghijklmnopqrst
1: abcdefghijklmnopqrst
@@ -980,10 +985,10 @@ copy substring 1 failed -6
1G abcdefghijklmnopqrst (20)
/(.{15})/
-Identifying subpattern count = 1
+Capturing subpattern count = 1
No options
No first char
-No req char
+No need char
abcdefghijklmnopqrstuvwxyz
0: abcdefghijklmno
1: abcdefghijklmno
@@ -994,10 +999,10 @@ No req char
1G abcdefghijklmno (15)
/(.{16})/
-Identifying subpattern count = 1
+Capturing subpattern count = 1
No options
No first char
-No req char
+No need char
abcdefghijklmnopqrstuvwxyz
0: abcdefghijklmnop
1: abcdefghijklmnop
@@ -1010,10 +1015,10 @@ copy substring 1 failed -6
1L abcdefghijklmnop
/^(a|(bc))de(f)/
-Identifying subpattern count = 3
+Capturing subpattern count = 3
Options: anchored
No first char
-Req char = 'f'
+Need char = 'f'
adef\G1\G2\G3\G4\L
0: adef
1: a
@@ -1048,10 +1053,10 @@ get substring 4 failed -7
0C adef (4)
/^abc\00def/
-Identifying subpattern count = 0
+Capturing subpattern count = 0
Options: anchored
No first char
-Req char = 'f'
+Need char = 'f'
abc\00def\L\C0
0: abc\x00def
0C abc (7)
@@ -1061,10 +1066,10 @@ Req char = 'f'
)((?:[a-zA-Z0-9]+ )((?:[a-zA-Z0-9]+ )((?:[a-zA-Z0-9]+ )((?:[a-zA-Z0-9]+
)?)?)?)?)?)?)?)?)?otherword/M
Memory allocation (code space): 428
-Identifying subpattern count = 8
+Capturing subpattern count = 8
No options
First char = 'w'
-Req char = 'd'
+Need char = 'd'
/.*X/D
------------------------------------------------------------------
@@ -1074,10 +1079,10 @@ Req char = 'd'
8 8 Ket
11 End
------------------------------------------------------------------
-Identifying subpattern count = 0
+Capturing subpattern count = 0
No options
First char at start or follows \n
-Req char = 'X'
+Need char = 'X'
/.*X/Ds
------------------------------------------------------------------
@@ -1087,10 +1092,10 @@ Req char = 'X'
8 8 Ket
11 End
------------------------------------------------------------------
-Identifying subpattern count = 0
+Capturing subpattern count = 0
Options: anchored dotall
No first char
-Req char = 'X'
+Need char = 'X'
/(.*X|^B)/D
------------------------------------------------------------------
@@ -1105,10 +1110,10 @@ Req char = 'X'
21 21 Ket
24 End
------------------------------------------------------------------
-Identifying subpattern count = 1
+Capturing subpattern count = 1
No options
First char at start or follows \n
-No req char
+No need char
/(.*X|^B)/Ds
------------------------------------------------------------------
@@ -1123,10 +1128,10 @@ No req char
21 21 Ket
24 End
------------------------------------------------------------------
-Identifying subpattern count = 1
+Capturing subpattern count = 1
Options: anchored dotall
No first char
-No req char
+No need char
/(?s)(.*X|^B)/D
------------------------------------------------------------------
@@ -1141,10 +1146,10 @@ No req char
21 21 Ket
24 End
------------------------------------------------------------------
-Identifying subpattern count = 1
+Capturing subpattern count = 1
Options: anchored dotall
No first char
-No req char
+No need char
/(?s:.*X|^B)/D
------------------------------------------------------------------
@@ -1162,16 +1167,16 @@ No req char
27 27 Ket
30 End
------------------------------------------------------------------
-Identifying subpattern count = 0
+Capturing subpattern count = 0
No options
First char at start or follows \n
-No req char
+No need char
/\Biss\B/+
-Identifying subpattern count = 0
+Capturing subpattern count = 0
No options
First char = 'i'
-Req char = 's'
+Need char = 's'
Mississippi
0: iss
0+ issippi
@@ -1182,10 +1187,10 @@ Req char = 's'
0+ issippi
/iss/G+
-Identifying subpattern count = 0
+Capturing subpattern count = 0
No options
First char = 'i'
-Req char = 's'
+Need char = 's'
Mississippi
0: iss
0+ issippi
@@ -1193,19 +1198,19 @@ Req char = 's'
0+ ippi
/\Biss\B/G+
-Identifying subpattern count = 0
+Capturing subpattern count = 0
No options
First char = 'i'
-Req char = 's'
+Need char = 's'
Mississippi
0: iss
0+ issippi
/\Biss\B/g+
-Identifying subpattern count = 0
+Capturing subpattern count = 0
No options
First char = 'i'
-Req char = 's'
+Need char = 's'
Mississippi
0: iss
0+ issippi
@@ -1217,10 +1222,10 @@ No match
No match
/(?<=[Ms])iss/g+
-Identifying subpattern count = 0
+Capturing subpattern count = 0
No options
First char = 'i'
-Req char = 's'
+Need char = 's'
Mississippi
0: iss
0+ issippi
@@ -1228,28 +1233,28 @@ Req char = 's'
0+ ippi
/(?<=[Ms])iss/G+
-Identifying subpattern count = 0
+Capturing subpattern count = 0
No options
First char = 'i'
-Req char = 's'
+Need char = 's'
Mississippi
0: iss
0+ issippi
/^iss/g+
-Identifying subpattern count = 0
+Capturing subpattern count = 0
Options: anchored
No first char
-Req char = 's'
+Need char = 's'
ississippi
0: iss
0+ issippi
/.*iss/g+
-Identifying subpattern count = 0
+Capturing subpattern count = 0
No options
First char at start or follows \n
-Req char = 's'
+Need char = 's'
abciss\nxyzisspqr
0: abciss
0+ \x0axyzisspqr
@@ -1257,10 +1262,10 @@ Req char = 's'
0+ pqr
/.i./+g
-Identifying subpattern count = 0
+Capturing subpattern count = 0
No options
No first char
-Req char = 'i'
+Need char = 'i'
Mississippi
0: Mis
0+ sissippi
@@ -1287,28 +1292,28 @@ Req char = 'i'
0+ souri river
/^.is/+g
-Identifying subpattern count = 0
+Capturing subpattern count = 0
Options: anchored
No first char
-Req char = 's'
+Need char = 's'
Mississippi
0: Mis
0+ sissippi
/^ab\n/g+
-Identifying subpattern count = 0
+Capturing subpattern count = 0
Options: anchored
No first char
-Req char = 10
+Need char = 10
ab\nab\ncd
0: ab\x0a
0+ ab\x0acd
/^ab\n/mg+
-Identifying subpattern count = 0
+Capturing subpattern count = 0
Options: multiline
First char at start or follows \n
-Req char = 10
+Need char = 10
ab\nab\ncd
0: ab\x0a
0+ ab\x0acd
@@ -1316,256 +1321,256 @@ Req char = 10
0+ cd
/abc/
-Identifying subpattern count = 0
+Capturing subpattern count = 0
No options
First char = 'a'
-Req char = 'c'
+Need char = 'c'
/abc|bac/
-Identifying subpattern count = 0
+Capturing subpattern count = 0
No options
No first char
-Req char = 'c'
+Need char = 'c'
/(abc|bac)/
-Identifying subpattern count = 1
+Capturing subpattern count = 1
No options
No first char
-Req char = 'c'
+Need char = 'c'
/(abc|(c|dc))/
-Identifying subpattern count = 2
+Capturing subpattern count = 2
No options
No first char
-Req char = 'c'
+Need char = 'c'
/(abc|(d|de)c)/
-Identifying subpattern count = 2
+Capturing subpattern count = 2
No options
No first char
-Req char = 'c'
+Need char = 'c'
/a*/
-Identifying subpattern count = 0
+Capturing subpattern count = 0
No options
No first char
-No req char
+No need char
/a+/
-Identifying subpattern count = 0
+Capturing subpattern count = 0
No options
First char = 'a'
-No req char
+No need char
/(baa|a+)/
-Identifying subpattern count = 1
+Capturing subpattern count = 1
No options
No first char
-Req char = 'a'
+Need char = 'a'
/a{0,3}/
-Identifying subpattern count = 0
+Capturing subpattern count = 0
No options
No first char
-No req char
+No need char
/baa{3,}/
-Identifying subpattern count = 0
+Capturing subpattern count = 0
No options
First char = 'b'
-Req char = 'a'
+Need char = 'a'
/"([^\\"]+|\\.)*"/
-Identifying subpattern count = 1
+Capturing subpattern count = 1
No options
First char = '"'
-Req char = '"'
+Need char = '"'
/(abc|ab[cd])/
-Identifying subpattern count = 1
+Capturing subpattern count = 1
No options
First char = 'a'
-No req char
+No need char
/(a|.)/
-Identifying subpattern count = 1
+Capturing subpattern count = 1
No options
No first char
-No req char
+No need char
/a|ba|\w/
-Identifying subpattern count = 0
+Capturing subpattern count = 0
No options
No first char
-No req char
+No need char
/abc(?=pqr)/
-Identifying subpattern count = 0
+Capturing subpattern count = 0
No options
First char = 'a'
-Req char = 'r'
+Need char = 'r'
/...(?<=abc)/
-Identifying subpattern count = 0
+Capturing subpattern count = 0
No options
No first char
-No req char
+No need char
/abc(?!pqr)/
-Identifying subpattern count = 0
+Capturing subpattern count = 0
No options
First char = 'a'
-Req char = 'c'
+Need char = 'c'
/ab./
-Identifying subpattern count = 0
+Capturing subpattern count = 0
No options
First char = 'a'
-Req char = 'b'
+Need char = 'b'
/ab[xyz]/
-Identifying subpattern count = 0
+Capturing subpattern count = 0
No options
First char = 'a'
-Req char = 'b'
+Need char = 'b'
/abc*/
-Identifying subpattern count = 0
+Capturing subpattern count = 0
No options
First char = 'a'
-Req char = 'b'
+Need char = 'b'
/ab.c*/
-Identifying subpattern count = 0
+Capturing subpattern count = 0
No options
First char = 'a'
-Req char = 'b'
+Need char = 'b'
/a.c*/
-Identifying subpattern count = 0
+Capturing subpattern count = 0
No options
First char = 'a'
-No req char
+No need char
/.c*/
-Identifying subpattern count = 0
+Capturing subpattern count = 0
No options
No first char
-No req char
+No need char
/ac*/
-Identifying subpattern count = 0
+Capturing subpattern count = 0
No options
First char = 'a'
-No req char
+No need char
/(a.c*|b.c*)/
-Identifying subpattern count = 1
+Capturing subpattern count = 1
No options
No first char
-No req char
+No need char
/a.c*|aba/
-Identifying subpattern count = 0
+Capturing subpattern count = 0
No options
First char = 'a'
-No req char
+No need char
/.+a/
-Identifying subpattern count = 0
+Capturing subpattern count = 0
No options
No first char
-Req char = 'a'
+Need char = 'a'
/(?=abcda)a.*/
-Identifying subpattern count = 0
+Capturing subpattern count = 0
No options
First char = 'a'
-No req char
+No need char
/(?=a)a.*/
-Identifying subpattern count = 0
+Capturing subpattern count = 0
No options
First char = 'a'
-No req char
+No need char
/a(b)*/
-Identifying subpattern count = 1
+Capturing subpattern count = 1
No options
First char = 'a'
-No req char
+No need char
/a\d*/
-Identifying subpattern count = 0
+Capturing subpattern count = 0
No options
First char = 'a'
-No req char
+No need char
/ab\d*/
-Identifying subpattern count = 0
+Capturing subpattern count = 0
No options
First char = 'a'
-Req char = 'b'
+Need char = 'b'
/a(\d)*/
-Identifying subpattern count = 1
+Capturing subpattern count = 1
No options
First char = 'a'
-No req char
+No need char
/abcde{0,0}/
-Identifying subpattern count = 0
+Capturing subpattern count = 0
No options
First char = 'a'
-Req char = 'd'
+Need char = 'd'
/ab\d+/
-Identifying subpattern count = 0
+Capturing subpattern count = 0
No options
First char = 'a'
-Req char = 'b'
+Need char = 'b'
/a(?(1)b)/
-Identifying subpattern count = 0
+Capturing subpattern count = 0
No options
First char = 'a'
-No req char
+No need char
/a(?(1)bag|big)/
-Identifying subpattern count = 0
+Capturing subpattern count = 0
No options
First char = 'a'
-Req char = 'g'
+Need char = 'g'
/a(?(1)bag|big)*/
-Identifying subpattern count = 0
+Capturing subpattern count = 0
No options
First char = 'a'
-No req char
+No need char
/a(?(1)bag|big)+/
-Identifying subpattern count = 0
+Capturing subpattern count = 0
No options
First char = 'a'
-Req char = 'g'
+Need char = 'g'
/a(?(1)b..|b..)/
-Identifying subpattern count = 0
+Capturing subpattern count = 0
No options
First char = 'a'
-Req char = 'b'
+Need char = 'b'
/ab\d{0}e/
-Identifying subpattern count = 0
+Capturing subpattern count = 0
No options
First char = 'a'
-Req char = 'e'
+Need char = 'e'
/a?b?/
-Identifying subpattern count = 0
+Capturing subpattern count = 0
No options
No first char
-No req char
+No need char
a
0: a
b
@@ -1580,10 +1585,10 @@ No req char
No match
/|-/
-Identifying subpattern count = 0
+Capturing subpattern count = 0
No options
No first char
-No req char
+No need char
abcd
0:
-abc
@@ -1621,10 +1626,447 @@ No match
1: bbbb
2: z
3: z
+
+/^.?abcd/S
+Capturing subpattern count = 0
+Options: anchored
+No first char
+Need char = 'd'
+Study returned NULL
+
+/\( # ( at start
+ (?: # Non-capturing bracket
+ (?>[^()]+) # Either a sequence of non-brackets (no backtracking)
+ | # Or
+ (?R) # Recurse - i.e. nested bracketed string
+ )* # Zero or more contents
+ \) # Closing )
+ /x
+Capturing subpattern count = 0
+Options: extended
+First char = '('
+Need char = ')'
+ (abcd)
+ 0: (abcd)
+ (abcd)xyz
+ 0: (abcd)
+ xyz(abcd)
+ 0: (abcd)
+ (ab(xy)cd)pqr
+ 0: (ab(xy)cd)
+ (ab(xycd)pqr
+ 0: (xycd)
+ () abc ()
+ 0: ()
+ 12(abcde(fsh)xyz(foo(bar))lmno)89
+ 0: (abcde(fsh)xyz(foo(bar))lmno)
+ *** Failers
+No match
+ abcd
+No match
+ abcd)
+No match
+ (abcd
+No match
+
+/\( ( (?>[^()]+) | (?R) )* \) /xg
+Capturing subpattern count = 1
+Options: extended
+First char = '('
+Need char = ')'
+ (ab(xy)cd)pqr
+ 0: (ab(xy)cd)
+ 1: cd
+ 1(abcd)(x(y)z)pqr
+ 0: (abcd)
+ 1: abcd
+ 0: (x(y)z)
+ 1: z
+
+/\( (?: (?>[^()]+) | (?R) ) \) /x
+Capturing subpattern count = 0
+Options: extended
+First char = '('
+Need char = ')'
+ (abcd)
+ 0: (abcd)
+ (ab(xy)cd)
+ 0: (xy)
+ (a(b(c)d)e)
+ 0: (c)
+ ((ab))
+ 0: ((ab))
+ *** Failers
+No match
+ ()
+No match
+
+/\( (?: (?>[^()]+) | (?R) )? \) /x
+Capturing subpattern count = 0
+Options: extended
+First char = '('
+Need char = ')'
+ ()
+ 0: ()
+ 12(abcde(fsh)xyz(foo(bar))lmno)89
+ 0: (fsh)
+
+/\( ( (?>[^()]+) | (?R) )* \) /x
+Capturing subpattern count = 1
+Options: extended
+First char = '('
+Need char = ')'
+ (ab(xy)cd)
+ 0: (ab(xy)cd)
+ 1: cd
+
+/\( ( ( (?>[^()]+) | (?R) )* ) \) /x
+Capturing subpattern count = 2
+Options: extended
+First char = '('
+Need char = ')'
+ (ab(xy)cd)
+ 0: (ab(xy)cd)
+ 1: ab(xy)cd
+ 2: cd
+
+/\( (123)? ( ( (?>[^()]+) | (?R) )* ) \) /x
+Capturing subpattern count = 3
+Options: extended
+First char = '('
+Need char = ')'
+ (ab(xy)cd)
+ 0: (ab(xy)cd)
+ 1: <unset>
+ 2: ab(xy)cd
+ 3: cd
+ (123ab(xy)cd)
+ 0: (123ab(xy)cd)
+ 1: 123
+ 2: ab(xy)cd
+ 3: cd
+
+/\( ( (123)? ( (?>[^()]+) | (?R) )* ) \) /x
+Capturing subpattern count = 3
+Options: extended
+First char = '('
+Need char = ')'
+ (ab(xy)cd)
+ 0: (ab(xy)cd)
+ 1: ab(xy)cd
+ 2: <unset>
+ 3: cd
+ (123ab(xy)cd)
+ 0: (123ab(xy)cd)
+ 1: 123ab(xy)cd
+ 2: 123
+ 3: cd
+
+/\( (((((((((( ( (?>[^()]+) | (?R) )* )))))))))) \) /x
+Capturing subpattern count = 11
+Options: extended
+First char = '('
+Need char = ')'
+ (ab(xy)cd)
+ 0: (ab(xy)cd)
+ 1: ab(xy)cd
+ 2: ab(xy)cd
+ 3: ab(xy)cd
+ 4: ab(xy)cd
+ 5: ab(xy)cd
+ 6: ab(xy)cd
+ 7: ab(xy)cd
+ 8: ab(xy)cd
+ 9: ab(xy)cd
+10: ab(xy)cd
+11: cd
+
+/\( ( ( (?>[^()<>]+) | ((?>[^()]+)) | (?R) )* ) \) /x
+Capturing subpattern count = 3
+Options: extended
+First char = '('
+Need char = ')'
+ (abcd(xyz<p>qrs)123)
+ 0: (abcd(xyz<p>qrs)123)
+ 1: abcd(xyz<p>qrs)123
+ 2: 123
+ 3: <p>qrs
+
+/\( ( ( (?>[^()]+) | ((?R)) )* ) \) /x
+Capturing subpattern count = 3
+Options: extended
+First char = '('
+Need char = ')'
+ (ab(cd)ef)
+ 0: (ab(cd)ef)
+ 1: ab(cd)ef
+ 2: ef
+ 3: (cd)
+ (ab(cd(ef)gh)ij)
+ 0: (ab(cd(ef)gh)ij)
+ 1: ab(cd(ef)gh)ij
+ 2: ij
+ 3: (cd(ef)gh)
+
+/^[[:alnum:]]/D
+------------------------------------------------------------------
+ 0 37 Bra 0
+ 3 ^
+ 4 [0-9A-Za-z]
+ 37 37 Ket
+ 40 End
+------------------------------------------------------------------
+Capturing subpattern count = 0
+Options: anchored
+No first char
+No need char
+
+/^[[:alpha:]]/D
+------------------------------------------------------------------
+ 0 37 Bra 0
+ 3 ^
+ 4 [A-Za-z]
+ 37 37 Ket
+ 40 End
+------------------------------------------------------------------
+Capturing subpattern count = 0
+Options: anchored
+No first char
+No need char
+
+/^[[:ascii:]]/D
+------------------------------------------------------------------
+ 0 37 Bra 0
+ 3 ^
+ 4 [\x00-\x7f]
+ 37 37 Ket
+ 40 End
+------------------------------------------------------------------
+Capturing subpattern count = 0
+Options: anchored
+No first char
+No need char
+
+/^[[:cntrl:]]/D
+------------------------------------------------------------------
+ 0 37 Bra 0
+ 3 ^
+ 4 [\x00-\x1f\x7f]
+ 37 37 Ket
+ 40 End
+------------------------------------------------------------------
+Capturing subpattern count = 0
+Options: anchored
+No first char
+No need char
+
+/^[[:digit:]]/D
+------------------------------------------------------------------
+ 0 37 Bra 0
+ 3 ^
+ 4 [0-9]
+ 37 37 Ket
+ 40 End
+------------------------------------------------------------------
+Capturing subpattern count = 0
+Options: anchored
+No first char
+No need char
+
+/^[[:graph:]]/D
+------------------------------------------------------------------
+ 0 37 Bra 0
+ 3 ^
+ 4 [!-~]
+ 37 37 Ket
+ 40 End
+------------------------------------------------------------------
+Capturing subpattern count = 0
+Options: anchored
+No first char
+No need char
+
+/^[[:lower:]]/D
+------------------------------------------------------------------
+ 0 37 Bra 0
+ 3 ^
+ 4 [a-z]
+ 37 37 Ket
+ 40 End
+------------------------------------------------------------------
+Capturing subpattern count = 0
+Options: anchored
+No first char
+No need char
+
+/^[[:print:]]/D
+------------------------------------------------------------------
+ 0 37 Bra 0
+ 3 ^
+ 4 [ -~]
+ 37 37 Ket
+ 40 End
+------------------------------------------------------------------
+Capturing subpattern count = 0
+Options: anchored
+No first char
+No need char
+
+/^[[:punct:]]/D
+------------------------------------------------------------------
+ 0 37 Bra 0
+ 3 ^
+ 4 [!-/:-@[-`{-~]
+ 37 37 Ket
+ 40 End
+------------------------------------------------------------------
+Capturing subpattern count = 0
+Options: anchored
+No first char
+No need char
+
+/^[[:space:]]/D
+------------------------------------------------------------------
+ 0 37 Bra 0
+ 3 ^
+ 4 [\x09-\x0d ]
+ 37 37 Ket
+ 40 End
+------------------------------------------------------------------
+Capturing subpattern count = 0
+Options: anchored
+No first char
+No need char
+
+/^[[:upper:]]/D
+------------------------------------------------------------------
+ 0 37 Bra 0
+ 3 ^
+ 4 [A-Z]
+ 37 37 Ket
+ 40 End
+------------------------------------------------------------------
+Capturing subpattern count = 0
+Options: anchored
+No first char
+No need char
+
+/^[[:xdigit:]]/D
+------------------------------------------------------------------
+ 0 37 Bra 0
+ 3 ^
+ 4 [0-9A-Fa-f]
+ 37 37 Ket
+ 40 End
+------------------------------------------------------------------
+Capturing subpattern count = 0
+Options: anchored
+No first char
+No need char
+
+/^[[:word:]]/D
+------------------------------------------------------------------
+ 0 37 Bra 0
+ 3 ^
+ 4 [0-9A-Z_a-z]
+ 37 37 Ket
+ 40 End
+------------------------------------------------------------------
+Capturing subpattern count = 0
+Options: anchored
+No first char
+No need char
+
+/^[[:^cntrl:]]/D
+------------------------------------------------------------------
+ 0 37 Bra 0
+ 3 ^
+ 4 [ -~\x80-\xff]
+ 37 37 Ket
+ 40 End
+------------------------------------------------------------------
+Capturing subpattern count = 0
+Options: anchored
+No first char
+No need char
+
+/^[12[:^digit:]]/D
+------------------------------------------------------------------
+ 0 37 Bra 0
+ 3 ^
+ 4 [\x00-/1-2:-\xff]
+ 37 37 Ket
+ 40 End
+------------------------------------------------------------------
+Capturing subpattern count = 0
+Options: anchored
+No first char
+No need char
+
+/[01[:alpha:]%]/D
+------------------------------------------------------------------
+ 0 36 Bra 0
+ 3 [%0-1A-Za-z]
+ 36 36 Ket
+ 39 End
+------------------------------------------------------------------
+Capturing subpattern count = 0
+No options
+No first char
+No need char
+
+/[[.ch.]]/
+Failed: POSIX collating elements are not supported at offset 1
+
+/[[=ch=]]/
+Failed: POSIX collating elements are not supported at offset 1
+
+/[[:rhubarb:]]/
+Failed: unknown POSIX class name at offset 3
+
+/[[:upper:]]/i
+Capturing subpattern count = 0
+Options: caseless
+No first char
+No need char
+ A
+ 0: A
+ a
+ 0: a
+
+/[[:lower:]]/i
+Capturing subpattern count = 0
+Options: caseless
+No first char
+No need char
+ A
+ 0: A
+ a
+ 0: a
+
+/((?-i)[[:lower:]])[[:lower:]]/i
+Capturing subpattern count = 1
+Options: caseless
+Case state changes
+No first char
+No need char
+ ab
+ 0: ab
+ 1: a
+ aB
+ 0: aB
+ 1: a
+ *** Failers
+ 0: ai
+ 1: a
+ Ab
+No match
+ AB
+No match
/ End of test input /
-Identifying subpattern count = 0
+Capturing subpattern count = 0
No options
First char = ' '
-Req char = ' '
+Need char = ' '
diff --git a/testdata/testoutput3 b/testdata/testoutput3
index d997659..a4a28c1 100644
--- a/testdata/testoutput3
+++ b/testdata/testoutput3
@@ -1,4 +1,4 @@
-PCRE version 2.08 31-Aug-1999
+PCRE version 3.0 01-Feb-2000
/(?<!bar)foo/
foo
diff --git a/testdata/testoutput4 b/testdata/testoutput4
index c8af6cf..586cbbd 100644
--- a/testdata/testoutput4
+++ b/testdata/testoutput4
@@ -1,4 +1,4 @@
-PCRE version 2.08 31-Aug-1999
+PCRE version 3.0 01-Feb-2000
/^[\w]+/
*** Failers
@@ -81,18 +81,18 @@ No match
0: école
/\w/IS
-Identifying subpattern count = 0
+Capturing subpattern count = 0
No options
No first char
-No req char
+No need char
Starting character set: 0 1 2 3 4 5 6 7 8 9 A B C D E F G H I J K L M N O P
Q R S T U V W X Y Z _ a b c d e f g h i j k l m n o p q r s t u v w x y z
/\w/ISLfr
-Identifying subpattern count = 0
+Capturing subpattern count = 0
No options
No first char
-No req char
+No need char
Starting character set: 0 1 2 3 4 5 6 7 8 9 A B C D E F G H I J K L M N O P
Q R S T U V W X Y Z _ a b c d e f g h i j k l m n o p q r s t u v w x y z
À Á Â Ã Ä Å Æ Ç È É Ê Ë Ì Í Î Ï Ð Ñ Ò Ó Ô Õ Ö Ø Ù Ú Û Ü Ý Þ ß à á â ã ä å