summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authornigel <nigel@2f5784b3-3f2a-0410-8824-cb99058d5e15>2007-02-24 21:40:08 +0000
committernigel <nigel@2f5784b3-3f2a-0410-8824-cb99058d5e15>2007-02-24 21:40:08 +0000
commit776635721f3417106a59041ad7e77c4c815e1bd0 (patch)
tree1bbd6ba83e3faa1b6cebf413d2e080e51f95e483
parentc8cb607ab7e12e185e86a8b23d413b7f9536f24c (diff)
downloadpcre-776635721f3417106a59041ad7e77c4c815e1bd0.tar.gz
Load pcre-4.1 into code/trunk.
git-svn-id: svn://vcs.exim.org/pcre/code/trunk@65 2f5784b3-3f2a-0410-8824-cb99058d5e15
-rw-r--r--ChangeLog44
-rw-r--r--Makefile.in12
-rw-r--r--README10
-rwxr-xr-xconfigure36
-rw-r--r--configure.in24
-rw-r--r--dftables.c6
-rw-r--r--doc/html/pcreapi.html12
-rw-r--r--doc/pcre.txt16
-rw-r--r--doc/pcreapi.312
-rw-r--r--internal.h8
-rw-r--r--pcre.c112
-rw-r--r--pcretest.c7
-rw-r--r--testdata/testinput13
-rw-r--r--testdata/testinput24
-rw-r--r--testdata/testoutput16
-rw-r--r--testdata/testoutput272
-rw-r--r--testdata/testoutput32
-rw-r--r--testdata/testoutput42
-rw-r--r--testdata/testoutput52
19 files changed, 256 insertions, 134 deletions
diff --git a/ChangeLog b/ChangeLog
index 9c99cf3..66bac98 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,8 +1,48 @@
ChangeLog for PCRE
------------------
-Version 4.00 17-Feb-03
-----------------------
+Version 4.1 12-Mar-03
+---------------------
+
+1. Compiling with gcc -pedantic found a couple of places where casts were
+needed, and a string in dftables.c that was longer than standard compilers are
+required to support.
+
+2. Compiling with Sun's compiler found a few more places where the code could
+be tidied up in order to avoid warnings.
+
+3. The variables for cross-compiling were called HOST_CC and HOST_CFLAGS; the
+first of these names is deprecated in the latest Autoconf in favour of the name
+CC_FOR_BUILD, because "host" is typically used to mean the system on which the
+compiled code will be run. I can't find a reference for HOST_CFLAGS, but by
+analogy I have changed it to CFLAGS_FOR_BUILD.
+
+4. Added -no-undefined to the linking command in the Makefile, because this is
+apparently helpful for Windows. To make it work, also added "-L. -lpcre" to the
+linking step for the pcreposix library.
+
+5. PCRE was failing to diagnose the case of two named groups with the same
+name.
+
+6. A problem with one of PCRE's optimizations was discovered. PCRE remembers a
+literal character that is needed in the subject for a match, and scans along to
+ensure that it is present before embarking on the full matching process. This
+saves time in cases of nested unlimited repeats that are never going to match.
+Problem: the scan can take a lot of time if the subject is very long (e.g.
+megabytes), thus penalizing straightforward matches. It is now done only if the
+amount of subject to be scanned is less than 1000 bytes.
+
+7. A lesser problem with the same optimization is that it was recording the
+first character of an anchored pattern as "needed", thus provoking a search
+right along the subject, even when the first match of the pattern was going to
+fail. The "needed" character is now not set for anchored patterns, unless it
+follows something in the pattern that is of non-fixed length. Thus, it still
+fulfils its original purpose of finding quick non-matches in cases of nested
+unlimited repeats, but isn't used for simple anchored patterns such as /^abc/.
+
+
+Version 4.0 17-Feb-03
+---------------------
1. If a comment in an extended regex that started immediately after a meta-item
extended to the end of string, PCRE compiled incorrect data. This could lead to
diff --git a/Makefile.in b/Makefile.in
index 5d621b8..510d8f1 100644
--- a/Makefile.in
+++ b/Makefile.in
@@ -56,9 +56,9 @@ MANDIR = @mandir@
CC = @CC@
CFLAGS = @CFLAGS@
-HOST_CC = @HOST_CC@
-HOST_CFLAGS = @HOST_CFLAGS@
-UTF8 = @UTF8@
+CC_FOR_BUILD = @CC_FOR_BUILD@
+CFLAGS_FOR_BUILD = @CFLAGS_FOR_BUILD@
+UTF8 = @UTF8@
NEWLINE = @NEWLINE@
POSIX_MALLOC_THRESHOLD = @POSIX_MALLOC_THRESHOLD@
LINK_SIZE = @LINK_SIZE@
@@ -72,7 +72,7 @@ INSTALL_DATA = @INSTALL_DATA@
LIBTOOL = @LIBTOOL@
LTCOMPILE = $(LIBTOOL) --mode=compile $(CC) -c $(CFLAGS) -I. -I$(top_srcdir) $(NEWLINE) $(LINK_SIZE) $(MATCH_LIMIT)
-LINK = $(LIBTOOL) --mode=link $(CC) $(CFLAGS) -I. -I$(top_srcdir)
+LINK = $(LIBTOOL) --mode=link $(CC) --no-undefined $(CFLAGS) -I. -I$(top_srcdir)
# These are the version numbers for the shared libraries
@@ -101,7 +101,7 @@ libpcre.la: $(OBJ)
libpcreposix.la: pcreposix.o
-rm -f libpcreposix.la
- $(LINK) -rpath $(LIBDIR) -version-info \
+ $(LINK) -rpath $(LIBDIR) -L. -lpcre -version-info \
'$(PCREPOSIXLIBVERSION)' -o libpcreposix.la pcreposix.lo
pcre.o: $(top_srcdir)/chartables.c $(top_srcdir)/pcre.c \
@@ -140,7 +140,7 @@ $(top_srcdir)/chartables.c: dftables
dftables.o: $(top_srcdir)/dftables.c $(top_srcdir)/maketables.c \
$(top_srcdir)/internal.h pcre.h config.h Makefile
- $(HOST_CC) -c $(HOST_CFLAGS) -I. $(top_srcdir)/dftables.c
+ $(CC_FOR_BUILD) -c $(CFLAGS_FOR_BUILD) -I. $(top_srcdir)/dftables.c
dftables: dftables.o
$(LINK) -o dftables dftables.o
diff --git a/README b/README
index a1e04cb..6cef77c 100644
--- a/README
+++ b/README
@@ -125,6 +125,16 @@ included in makefiles for programs that use PCRE, saving the programmer from
having to remember too many details.
+Cross-compiling PCRE on a Unix-like system
+------------------------------------------
+
+PCRE needs to compile and run an auxiliary program as part of the building
+process. Obviously, if the real compilation is for some other system, it can't
+use the same CC and CFLAGS values when it is doing this. For cross compilation,
+therefore, you must set CC_FOR_BUILD to the local host's compiler, and you can
+set flags in CFLAGS_FOR_BUILD if you need to.
+
+
Shared libraries on Unix-like systems
-------------------------------------
diff --git a/configure b/configure
index d5fad7f..83e26ed 100755
--- a/configure
+++ b/configure
@@ -466,7 +466,7 @@ ac_includes_default="\
# include <unistd.h>
#endif"
-ac_subst_vars='SHELL PATH_SEPARATOR PACKAGE_NAME PACKAGE_TARNAME PACKAGE_VERSION PACKAGE_STRING PACKAGE_BUGREPORT exec_prefix prefix program_transform_name bindir sbindir libexecdir datadir sysconfdir sharedstatedir localstatedir libdir includedir oldincludedir infodir mandir build_alias host_alias target_alias DEFS ECHO_C ECHO_N ECHO_T LIBS CC CFLAGS LDFLAGS CPPFLAGS ac_ct_CC EXEEXT OBJEXT INSTALL_PROGRAM INSTALL_SCRIPT INSTALL_DATA build build_cpu build_vendor build_os host host_cpu host_vendor host_os LN_S ECHO RANLIB ac_ct_RANLIB STRIP ac_ct_STRIP CPP EGREP LIBTOOL HAVE_MEMMOVE HAVE_STRERROR HOST_CC HOST_CFLAGS NEWLINE LINK_SIZE MATCH_LIMIT UTF8 PCRE_MAJOR PCRE_MINOR PCRE_DATE PCRE_VERSION PCRE_LIB_VERSION PCRE_POSIXLIB_VERSION POSIX_MALLOC_THRESHOLD POSIX_OBJ POSIX_LOBJ POSIX_LIB LIBOBJS LTLIBOBJS'
+ac_subst_vars='SHELL PATH_SEPARATOR PACKAGE_NAME PACKAGE_TARNAME PACKAGE_VERSION PACKAGE_STRING PACKAGE_BUGREPORT exec_prefix prefix program_transform_name bindir sbindir libexecdir datadir sysconfdir sharedstatedir localstatedir libdir includedir oldincludedir infodir mandir build_alias host_alias target_alias DEFS ECHO_C ECHO_N ECHO_T LIBS CC CFLAGS LDFLAGS CPPFLAGS ac_ct_CC EXEEXT OBJEXT INSTALL_PROGRAM INSTALL_SCRIPT INSTALL_DATA build build_cpu build_vendor build_os host host_cpu host_vendor host_os LN_S ECHO RANLIB ac_ct_RANLIB STRIP ac_ct_STRIP CPP EGREP LIBTOOL CC_FOR_BUILD CFLAGS_FOR_BUILD HAVE_MEMMOVE HAVE_STRERROR LINK_SIZE MATCH_LIMIT NEWLINE PCRE_MAJOR PCRE_MINOR PCRE_DATE PCRE_VERSION PCRE_LIB_VERSION PCRE_POSIXLIB_VERSION POSIX_MALLOC_THRESHOLD UTF8 POSIX_OBJ POSIX_LOBJ POSIX_LIB LIBOBJS LTLIBOBJS'
ac_subst_files=''
# Initialize some variables set by options.
@@ -939,12 +939,6 @@ if test "$ac_init_help" = "long"; then
# The list generated by autoconf has been trimmed to remove many
# options that are totally irrelevant to PCRE (e.g. relating to X),
# or are not supported by its Makefile.
- # The list generated by autoconf has been trimmed to remove many
- # options that are totally irrelevant to PCRE (e.g. relating to X),
- # or are not supported by its Makefile.
- # The list generated by autoconf has been trimmed to remove many
- # options that are totally irrelevant to PCRE (e.g. relating to X),
- # or are not supported by its Makefile.
# This message is too long to be a string in the A/UX 3.1 sh.
cat <<_ACEOF
\`configure' configures this package to adapt to many kinds of systems.
@@ -1438,14 +1432,12 @@ ac_compiler_gnu=$ac_cv_c_compiler_gnu
PCRE_MAJOR=4
-PCRE_MINOR=0
-PCRE_DATE=17-Feb-2003
+PCRE_MINOR=1
+PCRE_DATE=12-Mar-2003
PCRE_VERSION=${PCRE_MAJOR}.${PCRE_MINOR}
POSIX_MALLOC_THRESHOLD=-DPOSIX_MALLOC_THRESHOLD=10
-HOST_CC=${HOST_CC:-'$(CC)'}
-HOST_CFLAGS=${HOST_CFLAGS:-'$(CFLAGS)'}
PCRE_LIB_VERSION=0:1:0
@@ -4007,7 +3999,7 @@ test "x$enable_libtool_lock" != xno && enable_libtool_lock=yes
case $host in
*-*-irix6*)
# Find out which ABI we are using.
- echo '#line 4011 "configure"' > conftest.$ac_ext
+ echo '#line 4009 "configure"' > conftest.$ac_ext
if { (eval echo "$as_me:$LINENO: \"$ac_compile\"") >&5
(eval $ac_compile) 2>&5
ac_status=$?
@@ -4547,7 +4539,7 @@ chmod -w .
save_CFLAGS="$CFLAGS"
CFLAGS="$CFLAGS -o out/conftest2.$ac_objext"
compiler_c_o=no
-if { (eval echo configure:4551: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>out/conftest.err; } && test -s out/conftest2.$ac_objext; then
+if { (eval echo configure:4549: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>out/conftest.err; } && test -s out/conftest2.$ac_objext; then
# The compiler can only warn and ignore the option if not recognized
# So say no if there are warnings
if test -s out/conftest.err; then
@@ -5102,7 +5094,7 @@ else
;;
darwin* | rhapsody*)
- # This patch put in by hand by PH (19-Feb-2003) for Darwin 1.3.
+ # This patch put in by hand by PH (12-Mar-2003) for Darwin 1.3.
case "$host_os" in
rhapsody* | darwin1.[[012]])
allow_undefined_flag='-undefined suppress'
@@ -6217,7 +6209,7 @@ else
lt_dlunknown=0; lt_dlno_uscore=1; lt_dlneed_uscore=2
lt_status=$lt_dlunknown
cat > conftest.$ac_ext <<EOF
-#line 6211 "configure"
+#line 6209 "configure"
#include "confdefs.h"
#if HAVE_DLFCN_H
@@ -6315,7 +6307,7 @@ else
lt_dlunknown=0; lt_dlno_uscore=1; lt_dlneed_uscore=2
lt_status=$lt_dlunknown
cat > conftest.$ac_ext <<EOF
-#line 6309 "configure"
+#line 6307 "configure"
#include "confdefs.h"
#if HAVE_DLFCN_H
@@ -7045,6 +7037,10 @@ LIBTOOL='$(SHELL) $(top_builddir)/libtool'
+CC_FOR_BUILD=${CC_FOR_BUILD:-'$(CC)'}
+CFLAGS_FOR_BUILD=${CFLAGS_FOR_BUILD:-'$(CFLAGS)'}
+
+
echo "$as_me:$LINENO: checking for ANSI C header files" >&5
echo $ECHO_N "checking for ANSI C header files... $ECHO_C" >&6
if test "${ac_cv_header_stdc+set}" = set; then
@@ -8342,14 +8338,13 @@ s,@ac_ct_STRIP@,$ac_ct_STRIP,;t t
s,@CPP@,$CPP,;t t
s,@EGREP@,$EGREP,;t t
s,@LIBTOOL@,$LIBTOOL,;t t
+s,@CC_FOR_BUILD@,$CC_FOR_BUILD,;t t
+s,@CFLAGS_FOR_BUILD@,$CFLAGS_FOR_BUILD,;t t
s,@HAVE_MEMMOVE@,$HAVE_MEMMOVE,;t t
s,@HAVE_STRERROR@,$HAVE_STRERROR,;t t
-s,@HOST_CC@,$HOST_CC,;t t
-s,@HOST_CFLAGS@,$HOST_CFLAGS,;t t
-s,@NEWLINE@,$NEWLINE,;t t
s,@LINK_SIZE@,$LINK_SIZE,;t t
s,@MATCH_LIMIT@,$MATCH_LIMIT,;t t
-s,@UTF8@,$UTF8,;t t
+s,@NEWLINE@,$NEWLINE,;t t
s,@PCRE_MAJOR@,$PCRE_MAJOR,;t t
s,@PCRE_MINOR@,$PCRE_MINOR,;t t
s,@PCRE_DATE@,$PCRE_DATE,;t t
@@ -8357,6 +8352,7 @@ s,@PCRE_VERSION@,$PCRE_VERSION,;t t
s,@PCRE_LIB_VERSION@,$PCRE_LIB_VERSION,;t t
s,@PCRE_POSIXLIB_VERSION@,$PCRE_POSIXLIB_VERSION,;t t
s,@POSIX_MALLOC_THRESHOLD@,$POSIX_MALLOC_THRESHOLD,;t t
+s,@UTF8@,$UTF8,;t t
s,@POSIX_OBJ@,$POSIX_OBJ,;t t
s,@POSIX_LOBJ@,$POSIX_LOBJ,;t t
s,@POSIX_LIB@,$POSIX_LIB,;t t
diff --git a/configure.in b/configure.in
index 77bdaa9..574ad60 100644
--- a/configure.in
+++ b/configure.in
@@ -17,15 +17,13 @@ dnl digits for minor numbers less than 10. There are unlikely to be
dnl that many releases anyway.
PCRE_MAJOR=4
-PCRE_MINOR=0
-PCRE_DATE=17-Feb-2003
+PCRE_MINOR=1
+PCRE_DATE=12-Mar-2003
PCRE_VERSION=${PCRE_MAJOR}.${PCRE_MINOR}
dnl Default values for miscellaneous macros
POSIX_MALLOC_THRESHOLD=-DPOSIX_MALLOC_THRESHOLD=10
-HOST_CC=${HOST_CC:-'$(CC)'}
-HOST_CFLAGS=${HOST_CFLAGS:-'$(CFLAGS)'}
dnl Provide versioning information for libtool shared libraries that
dnl are built by default on Unix systems.
@@ -40,6 +38,16 @@ AC_PROG_INSTALL
AC_LIBTOOL_WIN32_DLL
AC_PROG_LIBTOOL
+dnl This is the compiler for compiling a program to run on the local host
+dnl while building. It needs to be different from CC when cross-compiling.
+dnl There is a macro called AC_PROG_CC_FOR_BUILD in the GNU archive for
+dnl figuring this out automatically, but I could not get it to work and I
+dnl ran out of time. Therefore, these values will have to be set manually
+dnl by people who are cross-compiling.
+
+CC_FOR_BUILD=${CC_FOR_BUILD:-'$(CC)'}
+CFLAGS_FOR_BUILD=${CFLAGS_FOR_BUILD:-'$(CFLAGS)'}
+
dnl Checks for header files.
AC_HEADER_STDC
@@ -112,14 +120,13 @@ AC_PROG_LIBTOOL
dnl "Export" these variables
+AC_SUBST(CC_FOR_BUILD)
+AC_SUBST(CFLAGS_FOR_BUILD)
AC_SUBST(HAVE_MEMMOVE)
AC_SUBST(HAVE_STRERROR)
-AC_SUBST(HOST_CC)
-AC_SUBST(HOST_CFLAGS)
-AC_SUBST(NEWLINE)
AC_SUBST(LINK_SIZE)
AC_SUBST(MATCH_LIMIT)
-AC_SUBST(UTF8)
+AC_SUBST(NEWLINE)
AC_SUBST(PCRE_MAJOR)
AC_SUBST(PCRE_MINOR)
AC_SUBST(PCRE_DATE)
@@ -127,6 +134,7 @@ AC_SUBST(PCRE_VERSION)
AC_SUBST(PCRE_LIB_VERSION)
AC_SUBST(PCRE_POSIXLIB_VERSION)
AC_SUBST(POSIX_MALLOC_THRESHOLD)
+AC_SUBST(UTF8)
dnl Stuff to make Win32 work better
diff --git a/dftables.c b/dftables.c
index fe4ffcd..9aa7b77 100644
--- a/dftables.c
+++ b/dftables.c
@@ -55,13 +55,17 @@ int main(void)
int i;
const unsigned char *tables = pcre_maketables();
+/* There are two printf() calls here, because gcc in pedantic mode complains
+about the very long string otherwise. */
+
printf(
"/*************************************************\n"
"* Perl-Compatible Regular Expressions *\n"
"*************************************************/\n\n"
"/* This file is automatically written by the dftables auxiliary \n"
"program. If you edit it by hand, you might like to edit the Makefile to \n"
- "prevent its ever being regenerated.\n\n"
+ "prevent its ever being regenerated.\n\n");
+printf(
"This file is #included in the compilation of pcre.c to build the default\n"
"character tables which are used when no tables are passed to the compile\n"
"function. */\n\n"
diff --git a/doc/html/pcreapi.html b/doc/html/pcreapi.html
index 3bc71b1..9a479e8 100644
--- a/doc/html/pcreapi.html
+++ b/doc/html/pcreapi.html
@@ -633,11 +633,13 @@ fourth argument should point to an <b>unsigned char *</b> variable.
</PRE>
</P>
<P>
-For a non-anchored pattern, return the value of the rightmost literal byte
-which must exist in any matched string, other than at its start. The fourth
-argument should point to an <b>int</b> variable. If there is no such byte, or if
-the pattern is anchored, -1 is returned. For example, for the pattern
-/a\d+z\d+/ the returned value is 'z'.
+Return the value of the rightmost literal byte that must exist in any matched
+string, other than at its start, if such a byte has been recorded. The fourth
+argument should point to an <b>int</b> variable. If there is no such byte, -1 is
+returned. For anchored patterns, a last literal byte is recorded only if it
+follows something of variable length. For example, for the pattern
+/^a\d+z\d+/ the returned value is "z", but for /^a\dz\d/ the returned value
+is -1.
</P>
<P>
<pre>
diff --git a/doc/pcre.txt b/doc/pcre.txt
index 07a1dd7..af147c0 100644
--- a/doc/pcre.txt
+++ b/doc/pcre.txt
@@ -819,12 +819,14 @@ INFORMATION ABOUT A PATTERN
PCRE_INFO_LASTLITERAL
- For a non-anchored pattern, return the value of the right-
- most literal byte which must exist in any matched string,
- other than at its start. The fourth argument should point to
- an int variable. If there is no such byte, or if the pattern
- is anchored, -1 is returned. For example, for the pattern
- /a\d+z\d+/ the returned value is 'z'.
+ Return the value of the rightmost literal byte that must
+ exist in any matched string, other than at its start, if
+ such a byte has been recorded. The fourth argument should
+ point to an int variable. If there is no such byte, -1 is
+ returned. For anchored patterns, a last literal byte is
+ recorded only if it follows something of variable length.
+ For example, for the pattern /^a\d+z\d+/ the returned value
+ is "z", but for /^a\dz\d/ the returned value is -1.
PCRE_INFO_NAMECOUNT
PCRE_INFO_NAMEENTRYSIZE
@@ -1127,6 +1129,7 @@ MATCHING A PATTERN
there are no capturing subpatterns, the return value from a
successful match is 1, indicating that just the first pair
of offsets has been set.
+
Some convenience functions are provided for extracting the
captured substrings as separate strings. These are described
in the following section.
@@ -1230,7 +1233,6 @@ EXTRACTING CAPTURED SUBSTRINGS BY NUMBER
int pcre_get_substring_list(const char *subject,
int *ovector, int stringcount, const char ***listptr);
-
Captured substrings can be accessed directly by using the
offsets returned by pcre_exec() in ovector. For convenience,
the functions pcre_copy_substring(), pcre_get_substring(),
diff --git a/doc/pcreapi.3 b/doc/pcreapi.3
index b808aba..fbd3d5d 100644
--- a/doc/pcreapi.3
+++ b/doc/pcreapi.3
@@ -520,11 +520,13 @@ fourth argument should point to an \fBunsigned char *\fR variable.
PCRE_INFO_LASTLITERAL
-For a non-anchored pattern, return the value of the rightmost literal byte
-which must exist in any matched string, other than at its start. The fourth
-argument should point to an \fBint\fR variable. If there is no such byte, or if
-the pattern is anchored, -1 is returned. For example, for the pattern
-/a\\d+z\\d+/ the returned value is 'z'.
+Return the value of the rightmost literal byte that must exist in any matched
+string, other than at its start, if such a byte has been recorded. The fourth
+argument should point to an \fBint\fR variable. If there is no such byte, -1 is
+returned. For anchored patterns, a last literal byte is recorded only if it
+follows something of variable length. For example, for the pattern
+/^a\\d+z\\d+/ the returned value is "z", but for /^a\\dz\\d/ the returned value
+is -1.
PCRE_INFO_NAMECOUNT
PCRE_INFO_NAMEENTRYSIZE
diff --git a/internal.h b/internal.h
index d7a47df..aa05fc0 100644
--- a/internal.h
+++ b/internal.h
@@ -214,10 +214,11 @@ time, run time or study time, respectively. */
#define REQ_UNSET (-2)
#define REQ_NONE (-1)
-/* Flags added to firstchar or reqchar */
+/* Flags added to firstbyte or reqbyte; a "non-literal" item is either a
+variable-length repeat, or a anything other than literal characters. */
#define REQ_CASELESS 0x0100 /* indicates caselessness */
-#define REQ_EOL 0x0200 /* indicates reqchar followed by $ */
+#define REQ_VARY 0x0200 /* reqbyte followed non-literal item */
/* Miscellaneous definitions */
@@ -570,6 +571,7 @@ typedef struct compile_data {
int name_entry_size; /* Size of each entry */
int top_backref; /* Maximum back reference */
unsigned int backref_map; /* Bitmap of low back refs */
+ int req_varyopt; /* "After variable item" flag for reqbyte */
} compile_data;
/* Structure for maintaining a chain of pointers to the currently incomplete
@@ -597,7 +599,7 @@ doing the matching, so that they are thread-safe. */
typedef struct match_data {
int match_call_count; /* As it says */
- int match_limit; /* As it says */
+ unsigned long int match_limit;/* As it says */
int *offset_vector; /* Offset vector */
int offset_end; /* One past the end */
int offset_max; /* The maximum usable for return data */
diff --git a/pcre.c b/pcre.c
index 0018613..458d0c6 100644
--- a/pcre.c
+++ b/pcre.c
@@ -68,7 +68,6 @@ compile time. */
#define BRASTACK_SIZE 200
-
/* Maximum number of ints of offset to save on the stack for recursive calls.
If the offset vector is bigger, malloc is used. This should be a multiple of 3,
because the offset vector is always a multiple of 3 long. */
@@ -84,6 +83,12 @@ test output would be different, which just complicates things.) */
#define MAXLIT 250
+/* The maximum remaining length of subject we are prepared to search for a
+req_byte match. */
+
+#define REQ_BYTE_MAX 1000
+
+
/* Table of sizes for the fixed-length opcodes. It's defined in a macro so that
the definition is next to the definition of the opcodes in internal.h. */
@@ -1138,6 +1143,10 @@ Returns: pointer to the opcode for the bracket, or NULL if not found
static const uschar *
find_bracket(const uschar *code, BOOL utf8, int number)
{
+#ifndef SUPPORT_UTF8
+utf8 = utf8; /* Stop pedantic compilers complaining */
+#endif
+
for (;;)
{
register int c = *code;
@@ -1453,7 +1462,7 @@ int length;
int greedy_default, greedy_non_default;
int firstbyte, reqbyte;
int zeroreqbyte, zerofirstbyte;
-int req_caseopt;
+int req_caseopt, reqvary, tempreqvary;
int condcount = 0;
int options = *optionsptr;
register int c;
@@ -1699,7 +1708,7 @@ for (;; ptr++)
posix_class *= 3;
for (i = 0; i < 3; i++)
{
- BOOL isblank = strncmp(ptr, "blank", 5) == 0;
+ BOOL isblank = strncmp((char *)ptr, "blank", 5) == 0;
int taboffset = posix_class_maps[posix_class + i];
if (taboffset < 0) break;
if (local_negate)
@@ -1949,7 +1958,7 @@ for (;; ptr++)
else
{
zerofirstbyte = firstbyte;
- reqbyte = class_lastchar | req_caseopt;
+ reqbyte = class_lastchar | req_caseopt | cd->req_varyopt;
}
*code++ = OP_CHARS;
*code++ = 1;
@@ -2053,10 +2062,14 @@ for (;; ptr++)
if (repeat_min == 0)
{
- firstbyte = zerofirstbyte; /* Adjust for zero repeat */
- reqbyte = zeroreqbyte; /* Ditto */
+ firstbyte = zerofirstbyte; /* Adjust for zero repeat */
+ reqbyte = zeroreqbyte; /* Ditto */
}
+ /* Remember whether this is a variable length repeat */
+
+ reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
+
op_type = 0; /* Default single-char op codes */
possessive_quantifier = FALSE; /* Default not possessive quantifier */
@@ -2142,7 +2155,7 @@ for (;; ptr++)
if (code == previous + 2) /* There was only one character */
{
code = previous; /* Abolish the previous item */
- if (repeat_min > 1) reqbyte = c | req_caseopt;
+ if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;
}
else
{
@@ -2501,10 +2514,13 @@ for (;; ptr++)
PUT(tempcode, 1, len);
}
- /* In all case we no longer have a previous item. */
+ /* In all case we no longer have a previous item. We also set the
+ "follows varying string" flag for subsequently encountered reqbytes if
+ it isn't already set and we have just passed a varying length item. */
END_REPEAT:
previous = NULL;
+ cd->req_varyopt |= reqvary;
break;
@@ -2553,7 +2569,8 @@ for (;; ptr++)
else if ((cd->ctypes[ptr[1]] & ctype_digit) != 0)
{
- int condref = *(++ptr) - '0';
+ int condref; /* Don't amalgamate; some compilers */
+ condref = *(++ptr) - '0'; /* grumble at autoincrement in declaration */
while (*(++ptr) != ')') condref = condref*10 + *ptr - '0';
if (condref == 0)
{
@@ -2619,19 +2636,24 @@ for (;; ptr++)
if (*(++ptr) == '<') /* Definition */
{
int i, namelen;
- const uschar *name = ++ptr;
uschar *slot = cd->name_table;
+ const uschar *name; /* Don't amalgamate; some compilers */
+ name = ++ptr; /* grumble at autoincrement in declaration */
while (*ptr++ != '>');
namelen = ptr - name - 1;
for (i = 0; i < cd->names_found; i++)
{
- int c = memcmp(name, slot+2, namelen + 1);
+ int c = memcmp(name, slot+2, namelen);
if (c == 0)
{
- *errorptr = ERR43;
- goto FAILED;
+ if (slot[2+namelen] == 0)
+ {
+ *errorptr = ERR43;
+ goto FAILED;
+ }
+ c = -1; /* Current name is substring */
}
if (c < 0)
{
@@ -2661,7 +2683,7 @@ for (;; ptr++)
for (i = 0; i < cd->names_found; i++)
{
- if (strncmp(name, slot+2, namelen) == 0) break;
+ if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
slot += cd->name_entry_size;
}
if (i >= cd->names_found)
@@ -2839,6 +2861,7 @@ for (;; ptr++)
previous = (bravalue >= OP_ONCE)? code : NULL;
*code = bravalue;
tempcode = code;
+ tempreqvary = cd->req_varyopt; /* Save value before bracket */
if (!compile_regex(
newoptions, /* The complete new option state */
@@ -2917,12 +2940,14 @@ for (;; ptr++)
}
/* If firstbyte was previously set, convert the subpattern's firstbyte
- into reqbyte if there wasn't one. */
+ into reqbyte if there wasn't one, using the vary flag that was in
+ existence beforehand. */
- else if (subfirstbyte >= 0 && subreqbyte < 0) subreqbyte = subfirstbyte;
+ else if (subfirstbyte >= 0 && subreqbyte < 0)
+ subreqbyte = subfirstbyte | tempreqvary;
- /* If the subpattern set a required char (or set a first char that isn't
- really the first char - see above), set it. */
+ /* If the subpattern set a required byte (or set a first byte that isn't
+ really the first byte - see above), set it. */
if (subreqbyte >= 0) reqbyte = subreqbyte;
}
@@ -3140,7 +3165,8 @@ for (;; ptr++)
if (firstbyte == REQ_UNSET)
{
zerofirstbyte = firstbyte = previous[2] | req_caseopt;
- zeroreqbyte = (t - 1 == previous + 2)? reqbyte : t[-1] | req_caseopt;
+ zeroreqbyte = (t - 1 == previous + 2)?
+ reqbyte : t[-1] | req_caseopt | cd->req_varyopt;
}
/* If there was a previous first byte, leave it alone, and don't change
@@ -3150,14 +3176,14 @@ for (;; ptr++)
else
{
zerofirstbyte = firstbyte;
- zeroreqbyte = t[-1] | req_caseopt;
+ zeroreqbyte = t[-1] | req_caseopt | cd->req_varyopt;
}
}
/* In all cases (we know length > 1), the new required byte is the last
byte of the string. */
- reqbyte = code[-1] | req_caseopt;
+ reqbyte = code[-1] | req_caseopt | cd->req_varyopt;
}
else /* End of UTF-8 coding */
@@ -3180,8 +3206,9 @@ for (;; ptr++)
else
{
zerofirstbyte = firstbyte = previous[2] | req_caseopt;
- zeroreqbyte = (length > 2)? (code[-2] | req_caseopt) : reqbyte;
- reqbyte = code[-1] | req_caseopt;
+ zeroreqbyte = (length > 2)?
+ (code[-2] | req_caseopt | cd->req_varyopt) : reqbyte;
+ reqbyte = code[-1] | req_caseopt | cd->req_varyopt;
}
}
@@ -3190,8 +3217,9 @@ for (;; ptr++)
else
{
zerofirstbyte = firstbyte;
- zeroreqbyte = (length == 1)? reqbyte : code[-2] | req_caseopt;
- reqbyte = code[-1] | req_caseopt;
+ zeroreqbyte = (length == 1)? reqbyte :
+ code[-2] | req_caseopt | cd->req_varyopt;
+ reqbyte = code[-1] | req_caseopt | cd->req_varyopt;
}
}
@@ -3308,7 +3336,9 @@ for (;;)
}
/* If this is not the first branch, the first char and reqbyte have to
- match the values from all the previous branches. */
+ match the values from all the previous branches, except that if the previous
+ value for reqbyte didn't have REQ_VARY set, it can still match, and we set
+ REQ_VARY for the regex. */
else
{
@@ -3330,7 +3360,9 @@ for (;;)
/* Now ensure that the reqbytes match */
- if (reqbyte != branchreqbyte) reqbyte = REQ_NONE;
+ if ((reqbyte & ~REQ_VARY) != (branchreqbyte & ~REQ_VARY))
+ reqbyte = REQ_NONE;
+ else reqbyte |= branchreqbyte; /* To "or" REQ_VARY */
}
/* If lookbehind, check that this branch matches a fixed-length string,
@@ -4168,7 +4200,8 @@ while ((c = *(++ptr)) != 0)
ptr += 3;
if (*ptr == '<')
{
- const uschar *p = ++ptr;
+ const uschar *p; /* Don't amalgamate; some compilers */
+ p = ++ptr; /* grumble at autoincrement in declaration */
while ((compile_block.ctypes[*ptr] & ctype_word) != 0) ptr++;
if (*ptr != '>')
{
@@ -4599,6 +4632,7 @@ compile_block.name_entry_size = max_name_size + 3;
compile_block.name_table = (uschar *)re + sizeof(real_pcre);
codestart = compile_block.name_table + re->name_entry_size * re->name_count;
compile_block.start_code = codestart;
+compile_block.req_varyopt = 0;
/* Set up a starting, non-extracting bracket, then compile the expression. On
error, *errorptr will be set non-NULL, so we don't need to look at the result
@@ -4672,13 +4706,12 @@ if ((options & PCRE_ANCHORED) == 0)
}
}
-/* Save the last required character if any. Remove caseless flag for
-non-caseable chars. */
+/* For an anchored pattern, we use the "required byte" only if it follows a
+variable length item in the regex. Remove the caseless flag for non-caseable
+chars. */
-if ((re->options & PCRE_ANCHORED) != 0 && reqbyte < 0 && firstbyte >= 0)
- reqbyte = firstbyte;
-
-if (reqbyte >= 0)
+if (reqbyte >= 0 &&
+ ((re->options & PCRE_ANCHORED) == 0 || (reqbyte & REQ_VARY) != 0))
{
int ch = reqbyte & 255;
re->req_byte = ((reqbyte & REQ_CASELESS) != 0 &&
@@ -5263,7 +5296,7 @@ for (;;)
(pcre_free)(new_recursive.offset_save);
return MATCH_NOMATCH;
}
- break;
+ /* Control never reaches here */
/* "Once" brackets are like assertion brackets except that after a match,
the point in the subject string is not moved back. Thus there can never be
@@ -7370,9 +7403,14 @@ do
optimization can save a huge amount of backtracking in patterns with nested
unlimited repeats that aren't going to match. Writing separate code for
cased/caseless versions makes it go faster, as does using an autoincrement
- and backing off on a match. */
+ and backing off on a match.
+
+ HOWEVER: when the subject string is very, very long, searching to its end can
+ take a long time, and give bad performance on quite ordinary patterns. This
+ showed up when somebody was matching /^C/ on a 32-megabyte string... so we
+ don't do this when the string is sufficiently long. */
- if (req_byte >= 0)
+ if (req_byte >= 0 && end_subject - start_match < REQ_BYTE_MAX)
{
register const uschar *p = start_match + ((first_byte >= 0)? 1 : 0);
diff --git a/pcretest.c b/pcretest.c
index 1b79669..2edfdc3 100644
--- a/pcretest.c
+++ b/pcretest.c
@@ -407,7 +407,8 @@ while (argc > 1 && argv[op][0] == '-')
else if (strcmp(argv[op], "-i") == 0) showinfo = 1;
else if (strcmp(argv[op], "-d") == 0) showinfo = debug = 1;
else if (strcmp(argv[op], "-o") == 0 && argc > 2 &&
- ((size_offsets = get_value(argv[op+1], &endptr)), *endptr == 0))
+ ((size_offsets = get_value((unsigned char *)argv[op+1], &endptr)),
+ *endptr == 0))
{
op++;
argc--;
@@ -1017,7 +1018,7 @@ while (!done)
uschar *pp = name;
while (isalnum(*p)) *pp++ = *p++;
*pp = 0;
- n = pcre_get_stringnumber(re, name);
+ n = pcre_get_stringnumber(re, (char *)name);
if (n < 0)
fprintf(outfile, "no parentheses with name \"%s\"\n", name);
else copystrings |= 1 << n;
@@ -1070,7 +1071,7 @@ while (!done)
uschar *pp = name;
while (isalnum(*p)) *pp++ = *p++;
*pp = 0;
- n = pcre_get_stringnumber(re, name);
+ n = pcre_get_stringnumber(re, (char *)name);
if (n < 0)
fprintf(outfile, "no parentheses with name \"%s\"\n", name);
else getstrings |= 1 << n;
diff --git a/testdata/testinput1 b/testdata/testinput1
index 02211ac..22bf6b1 100644
--- a/testdata/testinput1
+++ b/testdata/testinput1
@@ -3820,4 +3820,7 @@
/\M/
M
+/(a+)*b/
+ aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
+
/ End of testinput1 /
diff --git a/testdata/testinput2 b/testdata/testinput2
index d46546a..2748c6a 100644
--- a/testdata/testinput2
+++ b/testdata/testinput2
@@ -1225,4 +1225,8 @@
zzaa\CZ
zzaa\CA
+/(?P<x>eks)(?P<x>eccs)/
+
+/(?P<abc>abc(?P<def>def)(?P<abc>xyz))/
+
/ End of testinput2 /
diff --git a/testdata/testoutput1 b/testdata/testoutput1
index a01bb93..b5b01ad 100644
--- a/testdata/testoutput1
+++ b/testdata/testoutput1
@@ -1,4 +1,4 @@
-PCRE version 4.0 17-Feb-2003
+PCRE version 4.1 12-Mar-2003
/the quick brown fox/
the quick brown fox
@@ -6243,5 +6243,9 @@ No match
M
0: M
+/(a+)*b/
+ aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
+No match
+
/ End of testinput1 /
diff --git a/testdata/testoutput2 b/testdata/testoutput2
index 3f38522..f9bb2ae 100644
--- a/testdata/testoutput2
+++ b/testdata/testoutput2
@@ -1,4 +1,4 @@
-PCRE version 4.0 17-Feb-2003
+PCRE version 4.1 12-Mar-2003
/(a)b|/
Capturing subpattern count = 1
@@ -28,7 +28,7 @@ No match
Capturing subpattern count = 0
Options: anchored
No first char
-Need char = 'c'
+No need char
abc
0: abc
\Aabc
@@ -68,7 +68,7 @@ No need char
Capturing subpattern count = 0
Options: anchored
No first char
-Need char = 'c'
+No need char
abc
0: abc
*** Failers
@@ -302,7 +302,7 @@ Need char = 'x'
Capturing subpattern count = 0
Options: anchored
No first char
-Need char = 'x'
+No need char
the quick brown fox
0: the quick brown fox
*** Failers
@@ -504,7 +504,7 @@ No need char
Capturing subpattern count = 0
Options: anchored caseless
No first char
-Need char = '4'
+No need char
/(^b|(?i)^d)/
Capturing subpattern count = 1
@@ -651,7 +651,7 @@ No match
Capturing subpattern count = 0
Options: anchored multiline
No first char
-Need char = 'c'
+No need char
/^abc/m
Capturing subpattern count = 0
@@ -663,7 +663,7 @@ Need char = 'c'
Capturing subpattern count = 5
Options: anchored
No first char
-Need char = 'a'
+No need char
aaaaabbbbbcccccdef
0: aaaaabbbbbcccccdef
1: aaaaabbbbbcccccdef
@@ -837,7 +837,7 @@ Capturing subpattern count = 1
Max back reference = 1
Options: anchored
No first char
-Need char = 'a'
+No need char
aaaaaa
0: aaaaaa
1: aa
@@ -1027,7 +1027,7 @@ copy substring 1 failed -6
Capturing subpattern count = 3
Options: anchored
No first char
-Need char = 'f'
+No need char
adef\G1\G2\G3\G4\L
0: adef
1: a
@@ -1065,7 +1065,7 @@ get substring 4 failed -7
Capturing subpattern count = 0
Options: anchored
No first char
-Need char = 'f'
+No need char
abc\00def\L\C0
0: abc\x00def
0C abc (7)
@@ -1254,7 +1254,7 @@ Need char = 's'
Capturing subpattern count = 0
Options: anchored
No first char
-Need char = 's'
+No need char
ississippi
0: iss
0+ issippi
@@ -1304,7 +1304,7 @@ Need char = 'i'
Capturing subpattern count = 0
Options: anchored
No first char
-Need char = 's'
+No need char
Mississippi
0: Mis
0+ sissippi
@@ -1313,7 +1313,7 @@ Need char = 's'
Capturing subpattern count = 0
Options: anchored
No first char
-Need char = 10
+No need char
ab\nab\ncd
0: ab\x0a
0+ ab\x0acd
@@ -2420,7 +2420,7 @@ Need char = 's'
Capturing subpattern count = 2
Options: anchored
No first char
-Need char = 'a'
+No need char
aba
0: aba
1: a
@@ -2430,7 +2430,7 @@ Need char = 'a'
Capturing subpattern count = 2
Options: anchored
No first char
-Need char = 'a'
+No need char
aabbaa
0: aabbaa
1: aa
@@ -2440,7 +2440,7 @@ Need char = 'a'
Capturing subpattern count = 2
Options: anchored
No first char
-Need char = 'a'
+No need char
aabbaa
0: aabbaa
1: aa
@@ -2450,7 +2450,7 @@ Need char = 'a'
Capturing subpattern count = 2
Options: anchored
No first char
-Need char = 'a'
+No need char
aabbaa
0: aabbaa
1: aa
@@ -2460,7 +2460,7 @@ Need char = 'a'
Capturing subpattern count = 1
Options: anchored
No first char
-Need char = 'a'
+No need char
aabbaa
0: aabbaa
1: bb
@@ -2469,7 +2469,7 @@ Need char = 'a'
Capturing subpattern count = 3
Options: anchored
No first char
-Need char = 'a'
+No need char
aabbaa
0: aabbaa
1: aa
@@ -2480,7 +2480,7 @@ Need char = 'a'
Capturing subpattern count = 2
Options: anchored
No first char
-Need char = 'a'
+No need char
aabbaa
0: aabbaa
1: bb
@@ -2490,7 +2490,7 @@ Need char = 'a'
Capturing subpattern count = 1
Options: anchored
No first char
-Need char = 'a'
+No need char
aabbaa
0: aabbaa
1: bb
@@ -2499,7 +2499,7 @@ Need char = 'a'
Capturing subpattern count = 1
Options: anchored
No first char
-Need char = 'a'
+No need char
aabbbaa
0: aabbbaa
1: bbb
@@ -2508,7 +2508,7 @@ Need char = 'a'
Capturing subpattern count = 1
Options: anchored
No first char
-Need char = 'a'
+No need char
aabbbaa
0: aabbbaa
1: bbb
@@ -2517,7 +2517,7 @@ Need char = 'a'
Capturing subpattern count = 1
Options: anchored
No first char
-Need char = 'a'
+No need char
aabbaa
0: aabbaa
1: b
@@ -2526,7 +2526,7 @@ Need char = 'a'
Capturing subpattern count = 1
Options: anchored
No first char
-Need char = 'a'
+No need char
aabbbaa
0: aabbbaa
1: bb
@@ -2535,7 +2535,7 @@ Need char = 'a'
Capturing subpattern count = 3
Options: anchored
No first char
-Need char = 'a'
+No need char
aabbbaa
0: aabbbaa
1: aa
@@ -2546,7 +2546,7 @@ Need char = 'a'
Capturing subpattern count = 3
Options: anchored
No first char
-Need char = 'a'
+No need char
aabbbbaa
0: aabbbbaa
1: aa
@@ -3047,7 +3047,7 @@ Need char = 'b'
Capturing subpattern count = 5
Options: anchored
No first char
-Need char = 'a'
+No need char
/^x(?U)a+b/D
------------------------------------------------------------------
@@ -3735,7 +3735,7 @@ Need char = 'c'
Capturing subpattern count = 0
Options: anchored
No first char
-Need char = 'c'
+No need char
/(?C)a|b/S
Capturing subpattern count = 0
@@ -3917,7 +3917,7 @@ No match
Capturing subpattern count = 2
Options: anchored
No first char
-Need char = 'z'
+No need char
xyz
0: xyz
1: xyz
@@ -3977,7 +3977,7 @@ Failed: reference to non-existent subpattern at offset 4
Capturing subpattern count = 1
Options: anchored
No first char
-Need char = 'f'
+No need char
abcdefabc
0: abcdefabc
1: abc
@@ -3986,7 +3986,7 @@ Need char = 'f'
Capturing subpattern count = 1
Options: anchored
No first char
-Need char = '='
+No need char
a=a
0: a=a
1: a
@@ -4001,7 +4001,7 @@ Need char = '='
Capturing subpattern count = 2
Options: anchored
No first char
-Need char = '='
+No need char
a=a
0: a=a
1: a
@@ -4408,6 +4408,12 @@ Need char = 'a'
2: aa
2C aa (2)
+/(?P<x>eks)(?P<x>eccs)/
+Failed: two named groups have the same name at offset 16
+
+/(?P<abc>abc(?P<def>def)(?P<abc>xyz))/
+Failed: two named groups have the same name at offset 31
+
/ End of testinput2 /
Capturing subpattern count = 0
No options
diff --git a/testdata/testoutput3 b/testdata/testoutput3
index 6fdb681..4ec3489 100644
--- a/testdata/testoutput3
+++ b/testdata/testoutput3
@@ -1,4 +1,4 @@
-PCRE version 4.0 17-Feb-2003
+PCRE version 4.1 12-Mar-2003
/^[\w]+/
*** Failers
diff --git a/testdata/testoutput4 b/testdata/testoutput4
index 82c5e2a..18b4036 100644
--- a/testdata/testoutput4
+++ b/testdata/testoutput4
@@ -1,4 +1,4 @@
-PCRE version 4.0 17-Feb-2003
+PCRE version 4.1 12-Mar-2003
/-- Do not use the \x{} construct except with patterns that have the --/
/-- /8 option set, because PCRE doesn't recognize them as UTF-8 unless --/
diff --git a/testdata/testoutput5 b/testdata/testoutput5
index 3491576..c8daba0 100644
--- a/testdata/testoutput5
+++ b/testdata/testoutput5
@@ -1,4 +1,4 @@
-PCRE version 4.0 17-Feb-2003
+PCRE version 4.1 12-Mar-2003
/\x{100}/8DM
Memory allocation (code space): 11