diff options
author | nigel <nigel@2f5784b3-3f2a-0410-8824-cb99058d5e15> | 2007-02-24 21:40:08 +0000 |
---|---|---|
committer | nigel <nigel@2f5784b3-3f2a-0410-8824-cb99058d5e15> | 2007-02-24 21:40:08 +0000 |
commit | 776635721f3417106a59041ad7e77c4c815e1bd0 (patch) | |
tree | 1bbd6ba83e3faa1b6cebf413d2e080e51f95e483 | |
parent | c8cb607ab7e12e185e86a8b23d413b7f9536f24c (diff) | |
download | pcre-776635721f3417106a59041ad7e77c4c815e1bd0.tar.gz |
Load pcre-4.1 into code/trunk.
git-svn-id: svn://vcs.exim.org/pcre/code/trunk@65 2f5784b3-3f2a-0410-8824-cb99058d5e15
-rw-r--r-- | ChangeLog | 44 | ||||
-rw-r--r-- | Makefile.in | 12 | ||||
-rw-r--r-- | README | 10 | ||||
-rwxr-xr-x | configure | 36 | ||||
-rw-r--r-- | configure.in | 24 | ||||
-rw-r--r-- | dftables.c | 6 | ||||
-rw-r--r-- | doc/html/pcreapi.html | 12 | ||||
-rw-r--r-- | doc/pcre.txt | 16 | ||||
-rw-r--r-- | doc/pcreapi.3 | 12 | ||||
-rw-r--r-- | internal.h | 8 | ||||
-rw-r--r-- | pcre.c | 112 | ||||
-rw-r--r-- | pcretest.c | 7 | ||||
-rw-r--r-- | testdata/testinput1 | 3 | ||||
-rw-r--r-- | testdata/testinput2 | 4 | ||||
-rw-r--r-- | testdata/testoutput1 | 6 | ||||
-rw-r--r-- | testdata/testoutput2 | 72 | ||||
-rw-r--r-- | testdata/testoutput3 | 2 | ||||
-rw-r--r-- | testdata/testoutput4 | 2 | ||||
-rw-r--r-- | testdata/testoutput5 | 2 |
19 files changed, 256 insertions, 134 deletions
@@ -1,8 +1,48 @@ ChangeLog for PCRE ------------------ -Version 4.00 17-Feb-03 ----------------------- +Version 4.1 12-Mar-03 +--------------------- + +1. Compiling with gcc -pedantic found a couple of places where casts were +needed, and a string in dftables.c that was longer than standard compilers are +required to support. + +2. Compiling with Sun's compiler found a few more places where the code could +be tidied up in order to avoid warnings. + +3. The variables for cross-compiling were called HOST_CC and HOST_CFLAGS; the +first of these names is deprecated in the latest Autoconf in favour of the name +CC_FOR_BUILD, because "host" is typically used to mean the system on which the +compiled code will be run. I can't find a reference for HOST_CFLAGS, but by +analogy I have changed it to CFLAGS_FOR_BUILD. + +4. Added -no-undefined to the linking command in the Makefile, because this is +apparently helpful for Windows. To make it work, also added "-L. -lpcre" to the +linking step for the pcreposix library. + +5. PCRE was failing to diagnose the case of two named groups with the same +name. + +6. A problem with one of PCRE's optimizations was discovered. PCRE remembers a +literal character that is needed in the subject for a match, and scans along to +ensure that it is present before embarking on the full matching process. This +saves time in cases of nested unlimited repeats that are never going to match. +Problem: the scan can take a lot of time if the subject is very long (e.g. +megabytes), thus penalizing straightforward matches. It is now done only if the +amount of subject to be scanned is less than 1000 bytes. + +7. A lesser problem with the same optimization is that it was recording the +first character of an anchored pattern as "needed", thus provoking a search +right along the subject, even when the first match of the pattern was going to +fail. The "needed" character is now not set for anchored patterns, unless it +follows something in the pattern that is of non-fixed length. Thus, it still +fulfils its original purpose of finding quick non-matches in cases of nested +unlimited repeats, but isn't used for simple anchored patterns such as /^abc/. + + +Version 4.0 17-Feb-03 +--------------------- 1. If a comment in an extended regex that started immediately after a meta-item extended to the end of string, PCRE compiled incorrect data. This could lead to diff --git a/Makefile.in b/Makefile.in index 5d621b8..510d8f1 100644 --- a/Makefile.in +++ b/Makefile.in @@ -56,9 +56,9 @@ MANDIR = @mandir@ CC = @CC@ CFLAGS = @CFLAGS@ -HOST_CC = @HOST_CC@ -HOST_CFLAGS = @HOST_CFLAGS@ -UTF8 = @UTF8@ +CC_FOR_BUILD = @CC_FOR_BUILD@ +CFLAGS_FOR_BUILD = @CFLAGS_FOR_BUILD@ +UTF8 = @UTF8@ NEWLINE = @NEWLINE@ POSIX_MALLOC_THRESHOLD = @POSIX_MALLOC_THRESHOLD@ LINK_SIZE = @LINK_SIZE@ @@ -72,7 +72,7 @@ INSTALL_DATA = @INSTALL_DATA@ LIBTOOL = @LIBTOOL@ LTCOMPILE = $(LIBTOOL) --mode=compile $(CC) -c $(CFLAGS) -I. -I$(top_srcdir) $(NEWLINE) $(LINK_SIZE) $(MATCH_LIMIT) -LINK = $(LIBTOOL) --mode=link $(CC) $(CFLAGS) -I. -I$(top_srcdir) +LINK = $(LIBTOOL) --mode=link $(CC) --no-undefined $(CFLAGS) -I. -I$(top_srcdir) # These are the version numbers for the shared libraries @@ -101,7 +101,7 @@ libpcre.la: $(OBJ) libpcreposix.la: pcreposix.o -rm -f libpcreposix.la - $(LINK) -rpath $(LIBDIR) -version-info \ + $(LINK) -rpath $(LIBDIR) -L. -lpcre -version-info \ '$(PCREPOSIXLIBVERSION)' -o libpcreposix.la pcreposix.lo pcre.o: $(top_srcdir)/chartables.c $(top_srcdir)/pcre.c \ @@ -140,7 +140,7 @@ $(top_srcdir)/chartables.c: dftables dftables.o: $(top_srcdir)/dftables.c $(top_srcdir)/maketables.c \ $(top_srcdir)/internal.h pcre.h config.h Makefile - $(HOST_CC) -c $(HOST_CFLAGS) -I. $(top_srcdir)/dftables.c + $(CC_FOR_BUILD) -c $(CFLAGS_FOR_BUILD) -I. $(top_srcdir)/dftables.c dftables: dftables.o $(LINK) -o dftables dftables.o @@ -125,6 +125,16 @@ included in makefiles for programs that use PCRE, saving the programmer from having to remember too many details. +Cross-compiling PCRE on a Unix-like system +------------------------------------------ + +PCRE needs to compile and run an auxiliary program as part of the building +process. Obviously, if the real compilation is for some other system, it can't +use the same CC and CFLAGS values when it is doing this. For cross compilation, +therefore, you must set CC_FOR_BUILD to the local host's compiler, and you can +set flags in CFLAGS_FOR_BUILD if you need to. + + Shared libraries on Unix-like systems ------------------------------------- @@ -466,7 +466,7 @@ ac_includes_default="\ # include <unistd.h> #endif" -ac_subst_vars='SHELL PATH_SEPARATOR PACKAGE_NAME PACKAGE_TARNAME PACKAGE_VERSION PACKAGE_STRING PACKAGE_BUGREPORT exec_prefix prefix program_transform_name bindir sbindir libexecdir datadir sysconfdir sharedstatedir localstatedir libdir includedir oldincludedir infodir mandir build_alias host_alias target_alias DEFS ECHO_C ECHO_N ECHO_T LIBS CC CFLAGS LDFLAGS CPPFLAGS ac_ct_CC EXEEXT OBJEXT INSTALL_PROGRAM INSTALL_SCRIPT INSTALL_DATA build build_cpu build_vendor build_os host host_cpu host_vendor host_os LN_S ECHO RANLIB ac_ct_RANLIB STRIP ac_ct_STRIP CPP EGREP LIBTOOL HAVE_MEMMOVE HAVE_STRERROR HOST_CC HOST_CFLAGS NEWLINE LINK_SIZE MATCH_LIMIT UTF8 PCRE_MAJOR PCRE_MINOR PCRE_DATE PCRE_VERSION PCRE_LIB_VERSION PCRE_POSIXLIB_VERSION POSIX_MALLOC_THRESHOLD POSIX_OBJ POSIX_LOBJ POSIX_LIB LIBOBJS LTLIBOBJS' +ac_subst_vars='SHELL PATH_SEPARATOR PACKAGE_NAME PACKAGE_TARNAME PACKAGE_VERSION PACKAGE_STRING PACKAGE_BUGREPORT exec_prefix prefix program_transform_name bindir sbindir libexecdir datadir sysconfdir sharedstatedir localstatedir libdir includedir oldincludedir infodir mandir build_alias host_alias target_alias DEFS ECHO_C ECHO_N ECHO_T LIBS CC CFLAGS LDFLAGS CPPFLAGS ac_ct_CC EXEEXT OBJEXT INSTALL_PROGRAM INSTALL_SCRIPT INSTALL_DATA build build_cpu build_vendor build_os host host_cpu host_vendor host_os LN_S ECHO RANLIB ac_ct_RANLIB STRIP ac_ct_STRIP CPP EGREP LIBTOOL CC_FOR_BUILD CFLAGS_FOR_BUILD HAVE_MEMMOVE HAVE_STRERROR LINK_SIZE MATCH_LIMIT NEWLINE PCRE_MAJOR PCRE_MINOR PCRE_DATE PCRE_VERSION PCRE_LIB_VERSION PCRE_POSIXLIB_VERSION POSIX_MALLOC_THRESHOLD UTF8 POSIX_OBJ POSIX_LOBJ POSIX_LIB LIBOBJS LTLIBOBJS' ac_subst_files='' # Initialize some variables set by options. @@ -939,12 +939,6 @@ if test "$ac_init_help" = "long"; then # The list generated by autoconf has been trimmed to remove many # options that are totally irrelevant to PCRE (e.g. relating to X), # or are not supported by its Makefile. - # The list generated by autoconf has been trimmed to remove many - # options that are totally irrelevant to PCRE (e.g. relating to X), - # or are not supported by its Makefile. - # The list generated by autoconf has been trimmed to remove many - # options that are totally irrelevant to PCRE (e.g. relating to X), - # or are not supported by its Makefile. # This message is too long to be a string in the A/UX 3.1 sh. cat <<_ACEOF \`configure' configures this package to adapt to many kinds of systems. @@ -1438,14 +1432,12 @@ ac_compiler_gnu=$ac_cv_c_compiler_gnu PCRE_MAJOR=4 -PCRE_MINOR=0 -PCRE_DATE=17-Feb-2003 +PCRE_MINOR=1 +PCRE_DATE=12-Mar-2003 PCRE_VERSION=${PCRE_MAJOR}.${PCRE_MINOR} POSIX_MALLOC_THRESHOLD=-DPOSIX_MALLOC_THRESHOLD=10 -HOST_CC=${HOST_CC:-'$(CC)'} -HOST_CFLAGS=${HOST_CFLAGS:-'$(CFLAGS)'} PCRE_LIB_VERSION=0:1:0 @@ -4007,7 +3999,7 @@ test "x$enable_libtool_lock" != xno && enable_libtool_lock=yes case $host in *-*-irix6*) # Find out which ABI we are using. - echo '#line 4011 "configure"' > conftest.$ac_ext + echo '#line 4009 "configure"' > conftest.$ac_ext if { (eval echo "$as_me:$LINENO: \"$ac_compile\"") >&5 (eval $ac_compile) 2>&5 ac_status=$? @@ -4547,7 +4539,7 @@ chmod -w . save_CFLAGS="$CFLAGS" CFLAGS="$CFLAGS -o out/conftest2.$ac_objext" compiler_c_o=no -if { (eval echo configure:4551: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>out/conftest.err; } && test -s out/conftest2.$ac_objext; then +if { (eval echo configure:4549: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>out/conftest.err; } && test -s out/conftest2.$ac_objext; then # The compiler can only warn and ignore the option if not recognized # So say no if there are warnings if test -s out/conftest.err; then @@ -5102,7 +5094,7 @@ else ;; darwin* | rhapsody*) - # This patch put in by hand by PH (19-Feb-2003) for Darwin 1.3. + # This patch put in by hand by PH (12-Mar-2003) for Darwin 1.3. case "$host_os" in rhapsody* | darwin1.[[012]]) allow_undefined_flag='-undefined suppress' @@ -6217,7 +6209,7 @@ else lt_dlunknown=0; lt_dlno_uscore=1; lt_dlneed_uscore=2 lt_status=$lt_dlunknown cat > conftest.$ac_ext <<EOF -#line 6211 "configure" +#line 6209 "configure" #include "confdefs.h" #if HAVE_DLFCN_H @@ -6315,7 +6307,7 @@ else lt_dlunknown=0; lt_dlno_uscore=1; lt_dlneed_uscore=2 lt_status=$lt_dlunknown cat > conftest.$ac_ext <<EOF -#line 6309 "configure" +#line 6307 "configure" #include "confdefs.h" #if HAVE_DLFCN_H @@ -7045,6 +7037,10 @@ LIBTOOL='$(SHELL) $(top_builddir)/libtool' +CC_FOR_BUILD=${CC_FOR_BUILD:-'$(CC)'} +CFLAGS_FOR_BUILD=${CFLAGS_FOR_BUILD:-'$(CFLAGS)'} + + echo "$as_me:$LINENO: checking for ANSI C header files" >&5 echo $ECHO_N "checking for ANSI C header files... $ECHO_C" >&6 if test "${ac_cv_header_stdc+set}" = set; then @@ -8342,14 +8338,13 @@ s,@ac_ct_STRIP@,$ac_ct_STRIP,;t t s,@CPP@,$CPP,;t t s,@EGREP@,$EGREP,;t t s,@LIBTOOL@,$LIBTOOL,;t t +s,@CC_FOR_BUILD@,$CC_FOR_BUILD,;t t +s,@CFLAGS_FOR_BUILD@,$CFLAGS_FOR_BUILD,;t t s,@HAVE_MEMMOVE@,$HAVE_MEMMOVE,;t t s,@HAVE_STRERROR@,$HAVE_STRERROR,;t t -s,@HOST_CC@,$HOST_CC,;t t -s,@HOST_CFLAGS@,$HOST_CFLAGS,;t t -s,@NEWLINE@,$NEWLINE,;t t s,@LINK_SIZE@,$LINK_SIZE,;t t s,@MATCH_LIMIT@,$MATCH_LIMIT,;t t -s,@UTF8@,$UTF8,;t t +s,@NEWLINE@,$NEWLINE,;t t s,@PCRE_MAJOR@,$PCRE_MAJOR,;t t s,@PCRE_MINOR@,$PCRE_MINOR,;t t s,@PCRE_DATE@,$PCRE_DATE,;t t @@ -8357,6 +8352,7 @@ s,@PCRE_VERSION@,$PCRE_VERSION,;t t s,@PCRE_LIB_VERSION@,$PCRE_LIB_VERSION,;t t s,@PCRE_POSIXLIB_VERSION@,$PCRE_POSIXLIB_VERSION,;t t s,@POSIX_MALLOC_THRESHOLD@,$POSIX_MALLOC_THRESHOLD,;t t +s,@UTF8@,$UTF8,;t t s,@POSIX_OBJ@,$POSIX_OBJ,;t t s,@POSIX_LOBJ@,$POSIX_LOBJ,;t t s,@POSIX_LIB@,$POSIX_LIB,;t t diff --git a/configure.in b/configure.in index 77bdaa9..574ad60 100644 --- a/configure.in +++ b/configure.in @@ -17,15 +17,13 @@ dnl digits for minor numbers less than 10. There are unlikely to be dnl that many releases anyway. PCRE_MAJOR=4 -PCRE_MINOR=0 -PCRE_DATE=17-Feb-2003 +PCRE_MINOR=1 +PCRE_DATE=12-Mar-2003 PCRE_VERSION=${PCRE_MAJOR}.${PCRE_MINOR} dnl Default values for miscellaneous macros POSIX_MALLOC_THRESHOLD=-DPOSIX_MALLOC_THRESHOLD=10 -HOST_CC=${HOST_CC:-'$(CC)'} -HOST_CFLAGS=${HOST_CFLAGS:-'$(CFLAGS)'} dnl Provide versioning information for libtool shared libraries that dnl are built by default on Unix systems. @@ -40,6 +38,16 @@ AC_PROG_INSTALL AC_LIBTOOL_WIN32_DLL AC_PROG_LIBTOOL +dnl This is the compiler for compiling a program to run on the local host +dnl while building. It needs to be different from CC when cross-compiling. +dnl There is a macro called AC_PROG_CC_FOR_BUILD in the GNU archive for +dnl figuring this out automatically, but I could not get it to work and I +dnl ran out of time. Therefore, these values will have to be set manually +dnl by people who are cross-compiling. + +CC_FOR_BUILD=${CC_FOR_BUILD:-'$(CC)'} +CFLAGS_FOR_BUILD=${CFLAGS_FOR_BUILD:-'$(CFLAGS)'} + dnl Checks for header files. AC_HEADER_STDC @@ -112,14 +120,13 @@ AC_PROG_LIBTOOL dnl "Export" these variables +AC_SUBST(CC_FOR_BUILD) +AC_SUBST(CFLAGS_FOR_BUILD) AC_SUBST(HAVE_MEMMOVE) AC_SUBST(HAVE_STRERROR) -AC_SUBST(HOST_CC) -AC_SUBST(HOST_CFLAGS) -AC_SUBST(NEWLINE) AC_SUBST(LINK_SIZE) AC_SUBST(MATCH_LIMIT) -AC_SUBST(UTF8) +AC_SUBST(NEWLINE) AC_SUBST(PCRE_MAJOR) AC_SUBST(PCRE_MINOR) AC_SUBST(PCRE_DATE) @@ -127,6 +134,7 @@ AC_SUBST(PCRE_VERSION) AC_SUBST(PCRE_LIB_VERSION) AC_SUBST(PCRE_POSIXLIB_VERSION) AC_SUBST(POSIX_MALLOC_THRESHOLD) +AC_SUBST(UTF8) dnl Stuff to make Win32 work better @@ -55,13 +55,17 @@ int main(void) int i; const unsigned char *tables = pcre_maketables(); +/* There are two printf() calls here, because gcc in pedantic mode complains +about the very long string otherwise. */ + printf( "/*************************************************\n" "* Perl-Compatible Regular Expressions *\n" "*************************************************/\n\n" "/* This file is automatically written by the dftables auxiliary \n" "program. If you edit it by hand, you might like to edit the Makefile to \n" - "prevent its ever being regenerated.\n\n" + "prevent its ever being regenerated.\n\n"); +printf( "This file is #included in the compilation of pcre.c to build the default\n" "character tables which are used when no tables are passed to the compile\n" "function. */\n\n" diff --git a/doc/html/pcreapi.html b/doc/html/pcreapi.html index 3bc71b1..9a479e8 100644 --- a/doc/html/pcreapi.html +++ b/doc/html/pcreapi.html @@ -633,11 +633,13 @@ fourth argument should point to an <b>unsigned char *</b> variable. </PRE> </P> <P> -For a non-anchored pattern, return the value of the rightmost literal byte -which must exist in any matched string, other than at its start. The fourth -argument should point to an <b>int</b> variable. If there is no such byte, or if -the pattern is anchored, -1 is returned. For example, for the pattern -/a\d+z\d+/ the returned value is 'z'. +Return the value of the rightmost literal byte that must exist in any matched +string, other than at its start, if such a byte has been recorded. The fourth +argument should point to an <b>int</b> variable. If there is no such byte, -1 is +returned. For anchored patterns, a last literal byte is recorded only if it +follows something of variable length. For example, for the pattern +/^a\d+z\d+/ the returned value is "z", but for /^a\dz\d/ the returned value +is -1. </P> <P> <pre> diff --git a/doc/pcre.txt b/doc/pcre.txt index 07a1dd7..af147c0 100644 --- a/doc/pcre.txt +++ b/doc/pcre.txt @@ -819,12 +819,14 @@ INFORMATION ABOUT A PATTERN PCRE_INFO_LASTLITERAL - For a non-anchored pattern, return the value of the right- - most literal byte which must exist in any matched string, - other than at its start. The fourth argument should point to - an int variable. If there is no such byte, or if the pattern - is anchored, -1 is returned. For example, for the pattern - /a\d+z\d+/ the returned value is 'z'. + Return the value of the rightmost literal byte that must + exist in any matched string, other than at its start, if + such a byte has been recorded. The fourth argument should + point to an int variable. If there is no such byte, -1 is + returned. For anchored patterns, a last literal byte is + recorded only if it follows something of variable length. + For example, for the pattern /^a\d+z\d+/ the returned value + is "z", but for /^a\dz\d/ the returned value is -1. PCRE_INFO_NAMECOUNT PCRE_INFO_NAMEENTRYSIZE @@ -1127,6 +1129,7 @@ MATCHING A PATTERN there are no capturing subpatterns, the return value from a successful match is 1, indicating that just the first pair of offsets has been set. + Some convenience functions are provided for extracting the captured substrings as separate strings. These are described in the following section. @@ -1230,7 +1233,6 @@ EXTRACTING CAPTURED SUBSTRINGS BY NUMBER int pcre_get_substring_list(const char *subject, int *ovector, int stringcount, const char ***listptr); - Captured substrings can be accessed directly by using the offsets returned by pcre_exec() in ovector. For convenience, the functions pcre_copy_substring(), pcre_get_substring(), diff --git a/doc/pcreapi.3 b/doc/pcreapi.3 index b808aba..fbd3d5d 100644 --- a/doc/pcreapi.3 +++ b/doc/pcreapi.3 @@ -520,11 +520,13 @@ fourth argument should point to an \fBunsigned char *\fR variable. PCRE_INFO_LASTLITERAL -For a non-anchored pattern, return the value of the rightmost literal byte -which must exist in any matched string, other than at its start. The fourth -argument should point to an \fBint\fR variable. If there is no such byte, or if -the pattern is anchored, -1 is returned. For example, for the pattern -/a\\d+z\\d+/ the returned value is 'z'. +Return the value of the rightmost literal byte that must exist in any matched +string, other than at its start, if such a byte has been recorded. The fourth +argument should point to an \fBint\fR variable. If there is no such byte, -1 is +returned. For anchored patterns, a last literal byte is recorded only if it +follows something of variable length. For example, for the pattern +/^a\\d+z\\d+/ the returned value is "z", but for /^a\\dz\\d/ the returned value +is -1. PCRE_INFO_NAMECOUNT PCRE_INFO_NAMEENTRYSIZE @@ -214,10 +214,11 @@ time, run time or study time, respectively. */ #define REQ_UNSET (-2) #define REQ_NONE (-1) -/* Flags added to firstchar or reqchar */ +/* Flags added to firstbyte or reqbyte; a "non-literal" item is either a +variable-length repeat, or a anything other than literal characters. */ #define REQ_CASELESS 0x0100 /* indicates caselessness */ -#define REQ_EOL 0x0200 /* indicates reqchar followed by $ */ +#define REQ_VARY 0x0200 /* reqbyte followed non-literal item */ /* Miscellaneous definitions */ @@ -570,6 +571,7 @@ typedef struct compile_data { int name_entry_size; /* Size of each entry */ int top_backref; /* Maximum back reference */ unsigned int backref_map; /* Bitmap of low back refs */ + int req_varyopt; /* "After variable item" flag for reqbyte */ } compile_data; /* Structure for maintaining a chain of pointers to the currently incomplete @@ -597,7 +599,7 @@ doing the matching, so that they are thread-safe. */ typedef struct match_data { int match_call_count; /* As it says */ - int match_limit; /* As it says */ + unsigned long int match_limit;/* As it says */ int *offset_vector; /* Offset vector */ int offset_end; /* One past the end */ int offset_max; /* The maximum usable for return data */ @@ -68,7 +68,6 @@ compile time. */ #define BRASTACK_SIZE 200 - /* Maximum number of ints of offset to save on the stack for recursive calls. If the offset vector is bigger, malloc is used. This should be a multiple of 3, because the offset vector is always a multiple of 3 long. */ @@ -84,6 +83,12 @@ test output would be different, which just complicates things.) */ #define MAXLIT 250 +/* The maximum remaining length of subject we are prepared to search for a +req_byte match. */ + +#define REQ_BYTE_MAX 1000 + + /* Table of sizes for the fixed-length opcodes. It's defined in a macro so that the definition is next to the definition of the opcodes in internal.h. */ @@ -1138,6 +1143,10 @@ Returns: pointer to the opcode for the bracket, or NULL if not found static const uschar * find_bracket(const uschar *code, BOOL utf8, int number) { +#ifndef SUPPORT_UTF8 +utf8 = utf8; /* Stop pedantic compilers complaining */ +#endif + for (;;) { register int c = *code; @@ -1453,7 +1462,7 @@ int length; int greedy_default, greedy_non_default; int firstbyte, reqbyte; int zeroreqbyte, zerofirstbyte; -int req_caseopt; +int req_caseopt, reqvary, tempreqvary; int condcount = 0; int options = *optionsptr; register int c; @@ -1699,7 +1708,7 @@ for (;; ptr++) posix_class *= 3; for (i = 0; i < 3; i++) { - BOOL isblank = strncmp(ptr, "blank", 5) == 0; + BOOL isblank = strncmp((char *)ptr, "blank", 5) == 0; int taboffset = posix_class_maps[posix_class + i]; if (taboffset < 0) break; if (local_negate) @@ -1949,7 +1958,7 @@ for (;; ptr++) else { zerofirstbyte = firstbyte; - reqbyte = class_lastchar | req_caseopt; + reqbyte = class_lastchar | req_caseopt | cd->req_varyopt; } *code++ = OP_CHARS; *code++ = 1; @@ -2053,10 +2062,14 @@ for (;; ptr++) if (repeat_min == 0) { - firstbyte = zerofirstbyte; /* Adjust for zero repeat */ - reqbyte = zeroreqbyte; /* Ditto */ + firstbyte = zerofirstbyte; /* Adjust for zero repeat */ + reqbyte = zeroreqbyte; /* Ditto */ } + /* Remember whether this is a variable length repeat */ + + reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY; + op_type = 0; /* Default single-char op codes */ possessive_quantifier = FALSE; /* Default not possessive quantifier */ @@ -2142,7 +2155,7 @@ for (;; ptr++) if (code == previous + 2) /* There was only one character */ { code = previous; /* Abolish the previous item */ - if (repeat_min > 1) reqbyte = c | req_caseopt; + if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt; } else { @@ -2501,10 +2514,13 @@ for (;; ptr++) PUT(tempcode, 1, len); } - /* In all case we no longer have a previous item. */ + /* In all case we no longer have a previous item. We also set the + "follows varying string" flag for subsequently encountered reqbytes if + it isn't already set and we have just passed a varying length item. */ END_REPEAT: previous = NULL; + cd->req_varyopt |= reqvary; break; @@ -2553,7 +2569,8 @@ for (;; ptr++) else if ((cd->ctypes[ptr[1]] & ctype_digit) != 0) { - int condref = *(++ptr) - '0'; + int condref; /* Don't amalgamate; some compilers */ + condref = *(++ptr) - '0'; /* grumble at autoincrement in declaration */ while (*(++ptr) != ')') condref = condref*10 + *ptr - '0'; if (condref == 0) { @@ -2619,19 +2636,24 @@ for (;; ptr++) if (*(++ptr) == '<') /* Definition */ { int i, namelen; - const uschar *name = ++ptr; uschar *slot = cd->name_table; + const uschar *name; /* Don't amalgamate; some compilers */ + name = ++ptr; /* grumble at autoincrement in declaration */ while (*ptr++ != '>'); namelen = ptr - name - 1; for (i = 0; i < cd->names_found; i++) { - int c = memcmp(name, slot+2, namelen + 1); + int c = memcmp(name, slot+2, namelen); if (c == 0) { - *errorptr = ERR43; - goto FAILED; + if (slot[2+namelen] == 0) + { + *errorptr = ERR43; + goto FAILED; + } + c = -1; /* Current name is substring */ } if (c < 0) { @@ -2661,7 +2683,7 @@ for (;; ptr++) for (i = 0; i < cd->names_found; i++) { - if (strncmp(name, slot+2, namelen) == 0) break; + if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break; slot += cd->name_entry_size; } if (i >= cd->names_found) @@ -2839,6 +2861,7 @@ for (;; ptr++) previous = (bravalue >= OP_ONCE)? code : NULL; *code = bravalue; tempcode = code; + tempreqvary = cd->req_varyopt; /* Save value before bracket */ if (!compile_regex( newoptions, /* The complete new option state */ @@ -2917,12 +2940,14 @@ for (;; ptr++) } /* If firstbyte was previously set, convert the subpattern's firstbyte - into reqbyte if there wasn't one. */ + into reqbyte if there wasn't one, using the vary flag that was in + existence beforehand. */ - else if (subfirstbyte >= 0 && subreqbyte < 0) subreqbyte = subfirstbyte; + else if (subfirstbyte >= 0 && subreqbyte < 0) + subreqbyte = subfirstbyte | tempreqvary; - /* If the subpattern set a required char (or set a first char that isn't - really the first char - see above), set it. */ + /* If the subpattern set a required byte (or set a first byte that isn't + really the first byte - see above), set it. */ if (subreqbyte >= 0) reqbyte = subreqbyte; } @@ -3140,7 +3165,8 @@ for (;; ptr++) if (firstbyte == REQ_UNSET) { zerofirstbyte = firstbyte = previous[2] | req_caseopt; - zeroreqbyte = (t - 1 == previous + 2)? reqbyte : t[-1] | req_caseopt; + zeroreqbyte = (t - 1 == previous + 2)? + reqbyte : t[-1] | req_caseopt | cd->req_varyopt; } /* If there was a previous first byte, leave it alone, and don't change @@ -3150,14 +3176,14 @@ for (;; ptr++) else { zerofirstbyte = firstbyte; - zeroreqbyte = t[-1] | req_caseopt; + zeroreqbyte = t[-1] | req_caseopt | cd->req_varyopt; } } /* In all cases (we know length > 1), the new required byte is the last byte of the string. */ - reqbyte = code[-1] | req_caseopt; + reqbyte = code[-1] | req_caseopt | cd->req_varyopt; } else /* End of UTF-8 coding */ @@ -3180,8 +3206,9 @@ for (;; ptr++) else { zerofirstbyte = firstbyte = previous[2] | req_caseopt; - zeroreqbyte = (length > 2)? (code[-2] | req_caseopt) : reqbyte; - reqbyte = code[-1] | req_caseopt; + zeroreqbyte = (length > 2)? + (code[-2] | req_caseopt | cd->req_varyopt) : reqbyte; + reqbyte = code[-1] | req_caseopt | cd->req_varyopt; } } @@ -3190,8 +3217,9 @@ for (;; ptr++) else { zerofirstbyte = firstbyte; - zeroreqbyte = (length == 1)? reqbyte : code[-2] | req_caseopt; - reqbyte = code[-1] | req_caseopt; + zeroreqbyte = (length == 1)? reqbyte : + code[-2] | req_caseopt | cd->req_varyopt; + reqbyte = code[-1] | req_caseopt | cd->req_varyopt; } } @@ -3308,7 +3336,9 @@ for (;;) } /* If this is not the first branch, the first char and reqbyte have to - match the values from all the previous branches. */ + match the values from all the previous branches, except that if the previous + value for reqbyte didn't have REQ_VARY set, it can still match, and we set + REQ_VARY for the regex. */ else { @@ -3330,7 +3360,9 @@ for (;;) /* Now ensure that the reqbytes match */ - if (reqbyte != branchreqbyte) reqbyte = REQ_NONE; + if ((reqbyte & ~REQ_VARY) != (branchreqbyte & ~REQ_VARY)) + reqbyte = REQ_NONE; + else reqbyte |= branchreqbyte; /* To "or" REQ_VARY */ } /* If lookbehind, check that this branch matches a fixed-length string, @@ -4168,7 +4200,8 @@ while ((c = *(++ptr)) != 0) ptr += 3; if (*ptr == '<') { - const uschar *p = ++ptr; + const uschar *p; /* Don't amalgamate; some compilers */ + p = ++ptr; /* grumble at autoincrement in declaration */ while ((compile_block.ctypes[*ptr] & ctype_word) != 0) ptr++; if (*ptr != '>') { @@ -4599,6 +4632,7 @@ compile_block.name_entry_size = max_name_size + 3; compile_block.name_table = (uschar *)re + sizeof(real_pcre); codestart = compile_block.name_table + re->name_entry_size * re->name_count; compile_block.start_code = codestart; +compile_block.req_varyopt = 0; /* Set up a starting, non-extracting bracket, then compile the expression. On error, *errorptr will be set non-NULL, so we don't need to look at the result @@ -4672,13 +4706,12 @@ if ((options & PCRE_ANCHORED) == 0) } } -/* Save the last required character if any. Remove caseless flag for -non-caseable chars. */ +/* For an anchored pattern, we use the "required byte" only if it follows a +variable length item in the regex. Remove the caseless flag for non-caseable +chars. */ -if ((re->options & PCRE_ANCHORED) != 0 && reqbyte < 0 && firstbyte >= 0) - reqbyte = firstbyte; - -if (reqbyte >= 0) +if (reqbyte >= 0 && + ((re->options & PCRE_ANCHORED) == 0 || (reqbyte & REQ_VARY) != 0)) { int ch = reqbyte & 255; re->req_byte = ((reqbyte & REQ_CASELESS) != 0 && @@ -5263,7 +5296,7 @@ for (;;) (pcre_free)(new_recursive.offset_save); return MATCH_NOMATCH; } - break; + /* Control never reaches here */ /* "Once" brackets are like assertion brackets except that after a match, the point in the subject string is not moved back. Thus there can never be @@ -7370,9 +7403,14 @@ do optimization can save a huge amount of backtracking in patterns with nested unlimited repeats that aren't going to match. Writing separate code for cased/caseless versions makes it go faster, as does using an autoincrement - and backing off on a match. */ + and backing off on a match. + + HOWEVER: when the subject string is very, very long, searching to its end can + take a long time, and give bad performance on quite ordinary patterns. This + showed up when somebody was matching /^C/ on a 32-megabyte string... so we + don't do this when the string is sufficiently long. */ - if (req_byte >= 0) + if (req_byte >= 0 && end_subject - start_match < REQ_BYTE_MAX) { register const uschar *p = start_match + ((first_byte >= 0)? 1 : 0); @@ -407,7 +407,8 @@ while (argc > 1 && argv[op][0] == '-') else if (strcmp(argv[op], "-i") == 0) showinfo = 1; else if (strcmp(argv[op], "-d") == 0) showinfo = debug = 1; else if (strcmp(argv[op], "-o") == 0 && argc > 2 && - ((size_offsets = get_value(argv[op+1], &endptr)), *endptr == 0)) + ((size_offsets = get_value((unsigned char *)argv[op+1], &endptr)), + *endptr == 0)) { op++; argc--; @@ -1017,7 +1018,7 @@ while (!done) uschar *pp = name; while (isalnum(*p)) *pp++ = *p++; *pp = 0; - n = pcre_get_stringnumber(re, name); + n = pcre_get_stringnumber(re, (char *)name); if (n < 0) fprintf(outfile, "no parentheses with name \"%s\"\n", name); else copystrings |= 1 << n; @@ -1070,7 +1071,7 @@ while (!done) uschar *pp = name; while (isalnum(*p)) *pp++ = *p++; *pp = 0; - n = pcre_get_stringnumber(re, name); + n = pcre_get_stringnumber(re, (char *)name); if (n < 0) fprintf(outfile, "no parentheses with name \"%s\"\n", name); else getstrings |= 1 << n; diff --git a/testdata/testinput1 b/testdata/testinput1 index 02211ac..22bf6b1 100644 --- a/testdata/testinput1 +++ b/testdata/testinput1 @@ -3820,4 +3820,7 @@ /\M/ M +/(a+)*b/ + aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa + / End of testinput1 / diff --git a/testdata/testinput2 b/testdata/testinput2 index d46546a..2748c6a 100644 --- a/testdata/testinput2 +++ b/testdata/testinput2 @@ -1225,4 +1225,8 @@ zzaa\CZ zzaa\CA +/(?P<x>eks)(?P<x>eccs)/ + +/(?P<abc>abc(?P<def>def)(?P<abc>xyz))/ + / End of testinput2 / diff --git a/testdata/testoutput1 b/testdata/testoutput1 index a01bb93..b5b01ad 100644 --- a/testdata/testoutput1 +++ b/testdata/testoutput1 @@ -1,4 +1,4 @@ -PCRE version 4.0 17-Feb-2003 +PCRE version 4.1 12-Mar-2003 /the quick brown fox/ the quick brown fox @@ -6243,5 +6243,9 @@ No match M 0: M +/(a+)*b/ + aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa +No match + / End of testinput1 / diff --git a/testdata/testoutput2 b/testdata/testoutput2 index 3f38522..f9bb2ae 100644 --- a/testdata/testoutput2 +++ b/testdata/testoutput2 @@ -1,4 +1,4 @@ -PCRE version 4.0 17-Feb-2003 +PCRE version 4.1 12-Mar-2003 /(a)b|/ Capturing subpattern count = 1 @@ -28,7 +28,7 @@ No match Capturing subpattern count = 0 Options: anchored No first char -Need char = 'c' +No need char abc 0: abc \Aabc @@ -68,7 +68,7 @@ No need char Capturing subpattern count = 0 Options: anchored No first char -Need char = 'c' +No need char abc 0: abc *** Failers @@ -302,7 +302,7 @@ Need char = 'x' Capturing subpattern count = 0 Options: anchored No first char -Need char = 'x' +No need char the quick brown fox 0: the quick brown fox *** Failers @@ -504,7 +504,7 @@ No need char Capturing subpattern count = 0 Options: anchored caseless No first char -Need char = '4' +No need char /(^b|(?i)^d)/ Capturing subpattern count = 1 @@ -651,7 +651,7 @@ No match Capturing subpattern count = 0 Options: anchored multiline No first char -Need char = 'c' +No need char /^abc/m Capturing subpattern count = 0 @@ -663,7 +663,7 @@ Need char = 'c' Capturing subpattern count = 5 Options: anchored No first char -Need char = 'a' +No need char aaaaabbbbbcccccdef 0: aaaaabbbbbcccccdef 1: aaaaabbbbbcccccdef @@ -837,7 +837,7 @@ Capturing subpattern count = 1 Max back reference = 1 Options: anchored No first char -Need char = 'a' +No need char aaaaaa 0: aaaaaa 1: aa @@ -1027,7 +1027,7 @@ copy substring 1 failed -6 Capturing subpattern count = 3 Options: anchored No first char -Need char = 'f' +No need char adef\G1\G2\G3\G4\L 0: adef 1: a @@ -1065,7 +1065,7 @@ get substring 4 failed -7 Capturing subpattern count = 0 Options: anchored No first char -Need char = 'f' +No need char abc\00def\L\C0 0: abc\x00def 0C abc (7) @@ -1254,7 +1254,7 @@ Need char = 's' Capturing subpattern count = 0 Options: anchored No first char -Need char = 's' +No need char ississippi 0: iss 0+ issippi @@ -1304,7 +1304,7 @@ Need char = 'i' Capturing subpattern count = 0 Options: anchored No first char -Need char = 's' +No need char Mississippi 0: Mis 0+ sissippi @@ -1313,7 +1313,7 @@ Need char = 's' Capturing subpattern count = 0 Options: anchored No first char -Need char = 10 +No need char ab\nab\ncd 0: ab\x0a 0+ ab\x0acd @@ -2420,7 +2420,7 @@ Need char = 's' Capturing subpattern count = 2 Options: anchored No first char -Need char = 'a' +No need char aba 0: aba 1: a @@ -2430,7 +2430,7 @@ Need char = 'a' Capturing subpattern count = 2 Options: anchored No first char -Need char = 'a' +No need char aabbaa 0: aabbaa 1: aa @@ -2440,7 +2440,7 @@ Need char = 'a' Capturing subpattern count = 2 Options: anchored No first char -Need char = 'a' +No need char aabbaa 0: aabbaa 1: aa @@ -2450,7 +2450,7 @@ Need char = 'a' Capturing subpattern count = 2 Options: anchored No first char -Need char = 'a' +No need char aabbaa 0: aabbaa 1: aa @@ -2460,7 +2460,7 @@ Need char = 'a' Capturing subpattern count = 1 Options: anchored No first char -Need char = 'a' +No need char aabbaa 0: aabbaa 1: bb @@ -2469,7 +2469,7 @@ Need char = 'a' Capturing subpattern count = 3 Options: anchored No first char -Need char = 'a' +No need char aabbaa 0: aabbaa 1: aa @@ -2480,7 +2480,7 @@ Need char = 'a' Capturing subpattern count = 2 Options: anchored No first char -Need char = 'a' +No need char aabbaa 0: aabbaa 1: bb @@ -2490,7 +2490,7 @@ Need char = 'a' Capturing subpattern count = 1 Options: anchored No first char -Need char = 'a' +No need char aabbaa 0: aabbaa 1: bb @@ -2499,7 +2499,7 @@ Need char = 'a' Capturing subpattern count = 1 Options: anchored No first char -Need char = 'a' +No need char aabbbaa 0: aabbbaa 1: bbb @@ -2508,7 +2508,7 @@ Need char = 'a' Capturing subpattern count = 1 Options: anchored No first char -Need char = 'a' +No need char aabbbaa 0: aabbbaa 1: bbb @@ -2517,7 +2517,7 @@ Need char = 'a' Capturing subpattern count = 1 Options: anchored No first char -Need char = 'a' +No need char aabbaa 0: aabbaa 1: b @@ -2526,7 +2526,7 @@ Need char = 'a' Capturing subpattern count = 1 Options: anchored No first char -Need char = 'a' +No need char aabbbaa 0: aabbbaa 1: bb @@ -2535,7 +2535,7 @@ Need char = 'a' Capturing subpattern count = 3 Options: anchored No first char -Need char = 'a' +No need char aabbbaa 0: aabbbaa 1: aa @@ -2546,7 +2546,7 @@ Need char = 'a' Capturing subpattern count = 3 Options: anchored No first char -Need char = 'a' +No need char aabbbbaa 0: aabbbbaa 1: aa @@ -3047,7 +3047,7 @@ Need char = 'b' Capturing subpattern count = 5 Options: anchored No first char -Need char = 'a' +No need char /^x(?U)a+b/D ------------------------------------------------------------------ @@ -3735,7 +3735,7 @@ Need char = 'c' Capturing subpattern count = 0 Options: anchored No first char -Need char = 'c' +No need char /(?C)a|b/S Capturing subpattern count = 0 @@ -3917,7 +3917,7 @@ No match Capturing subpattern count = 2 Options: anchored No first char -Need char = 'z' +No need char xyz 0: xyz 1: xyz @@ -3977,7 +3977,7 @@ Failed: reference to non-existent subpattern at offset 4 Capturing subpattern count = 1 Options: anchored No first char -Need char = 'f' +No need char abcdefabc 0: abcdefabc 1: abc @@ -3986,7 +3986,7 @@ Need char = 'f' Capturing subpattern count = 1 Options: anchored No first char -Need char = '=' +No need char a=a 0: a=a 1: a @@ -4001,7 +4001,7 @@ Need char = '=' Capturing subpattern count = 2 Options: anchored No first char -Need char = '=' +No need char a=a 0: a=a 1: a @@ -4408,6 +4408,12 @@ Need char = 'a' 2: aa 2C aa (2) +/(?P<x>eks)(?P<x>eccs)/ +Failed: two named groups have the same name at offset 16 + +/(?P<abc>abc(?P<def>def)(?P<abc>xyz))/ +Failed: two named groups have the same name at offset 31 + / End of testinput2 / Capturing subpattern count = 0 No options diff --git a/testdata/testoutput3 b/testdata/testoutput3 index 6fdb681..4ec3489 100644 --- a/testdata/testoutput3 +++ b/testdata/testoutput3 @@ -1,4 +1,4 @@ -PCRE version 4.0 17-Feb-2003 +PCRE version 4.1 12-Mar-2003 /^[\w]+/ *** Failers diff --git a/testdata/testoutput4 b/testdata/testoutput4 index 82c5e2a..18b4036 100644 --- a/testdata/testoutput4 +++ b/testdata/testoutput4 @@ -1,4 +1,4 @@ -PCRE version 4.0 17-Feb-2003 +PCRE version 4.1 12-Mar-2003 /-- Do not use the \x{} construct except with patterns that have the --/ /-- /8 option set, because PCRE doesn't recognize them as UTF-8 unless --/ diff --git a/testdata/testoutput5 b/testdata/testoutput5 index 3491576..c8daba0 100644 --- a/testdata/testoutput5 +++ b/testdata/testoutput5 @@ -1,4 +1,4 @@ -PCRE version 4.0 17-Feb-2003 +PCRE version 4.1 12-Mar-2003 /\x{100}/8DM Memory allocation (code space): 11 |