diff options
author | nigel <nigel@2f5784b3-3f2a-0410-8824-cb99058d5e15> | 2007-02-24 21:39:13 +0000 |
---|---|---|
committer | nigel <nigel@2f5784b3-3f2a-0410-8824-cb99058d5e15> | 2007-02-24 21:39:13 +0000 |
commit | b72ae7c414f315e8915948fbea7b391a490fa946 (patch) | |
tree | f900e0a0a759e700bf22637d9db64878bf433be1 | |
parent | 7301eeae8c520c3a24e15bbcbb4b5b5343646e2c (diff) | |
download | pcre-b72ae7c414f315e8915948fbea7b391a490fa946.tar.gz |
Load pcre-2.08 into code/trunk.
git-svn-id: svn://vcs.exim.org/pcre/code/trunk@39 2f5784b3-3f2a-0410-8824-cb99058d5e15
-rw-r--r-- | ChangeLog | 18 | ||||
-rw-r--r-- | Makefile | 36 | ||||
-rw-r--r-- | README | 31 | ||||
-rw-r--r-- | dll.mk | 60 | ||||
-rw-r--r-- | internal.h | 3 | ||||
-rw-r--r-- | pcre.3 | 5 | ||||
-rw-r--r-- | pcre.3.html | 5 | ||||
-rw-r--r-- | pcre.3.txt | 4 | ||||
-rw-r--r-- | pcre.c | 13 | ||||
-rw-r--r-- | pcre.def | 19 | ||||
-rw-r--r-- | pcre.h | 25 | ||||
-rw-r--r-- | pcretest.c | 47 | ||||
-rw-r--r-- | testinput2 | 3 | ||||
-rw-r--r-- | testinput3 | 27 | ||||
-rw-r--r-- | testoutput1 | 2 | ||||
-rw-r--r-- | testoutput2 | 23 | ||||
-rw-r--r-- | testoutput3 | 59 | ||||
-rw-r--r-- | testoutput4 | 2 |
18 files changed, 338 insertions, 44 deletions
@@ -2,6 +2,24 @@ ChangeLog for PCRE ------------------ +Version 2.08 31-Aug-99 +---------------------- + +1. When startoffset was not zero and the pattern began with ".*", PCRE was not +trying to match at the startoffset position, but instead was moving forward to +the next newline as if a previous match had failed. + +2. pcretest was not making use of PCRE_NOTEMPTY when repeating for /g and /G, +and could get into a loop if a null string was matched other than at the start +of the subject. + +3. Added definitions of PCRE_MAJOR and PCRE_MINOR to pcre.h so the version can +be distinguished at compile time, and for completeness also added PCRE_DATE. + +5. Added Paul Sokolovsky's minor changes to make it easy to compile a Win32 DLL +in GnuWin32 environments. + + Version 2.07 29-Jul-99 ---------------------- @@ -1,8 +1,18 @@ # Make file for PCRE (Perl-Compatible Regular Expression) library. -# If you are using a Unix system, see below. +# If you are using a Unix system, see below. I am a Unix person, so that is +# the stuff I really know about. PCRE is developed on a Unix box. -############################################################################## +# To build mingw32 DLL uncomment the next two lines. This addition for mingw32 +# was contributed by Paul Sokolovsky <Paul.Sokolovsky@technologist.com>. I +# (Philip Hazel) don't know anything about it! There are some additional +# targets at the bottom of this Makefile. +# +# include dll.mk +# DLL_LDFLAGS=-s + + +######## NON-UNIX ############ NON-UNIX ############## NON-UNIX ############## # If you want to compile PCRE for a non-Unix system, note that it consists # entirely of code written in Standard C, and so should compile successfully # using normal compiling commands to do the following: @@ -25,7 +35,7 @@ # testinput2. -############################################################################## +######## UNIX ################## UNIX ################## UNIX ################ # On a Unix system: # # Edit CC, CFLAGS, and RANLIB for your system. @@ -60,6 +70,7 @@ MANDIR = /usr/local/man ############################################################################## + OBJ = maketables.o get.o study.o pcre.o all: libpcre.a libpcreposix.a pcretest pgrep @@ -125,4 +136,23 @@ clean:; -rm -f *.o *.a pcretest pgrep runtest: all ./RunTest +######## MINGW32 ############### MINGW32 ############### MINGW32 ############# + +# This addition for mingw32 was contributed by Paul Sokolovsky +# <Paul.Sokolovsky@technologist.com>. I (PH) don't know anything about it! + +dll: _dll libpcre.dll.a pgrep_d pcretest_d + +_dll: + $(MAKE) CFLAGS=-DSTATIC pcre.dll + +pcre.dll: $(OBJ) pcreposix.o pcre.def +libpcre.dll.a: pcre.def + +pgrep_d: libpcre.dll.a pgrep.o + $(CC) $(CFLAGS) -L. -o pgrep pgrep.o -lpcre.dll + +pcretest_d: libpcre.dll.a pcretest.o + $(PURIFY) $(CC) $(CFLAGS) -L. -o pcretest pcretest.o -lpcre.dll + # End @@ -70,6 +70,8 @@ The distribution should contain the following files: testoutput2 test results corresponding to testinput2 testoutput3 test results corresponding to testinput3 testoutput4 test results corresponding to testinput4 + dll.mk for Win32 DLL + pcre.def ditto To build PCRE on a Unix system, first edit Makefile for your system. It is a fairly simple make file, and there are some comments near the top, after the @@ -86,6 +88,16 @@ and on Macintoshes, but I don't know the details as I don't use those systems. It should be straightforward to build PCRE on any system that has a Standard C compiler. +Some help in building a Win32 DLL of PCRE in GnuWin32 environments was +contributed by Paul.Sokolovsky@technologist.com. These environments are +Mingw32 (http://www.xraylith.wisc.edu/~khan/software/gnu-win32/) and +CygWin (http://sourceware.cygnus.com/cygwin/). Paul comments: + + For CygWin, set CFLAGS=-mno-cygwin, and do 'make dll'. You'll get + pcre.dll (containing pcreposix also), libpcre.dll.a, and dynamically + linked pgrep and pcretest. If you have /bin/sh, run RunTest (three + main test go ok, locale not supported). + To test PCRE, run the RunTest script in the pcre directory. This can also be run by "make runtest". It runs the pcretest test program (which is documented below) on each of the testinput files in turn, and compares the output with the @@ -220,13 +232,18 @@ others which set PCRE options that do not correspond to anything in Perl: /A, /E, and /X set PCRE_ANCHORED, PCRE_DOLLAR_ENDONLY, and PCRE_EXTRA respectively. Searching for all possible matches within each subject string can be requested -by the /g or /G modifier. The /g modifier behaves similarly to the way it does -in Perl. After finding a match, PCRE is called again to search the remainder of -the subject string. The difference between /g and /G is that the former uses -the start_offset argument to pcre_exec() to start searching at a new point -within the entire string, whereas the latter passes over a shortened substring. -This makes a difference to the matching process if the pattern begins with a -lookbehind assertion (including \b or \B). +by the /g or /G modifier. After finding a match, PCRE is called again to search +the remainder of the subject string. The difference between /g and /G is that +the former uses the startoffset argument to pcre_exec() to start searching at +a new point within the entire string (which is in effect what Perl does), +whereas the latter passes over a shortened substring. This makes a difference +to the matching process if the pattern begins with a lookbehind assertion +(including \b or \B). + +If any call to pcre_exec() in a /g or /G sequence matches an empty string, the +next call is done with the PCRE_NOTEMPTY flag set so that it cannot match an +empty string again. This imitates the way Perl handles such cases when using +the /g modifier or the split() function. There are a number of other modifiers for controlling the way pcretest operates. @@ -0,0 +1,60 @@ +# dll.mk - auxilary Makefile to easy build dll's for mingw32 target +# ver. 0.6 of 1999-03-25 +# +# Homepage of this makefile - http://www.is.lg.ua/~paul/devel/ +# Homepage of original mingw32 project - +# http://www.fu.is.saga-u.ac.jp/~colin/gcc.html +# +# How to use: +# This makefile can: +# 1. Create automatical .def file from list of objects +# 2. Create .dll from objects and .def file, either automatical, or your +# hand-written (maybe) file, which must have same basename as dll +# WARNING! There MUST be object, which name match dll's name. Make sux. +# 3. Create import library from .def (as for .dll, only its name required, +# not dll itself) +# By convention implibs for dll have .dll.a suffix, e.g. libstuff.dll.a +# Why not just libstuff.a? 'Cos that's name for static lib, ok? +# Process divided into 3 phases because: +# 1. Pre-existent .def possible +# 2. Generating implib is enough time-consuming +# +# Variables: +# DLL_LDLIBS - libs for linking dll +# DLL_LDFLAGS - flags for linking dll +# +# By using $(DLL_SUFFIX) instead of 'dll', e.g. stuff.$(DLL_SUFFIX) +# you may help porting makefiles to other platforms +# +# Put this file in your make's include path (e.g. main include dir, for +# more information see include section in make doc). Put in the beginning +# of your own Makefile line "include dll.mk". Specify dependences, e.g.: +# +# Do all stuff in one step +# libstuff.dll.a: $(OBJECTS) stuff.def +# stuff.def: $(OBJECTS) +# +# Steps separated, pre-provided .def, link with user32 +# +# DLL_LDLIBS=-luser32 +# stuff.dll: $(OBJECTS) +# libstuff.dll.a: $(OBJECTS) + + +DLLWRAP=dllwrap +DLLTOOL=dlltool + +DLL_SUFFIX=dll + +.SUFFIXES: .o .$(DLL_SUFFIX) + +_%.def: %.o + $(DLLTOOL) --export-all --output-def $@ $^ + +%.$(DLL_SUFFIX): %.o + $(DLLWRAP) --dllname $(notdir $@) --driver-name $(CC) --def $*.def -o $@ $(filter %.o,$^) $(DLL_LDFLAGS) $(DLL_LDLIBS) + +lib%.$(DLL_SUFFIX).a:%.def + $(DLLTOOL) --dllname $(notdir $*.dll) --def $< --output-lib $@ + +# End @@ -3,9 +3,6 @@ *************************************************/ -#define PCRE_VERSION "2.07 29-Jul-1999" - - /* This is a library of functions to support regular expressions whose syntax and semantics are as close as possible to those of the Perl 5 language. See the file Tech.Notes for some information on the internals. @@ -330,8 +330,9 @@ is applied to a string not beginning with "a" or "b", it matches the empty string at the start of the subject. With PCRE_NOTEMPTY set, this match is not valid, so PCRE searches further into the string for occurrences of "a" or "b". Perl has no direct equivalent of this option, but it makes a special case of -a pattern match of the empty string within its \fBsplit()\fR function. Using -PCRE_NOTEMPTY it is possible to emulate this behaviour. +a pattern match of the empty string within its \fBsplit()\fR function, or when +using the /g modifier. Using PCRE_NOTEMPTY it is possible to emulate this +behaviour. The subject string is passed as a pointer in \fIsubject\fR, a length in \fIlength\fR, and a starting offset in \fIstartoffset\fR. Unlike the pattern diff --git a/pcre.3.html b/pcre.3.html index 464714f..3417579 100644 --- a/pcre.3.html +++ b/pcre.3.html @@ -437,8 +437,9 @@ is applied to a string not beginning with "a" or "b", it matches the empty string at the start of the subject. With PCRE_NOTEMPTY set, this match is not valid, so PCRE searches further into the string for occurrences of "a" or "b". Perl has no direct equivalent of this option, but it makes a special case of -a pattern match of the empty string within its <B>split()</B> function. Using -PCRE_NOTEMPTY it is possible to emulate this behaviour. +a pattern match of the empty string within its <B>split()</B> function, or when +using the /g modifier. Using PCRE_NOTEMPTY it is possible to emulate this +behaviour. </P> <P> The subject string is passed as a pointer in <I>subject</I>, a length in @@ -365,8 +365,8 @@ MATCHING A PATTERN further into the string for occurrences of "a" or "b". Perl has no direct equivalent of this option, but it makes a special case of a pattern match of the empty string within - its split() function. Using PCRE_NOTEMPTY it is possible to - emulate this behaviour. + its split() function, or when using the /g modifier. Using + PCRE_NOTEMPTY it is possible to emulate this behaviour. The subject string is passed as a pointer in subject, a length in length, and a starting offset in startoffset. @@ -148,10 +148,13 @@ tables. */ * Return version string * *************************************************/ +#define STRING(a) # a +#define XSTRING(s) STRING(s) + const char * pcre_version(void) { -return PCRE_VERSION; +return XSTRING(PCRE_MAJOR) "." XSTRING(PCRE_MINOR) " " XSTRING(PCRE_DATE); } @@ -3088,7 +3091,7 @@ for (;;) case OP_OPT: ims = ecode[1]; ecode += 2; - DPRINTF(("ims set to %02x\n", ims)); + DPRINTF(("ims set to %02lx\n", ims)); break; /* Assertion brackets. Check the alternative branches in turn - the @@ -3195,7 +3198,7 @@ for (;;) if (ecode[3] == OP_OPT) { ims = (ims & ~PCRE_IMS) | ecode[4]; - DPRINTF(("ims set to %02x at group repeat\n", ims)); + DPRINTF(("ims set to %02lx at group repeat\n", ims)); } if (*ecode == OP_KETRMIN) @@ -3289,7 +3292,7 @@ for (;;) the group. */ ims = original_ims; - DPRINTF(("ims reset to %02x\n", ims)); + DPRINTF(("ims reset to %02lx\n", ims)); /* For a non-repeating ket, just continue at this level. This also happens for a repeating ket if no characters were matched in the group. @@ -4337,7 +4340,7 @@ do else if (startline) { - if (start_match > match_block.start_subject) + if (start_match > match_block.start_subject + start_offset) { while (start_match < end_subject && start_match[-1] != '\n') start_match++; diff --git a/pcre.def b/pcre.def new file mode 100644 index 0000000..0e8cf3f --- /dev/null +++ b/pcre.def @@ -0,0 +1,19 @@ +EXPORTS + +pcre_malloc DATA +pcre_free DATA + +pcre_compile +pcre_copy_substring +pcre_exec +pcre_get_substring +pcre_get_substring_list +pcre_info +pcre_maketables +pcre_study +pcre_version + +regcomp +regexec +regerror +regfree @@ -7,6 +7,22 @@ #ifndef _PCRE_H #define _PCRE_H +#define PCRE_MAJOR 2 +#define PCRE_MINOR 08 +#define PCRE_DATE 31-Aug-1999 + +/* Win32 uses DLL by default */ + +#ifdef _WIN32 +# ifdef STATIC +# define PCRE_DL_IMPORT +# else +# define PCRE_DL_IMPORT __declspec(dllimport) +# endif +#else +# define PCRE_DL_IMPORT +#endif + /* Have to include stdlib.h in order to ensure that size_t is defined; it is needed here for malloc. */ @@ -49,10 +65,13 @@ typedef void pcre; typedef void pcre_extra; /* Store get and free functions. These can be set to alternative malloc/free -functions if required. */ +functions if required. Some magic is required for Win32 DLL; it is null on +other OS. */ + +PCRE_DL_IMPORT extern void *(*pcre_malloc)(size_t); +PCRE_DL_IMPORT extern void (*pcre_free)(void *); -extern void *(*pcre_malloc)(size_t); -extern void (*pcre_free)(void *); +#undef PCRE_DL_IMPORT /* Functions */ @@ -704,6 +704,7 @@ while (!done) int copystrings = 0; int getstrings = 0; int getlist = 0; + int gmatched = 0; int start_offset = 0; int offsets[45]; int size_offsets = sizeof(offsets)/sizeof(int); @@ -849,7 +850,7 @@ while (!done) else #endif /* !defined NOPOSIX */ - for (;;) + for (;; gmatched++) /* Loop for /g or /G */ { if (timeit) { @@ -858,7 +859,7 @@ while (!done) clock_t start_time = clock(); for (i = 0; i < LOOPREPEAT; i++) count = pcre_exec(re, extra, (char *)bptr, len, - (do_g? start_offset : 0), options, offsets, size_offsets); + start_offset, options, offsets, size_offsets); time_taken = clock() - start_time; fprintf(outfile, "Execute time %.3f milliseconds\n", ((double)time_taken * 1000.0)/ @@ -866,7 +867,7 @@ while (!done) } count = pcre_exec(re, extra, (char *)bptr, len, - (do_g? start_offset : 0), options, offsets, size_offsets); + start_offset, options, offsets, size_offsets); if (count == 0) { @@ -874,6 +875,8 @@ while (!done) count = size_offsets/3; } + /* Matched */ + if (count >= 0) { int i; @@ -888,7 +891,6 @@ while (!done) fprintf(outfile, "\n"); if (i == 0) { - start_offset = offsets[1]; if (do_showrest) { fprintf(outfile, " 0+ "); @@ -946,26 +948,45 @@ while (!done) free((void *)stringlist); } } - } + + /* Failed to match */ + else { - if (start_offset == 0) + if (gmatched == 0) { if (count == -1) fprintf(outfile, "No match\n"); else fprintf(outfile, "Error %d\n", count); } - start_offset = -1; + break; /* Out of the /g loop */ } - if ((!do_g && !do_G) || start_offset <= 0) break; - if (do_G) + /* If not /g or /G we are done */ + + if (!do_g && !do_G) break; + + /* If we have matched an empty string, set PCRE_NOTEMPTY for the next + match. This mimics what Perl's /g option does. */ + + if (offsets[1] == offsets[0]) + options |= PCRE_NOTEMPTY; + else + options &= ~PCRE_NOTEMPTY; + + /* For /g, update the start offset, leaving the rest alone */ + + if (do_g) start_offset = offsets[1]; + + /* For /G, update the pointer and length */ + + else { - bptr += start_offset; - len -= start_offset; + bptr += offsets[1]; + len -= offsets[1]; } - } - } + } /* End of loop for /g and /G */ + } /* End of loop for data lines */ CONTINUE: @@ -583,4 +583,7 @@ *** Failers \Nabc +/.*?/g+ + abc + / End of test input / @@ -1662,4 +1662,31 @@ *** Failers 123999foo +/<a[\s]+href[\s]*=[\s]* # find <a href= + ([\"\'])? # find single or double quote + (?(1) (.*?)\1 | ([^\s]+)) # if quote found, match up to next matching + # quote, otherwise match up to next space +/isx + <a href=abcd xyz + <a href=\"abcd xyz pqr\" cats + <a href=\'abcd xyz pqr\' cats + +/<a\s+href\s*=\s* # find <a href= + (["'])? # find single or double quote + (?(1) (.*?)\1 | (\S+)) # if quote found, match up to next matching + # quote, otherwise match up to next space +/isx + <a href=abcd xyz + <a href=\"abcd xyz pqr\" cats + <a href = \'abcd xyz pqr\' cats + +/<a\s+href(?>\s*)=(?>\s*) # find <a href= + (["'])? # find single or double quote + (?(1) (.*?)\1 | (\S+)) # if quote found, match up to next matching + # quote, otherwise match up to next space +/isx + <a href=abcd xyz + <a href=\"abcd xyz pqr\" cats + <a href = \'abcd xyz pqr\' cats + / End of test input / diff --git a/testoutput1 b/testoutput1 index a4fc4c0..3c1eb60 100644 --- a/testoutput1 +++ b/testoutput1 @@ -1,4 +1,4 @@ -PCRE version 2.07 29-Jul-1999 +PCRE version 2.08 31-Aug-1999 /the quick brown fox/ the quick brown fox diff --git a/testoutput2 b/testoutput2 index e1bdf27..5950a91 100644 --- a/testoutput2 +++ b/testoutput2 @@ -1,4 +1,4 @@ -PCRE version 2.07 29-Jul-1999 +PCRE version 2.08 31-Aug-1999 /(a)b|/ Identifying subpattern count = 1 @@ -1595,6 +1595,27 @@ No req char \Nabc No match +/.*?/g+ +Identifying subpattern count = 0 +No options +First char at start or follows \n +No req char + abc + 0: + 0+ abc + 0: a + 0+ bc + 0: + 0+ bc + 0: b + 0+ c + 0: + 0+ c + 0: c + 0+ + 0: + 0+ + / End of test input / Identifying subpattern count = 0 No options diff --git a/testoutput3 b/testoutput3 index d891d7e..d997659 100644 --- a/testoutput3 +++ b/testoutput3 @@ -1,4 +1,4 @@ -PCRE version 2.07 29-Jul-1999 +PCRE version 2.08 31-Aug-1999 /(?<!bar)foo/ foo @@ -2868,5 +2868,62 @@ No match 123999foo No match +/<a[\s]+href[\s]*=[\s]* # find <a href= + ([\"\'])? # find single or double quote + (?(1) (.*?)\1 | ([^\s]+)) # if quote found, match up to next matching + # quote, otherwise match up to next space +/isx + <a href=abcd xyz + 0: <a href=abcd + 1: <unset> + 2: <unset> + 3: abcd + <a href=\"abcd xyz pqr\" cats + 0: <a href="abcd xyz pqr" + 1: " + 2: abcd xyz pqr + <a href=\'abcd xyz pqr\' cats + 0: <a href='abcd xyz pqr' + 1: ' + 2: abcd xyz pqr + +/<a\s+href\s*=\s* # find <a href= + (["'])? # find single or double quote + (?(1) (.*?)\1 | (\S+)) # if quote found, match up to next matching + # quote, otherwise match up to next space +/isx + <a href=abcd xyz + 0: <a href=abcd + 1: <unset> + 2: <unset> + 3: abcd + <a href=\"abcd xyz pqr\" cats + 0: <a href="abcd xyz pqr" + 1: " + 2: abcd xyz pqr + <a href = \'abcd xyz pqr\' cats + 0: <a href = 'abcd xyz pqr' + 1: ' + 2: abcd xyz pqr + +/<a\s+href(?>\s*)=(?>\s*) # find <a href= + (["'])? # find single or double quote + (?(1) (.*?)\1 | (\S+)) # if quote found, match up to next matching + # quote, otherwise match up to next space +/isx + <a href=abcd xyz + 0: <a href=abcd + 1: <unset> + 2: <unset> + 3: abcd + <a href=\"abcd xyz pqr\" cats + 0: <a href="abcd xyz pqr" + 1: " + 2: abcd xyz pqr + <a href = \'abcd xyz pqr\' cats + 0: <a href = 'abcd xyz pqr' + 1: ' + 2: abcd xyz pqr + / End of test input / diff --git a/testoutput4 b/testoutput4 index bb41319..c8af6cf 100644 --- a/testoutput4 +++ b/testoutput4 @@ -1,4 +1,4 @@ -PCRE version 2.07 29-Jul-1999 +PCRE version 2.08 31-Aug-1999 /^[\w]+/ *** Failers |