diff options
author | nigel <nigel@2f5784b3-3f2a-0410-8824-cb99058d5e15> | 2007-02-24 21:40:59 +0000 |
---|---|---|
committer | nigel <nigel@2f5784b3-3f2a-0410-8824-cb99058d5e15> | 2007-02-24 21:40:59 +0000 |
commit | f82b62380bd773b22a4a5d28d1a403ffd54c5392 (patch) | |
tree | d8fd1e5c25d0e781ca46b6b570beedaa15a81019 | |
parent | 477806cfbeb607865593eb63f0216d854a2bbf6f (diff) | |
download | pcre-f82b62380bd773b22a4a5d28d1a403ffd54c5392.tar.gz |
Load pcre-6.2 into code/trunk.
git-svn-id: svn://vcs.exim.org/pcre/code/trunk@81 2f5784b3-3f2a-0410-8824-cb99058d5e15
-rw-r--r-- | ChangeLog | 29 | ||||
-rw-r--r-- | Makefile.in | 12 | ||||
-rw-r--r-- | RunGrepTest.in | 5 | ||||
-rwxr-xr-x | RunTest.in | 5 | ||||
-rwxr-xr-x | configure | 4 | ||||
-rw-r--r-- | configure.in | 4 | ||||
-rw-r--r-- | dftables.c | 2 | ||||
-rw-r--r-- | doc/html/pcrecpp.html | 114 | ||||
-rw-r--r-- | doc/pcre.txt | 165 | ||||
-rw-r--r-- | doc/pcrecpp.3 | 94 | ||||
-rw-r--r-- | pcre_compile.c | 59 | ||||
-rw-r--r-- | pcrecpp.cc | 6 | ||||
-rw-r--r-- | pcrecpp.h.in | 211 | ||||
-rw-r--r-- | pcrecpp_unittest.cc | 228 | ||||
-rw-r--r-- | testdata/grepoutput | 2 | ||||
-rw-r--r-- | testdata/testinput2 | 21 | ||||
-rw-r--r-- | testdata/testoutput1 | 2 | ||||
-rw-r--r-- | testdata/testoutput2 | 225 | ||||
-rw-r--r-- | testdata/testoutput3 | 2 | ||||
-rw-r--r-- | testdata/testoutput4 | 2 | ||||
-rw-r--r-- | testdata/testoutput5 | 2 | ||||
-rw-r--r-- | testdata/testoutput6 | 2 | ||||
-rw-r--r-- | testdata/testoutput7 | 2 | ||||
-rw-r--r-- | testdata/testoutput8 | 2 | ||||
-rw-r--r-- | testdata/testoutput9 | 2 |
25 files changed, 1090 insertions, 112 deletions
@@ -1,6 +1,35 @@ ChangeLog for PCRE ------------------ +Version 6.2 01-Aug-05 +--------------------- + + 1. There was no test for integer overflow of quantifier values. A construction + such as {1111111111111111} would give undefined results. What is worse, if + a minimum quantifier for a parenthesized subpattern overflowed and became + negative, the calculation of the memory size went wrong. This could have + led to memory overwriting. + + 2. Building PCRE using VPATH was broken. Hopefully it is now fixed. + + 3. Added "b" to the 2nd argument of fopen() in dftables.c, for non-Unix-like + operating environments where this matters. + + 4. Applied Giuseppe Maxia's patch to add additional features for controlling + PCRE options from within the C++ wrapper. + + 5. Named capturing subpatterns were not being correctly counted when a pattern + was compiled. This caused two problems: (a) If there were more than 100 + such subpatterns, the calculation of the memory needed for the whole + compiled pattern went wrong, leading to an overflow error. (b) Numerical + back references of the form \12, where the number was greater than 9, were + not recognized as back references, even though there were sufficient + previous subpatterns. + + 6. Two minor patches to pcrecpp.cc in order to allow it to compile on older + versions of gcc, e.g. 2.95.4. + + Version 6.1 21-Jun-05 --------------------- diff --git a/Makefile.in b/Makefile.in index 74121ea..39a7abe 100644 --- a/Makefile.in +++ b/Makefile.in @@ -211,7 +211,7 @@ libpcrecpp.la: libpcre.la $(CPPOBJ) # directory, not the source directory. pcre_chartables.@OBJEXT@: pcre_chartables.c - @$(LTCOMPILE) $(top_srcdir)/pcre_chartables.c + @$(LTCOMPILE) pcre_chartables.c pcre_compile.@OBJEXT@: Makefile config.h pcre.h \ $(top_srcdir)/pcre_internal.h $(top_srcdir)/pcre_compile.c @@ -329,23 +329,23 @@ pcre_stringpiece.@OBJEXT@: $(top_srcdir)/pcre_stringpiece.cc pcre_stringpiece.h pcretest.@OBJEXT@: $(top_srcdir)/pcretest.c $(top_srcdir)/pcre_internal.h \ pcre.h config.h Makefile - $(CC) -c $(CFLAGS) -I. $(UTF8) $(UCP) $(LINK_SIZE) $(top_srcdir)/pcretest.c + $(CC) -c $(CFLAGS) -I. -I$(top_srcdir) $(UTF8) $(UCP) $(LINK_SIZE) $(top_srcdir)/pcretest.c pcrecpp_unittest.@OBJEXT@: $(top_srcdir)/pcrecpp_unittest.cc pcrecpp.h \ pcre_stringpiece.h pcre.h config.h Makefile - $(CXX) -c $(CXXFLAGS) -I. $(UTF8) $(UCP) $(LINK_SIZE) $(top_srcdir)/pcrecpp_unittest.cc + $(CXX) -c $(CXXFLAGS) -I. -I$(top_srcdir) $(UTF8) $(UCP) $(LINK_SIZE) $(top_srcdir)/pcrecpp_unittest.cc pcre_stringpiece_unittest.@OBJEXT@: $(top_srcdir)/pcre_stringpiece_unittest.cc \ pcre_stringpiece.h config.h Makefile - $(CXX) -c $(CXXFLAGS) -I. $(UTF8) $(UCP) $(LINK_SIZE) $(top_srcdir)/pcre_stringpiece_unittest.cc + $(CXX) -c $(CXXFLAGS) -I. -I$(top_srcdir) $(UTF8) $(UCP) $(LINK_SIZE) $(top_srcdir)/pcre_stringpiece_unittest.cc pcre_scanner_unittest.@OBJEXT@: $(top_srcdir)/pcre_scanner_unittest.cc \ $(top_srcdir)/pcre_scanner.h pcrecpp.h pcre_stringpiece.h \ pcre.h config.h Makefile - $(CXX) -c $(CXXFLAGS) -I. $(UTF8) $(UCP) $(LINK_SIZE) $(top_srcdir)/pcre_scanner_unittest.cc + $(CXX) -c $(CXXFLAGS) -I. -I$(top_srcdir) $(UTF8) $(UCP) $(LINK_SIZE) $(top_srcdir)/pcre_scanner_unittest.cc pcregrep.@OBJEXT@: $(top_srcdir)/pcregrep.c pcre.h Makefile config.h - $(CC) -c $(CFLAGS) -I. $(UTF8) $(UCP) $(PCREGREP_OSTYPE) $(top_srcdir)/pcregrep.c + $(CC) -c $(CFLAGS) -I. -I$(top_srcdir) $(UTF8) $(UCP) $(PCREGREP_OSTYPE) $(top_srcdir)/pcregrep.c # Some Windows-specific targets for MinGW. Do not use for Cygwin. diff --git a/RunGrepTest.in b/RunGrepTest.in index 57ac403..109267e 100644 --- a/RunGrepTest.in +++ b/RunGrepTest.in @@ -10,7 +10,10 @@ echo "Testing pcregrep" # supported by pcregrep. cf=diff -testdata=@top_srcdir@/testdata +if [ ! -d testdata ] ; then + ln -s @top_srcdir@/testdata testdata +fi +testdata=./testdata ./pcregrep -V 2>testtry @@ -6,7 +6,10 @@ # Run PCRE tests cf=diff -testdata=@top_srcdir@/testdata +if [ ! -d testdata ] ; then + ln -s @top_srcdir@/testdata testdata +fi +testdata=./testdata # Select which tests to run; if no selection, run all @@ -1505,8 +1505,8 @@ ac_compiler_gnu=$ac_cv_c_compiler_gnu PCRE_MAJOR=6 -PCRE_MINOR=1 -PCRE_DATE=21-Jun-2005 +PCRE_MINOR=2 +PCRE_DATE=01-Aug-2005 PCRE_VERSION=${PCRE_MAJOR}.${PCRE_MINOR} diff --git a/configure.in b/configure.in index 4f8c6ee..47fd8d5 100644 --- a/configure.in +++ b/configure.in @@ -21,8 +21,8 @@ dnl digits for minor numbers less than 10. There are unlikely to be dnl that many releases anyway. PCRE_MAJOR=6 -PCRE_MINOR=1 -PCRE_DATE=21-Jun-2005 +PCRE_MINOR=2 +PCRE_DATE=01-Aug-2005 PCRE_VERSION=${PCRE_MAJOR}.${PCRE_MINOR} dnl Default values for miscellaneous macros @@ -66,7 +66,7 @@ if (argc != 2) return 1; } -f = fopen(argv[1], "w"); +f = fopen(argv[1], "wb"); if (f == NULL) { fprintf(stderr, "dftables: failed to open %s for writing\n", argv[1]); diff --git a/doc/html/pcrecpp.html b/doc/html/pcrecpp.html index 3f597b1..1d5acb7 100644 --- a/doc/html/pcrecpp.html +++ b/doc/html/pcrecpp.html @@ -18,10 +18,11 @@ man page, in case the conversion went wrong. <li><a name="TOC3" href="#SEC3">MATCHING INTERFACE</a> <li><a name="TOC4" href="#SEC4">PARTIAL MATCHES</a> <li><a name="TOC5" href="#SEC5">UTF-8 AND THE MATCHING INTERFACE</a> -<li><a name="TOC6" href="#SEC6">SCANNING TEXT INCREMENTALLY</a> -<li><a name="TOC7" href="#SEC7">PARSING HEX/OCTAL/C-RADIX NUMBERS</a> -<li><a name="TOC8" href="#SEC8">REPLACING PARTS OF STRINGS</a> -<li><a name="TOC9" href="#SEC9">AUTHOR</a> +<li><a name="TOC6" href="#SEC6">PASSING MODIFIERS TO THE REGULAR EXPRESSION ENGINE</a> +<li><a name="TOC7" href="#SEC7">SCANNING TEXT INCREMENTALLY</a> +<li><a name="TOC8" href="#SEC8">PARSING HEX/OCTAL/C-RADIX NUMBERS</a> +<li><a name="TOC9" href="#SEC9">REPLACING PARTS OF STRINGS</a> +<li><a name="TOC10" href="#SEC10">AUTHOR</a> </ul> <br><a name="SEC1" href="#TOC1">SYNOPSIS OF C++ WRAPPER</a><br> <P> @@ -31,9 +32,10 @@ man page, in case the conversion went wrong. </P> <br><a name="SEC2" href="#TOC1">DESCRIPTION</a><br> <P> -The C++ wrapper for PCRE was provided by Google Inc. This brief man page was -constructed from the notes in the <i>pcrecpp.h</i> file, which should be -consulted for further details. +The C++ wrapper for PCRE was provided by Google Inc. Some additional +functionality was added by Giuseppe Maxia. This brief man page was constructed +from the notes in the <i>pcrecpp.h</i> file, which should be consulted for +further details. </P> <br><a name="SEC3" href="#TOC1">MATCHING INTERFACE</a><br> <P> @@ -148,7 +150,97 @@ NOTE: The UTF8 flag is ignored if pcre was not configured with the --enable-utf8 flag. </PRE> </P> -<br><a name="SEC6" href="#TOC1">SCANNING TEXT INCREMENTALLY</a><br> +<br><a name="SEC6" href="#TOC1">PASSING MODIFIERS TO THE REGULAR EXPRESSION ENGINE</a><br> +<P> +PCRE defines some modifiers to change the behavior of the regular expression +engine. The C++ wrapper defines an auxiliary class, RE_Options, as a vehicle to +pass such modifiers to a RE class. Currently, the following modifiers are +supported: +<pre> + modifier description Perl corresponding + + PCRE_CASELESS case insensitive match /i + PCRE_MULTILINE multiple lines match /m + PCRE_DOTALL dot matches newlines /s + PCRE_DOLLAR_ENDONLY $ matches only at end N/A + PCRE_EXTRA strict escape parsing N/A + PCRE_EXTENDED ignore whitespaces /x + PCRE_UTF8 handles UTF8 chars built-in + PCRE_UNGREEDY reverses * and *? N/A + PCRE_NO_AUTO_CAPTURE disables capturing parens N/A (*) +</pre> +(*) Both Perl and PCRE allow non capturing parentheses by means of the +"?:" modifier within the pattern itself. e.g. (?:ab|cd) does not +capture, while (ab|cd) does. +</P> +<P> +For a full account on how each modifier works, please check the +PCRE API reference page. +</P> +<P> +For each modifier, there are two member functions whose name is made +out of the modifier in lowercase, without the "PCRE_" prefix. For +instance, PCRE_CASELESS is handled by +<pre> + bool caseless() +</pre> +which returns true if the modifier is set, and +<pre> + RE_Options & set_caseless(bool) +</pre> +which sets or unsets the modifier. Moreover, PCRE_CONFIG_MATCH_LIMIT can be +accessed through the <b>set_match_limit()</b> and <b>match_limit()</b> member +functions. Setting <i>match_limit</i> to a non-zero value will limit the +execution of pcre to keep it from doing bad things like blowing the stack or +taking an eternity to return a result. A value of 5000 is good enough to stop +stack blowup in a 2MB thread stack. Setting <i>match_limit</i> to zero disables +match limiting. +</P> +<P> +Normally, to pass one or more modifiers to a RE class, you declare +a <i>RE_Options</i> object, set the appropriate options, and pass this +object to a RE constructor. Example: +<pre> + RE_options opt; + opt.set_caseless(true); + if (RE("HELLO", opt).PartialMatch("hello world")) ... +</pre> +RE_options has two constructors. The default constructor takes no arguments and +creates a set of flags that are off by default. The optional parameter +<i>option_flags</i> is to facilitate transfer of legacy code from C programs. +This lets you do +<pre> + RE(pattern, + RE_Options(PCRE_CASELESS|PCRE_MULTILINE)).PartialMatch(str); +</pre> +However, new code is better off doing +<pre> + RE(pattern, + RE_Options().set_caseless(true).set_multiline(true)) + .PartialMatch(str); +</pre> +If you are going to pass one of the most used modifiers, there are some +convenience functions that return a RE_Options class with the +appropriate modifier already set: <b>CASELESS()</b>, <b>UTF8()</b>, +<b>MULTILINE()</b>, <b>DOTALL</b>(), and <b>EXTENDED()</b>. +</P> +<P> +If you need to set several options at once, and you don't want to go through +the pains of declaring a RE_Options object and setting several options, there +is a parallel method that give you such ability on the fly. You can concatenate +several <b>set_xxxxx()</b> member functions, since each of them returns a +reference to its class object. For example, to pass PCRE_CASELESS, +PCRE_EXTENDED, and PCRE_MULTILINE to a RE with one statement, you may write: +<pre> + RE(" ^ xyz \\s+ .* blah$", + RE_Options() + .set_caseless(true) + .set_extended(true) + .set_multiline(true)).PartialMatch(sometext); + +</PRE> +</P> +<br><a name="SEC7" href="#TOC1">SCANNING TEXT INCREMENTALLY</a><br> <P> The "Consume" operation may be useful if you want to repeatedly match regular expressions at the front of a string and skip over @@ -181,7 +273,7 @@ could extract all words from a string by repeatedly calling pcrecpp::RE("(\\w+)").FindAndConsume(&input, &word) </PRE> </P> -<br><a name="SEC7" href="#TOC1">PARSING HEX/OCTAL/C-RADIX NUMBERS</a><br> +<br><a name="SEC8" href="#TOC1">PARSING HEX/OCTAL/C-RADIX NUMBERS</a><br> <P> By default, if you pass a pointer to a numeric value, the corresponding text is interpreted as a base-10 number. You can @@ -199,7 +291,7 @@ prefixes, but defaults to base-10. </pre> will leave 64 in a, b, c, and d. </P> -<br><a name="SEC8" href="#TOC1">REPLACING PARTS OF STRINGS</a><br> +<br><a name="SEC9" href="#TOC1">REPLACING PARTS OF STRINGS</a><br> <P> You can replace the first match of "pattern" in "str" with "rewrite". Within "rewrite", backslash-escaped digits (\1 to \9) can be @@ -231,7 +323,7 @@ The non-matching portions of "text" are ignored. Returns true iff a match occurred and the extraction happened successfully; if no match occurs, the string is left unaffected. </P> -<br><a name="SEC9" href="#TOC1">AUTHOR</a><br> +<br><a name="SEC10" href="#TOC1">AUTHOR</a><br> <P> The C++ wrapper was contributed by Google Inc. <br> diff --git a/doc/pcre.txt b/doc/pcre.txt index 735f504..801bde2 100644 --- a/doc/pcre.txt +++ b/doc/pcre.txt @@ -4352,15 +4352,16 @@ SYNOPSIS OF C++ WRAPPER DESCRIPTION - The C++ wrapper for PCRE was provided by Google Inc. This brief man - page was constructed from the notes in the pcrecpp.h file, which should - be consulted for further details. + The C++ wrapper for PCRE was provided by Google Inc. Some additional + functionality was added by Giuseppe Maxia. This brief man page was con- + structed from the notes in the pcrecpp.h file, which should be con- + sulted for further details. MATCHING INTERFACE - The "FullMatch" operation checks that supplied text matches a supplied - pattern exactly. If pointer arguments are supplied, it copies matched + The "FullMatch" operation checks that supplied text matches a supplied + pattern exactly. If pointer arguments are supplied, it copies matched sub-strings that match sub-patterns into them. Example: successful match @@ -4374,10 +4375,10 @@ MATCHING INTERFACE Example: creating a temporary RE object: pcrecpp::RE("h.*o").FullMatch("hello"); - You can pass in a "const char*" or a "string" for "text". The examples - below tend to use a const char*. You can, as in the different examples - above, store the RE object explicitly in a variable or use a temporary - RE object. The examples below use one mode or the other arbitrarily. + You can pass in a "const char*" or a "string" for "text". The examples + below tend to use a const char*. You can, as in the different examples + above, store the RE object explicitly in a variable or use a temporary + RE object. The examples below use one mode or the other arbitrarily. Either could correctly be used for any of these examples. You must supply extra pointer arguments to extract matched subpieces. @@ -4403,7 +4404,7 @@ MATCHING INTERFACE Example: fails because string cannot be stored in integer !pcrecpp::RE("(.*)").FullMatch("ruby", &i); - The provided pointer arguments can be pointers to any scalar numeric + The provided pointer arguments can be pointers to any scalar numeric type, or one of: string (matched piece is copied to string) @@ -4411,7 +4412,7 @@ MATCHING INTERFACE T (where "bool T::ParseFrom(const char*, int)" exists) NULL (the corresponding matched sub-pattern is not copied) - The function returns true iff all of the following conditions are sat- + The function returns true iff all of the following conditions are sat- isfied: a. "text" matches "pattern" exactly; @@ -4425,14 +4426,14 @@ MATCHING INTERFACE number of sub-patterns, "i"th captured sub-pattern is ignored. - The matching interface supports at most 16 arguments per call. If you - need more, consider using the more general interface + The matching interface supports at most 16 arguments per call. If you + need more, consider using the more general interface pcrecpp::RE::DoMatch. See pcrecpp.h for the signature for DoMatch. PARTIAL MATCHES - You can use the "PartialMatch" operation when you want the pattern to + You can use the "PartialMatch" operation when you want the pattern to match any substring of the text. Example: simple search for a string: @@ -4447,13 +4448,13 @@ PARTIAL MATCHES UTF-8 AND THE MATCHING INTERFACE - By default, pattern and text are plain text, one byte per character. - The UTF8 flag, passed to the constructor, causes both pattern and + By default, pattern and text are plain text, one byte per character. + The UTF8 flag, passed to the constructor, causes both pattern and string to be treated as UTF-8 text, still a byte stream but potentially - multiple bytes per character. In practice, the text is likelier to be - UTF-8 than the pattern, but the match returned may depend on the UTF8 - flag, so always use it when matching UTF8 text. For example, "." will - match one byte normally but with UTF8 set may match up to three bytes + multiple bytes per character. In practice, the text is likelier to be + UTF-8 than the pattern, but the match returned may depend on the UTF8 + flag, so always use it when matching UTF8 text. For example, "." will + match one byte normally but with UTF8 set may match up to three bytes of a multi-byte character. Example: @@ -4470,12 +4471,98 @@ UTF-8 AND THE MATCHING INTERFACE --enable-utf8 flag. +PASSING MODIFIERS TO THE REGULAR EXPRESSION ENGINE + + PCRE defines some modifiers to change the behavior of the regular + expression engine. The C++ wrapper defines an auxiliary class, + RE_Options, as a vehicle to pass such modifiers to a RE class. Cur- + rently, the following modifiers are supported: + + modifier description Perl corresponding + + PCRE_CASELESS case insensitive match /i + PCRE_MULTILINE multiple lines match /m + PCRE_DOTALL dot matches newlines /s + PCRE_DOLLAR_ENDONLY $ matches only at end N/A + PCRE_EXTRA strict escape parsing N/A + PCRE_EXTENDED ignore whitespaces /x + PCRE_UTF8 handles UTF8 chars built-in + PCRE_UNGREEDY reverses * and *? N/A + PCRE_NO_AUTO_CAPTURE disables capturing parens N/A (*) + + (*) Both Perl and PCRE allow non capturing parentheses by means of the + "?:" modifier within the pattern itself. e.g. (?:ab|cd) does not cap- + ture, while (ab|cd) does. + + For a full account on how each modifier works, please check the PCRE + API reference page. + + For each modifier, there are two member functions whose name is made + out of the modifier in lowercase, without the "PCRE_" prefix. For + instance, PCRE_CASELESS is handled by + + bool caseless() + + which returns true if the modifier is set, and + + RE_Options & set_caseless(bool) + + which sets or unsets the modifier. Moreover, PCRE_CONFIG_MATCH_LIMIT + can be accessed through the set_match_limit() and match_limit() member + functions. Setting match_limit to a non-zero value will limit the exe- + cution of pcre to keep it from doing bad things like blowing the stack + or taking an eternity to return a result. A value of 5000 is good + enough to stop stack blowup in a 2MB thread stack. Setting match_limit + to zero disables match limiting. + + Normally, to pass one or more modifiers to a RE class, you declare a + RE_Options object, set the appropriate options, and pass this object to + a RE constructor. Example: + + RE_options opt; + opt.set_caseless(true); + if (RE("HELLO", opt).PartialMatch("hello world")) ... + + RE_options has two constructors. The default constructor takes no argu- + ments and creates a set of flags that are off by default. The optional + parameter option_flags is to facilitate transfer of legacy code from C + programs. This lets you do + + RE(pattern, + RE_Options(PCRE_CASELESS|PCRE_MULTILINE)).PartialMatch(str); + + However, new code is better off doing + + RE(pattern, + RE_Options().set_caseless(true).set_multiline(true)) + .PartialMatch(str); + + If you are going to pass one of the most used modifiers, there are some + convenience functions that return a RE_Options class with the appropri- + ate modifier already set: CASELESS(), UTF8(), MULTILINE(), DOTALL(), + and EXTENDED(). + + If you need to set several options at once, and you don't want to go + through the pains of declaring a RE_Options object and setting several + options, there is a parallel method that give you such ability on the + fly. You can concatenate several set_xxxxx() member functions, since + each of them returns a reference to its class object. For example, to + pass PCRE_CASELESS, PCRE_EXTENDED, and PCRE_MULTILINE to a RE with one + statement, you may write: + + RE(" ^ xyz \\s+ .* blah$", + RE_Options() + .set_caseless(true) + .set_extended(true) + .set_multiline(true)).PartialMatch(sometext); + + SCANNING TEXT INCREMENTALLY - The "Consume" operation may be useful if you want to repeatedly match + The "Consume" operation may be useful if you want to repeatedly match regular expressions at the front of a string and skip over them as they - match. This requires use of the "StringPiece" type, which represents a - sub-range of a real string. Like RE, StringPiece is defined in the + match. This requires use of the "StringPiece" type, which represents a + sub-range of a real string. Like RE, StringPiece is defined in the pcrecpp namespace. Example: read lines of the form "var = value" from a string. @@ -4489,11 +4576,11 @@ SCANNING TEXT INCREMENTALLY ...; } - Each successful call to "Consume" will set "var/value", and also + Each successful call to "Consume" will set "var/value", and also advance "input" so it points past the matched text. - The "FindAndConsume" operation is similar to "Consume" but does not - anchor your match at the beginning of the string. For example, you + The "FindAndConsume" operation is similar to "Consume" but does not + anchor your match at the beginning of the string. For example, you could extract all words from a string by repeatedly calling pcrecpp::RE("(\\w+)").FindAndConsume(&input, &word) @@ -4502,10 +4589,10 @@ SCANNING TEXT INCREMENTALLY PARSING HEX/OCTAL/C-RADIX NUMBERS By default, if you pass a pointer to a numeric value, the corresponding - text is interpreted as a base-10 number. You can instead wrap the + text is interpreted as a base-10 number. You can instead wrap the pointer with a call to one of the operators Hex(), Octal(), or CRadix() - to interpret the text in another base. The CRadix operator interprets - C-style "0" (base-8) and "0x" (base-16) prefixes, but defaults to + to interpret the text in another base. The CRadix operator interprets + C-style "0" (base-8) and "0x" (base-16) prefixes, but defaults to base-10. Example: @@ -4520,30 +4607,30 @@ PARSING HEX/OCTAL/C-RADIX NUMBERS REPLACING PARTS OF STRINGS - You can replace the first match of "pattern" in "str" with "rewrite". - Within "rewrite", backslash-escaped digits (\1 to \9) can be used to - insert text matching corresponding parenthesized group from the pat- + You can replace the first match of "pattern" in "str" with "rewrite". + Within "rewrite", backslash-escaped digits (\1 to \9) can be used to + insert text matching corresponding parenthesized group from the pat- tern. \0 in "rewrite" refers to the entire matching text. For example: string s = "yabba dabba doo"; pcrecpp::RE("b+").Replace("d", &s); - will leave "s" containing "yada dabba doo". The result is true if the + will leave "s" containing "yada dabba doo". The result is true if the pattern matches and a replacement occurs, false otherwise. - GlobalReplace is like Replace except that it replaces all occurrences - of the pattern in the string with the rewrite. Replacements are not + GlobalReplace is like Replace except that it replaces all occurrences + of the pattern in the string with the rewrite. Replacements are not subject to re-matching. For example: string s = "yabba dabba doo"; pcrecpp::RE("b+").GlobalReplace("d", &s); - will leave "s" containing "yada dada doo". It returns the number of + will leave "s" containing "yada dada doo". It returns the number of replacements made. - Extract is like Replace, except that if the pattern matches, "rewrite" - is copied into "out" (an additional argument) with substitutions. The - non-matching portions of "text" are ignored. Returns true iff a match + Extract is like Replace, except that if the pattern matches, "rewrite" + is copied into "out" (an additional argument) with substitutions. The + non-matching portions of "text" are ignored. Returns true iff a match occurred and the extraction happened successfully; if no match occurs, the string is left unaffected. diff --git a/doc/pcrecpp.3 b/doc/pcrecpp.3 index abf7334..78ac564 100644 --- a/doc/pcrecpp.3 +++ b/doc/pcrecpp.3 @@ -11,9 +11,10 @@ PCRE - Perl-compatible regular expressions. .SH DESCRIPTION .rs .sp -The C++ wrapper for PCRE was provided by Google Inc. This brief man page was -constructed from the notes in the \fIpcrecpp.h\fP file, which should be -consulted for further details. +The C++ wrapper for PCRE was provided by Google Inc. Some additional +functionality was added by Giuseppe Maxia. This brief man page was constructed +from the notes in the \fIpcrecpp.h\fP file, which should be consulted for +further details. . . .SH "MATCHING INTERFACE" @@ -130,6 +131,93 @@ NOTE: The UTF8 flag is ignored if pcre was not configured with the --enable-utf8 flag. . . +.SH "PASSING MODIFIERS TO THE REGULAR EXPRESSION ENGINE" +.rs +.sp +PCRE defines some modifiers to change the behavior of the regular expression +engine. The C++ wrapper defines an auxiliary class, RE_Options, as a vehicle to +pass such modifiers to a RE class. Currently, the following modifiers are +supported: +.sp + modifier description Perl corresponding +.sp + PCRE_CASELESS case insensitive match /i + PCRE_MULTILINE multiple lines match /m + PCRE_DOTALL dot matches newlines /s + PCRE_DOLLAR_ENDONLY $ matches only at end N/A + PCRE_EXTRA strict escape parsing N/A + PCRE_EXTENDED ignore whitespaces /x + PCRE_UTF8 handles UTF8 chars built-in + PCRE_UNGREEDY reverses * and *? N/A + PCRE_NO_AUTO_CAPTURE disables capturing parens N/A (*) +.sp +(*) Both Perl and PCRE allow non capturing parentheses by means of the +"?:" modifier within the pattern itself. e.g. (?:ab|cd) does not +capture, while (ab|cd) does. +.P +For a full account on how each modifier works, please check the +PCRE API reference page. +.P +For each modifier, there are two member functions whose name is made +out of the modifier in lowercase, without the "PCRE_" prefix. For +instance, PCRE_CASELESS is handled by +.sp + bool caseless() +.sp +which returns true if the modifier is set, and +.sp + RE_Options & set_caseless(bool) +.sp +which sets or unsets the modifier. Moreover, PCRE_CONFIG_MATCH_LIMIT can be +accessed through the \fBset_match_limit()\fR and \fBmatch_limit()\fR member +functions. Setting \fImatch_limit\fR to a non-zero value will limit the +execution of pcre to keep it from doing bad things like blowing the stack or +taking an eternity to return a result. A value of 5000 is good enough to stop +stack blowup in a 2MB thread stack. Setting \fImatch_limit\fR to zero disables +match limiting. +.P +Normally, to pass one or more modifiers to a RE class, you declare +a \fIRE_Options\fR object, set the appropriate options, and pass this +object to a RE constructor. Example: +.sp + RE_options opt; + opt.set_caseless(true); + if (RE("HELLO", opt).PartialMatch("hello world")) ... +.sp +RE_options has two constructors. The default constructor takes no arguments and +creates a set of flags that are off by default. The optional parameter +\fIoption_flags\fR is to facilitate transfer of legacy code from C programs. +This lets you do +.sp + RE(pattern, + RE_Options(PCRE_CASELESS|PCRE_MULTILINE)).PartialMatch(str); +.sp +However, new code is better off doing +.sp + RE(pattern, + RE_Options().set_caseless(true).set_multiline(true)) + .PartialMatch(str); +.sp +If you are going to pass one of the most used modifiers, there are some +convenience functions that return a RE_Options class with the +appropriate modifier already set: \fBCASELESS()\fR, \fBUTF8()\fR, +\fBMULTILINE()\fR, \fBDOTALL\fR(), and \fBEXTENDED()\fR. +.P +If you need to set several options at once, and you don't want to go through +the pains of declaring a RE_Options object and setting several options, there +is a parallel method that give you such ability on the fly. You can concatenate +several \fBset_xxxxx()\fR member functions, since each of them returns a +reference to its class object. For example, to pass PCRE_CASELESS, +PCRE_EXTENDED, and PCRE_MULTILINE to a RE with one statement, you may write: +.sp + RE(" ^ xyz \e\es+ .* blah$", + RE_Options() + .set_caseless(true) + .set_extended(true) + .set_multiline(true)).PartialMatch(sometext); +.sp +. +. .SH "SCANNING TEXT INCREMENTALLY" .rs .sp diff --git a/pcre_compile.c b/pcre_compile.c index c592a49..2289952 100644 --- a/pcre_compile.c +++ b/pcre_compile.c @@ -698,7 +698,18 @@ read_repeat_counts(const uschar *p, int *minp, int *maxp, int *errorcodeptr) int min = 0; int max = -1; +/* Read the minimum value and do a paranoid check: a negative value indicates +an integer overflow. */ + while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0'; +if (min < 0 || min > 65535) + { + *errorcodeptr = ERR5; + return p; + } + +/* Read the maximum value if there is one, and again do a paranoid on its size. +Also, max must not be less than min. */ if (*p == '}') max = min; else { @@ -706,6 +717,11 @@ if (*p == '}') max = min; else { max = 0; while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0'; + if (max < 0 || max > 65535) + { + *errorcodeptr = ERR5; + return p; + } if (max < min) { *errorcodeptr = ERR4; @@ -714,16 +730,11 @@ if (*p == '}') max = min; else } } -/* Do paranoid checks, then fill in the required variables, and pass back the -pointer to the terminating '}'. */ +/* Fill in the required variables, and pass back the pointer to the terminating +'}'. */ -if (min > 65535 || max > 65535) - *errorcodeptr = ERR5; -else - { - *minp = min; - *maxp = max; - } +*minp = min; +*maxp = max; return p; } @@ -3856,6 +3867,7 @@ BOOL utf8; BOOL class_utf8; #endif BOOL inescq = FALSE; +BOOL capturing; unsigned int brastackptr = 0; size_t size; uschar *code; @@ -4410,6 +4422,7 @@ while ((c = *(++ptr)) != 0) case '(': branch_newextra = 0; bracket_length = 1 + LINK_SIZE; + capturing = FALSE; /* Handle special forms of bracket, which all start (? */ @@ -4497,6 +4510,9 @@ while ((c = *(++ptr)) != 0) case 'P': ptr += 3; + + /* Handle the definition of a named subpattern */ + if (*ptr == '<') { const uschar *p; /* Don't amalgamate; some compilers */ @@ -4509,9 +4525,12 @@ while ((c = *(++ptr)) != 0) } name_count++; if (ptr - p > max_name_size) max_name_size = (ptr - p); + capturing = TRUE; /* Named parentheses are always capturing */ break; } + /* Handle back references and recursive calls to named subpatterns */ + if (*ptr == '=' || *ptr == '>') { while ((compile_block.ctypes[*(++ptr)] & ctype_word) != 0); @@ -4695,18 +4714,24 @@ while ((c = *(++ptr)) != 0) continue; } - /* If options were terminated by ':' control comes here. Fall through - to handle the group below. */ + /* If options were terminated by ':' control comes here. This is a + non-capturing group with an options change. There is nothing more that + needs to be done because "capturing" is already set FALSE by default; + we can just fall through. */ + } } - /* Extracting brackets must be counted so we can process escapes in a - Perlish way. If the number exceeds EXTRACT_BASIC_MAX we are going to - need an additional 3 bytes of store per extracting bracket. However, if - PCRE_NO_AUTO)CAPTURE is set, unadorned brackets become non-capturing, so we - must leave the count alone (it will aways be zero). */ + /* Ordinary parentheses, not followed by '?', are capturing unless + PCRE_NO_AUTO_CAPTURE is set. */ + + else capturing = (options & PCRE_NO_AUTO_CAPTURE) == 0; + + /* Capturing brackets must be counted so we can process escapes in a + Perlish way. If the number exceeds EXTRACT_BASIC_MAX we are going to need + an additional 3 bytes of memory per capturing bracket. */ - else if ((options & PCRE_NO_AUTO_CAPTURE) == 0) + if (capturing) { bracount++; if (bracount > EXTRACT_BASIC_MAX) bracket_length += 3; @@ -36,6 +36,7 @@ #include <assert.h> #include <errno.h> #include <string> +#include <algorithm> #include "config.h" // We need this to compile the proper dll on windows/msys. This is copied // from pcre_internal.h. It would probably be better just to include that. @@ -97,8 +98,7 @@ RE::~RE() { pcre* RE::Compile(Anchor anchor) { // First, convert RE_Options into pcre options int pcre_options = 0; - if (options_.utf8()) - pcre_options |= PCRE_UTF8; + pcre_options = options_.all_options(); // Special treatment for anchoring. This is needed because at // runtime pcre only provides an option for anchoring at the @@ -378,7 +378,7 @@ bool RE::Extract(const StringPiece& rewrite, int matches = TryMatch(text, 0, UNANCHORED, vec, kVecSize); if (matches == 0) return false; - out->clear(); + out->erase(); return Rewrite(out, rewrite, text, vec, matches); } diff --git a/pcrecpp.h.in b/pcrecpp.h.in index c0d3050..c1da0c8 100644 --- a/pcrecpp.h.in +++ b/pcrecpp.h.in @@ -28,6 +28,7 @@ // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // // Author: Sanjay Ghemawat +// Support for PCRE_XXX modifiers added by Giuseppe Maxia, July 2005 #ifndef _PCRE_REGEXP_H #define _PCRE_REGEXP_H @@ -159,6 +160,90 @@ // --enable-utf8 flag. // // ----------------------------------------------------------------------- +// PASSING MODIFIERS TO THE REGULAR EXPRESSION ENGINE +// +// PCRE defines some modifiers to change the behavior of the regular +// expression engine. +// The C++ wrapper defines an auxiliary class, RE_Options, as a vehicle +// to pass such modifiers to a RE class. +// +// Currently, the following modifiers are supported +// +// modifier description Perl corresponding +// +// PCRE_CASELESS case insensitive match /i +// PCRE_MULTILINE multiple lines match /m +// PCRE_DOTALL dot matches newlines /s +// PCRE_DOLLAR_ENDONLY $ matches only at end N/A +// PCRE_EXTRA strict escape parsing N/A +// PCRE_EXTENDED ignore whitespaces /x +// PCRE_UTF8 handles UTF8 chars built-in +// PCRE_UNGREEDY reverses * and *? N/A +// PCRE_NO_AUTO_CAPTURE disables matching parens N/A (*) +// +// (For a full account on how each modifier works, please check the +// PCRE API reference manual). +// +// (*) Both Perl and PCRE allow non matching parentheses by means of the +// "?:" modifier within the pattern itself. e.g. (?:ab|cd) does not +// capture, while (ab|cd) does. +// +// For each modifier, there are two member functions whose name is made +// out of the modifier in lowercase, without the "PCRE_" prefix. For +// instance, PCRE_CASELESS is handled by +// bool caseless(), +// which returns true if the modifier is set, and +// RE_Options & set_caseless(bool), +// which sets or unsets the modifier. +// +// Moreover, PCRE_CONFIG_MATCH_LIMIT can be accessed through the +// set_match_limit() and match_limit() member functions. +// Setting match_limit to a non-zero value will limit the executation of +// pcre to keep it from doing bad things like blowing the stack or taking +// an eternity to return a result. A value of 5000 is good enough to stop +// stack blowup in a 2MB thread stack. Setting match_limit to zero will +// disable match limiting. +// +// Normally, to pass one or more modifiers to a RE class, you declare +// a RE_Options object, set the appropriate options, and pass this +// object to a RE constructor. Example: +// +// RE_options opt; +// opt.set_caseless(true); +// +// if (RE("HELLO", opt).PartialMatch("hello world")) ... +// +// RE_options has two constructors. The default constructor takes no +// arguments and creates a set of flags that are off by default. +// +// The optional parameter 'option_flags' is to facilitate transfer +// of legacy code from C programs. This lets you do +// RE(pattern, RE_Options(PCRE_CASELESS|PCRE_MULTILINE)).PartialMatch(str); +// +// But new code is better off doing +// RE(pattern, +// RE_Options().set_caseless(true).set_multiline(true)).PartialMatch(str); +// (See below) +// +// If you are going to pass one of the most used modifiers, there are some +// convenience functions that return a RE_Options class with the +// appropriate modifier already set: +// CASELESS(), UTF8(), MULTILINE(), DOTALL(), EXTENDED() +// +// If you need to set several options at once, and you don't want to go +// through the pains of declaring a RE_Options object and setting several +// options, there is a parallel method that give you such ability on the +// fly. You can concatenate several set_xxxxx member functions, since each +// of them returns a reference to its class object. e.g.: to pass +// PCRE_CASELESS, PCRE_EXTENDED, and PCRE_MULTILINE to a RE with one +// statement, you may write +// +// RE(" ^ xyz \\s+ .* blah$", RE_Options() +// .set_caseless(true) +// .set_extended(true) +// .set_multiline(true)).PartialMatch(sometext); +// +// ----------------------------------------------------------------------- // SCANNING TEXT INCREMENTALLY // // The "Consume" operation may be useful if you want to repeatedly @@ -245,6 +330,13 @@ namespace pcrecpp { +#define PCRE_SET_OR_CLEAR(b, o) \ + if (b) all_options_ |= (o); else all_options_ &= ~(o); \ + return *this + +#define PCRE_IS_SET(o) \ + (all_options_ & o) == o + // We convert user-passed pointers into special Arg objects class Arg; extern Arg no_arg; @@ -252,44 +344,128 @@ extern Arg no_arg; /***** Compiling regular expressions: the RE class *****/ // RE_Options allow you to set options to be passed along to pcre, -// along with other options we put on top of pcre. Only UTF and -// match_limit are supported now. Setting match_limit -// to a non-zero value will limit the executation of pcre to -// keep it from doing bad things like blowing the stack or taking -// an eternity to return a result. A value of 5000 is good enough -// to stop stack blowup in a 2MB thread stack. -// Setting match_limit to zero will disable match limiting. +// along with other options we put on top of pcre. +// Only 9 modifiers, plus match_limit are supported now. class RE_Options { public: // constructor - RE_Options() : match_limit_(0), utf8_(false) {} + RE_Options() : match_limit_(0), all_options_(0) {} + + // alternative constructor. + // To facilitate transfer of legacy code from C programs + // + // This lets you do + // RE(pattern, RE_Options(PCRE_CASELESS|PCRE_MULTILINE)).PartialMatch(str); + // But new code is better off doing + // RE(pattern, + // RE_Options().set_caseless(true).set_multiline(true)).PartialMatch(str); + RE_Options(int option_flags) : match_limit_(0), all_options_ (option_flags) {} // we're fine with the default destructor, copy constructor, etc. // accessors and mutators int match_limit() const { return match_limit_; }; - void set_match_limit(int limit) { + RE_Options &set_match_limit(int limit) { match_limit_ = limit; + return *this; + } + + bool caseless() const { + return PCRE_IS_SET(PCRE_CASELESS); + } + RE_Options &set_caseless(bool x) { + PCRE_SET_OR_CLEAR(x, PCRE_CASELESS); + } + + bool multiline() const { + return PCRE_IS_SET(PCRE_MULTILINE); + } + RE_Options &set_multiline(bool x) { + PCRE_SET_OR_CLEAR(x, PCRE_MULTILINE); } - bool utf8() const { return utf8_; } - void set_utf8(bool u) { - utf8_ = u; + bool dotall() const { + return PCRE_IS_SET(PCRE_DOTALL); + } + RE_Options &set_dotall(bool x) { + PCRE_SET_OR_CLEAR(x,PCRE_DOTALL); + } + + bool extended() const { + return PCRE_IS_SET(PCRE_EXTENDED); + } + RE_Options &set_extended(bool x) { + PCRE_SET_OR_CLEAR(x,PCRE_EXTENDED); + } + + bool dollar_endonly() const { + return PCRE_IS_SET(PCRE_DOLLAR_ENDONLY); + } + RE_Options &set_dollar_endonly(bool x) { + PCRE_SET_OR_CLEAR(x,PCRE_DOLLAR_ENDONLY); + } + + bool extra() const { + return PCRE_IS_SET( PCRE_EXTRA); + } + RE_Options &set_extra(bool x) { + PCRE_SET_OR_CLEAR(x, PCRE_EXTRA); + } + + bool ungreedy() const { + return PCRE_IS_SET(PCRE_UNGREEDY); + } + RE_Options &set_ungreedy(bool x) { + PCRE_SET_OR_CLEAR(x, PCRE_UNGREEDY); + } + + bool utf8() const { + return PCRE_IS_SET(PCRE_UTF8); + } + RE_Options &set_utf8(bool x) { + PCRE_SET_OR_CLEAR(x, PCRE_UTF8); + } + + bool no_auto_capture() const { + return PCRE_IS_SET(PCRE_NO_AUTO_CAPTURE); + } + RE_Options &set_no_auto_capture(bool x) { + PCRE_SET_OR_CLEAR(x, PCRE_NO_AUTO_CAPTURE); + } + + RE_Options &set_all_options(int opt) { + all_options_ = opt; + return *this; + } + int all_options() const { + return all_options_ ; } // TODO: add other pcre flags private: int match_limit_; - bool utf8_; + int all_options_; }; // These functions return some common RE_Options static inline RE_Options UTF8() { - RE_Options options; - options.set_utf8(true); - return options; + return RE_Options().set_utf8(true); } +static inline RE_Options CASELESS() { + return RE_Options().set_caseless(true); +} +static inline RE_Options MULTILINE() { + return RE_Options().set_multiline(true); +} + +static inline RE_Options DOTALL() { + return RE_Options().set_dotall(true); +} + +static inline RE_Options EXTENDED() { + return RE_Options().set_extended(true); +} // Interface for regular expression matching. Also corresponds to a // pre-compiled regular expression. An "RE" object is safe for @@ -600,8 +776,11 @@ MAKE_INTEGER_PARSER(long long, longlong); MAKE_INTEGER_PARSER(unsigned long long, ulonglong); #endif +#undef PCRE_IS_SET +#undef PCRE_SET_OR_CLEAR #undef MAKE_INTEGER_PARSER } // namespace pcrecpp + #endif /* _PCRE_REGEXP_H */ diff --git a/pcrecpp_unittest.cc b/pcrecpp_unittest.cc index 000c12e..6a03744 100644 --- a/pcrecpp_unittest.cc +++ b/pcrecpp_unittest.cc @@ -43,6 +43,8 @@ using pcrecpp::Hex; using pcrecpp::Octal; using pcrecpp::CRadix; +static bool VERBOSE_TEST = false; + // CHECK dies with a fatal error if condition is not true. It is *not* // controlled by NDEBUG, so the check will be executed regardless of // compilation mode. Therefore, it is safe to do things like: @@ -363,6 +365,227 @@ static void TestRecursion(int size, const char *pattern, int match_limit) { re.FullMatch(domain); } +// +// Options tests contributed by +// Giuseppe Maxia, CTO, Stardata s.r.l. +// July 2005 +// +static void GetOneOptionResult( + const char *option_name, + const char *regex, + const char *str, + RE_Options options, + bool full, + string expected) { + + printf("Testing Option <%s>\n", option_name); + if(VERBOSE_TEST) + printf("/%s/ finds \"%s\" within \"%s\" \n", + regex, + expected.c_str(), + str); + string captured(""); + if (full) + RE(regex,options).FullMatch(str, &captured); + else + RE(regex,options).PartialMatch(str, &captured); + CHECK_EQ(captured, expected); +} + +static void TestOneOption( + const char *option_name, + const char *regex, + const char *str, + RE_Options options, + bool full, + bool assertive = true) { + + printf("Testing Option <%s>\n", option_name); + if (VERBOSE_TEST) + printf("'%s' %s /%s/ \n", + str, + (assertive? "matches" : "doesn't match"), + regex); + if (assertive) { + if (full) + CHECK(RE(regex,options).FullMatch(str)); + else + CHECK(RE(regex,options).PartialMatch(str)); + } else { + if (full) + CHECK(!RE(regex,options).FullMatch(str)); + else + CHECK(!RE(regex,options).PartialMatch(str)); + } +} + +static void Test_CASELESS() { + RE_Options options; + RE_Options options2; + + options.set_caseless(true); + TestOneOption("CASELESS (class)", "HELLO", "hello", options, false); + TestOneOption("CASELESS (class2)", "HELLO", "hello", options2.set_caseless(true), false); + TestOneOption("CASELESS (class)", "^[A-Z]+$", "Hello", options, false); + + TestOneOption("CASELESS (function)", "HELLO", "hello", pcrecpp::CASELESS(), false); + TestOneOption("CASELESS (function)", "^[A-Z]+$", "Hello", pcrecpp::CASELESS(), false); + options.set_caseless(false); + TestOneOption("no CASELESS", "HELLO", "hello", options, false, false); +} + +static void Test_MULTILINE() { + RE_Options options; + RE_Options options2; + const char *str = "HELLO\n" "cruel\n" "world\n"; + + options.set_multiline(true); + TestOneOption("MULTILINE (class)", "^cruel$", str, options, false); + TestOneOption("MULTILINE (class2)", "^cruel$", str, options2.set_multiline(true), false); + TestOneOption("MULTILINE (function)", "^cruel$", str, pcrecpp::MULTILINE(), false); + options.set_multiline(false); + TestOneOption("no MULTILINE", "^cruel$", str, options, false, false); +} + +static void Test_DOTALL() { + RE_Options options; + RE_Options options2; + const char *str = "HELLO\n" "cruel\n" "world"; + + options.set_dotall(true); + TestOneOption("DOTALL (class)", "HELLO.*world", str, options, true); + TestOneOption("DOTALL (class2)", "HELLO.*world", str, options2.set_dotall(true), true); + TestOneOption("DOTALL (function)", "HELLO.*world", str, pcrecpp::DOTALL(), true); + options.set_dotall(false); + TestOneOption("no DOTALL", "HELLO.*world", str, options, true, false); +} + +static void Test_DOLLAR_ENDONLY() { + RE_Options options; + RE_Options options2; + const char *str = "HELLO world\n"; + + TestOneOption("no DOLLAR_ENDONLY", "world$", str, options, false); + options.set_dollar_endonly(true); + TestOneOption("DOLLAR_ENDONLY 1", "world$", str, options, false, false); + TestOneOption("DOLLAR_ENDONLY 2", "world$", str, options2.set_dollar_endonly(true), false, false); +} + +static void Test_EXTRA() { + RE_Options options; + const char *str = "HELLO"; + + options.set_extra(true); + TestOneOption("EXTRA 1", "\\HELL\\O", str, options, true, false ); + TestOneOption("EXTRA 2", "\\HELL\\O", str, RE_Options().set_extra(true), true, false ); + options.set_extra(false); + TestOneOption("no EXTRA", "\\HELL\\O", str, options, true ); +} + +static void Test_EXTENDED() { + RE_Options options; + RE_Options options2; + const char *str = "HELLO world"; + + options.set_extended(true); + TestOneOption("EXTENDED (class)", "HELLO world", str, options, false, false); + TestOneOption("EXTENDED (class2)", "HELLO world", str, options2.set_extended(true), false, false); + TestOneOption("EXTENDED (class)", + "^ HE L{2} O " + "\\s+ " + "\\w+ $ ", + str, + options, + false); + + TestOneOption("EXTENDED (function)", "HELLO world", str, pcrecpp::EXTENDED(), false, false); + TestOneOption("EXTENDED (function)", + "^ HE L{2} O " + "\\s+ " + "\\w+ $ ", + str, + pcrecpp::EXTENDED(), + false); + + options.set_extended(false); + TestOneOption("no EXTENDED", "HELLO world", str, options, false); +} + +static void Test_NO_AUTO_CAPTURE() { + RE_Options options; + const char *str = "HELLO world"; + string captured; + + printf("Testing Option <no NO_AUTO_CAPTURE>\n"); + if (VERBOSE_TEST) + printf("parentheses capture text\n"); + RE re("(world|universe)$", options); + CHECK(re.Extract("\\1", str , &captured)); + CHECK_EQ(captured, "world"); + options.set_no_auto_capture(true); + printf("testing Option <NO_AUTO_CAPTURE>\n"); + if (VERBOSE_TEST) + printf("parentheses do not capture text\n"); + re.Extract("\\1",str, &captured ); + CHECK_EQ(captured, "world"); +} + +static void Test_UNGREEDY() { + RE_Options options; + const char *str = "HELLO, 'this' is the 'world'"; + + options.set_ungreedy(true); + GetOneOptionResult("UNGREEDY 1", "('.*')", str, options, false, "'this'" ); + GetOneOptionResult("UNGREEDY 2", "('.*')", str, RE_Options().set_ungreedy(true), false, "'this'" ); + GetOneOptionResult("UNGREEDY", "('.*?')", str, options, false, "'this' is the 'world'" ); + + options.set_ungreedy(false); + GetOneOptionResult("no UNGREEDY", "('.*')", str, options, false, "'this' is the 'world'" ); + GetOneOptionResult("no UNGREEDY", "('.*?')", str, options, false, "'this'" ); +} + +static void Test_all_options() { + const char *str = "HELLO\n" "cruel\n" "world"; + RE_Options options; + options.set_all_options(PCRE_CASELESS | PCRE_DOTALL); + + TestOneOption("all_options (CASELESS|DOTALL)", "^hello.*WORLD", str , options, false); + options.set_all_options(0); + TestOneOption("all_options (0)", "^hello.*WORLD", str , options, false, false); + options.set_all_options(PCRE_MULTILINE | PCRE_EXTENDED); + + TestOneOption("all_options (MULTILINE|EXTENDED)", " ^ c r u e l $ ", str, options, false); + TestOneOption("all_options (MULTILINE|EXTENDED) with constructor", + " ^ c r u e l $ ", + str, + RE_Options(PCRE_MULTILINE | PCRE_EXTENDED), + false); + + TestOneOption("all_options (MULTILINE|EXTENDED) with concatenation", + " ^ c r u e l $ ", + str, + RE_Options() + .set_multiline(true) + .set_extended(true), + false); + + options.set_all_options(0); + TestOneOption("all_options (0)", "^ c r u e l $", str, options, false, false); + +} + +static void TestOptions() { + printf("Testing Options\n"); + Test_CASELESS(); + Test_MULTILINE(); + Test_DOTALL(); + Test_DOLLAR_ENDONLY(); + Test_EXTENDED(); + Test_NO_AUTO_CAPTURE(); + Test_UNGREEDY(); + Test_EXTRA(); + Test_all_options(); +} int main(int argc, char** argv) { // Treat any flag as --help @@ -807,6 +1030,11 @@ int main(int argc, char** argv) { TestRecursion(bytes, "ab.", matchlimit); TestRecursion(bytes, "abc.", matchlimit); + // Test Options + if (getenv("VERBOSE_TEST") != NULL) + VERBOSE_TEST = true; + TestOptions(); + // Done printf("OK\n"); diff --git a/testdata/grepoutput b/testdata/grepoutput index 20a6f79..27ab7e4 100644 --- a/testdata/grepoutput +++ b/testdata/grepoutput @@ -1,4 +1,4 @@ -pcregrep version 4.0 07-Jun-2005 using PCRE version 6.1 21-Jun-2005 +pcregrep version 4.0 07-Jun-2005 using PCRE version 6.2 01-Aug-2005 ---------------------------- Test 1 ------------------------------ PATTERN at the start of a line. In the middle of a line, PATTERN appears. diff --git a/testdata/testinput2 b/testdata/testinput2 index dcb5609..befb65a 100644 --- a/testdata/testinput2 +++ b/testdata/testinput2 @@ -1428,5 +1428,26 @@ AbCd ** Failers abcd + +/a{11111111111111111111}/ + +/(){64294967295}/ + +/(){2,4294967295}/ + +"(?i:a)(?i:b)(?i:c)(?i:d)(?i:e)(?i:f)(?i:g)(?i:h)(?i:i)(?i:j)(k)(?i:l)A\1B" + abcdefghijklAkB + +"(?P<n0>a)(?P<n1>b)(?P<n2>c)(?P<n3>d)(?P<n4>e)(?P<n5>f)(?P<n6>g)(?P<n7>h)(?P<n8>i)(?P<n9>j)(?P<n10>k)(?P<n11>l)A\11B" + abcdefghijklAkB + +"(a)(b)(c)(d)(e)(f)(g)(h)(i)(j)(k)(l)A\11B" + abcdefghijklAkB + +"(?P<name0>a)(?P<name1>a)(?P<name2>a)(?P<name3>a)(?P<name4>a)(?P<name5>a)(?P<name6>a)(?P<name7>a)(?P<name8>a)(?P<name9>a)(?P<name10>a)(?P<name11>a)(?P<name12>a)(?P<name13>a)(?P<name14>a)(?P<name15>a)(?P<name16>a)(?P<name17>a)(?P<name18>a)(?P<name19>a)(?P<name20>a)(?P<name21>a)(?P<name22>a)(?P<name23>a)(?P<name24>a)(?P<name25>a)(?P<name26>a)(?P<name27>a)(?P<name28>a)(?P<name29>a)(?P<name30>a)(?P<name31>a)(?P<name32>a)(?P<name33>a)(?P<name34>a)(?P<name35>a)(?P<name36>a)(?P<name37>a)(?P<name38>a)(?P<name39>a)(?P<name40>a)(?P<name41>a)(?P<name42>a)(?P<name43>a)(?P<name44>a)(?P<name45>a)(?P<name46>a)(?P<name47>a)(?P<name48>a)(?P<name49>a)(?P<name50>a)(?P<name51>a)(?P<name52>a)(?P<name53>a)(?P<name54>a)(?P<name55>a)(?P<name56>a)(?P<name57>a)(?P<name58>a)(?P<name59>a)(?P<name60>a)(?P<name61>a)(?P<name62>a)(?P<name63>a)(?P<name64>a)(?P<name65>a)(?P<name66>a)(?P<name67>a)(?P<name68>a)(?P<name69>a)(?P<name70>a)(?P<name71>a)(?P<name72>a)(?P<name73>a)(?P<name74>a)(?P<name75>a)(?P<name76>a)(?P<name77>a)(?P<name78>a)(?P<name79>a)(?P<name80>a)(?P<name81>a)(?P<name82>a)(?P<name83>a)(?P<name84>a)(?P<name85>a)(?P<name86>a)(?P<name87>a)(?P<name88>a)(?P<name89>a)(?P<name90>a)(?P<name91>a)(?P<name92>a)(?P<name93>a)(?P<name94>a)(?P<name95>a)(?P<name96>a)(?P<name97>a)(?P<name98>a)(?P<name99>a)(?P<name100>a)" + aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa + +"(a)(a)(a)(a)(a)(a)(a)(a)(a)(a)(a)(a)(a)(a)(a)(a)(a)(a)(a)(a)(a)(a)(a)(a)(a)(a)(a)(a)(a)(a)(a)(a)(a)(a)(a)(a)(a)(a)(a)(a)(a)(a)(a)(a)(a)(a)(a)(a)(a)(a)(a)(a)(a)(a)(a)(a)(a)(a)(a)(a)(a)(a)(a)(a)(a)(a)(a)(a)(a)(a)(a)(a)(a)(a)(a)(a)(a)(a)(a)(a)(a)(a)(a)(a)(a)(a)(a)(a)(a)(a)(a)(a)(a)(a)(a)(a)(a)(a)(a)(a)(a)" + aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa / End of testinput2 / diff --git a/testdata/testoutput1 b/testdata/testoutput1 index 0eef50e..c823162 100644 --- a/testdata/testoutput1 +++ b/testdata/testoutput1 @@ -1,4 +1,4 @@ -PCRE version 6.1 21-Jun-2005 +PCRE version 6.2 01-Aug-2005 /the quick brown fox/ the quick brown fox diff --git a/testdata/testoutput2 b/testdata/testoutput2 index 823c67a..8f078b8 100644 --- a/testdata/testoutput2 +++ b/testdata/testoutput2 @@ -1,4 +1,4 @@ -PCRE version 6.1 21-Jun-2005 +PCRE version 6.2 01-Aug-2005 /(a)b|/ Capturing subpattern count = 1 @@ -5715,6 +5715,229 @@ Need char = 'd' No match abcd No match + +/a{11111111111111111111}/ +Failed: number too big in {} quantifier at offset 22 + +/(){64294967295}/ +Failed: number too big in {} quantifier at offset 14 + +/(){2,4294967295}/ +Failed: number too big in {} quantifier at offset 15 + +"(?i:a)(?i:b)(?i:c)(?i:d)(?i:e)(?i:f)(?i:g)(?i:h)(?i:i)(?i:j)(k)(?i:l)A\1B" +Capturing subpattern count = 1 +Max back reference = 1 +No options +Case state changes +First char = 'a' (caseless) +Need char = 'B' + abcdefghijklAkB + 0: abcdefghijklAkB + 1: k + +"(?P<n0>a)(?P<n1>b)(?P<n2>c)(?P<n3>d)(?P<n4>e)(?P<n5>f)(?P<n6>g)(?P<n7>h)(?P<n8>i)(?P<n9>j)(?P<n10>k)(?P<n11>l)A\11B" +Capturing subpattern count = 12 +Max back reference = 11 +Named capturing subpatterns: + n0 1 + n1 2 + n10 11 + n11 12 + n2 3 + n3 4 + n4 5 + n5 6 + n6 7 + n7 8 + n8 9 + n9 10 +No options +First char = 'a' +Need char = 'B' + abcdefghijklAkB + 0: abcdefghijklAkB + 1: a + 2: b + 3: c + 4: d + 5: e + 6: f + 7: g + 8: h + 9: i +10: j +11: k +12: l + +"(a)(b)(c)(d)(e)(f)(g)(h)(i)(j)(k)(l)A\11B" +Capturing subpattern count = 12 +Max back reference = 11 +No options +First char = 'a' +Need char = 'B' + abcdefghijklAkB + 0: abcdefghijklAkB + 1: a + 2: b + 3: c + 4: d + 5: e + 6: f + 7: g + 8: h + 9: i +10: j +11: k +12: l + +"(?P<name0>a)(?P<name1>a)(?P<name2>a)(?P<name3>a)(?P<name4>a)(?P<name5>a)(?P<name6>a)(?P<name7>a)(?P<name8>a)(?P<name9>a)(?P<name10>a)(?P<name11>a)(?P<name12>a)(?P<name13>a)(?P<name14>a)(?P<name15>a)(?P<name16>a)(?P<name17>a)(?P<name18>a)(?P<name19>a)(?P<name20>a)(?P<name21>a)(?P<name22>a)(?P<name23>a)(?P<name24>a)(?P<name25>a)(?P<name26>a)(?P<name27>a)(?P<name28>a)(?P<name29>a)(?P<name30>a)(?P<name31>a)(?P<name32>a)(?P<name33>a)(?P<name34>a)(?P<name35>a)(?P<name36>a)(?P<name37>a)(?P<name38>a)(?P<name39>a)(?P<name40>a)(?P<name41>a)(?P<name42>a)(?P<name43>a)(?P<name44>a)(?P<name45>a)(?P<name46>a)(?P<name47>a)(?P<name48>a)(?P<name49>a)(?P<name50>a)(?P<name51>a)(?P<name52>a)(?P<name53>a)(?P<name54>a)(?P<name55>a)(?P<name56>a)(?P<name57>a)(?P<name58>a)(?P<name59>a)(?P<name60>a)(?P<name61>a)(?P<name62>a)(?P<name63>a)(?P<name64>a)(?P<name65>a)(?P<name66>a)(?P<name67>a)(?P<name68>a)(?P<name69>a)(?P<name70>a)(?P<name71>a)(?P<name72>a)(?P<name73>a)(?P<name74>a)(?P<name75>a)(?P<name76>a)(?P<name77>a)(?P<name78>a)(?P<name79>a)(?P<name80>a)(?P<name81>a)(?P<name82>a)(?P<name83>a)(?P<name84>a)(?P<name85>a)(?P<name86>a)(?P<name87>a)(?P<name88>a)(?P<name89>a)(?P<name90>a)(?P<name91>a)(?P<name92>a)(?P<name93>a)(?P<name94>a)(?P<name95>a)(?P<name96>a)(?P<name97>a)(?P<name98>a)(?P<name99>a)(?P<name100>a)" +Capturing subpattern count = 101 +Named capturing subpatterns: + name0 1 + name1 2 + name10 11 + name100 101 + name11 12 + name12 13 + name13 14 + name14 15 + name15 16 + name16 17 + name17 18 + name18 19 + name19 20 + name2 3 + name20 21 + name21 22 + name22 23 + name23 24 + name24 25 + name25 26 + name26 27 + name27 28 + name28 29 + name29 30 + name3 4 + name30 31 + name31 32 + name32 33 + name33 34 + name34 35 + name35 36 + name36 37 + name37 38 + name38 39 + name39 40 + name4 5 + name40 41 + name41 42 + name42 43 + name43 44 + name44 45 + name45 46 + name46 47 + name47 48 + name48 49 + name49 50 + name5 6 + name50 51 + name51 52 + name52 53 + name53 54 + name54 55 + name55 56 + name56 57 + name57 58 + name58 59 + name59 60 + name6 7 + name60 61 + name61 62 + name62 63 + name63 64 + name64 65 + name65 66 + name66 67 + name67 68 + name68 69 + name69 70 + name7 8 + name70 71 + name71 72 + name72 73 + name73 74 + name74 75 + name75 76 + name76 77 + name77 78 + name78 79 + name79 80 + name8 9 + name80 81 + name81 82 + name82 83 + name83 84 + name84 85 + name85 86 + name86 87 + name87 88 + name88 89 + name89 90 + name9 10 + name90 91 + name91 92 + name92 93 + name93 94 + name94 95 + name95 96 + name96 97 + name97 98 + name98 99 + name99 100 +No options +First char = 'a' +Need char = 'a' + aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa +Matched, but too many substrings + 0: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa + 1: a + 2: a + 3: a + 4: a + 5: a + 6: a + 7: a + 8: a + 9: a +10: a +11: a +12: a +13: a +14: a + +"(a)(a)(a)(a)(a)(a)(a)(a)(a)(a)(a)(a)(a)(a)(a)(a)(a)(a)(a)(a)(a)(a)(a)(a)(a)(a)(a)(a)(a)(a)(a)(a)(a)(a)(a)(a)(a)(a)(a)(a)(a)(a)(a)(a)(a)(a)(a)(a)(a)(a)(a)(a)(a)(a)(a)(a)(a)(a)(a)(a)(a)(a)(a)(a)(a)(a)(a)(a)(a)(a)(a)(a)(a)(a)(a)(a)(a)(a)(a)(a)(a)(a)(a)(a)(a)(a)(a)(a)(a)(a)(a)(a)(a)(a)(a)(a)(a)(a)(a)(a)(a)" +Capturing subpattern count = 101 +No options +First char = 'a' +Need char = 'a' + aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa +Matched, but too many substrings + 0: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa + 1: a + 2: a + 3: a + 4: a + 5: a + 6: a + 7: a + 8: a + 9: a +10: a +11: a +12: a +13: a +14: a / End of testinput2 / Capturing subpattern count = 0 diff --git a/testdata/testoutput3 b/testdata/testoutput3 index e58c9c7..16ebc71 100644 --- a/testdata/testoutput3 +++ b/testdata/testoutput3 @@ -1,4 +1,4 @@ -PCRE version 6.1 21-Jun-2005 +PCRE version 6.2 01-Aug-2005 /^[\w]+/ *** Failers diff --git a/testdata/testoutput4 b/testdata/testoutput4 index af4a821..234fd1a 100644 --- a/testdata/testoutput4 +++ b/testdata/testoutput4 @@ -1,4 +1,4 @@ -PCRE version 6.1 21-Jun-2005 +PCRE version 6.2 01-Aug-2005 /-- Do not use the \x{} construct except with patterns that have the --/ /-- /8 option set, because PCRE doesn't recognize them as UTF-8 unless --/ diff --git a/testdata/testoutput5 b/testdata/testoutput5 index 6b694a7..6e0d418 100644 --- a/testdata/testoutput5 +++ b/testdata/testoutput5 @@ -1,4 +1,4 @@ -PCRE version 6.1 21-Jun-2005 +PCRE version 6.2 01-Aug-2005 /\x{100}/8DM Memory allocation (code space): 10 diff --git a/testdata/testoutput6 b/testdata/testoutput6 index 8889b05..9f9421e 100644 --- a/testdata/testoutput6 +++ b/testdata/testoutput6 @@ -1,4 +1,4 @@ -PCRE version 6.1 21-Jun-2005 +PCRE version 6.2 01-Aug-2005 /^\pC\pL\pM\pN\pP\pS\pZ</8 \x7f\x{c0}\x{30f}\x{660}\x{66c}\x{f01}\x{1680}< diff --git a/testdata/testoutput7 b/testdata/testoutput7 index 421efb2..8e55069 100644 --- a/testdata/testoutput7 +++ b/testdata/testoutput7 @@ -1,4 +1,4 @@ -PCRE version 6.1 21-Jun-2005 +PCRE version 6.2 01-Aug-2005 /abc/ abc diff --git a/testdata/testoutput8 b/testdata/testoutput8 index 6425f00..eb585ad 100644 --- a/testdata/testoutput8 +++ b/testdata/testoutput8 @@ -1,4 +1,4 @@ -PCRE version 6.1 21-Jun-2005 +PCRE version 6.2 01-Aug-2005 /-- Do not use the \x{} construct except with patterns that have the --/ /-- /8 option set, because PCRE doesn't recognize them as UTF-8 unless --/ diff --git a/testdata/testoutput9 b/testdata/testoutput9 index 4d2a41f..f7b4661 100644 --- a/testdata/testoutput9 +++ b/testdata/testoutput9 @@ -1,4 +1,4 @@ -PCRE version 6.1 21-Jun-2005 +PCRE version 6.2 01-Aug-2005 /\pL\P{Nd}/8 AB |