diff options
author | ph10 <ph10@2f5784b3-3f2a-0410-8824-cb99058d5e15> | 2016-10-18 15:10:09 +0000 |
---|---|---|
committer | ph10 <ph10@2f5784b3-3f2a-0410-8824-cb99058d5e15> | 2016-10-18 15:10:09 +0000 |
commit | d6d45d15235fc6f010cfb193db8fb672a152e41c (patch) | |
tree | 160fa6cbfeb5360310a9818fde013ecdf00ad40e | |
parent | bad0d0347aa4ab37d2b3e906193725d68a5d98bf (diff) | |
download | pcre-d6d45d15235fc6f010cfb193db8fb672a152e41c.tar.gz |
Fix optimization bugs for patterns starting with lookaheads.
git-svn-id: svn://vcs.exim.org/pcre/code/trunk@1669 2f5784b3-3f2a-0410-8824-cb99058d5e15
-rw-r--r-- | ChangeLog | 19 | ||||
-rw-r--r-- | pcre_compile.c | 49 | ||||
-rw-r--r-- | testdata/testinput1 | 6 | ||||
-rw-r--r-- | testdata/testinput2 | 2 | ||||
-rw-r--r-- | testdata/testinput6 | 6 | ||||
-rw-r--r-- | testdata/testoutput1 | 8 | ||||
-rw-r--r-- | testdata/testoutput2 | 9 | ||||
-rw-r--r-- | testdata/testoutput6 | 8 |
8 files changed, 80 insertions, 27 deletions
@@ -25,18 +25,31 @@ Version 8.40 xx-xxxx-2016 is in the class. There was a bug that caused this not to happen if a Unicode property item was added to such a class, for example [\D\P{Nd}] or [\W\pL]. - + 7. When pcretest was outputing information from a callout, the caret indicator for the current position in the subject line was incorrect if it was after an escape sequence for a character whose code point was greater than \x{ff}. - + 8. A pattern such as (?<RA>abc)(?(R)xyz) was incorrectly compiled such that the conditional was interpreted as a reference to capturing group 1 instead of a test for recursion. Any group whose name began with R was - misinterpreted in this way. (The reference interpretation should only + misinterpreted in this way. (The reference interpretation should only happen if the group's name is precisely "R".) +9. A number of bugs have been mended relating to match start-up optimizations + when the first thing in a pattern is a positive lookahead. These all + applied only when PCRE_NO_START_OPTIMIZE was *not* set: + + (a) A pattern such as (?=.*X)X$ was incorrectly optimized as if it needed + both an initial 'X' and a following 'X'. + (b) Some patterns starting with an assertion that started with .* were + incorrectly optimized as having to match at the start of the subject or + after a newline. There are cases where this is not true, for example, + (?=.*[A-Z])(?=.{8,16})(?!.*[\s]) matches after the start in lines that + start with spaces. Starting .* in an assertion is no longer taken as an + indication of matching at the start (or after a newline). + Version 8.39 14-June-2016 ------------------------- diff --git a/pcre_compile.c b/pcre_compile.c index 67c74e8..de92313 100644 --- a/pcre_compile.c +++ b/pcre_compile.c @@ -7918,15 +7918,17 @@ for (;; ptr++) } } - /* For a forward assertion, we take the reqchar, if set. This can be - helpful if the pattern that follows the assertion doesn't set a different - char. For example, it's useful for /(?=abcde).+/. We can't set firstchar - for an assertion, however because it leads to incorrect effect for patterns - such as /(?=a)a.+/ when the "real" "a" would then become a reqchar instead - of a firstchar. This is overcome by a scan at the end if there's no - firstchar, looking for an asserted first char. */ - - else if (bravalue == OP_ASSERT && subreqcharflags >= 0) + /* For a forward assertion, we take the reqchar, if set, provided that the + group has also set a first char. This can be helpful if the pattern that + follows the assertion doesn't set a different char. For example, it's + useful for /(?=abcde).+/. We can't set firstchar for an assertion, however + because it leads to incorrect effect for patterns such as /(?=a)a.+/ when + the "real" "a" would then become a reqchar instead of a firstchar. This is + overcome by a scan at the end if there's no firstchar, looking for an + asserted first char. */ + + else if (bravalue == OP_ASSERT && subreqcharflags >= 0 && + subfirstcharflags >= 0) { reqchar = subreqchar; reqcharflags = subreqcharflags; @@ -8715,8 +8717,8 @@ matching and for non-DOTALL patterns that start with .* (which must start at the beginning or after \n). As in the case of is_anchored() (see above), we have to take account of back references to capturing brackets that contain .* because in that case we can't make the assumption. Also, the appearance of .* -inside atomic brackets or in a pattern that contains *PRUNE or *SKIP does not -count, because once again the assumption no longer holds. +inside atomic brackets or in an assertion, or in a pattern that contains *PRUNE +or *SKIP does not count, because once again the assumption no longer holds. Arguments: code points to start of expression (the bracket) @@ -8725,13 +8727,14 @@ Arguments: the less precise approach cd points to the compile data atomcount atomic group level + inassert TRUE if in an assertion Returns: TRUE or FALSE */ static BOOL is_startline(const pcre_uchar *code, unsigned int bracket_map, - compile_data *cd, int atomcount) + compile_data *cd, int atomcount, BOOL inassert) { do { const pcre_uchar *scode = first_significant_code( @@ -8758,7 +8761,7 @@ do { return FALSE; default: /* Assertion */ - if (!is_startline(scode, bracket_map, cd, atomcount)) return FALSE; + if (!is_startline(scode, bracket_map, cd, atomcount, TRUE)) return FALSE; do scode += GET(scode, 1); while (*scode == OP_ALT); scode += 1 + LINK_SIZE; break; @@ -8772,7 +8775,7 @@ do { if (op == OP_BRA || op == OP_BRAPOS || op == OP_SBRA || op == OP_SBRAPOS) { - if (!is_startline(scode, bracket_map, cd, atomcount)) return FALSE; + if (!is_startline(scode, bracket_map, cd, atomcount, inassert)) return FALSE; } /* Capturing brackets */ @@ -8782,33 +8785,33 @@ do { { int n = GET2(scode, 1+LINK_SIZE); int new_map = bracket_map | ((n < 32)? (1 << n) : 1); - if (!is_startline(scode, new_map, cd, atomcount)) return FALSE; + if (!is_startline(scode, new_map, cd, atomcount, inassert)) return FALSE; } /* Positive forward assertions */ else if (op == OP_ASSERT) { - if (!is_startline(scode, bracket_map, cd, atomcount)) return FALSE; + if (!is_startline(scode, bracket_map, cd, atomcount, TRUE)) return FALSE; } /* Atomic brackets */ else if (op == OP_ONCE || op == OP_ONCE_NC) { - if (!is_startline(scode, bracket_map, cd, atomcount + 1)) return FALSE; + if (!is_startline(scode, bracket_map, cd, atomcount + 1, inassert)) return FALSE; } /* .* means "start at start or after \n" if it isn't in atomic brackets or - brackets that may be referenced, as long as the pattern does not contain - *PRUNE or *SKIP, because these break the feature. Consider, for example, - /.*?a(*PRUNE)b/ with the subject "aab", which matches "ab", i.e. not at the - start of a line. */ + brackets that may be referenced or an assertion, as long as the pattern does + not contain *PRUNE or *SKIP, because these break the feature. Consider, for + example, /.*?a(*PRUNE)b/ with the subject "aab", which matches "ab", i.e. + not at the start of a line. */ else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR || op == OP_TYPEPOSSTAR) { if (scode[1] != OP_ANY || (bracket_map & cd->backref_map) != 0 || - atomcount > 0 || cd->had_pruneorskip) + atomcount > 0 || cd->had_pruneorskip || inassert) return FALSE; } @@ -9663,7 +9666,7 @@ if ((re->options & PCRE_ANCHORED) == 0) re->flags |= PCRE_FIRSTSET; } - else if (is_startline(codestart, 0, cd, 0)) re->flags |= PCRE_STARTLINE; + else if (is_startline(codestart, 0, cd, 0, FALSE)) re->flags |= PCRE_STARTLINE; } } diff --git a/testdata/testinput1 b/testdata/testinput1 index 8379ce0..93abab3 100644 --- a/testdata/testinput1 +++ b/testdata/testinput1 @@ -5733,4 +5733,10 @@ AbcdCBefgBhiBqz "(?|(\k'Pm')|(?'Pm'))" abcd +/(?=.*[A-Z])(?=.*[a-z])(?=.*[0-9])(?=.*[,;:])(?=.{8,16})(?!.*[\s])/ + \ Fred:099 + +/(?=.*X)X$/ + \ X + /-- End of testinput1 --/ diff --git a/testdata/testinput2 b/testdata/testinput2 index 38346ef..08c6f39 100644 --- a/testdata/testinput2 +++ b/testdata/testinput2 @@ -4247,4 +4247,6 @@ backtracking verbs. --/ /(?<R>abc)(?(R)xyz)/BZ +/(?=.*[A-Z])/I + /-- End of testinput2 --/ diff --git a/testdata/testinput6 b/testdata/testinput6 index a178d3d..22ed1e6 100644 --- a/testdata/testinput6 +++ b/testdata/testinput6 @@ -1562,4 +1562,10 @@ \x{389} \x{20ac} +/(?=.*b)\pL/ + 11bb + +/(?(?=.*b)(?=.*b)\pL|.*c)/ + 11bb + /-- End of testinput6 --/ diff --git a/testdata/testoutput1 b/testdata/testoutput1 index e852ab9..a2b3cff 100644 --- a/testdata/testoutput1 +++ b/testdata/testoutput1 @@ -9434,4 +9434,12 @@ No match 0: 1: +/(?=.*[A-Z])(?=.*[a-z])(?=.*[0-9])(?=.*[,;:])(?=.{8,16})(?!.*[\s])/ + \ Fred:099 + 0: + +/(?=.*X)X$/ + \ X + 0: X + /-- End of testinput1 --/ diff --git a/testdata/testoutput2 b/testdata/testoutput2 index 216bff8..811bbef 100644 --- a/testdata/testoutput2 +++ b/testdata/testoutput2 @@ -9380,7 +9380,7 @@ No need char /(?(?=.*b).*b|^d)/I Capturing subpattern count = 0 No options -First char at start or follows newline +No first char No need char /xyz/C @@ -14698,4 +14698,11 @@ Failed: assertion expected after (?( or (?(?C) at offset 4 End ------------------------------------------------------------------ +/(?=.*[A-Z])/I +Capturing subpattern count = 0 +May match empty string +No options +No first char +No need char + /-- End of testinput2 --/ diff --git a/testdata/testoutput6 b/testdata/testoutput6 index b64dc0d..422d383 100644 --- a/testdata/testoutput6 +++ b/testdata/testoutput6 @@ -2573,4 +2573,12 @@ No match \x{20ac} No match +/(?=.*b)\pL/ + 11bb + 0: b + +/(?(?=.*b)(?=.*b)\pL|.*c)/ + 11bb + 0: b + /-- End of testinput6 --/ |