Fix optimization bugs for patterns starting with lookaheads.

git-svn-id: svn://vcs.exim.org/pcre/code/trunk@1669 2f5784b3-3f2a-0410-8824-cb99058d5e15
author: ph10 <ph10@2f5784b3-3f2a-0410-8824-cb99058d5e15> 2016-10-18 15:10:09 +0000
committer: ph10 <ph10@2f5784b3-3f2a-0410-8824-cb99058d5e15> 2016-10-18 15:10:09 +0000
commit: d6d45d15235fc6f010cfb193db8fb672a152e41c (patch)
tree: 160fa6cbfeb5360310a9818fde013ecdf00ad40e
parent: bad0d0347aa4ab37d2b3e906193725d68a5d98bf (diff)
download: pcre-d6d45d15235fc6f010cfb193db8fb672a152e41c.tar.gz
8 files changed, 80 insertions, 27 deletions
diff --git a/ChangeLog b/ChangeLog
index d95d122..5ff5196 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -25,18 +25,31 @@ Version 8.40 xx-xxxx-2016
     is in the class. There was a bug that caused this not to happen if a
     Unicode property item was added to such a class, for example [\D\P{Nd}] or
     [\W\pL].
-    
+
 7.  When pcretest was outputing information from a callout, the caret indicator
     for the current position in the subject line was incorrect if it was after
     an escape sequence for a character whose code point was greater than
     \x{ff}.
-    
+
 8.  A pattern such as (?<RA>abc)(?(R)xyz) was incorrectly compiled such that
     the conditional was interpreted as a reference to capturing group 1 instead
     of a test for recursion. Any group whose name began with R was
-    misinterpreted in this way. (The reference interpretation should only 
+    misinterpreted in this way. (The reference interpretation should only
     happen if the group's name is precisely "R".)
 
+9.  A number of bugs have been mended relating to match start-up optimizations
+    when the first thing in a pattern is a positive lookahead. These all
+    applied only when PCRE_NO_START_OPTIMIZE was *not* set:
+
+    (a) A pattern such as (?=.*X)X$ was incorrectly optimized as if it needed
+        both an initial 'X' and a following 'X'.
+    (b) Some patterns starting with an assertion that started with .* were
+        incorrectly optimized as having to match at the start of the subject or
+        after a newline. There are cases where this is not true, for example,
+        (?=.*[A-Z])(?=.{8,16})(?!.*[\s]) matches after the start in lines that
+        start with spaces. Starting .* in an assertion is no longer taken as an
+        indication of matching at the start (or after a newline).
+
 
 Version 8.39 14-June-2016
 -------------------------
diff --git a/pcre_compile.c b/pcre_compile.c
index 67c74e8..de92313 100644
--- a/pcre_compile.c
+++ b/pcre_compile.c
@@ -7918,15 +7918,17 @@ for (;; ptr++)
         }
       }
 
-    /* For a forward assertion, we take the reqchar, if set. This can be
-    helpful if the pattern that follows the assertion doesn't set a different
-    char. For example, it's useful for /(?=abcde).+/. We can't set firstchar
-    for an assertion, however because it leads to incorrect effect for patterns
-    such as /(?=a)a.+/ when the "real" "a" would then become a reqchar instead
-    of a firstchar. This is overcome by a scan at the end if there's no
-    firstchar, looking for an asserted first char. */
-
-    else if (bravalue == OP_ASSERT && subreqcharflags >= 0)
+    /* For a forward assertion, we take the reqchar, if set, provided that the
+    group has also set a first char. This can be helpful if the pattern that
+    follows the assertion doesn't set a different char. For example, it's
+    useful for /(?=abcde).+/. We can't set firstchar for an assertion, however
+    because it leads to incorrect effect for patterns such as /(?=a)a.+/ when
+    the "real" "a" would then become a reqchar instead of a firstchar. This is
+    overcome by a scan at the end if there's no firstchar, looking for an
+    asserted first char. */
+
+    else if (bravalue == OP_ASSERT && subreqcharflags >= 0 &&
+             subfirstcharflags >= 0)
       {
       reqchar = subreqchar;
       reqcharflags = subreqcharflags;
@@ -8715,8 +8717,8 @@ matching and for non-DOTALL patterns that start with .* (which must start at
 the beginning or after \n). As in the case of is_anchored() (see above), we
 have to take account of back references to capturing brackets that contain .*
 because in that case we can't make the assumption. Also, the appearance of .*
-inside atomic brackets or in a pattern that contains *PRUNE or *SKIP does not
-count, because once again the assumption no longer holds.
+inside atomic brackets or in an assertion, or in a pattern that contains *PRUNE
+or *SKIP does not count, because once again the assumption no longer holds.
 
 Arguments:
   code           points to start of expression (the bracket)
@@ -8725,13 +8727,14 @@ Arguments:
                   the less precise approach
   cd             points to the compile data
   atomcount      atomic group level
+  inassert       TRUE if in an assertion
 
 Returns:         TRUE or FALSE
 */
 
 static BOOL
 is_startline(const pcre_uchar *code, unsigned int bracket_map,
-  compile_data *cd, int atomcount)
+  compile_data *cd, int atomcount, BOOL inassert)
 {
 do {
    const pcre_uchar *scode = first_significant_code(
@@ -8758,7 +8761,7 @@ do {
        return FALSE;
 
        default:     /* Assertion */
-       if (!is_startline(scode, bracket_map, cd, atomcount)) return FALSE;
+       if (!is_startline(scode, bracket_map, cd, atomcount, TRUE)) return FALSE;
        do scode += GET(scode, 1); while (*scode == OP_ALT);
        scode += 1 + LINK_SIZE;
        break;
@@ -8772,7 +8775,7 @@ do {
    if (op == OP_BRA  || op == OP_BRAPOS ||
        op == OP_SBRA || op == OP_SBRAPOS)
      {
-     if (!is_startline(scode, bracket_map, cd, atomcount)) return FALSE;
+     if (!is_startline(scode, bracket_map, cd, atomcount, inassert)) return FALSE;
      }
 
    /* Capturing brackets */
@@ -8782,33 +8785,33 @@ do {
      {
      int n = GET2(scode, 1+LINK_SIZE);
      int new_map = bracket_map | ((n < 32)? (1 << n) : 1);
-     if (!is_startline(scode, new_map, cd, atomcount)) return FALSE;
+     if (!is_startline(scode, new_map, cd, atomcount, inassert)) return FALSE;
      }
 
    /* Positive forward assertions */
 
    else if (op == OP_ASSERT)
      {
-     if (!is_startline(scode, bracket_map, cd, atomcount)) return FALSE;
+     if (!is_startline(scode, bracket_map, cd, atomcount, TRUE)) return FALSE;
      }
 
    /* Atomic brackets */
 
    else if (op == OP_ONCE || op == OP_ONCE_NC)
      {
-     if (!is_startline(scode, bracket_map, cd, atomcount + 1)) return FALSE;
+     if (!is_startline(scode, bracket_map, cd, atomcount + 1, inassert)) return FALSE;
      }
 
    /* .* means "start at start or after \n" if it isn't in atomic brackets or
-   brackets that may be referenced, as long as the pattern does not contain
-   *PRUNE or *SKIP, because these break the feature. Consider, for example,
-   /.*?a(*PRUNE)b/ with the subject "aab", which matches "ab", i.e. not at the
-   start of a line. */
+   brackets that may be referenced or an assertion, as long as the pattern does
+   not contain *PRUNE or *SKIP, because these break the feature. Consider, for
+   example, /.*?a(*PRUNE)b/ with the subject "aab", which matches "ab", i.e.
+   not at the start of a line. */
 
    else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR || op == OP_TYPEPOSSTAR)
      {
      if (scode[1] != OP_ANY || (bracket_map & cd->backref_map) != 0 ||
-         atomcount > 0 || cd->had_pruneorskip)
+         atomcount > 0 || cd->had_pruneorskip || inassert)
        return FALSE;
      }
 
@@ -9663,7 +9666,7 @@ if ((re->options & PCRE_ANCHORED) == 0)
       re->flags |= PCRE_FIRSTSET;
       }
 
-    else if (is_startline(codestart, 0, cd, 0)) re->flags |= PCRE_STARTLINE;
+    else if (is_startline(codestart, 0, cd, 0, FALSE)) re->flags |= PCRE_STARTLINE;
     }
   }
 
diff --git a/testdata/testinput1 b/testdata/testinput1
index 8379ce0..93abab3 100644
--- a/testdata/testinput1
+++ b/testdata/testinput1
@@ -5733,4 +5733,10 @@ AbcdCBefgBhiBqz
 "(?|(\k'Pm')|(?'Pm'))"
     abcd
 
+/(?=.*[A-Z])(?=.*[a-z])(?=.*[0-9])(?=.*[,;:])(?=.{8,16})(?!.*[\s])/
+    \  Fred:099
+
+/(?=.*X)X$/ 
+    \  X
+     
 /-- End of testinput1 --/
diff --git a/testdata/testinput2 b/testdata/testinput2
index 38346ef..08c6f39 100644
--- a/testdata/testinput2
+++ b/testdata/testinput2
@@ -4247,4 +4247,6 @@ backtracking verbs. --/
 
 /(?<R>abc)(?(R)xyz)/BZ
 
+/(?=.*[A-Z])/I
+
 /-- End of testinput2 --/
diff --git a/testdata/testinput6 b/testdata/testinput6
index a178d3d..22ed1e6 100644
--- a/testdata/testinput6
+++ b/testdata/testinput6
@@ -1562,4 +1562,10 @@
     \x{389}
     \x{20ac}
 
+/(?=.*b)\pL/
+    11bb
+    
+/(?(?=.*b)(?=.*b)\pL|.*c)/
+    11bb
+
 /-- End of testinput6 --/
diff --git a/testdata/testoutput1 b/testdata/testoutput1
index e852ab9..a2b3cff 100644
--- a/testdata/testoutput1
+++ b/testdata/testoutput1
@@ -9434,4 +9434,12 @@ No match
  0: 
  1: 
 
+/(?=.*[A-Z])(?=.*[a-z])(?=.*[0-9])(?=.*[,;:])(?=.{8,16})(?!.*[\s])/
+    \  Fred:099
+ 0: 
+
+/(?=.*X)X$/ 
+    \  X
+ 0: X
+     
 /-- End of testinput1 --/
diff --git a/testdata/testoutput2 b/testdata/testoutput2
index 216bff8..811bbef 100644
--- a/testdata/testoutput2
+++ b/testdata/testoutput2
@@ -9380,7 +9380,7 @@ No need char
 /(?(?=.*b).*b|^d)/I
 Capturing subpattern count = 0
 No options
-First char at start or follows newline
+No first char
 No need char
 
 /xyz/C
@@ -14698,4 +14698,11 @@ Failed: assertion expected after (?( or (?(?C) at offset 4
         End
 ------------------------------------------------------------------
 
+/(?=.*[A-Z])/I
+Capturing subpattern count = 0
+May match empty string
+No options
+No first char
+No need char
+
 /-- End of testinput2 --/
diff --git a/testdata/testoutput6 b/testdata/testoutput6
index b64dc0d..422d383 100644
--- a/testdata/testoutput6
+++ b/testdata/testoutput6
@@ -2573,4 +2573,12 @@ No match
     \x{20ac}
 No match
 
+/(?=.*b)\pL/
+    11bb
+ 0: b
+    
+/(?(?=.*b)(?=.*b)\pL|.*c)/
+    11bb
+ 0: b
+
 /-- End of testinput6 --/
author	ph10 <ph10@2f5784b3-3f2a-0410-8824-cb99058d5e15>	2016-10-18 15:10:09 +0000
committer	ph10 <ph10@2f5784b3-3f2a-0410-8824-cb99058d5e15>	2016-10-18 15:10:09 +0000
commit	d6d45d15235fc6f010cfb193db8fb672a152e41c (patch)
tree	160fa6cbfeb5360310a9818fde013ecdf00ad40e
parent	bad0d0347aa4ab37d2b3e906193725d68a5d98bf (diff)
download	pcre-d6d45d15235fc6f010cfb193db8fb672a152e41c.tar.gz