1 files changed, 142 insertions, 91 deletions
diff --git a/ext/pcre/pcrelib/pcre_exec.c b/ext/pcre/pcrelib/pcre_exec.c
index 91b5047731..073cf2410a 100644
--- a/ext/pcre/pcrelib/pcre_exec.c
+++ b/ext/pcre/pcrelib/pcre_exec.c
@@ -320,9 +320,9 @@ typedef struct heapframe {
 
   /* Function arguments that may change */
 
-  const uschar *Xeptr;
+  USPTR Xeptr;
   const uschar *Xecode;
-  const uschar *Xmstart;
+  USPTR Xmstart;
   int Xoffset_top;
   long int Xims;
   eptrblock *Xeptrb;
@@ -331,13 +331,15 @@ typedef struct heapframe {
 
   /* Function local variables */
 
-  const uschar *Xcallpat;
-  const uschar *Xcharptr;
-  const uschar *Xdata;
-  const uschar *Xnext;
-  const uschar *Xpp;
-  const uschar *Xprev;
-  const uschar *Xsaved_eptr;
+  USPTR Xcallpat;
+#ifdef SUPPORT_UTF8
+  USPTR Xcharptr;
+#endif
+  USPTR Xdata;
+  USPTR Xnext;
+  USPTR Xpp;
+  USPTR Xprev;
+  USPTR Xsaved_eptr;
 
   recursion_info Xnew_recursive;
 
@@ -358,6 +360,7 @@ typedef struct heapframe {
   uschar Xocchars[8];
 #endif
 
+  int Xcodelink;
   int Xctype;
   unsigned int Xfc;
   int Xfi;
@@ -423,7 +426,7 @@ Returns:       MATCH_MATCH if matched            )  these values are >= 0
 */
 
 static int
-match(REGISTER USPTR eptr, REGISTER const uschar *ecode, const uschar *mstart,
+match(REGISTER USPTR eptr, REGISTER const uschar *ecode, USPTR mstart,
   int offset_top, match_data *md, unsigned long int ims, eptrblock *eptrb,
   int flags, unsigned int rdepth)
 {
@@ -437,6 +440,7 @@ register unsigned int c;   /* Character values not kept over RMATCH() calls */
 register BOOL utf8;        /* Local copy of UTF-8 flag for speed */
 
 BOOL minimize, possessive; /* Quantifier options */
+int condcode;
 
 /* When recursion is not being used, all "local" variables that have to be
 preserved over calls to RMATCH() are part of a "frame" which is obtained from
@@ -479,6 +483,7 @@ HEAP_RECURSE:
 #define charptr            frame->Xcharptr
 #endif
 #define callpat            frame->Xcallpat
+#define codelink           frame->Xcodelink
 #define data               frame->Xdata
 #define next               frame->Xnext
 #define pp                 frame->Xpp
@@ -559,6 +564,7 @@ int oclength;
 uschar occhars[8];
 #endif
 
+int codelink;
 int ctype;
 int length;
 int max;
@@ -785,7 +791,39 @@ for (;;)
 
     case OP_COND:
     case OP_SCOND:
-    if (ecode[LINK_SIZE+1] == OP_RREF)         /* Recursion test */
+    codelink= GET(ecode, 1);
+
+    /* Because of the way auto-callout works during compile, a callout item is
+    inserted between OP_COND and an assertion condition. */
+
+    if (ecode[LINK_SIZE+1] == OP_CALLOUT)
+      {
+      if (pcre_callout != NULL)
+        {
+        pcre_callout_block cb;
+        cb.version          = 1;   /* Version 1 of the callout block */
+        cb.callout_number   = ecode[LINK_SIZE+2];
+        cb.offset_vector    = md->offset_vector;
+        cb.subject          = (PCRE_SPTR)md->start_subject;
+        cb.subject_length   = md->end_subject - md->start_subject;
+        cb.start_match      = mstart - md->start_subject;
+        cb.current_position = eptr - md->start_subject;
+        cb.pattern_position = GET(ecode, LINK_SIZE + 3);
+        cb.next_item_length = GET(ecode, 3 + 2*LINK_SIZE);
+        cb.capture_top      = offset_top/2;
+        cb.capture_last     = md->capture_last;
+        cb.callout_data     = md->callout_data;
+        if ((rrc = (*pcre_callout)(&cb)) > 0) RRETURN(MATCH_NOMATCH);
+        if (rrc < 0) RRETURN(rrc);
+        }
+      ecode += _pcre_OP_lengths[OP_CALLOUT];
+      }
+
+    condcode = ecode[LINK_SIZE+1];
+
+    /* Now see what the actual condition is */
+
+    if (condcode == OP_RREF)         /* Recursion test */
       {
       offset = GET2(ecode, LINK_SIZE + 2);     /* Recursion group number*/
       condition = md->recursive != NULL &&
@@ -793,14 +831,14 @@ for (;;)
       ecode += condition? 3 : GET(ecode, 1);
       }
 
-    else if (ecode[LINK_SIZE+1] == OP_CREF)    /* Group used test */
+    else if (condcode == OP_CREF)    /* Group used test */
       {
       offset = GET2(ecode, LINK_SIZE+2) << 1;  /* Doubled ref number */
       condition = offset < offset_top && md->offset_vector[offset] >= 0;
       ecode += condition? 3 : GET(ecode, 1);
       }
 
-    else if (ecode[LINK_SIZE+1] == OP_DEF)     /* DEFINE - always false */
+    else if (condcode == OP_DEF)     /* DEFINE - always false */
       {
       condition = FALSE;
       ecode += GET(ecode, 1);
@@ -827,7 +865,7 @@ for (;;)
       else
         {
         condition = FALSE;
-        ecode += GET(ecode, 1);
+        ecode += codelink;
         }
       }
 
@@ -850,7 +888,7 @@ for (;;)
         goto TAIL_RECURSE;
         }
       }
-    else                         /* Condition false & no 2nd alternative */
+    else                         /* Condition false & no alternative */
       {
       ecode += 1 + LINK_SIZE;
       }
@@ -1073,6 +1111,8 @@ for (;;)
         else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
           {
           DPRINTF(("Recursion gave error %d\n", rrc));
+          if (new_recursive.offset_save != stacksave)
+            (pcre_free)(new_recursive.offset_save);
           RRETURN(rrc);
           }
 
@@ -1419,7 +1459,7 @@ for (;;)
         {
         if (eptr == md->start_subject) prev_is_word = FALSE; else
           {
-          const uschar *lastptr = eptr - 1;
+          USPTR lastptr = eptr - 1;
           while((*lastptr & 0xc0) == 0x80) lastptr--;
           GETCHAR(c, lastptr);
           prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
@@ -1677,7 +1717,7 @@ for (;;)
     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
     GETCHARINCTEST(c, eptr);
       {
-      const ucd_record * prop = GET_UCD(c);
+      const ucd_record *prop = GET_UCD(c);
 
       switch(ecode[1])
         {
@@ -2045,7 +2085,8 @@ for (;;)
 
 
     /* Match an extended character class. This opcode is encountered only
-    in UTF-8 mode, because that's the only time it is compiled. */
+    when UTF-8 mode mode is supported. Nevertheless, we may not be in UTF-8
+    mode, because Unicode properties are supported in non-UTF-8 mode. */
 
 #ifdef SUPPORT_UTF8
     case OP_XCLASS:
@@ -2087,7 +2128,7 @@ for (;;)
       for (i = 1; i <= min; i++)
         {
         if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
-        GETCHARINC(c, eptr);
+        GETCHARINCTEST(c, eptr);
         if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH);
         }
 
@@ -2106,7 +2147,7 @@ for (;;)
           RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM20);
           if (rrc != MATCH_NOMATCH) RRETURN(rrc);
           if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
-          GETCHARINC(c, eptr);
+          GETCHARINCTEST(c, eptr);
           if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH);
           }
         /* Control never gets here */
@@ -2121,7 +2162,7 @@ for (;;)
           {
           int len = 1;
           if (eptr >= md->end_subject) break;
-          GETCHARLEN(c, eptr, len);
+          GETCHARLENTEST(c, eptr, len);
           if (!_pcre_xclass(c, data)) break;
           eptr += len;
           }
@@ -4531,10 +4572,10 @@ switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options :
         (pcre_uint32)options) & PCRE_NEWLINE_BITS)
   {
   case 0: newline = NEWLINE; break;   /* Compile-time default */
-  case PCRE_NEWLINE_CR: newline = '\r'; break;
-  case PCRE_NEWLINE_LF: newline = '\n'; break;
+  case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
+  case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
   case PCRE_NEWLINE_CR+
-       PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break;
+       PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
   case PCRE_NEWLINE_ANY: newline = -1; break;
   case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
   default: return PCRE_ERROR_BADNEWLINE;
@@ -4576,11 +4617,11 @@ back the character offset. */
 #ifdef SUPPORT_UTF8
 if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
   {
-  if (_pcre_valid_utf8((uschar *)subject, length) >= 0)
+  if (_pcre_valid_utf8((USPTR)subject, length) >= 0)
     return PCRE_ERROR_BADUTF8;
   if (start_offset > 0 && start_offset < length)
     {
-    int tb = ((uschar *)subject)[start_offset];
+    int tb = ((USPTR)subject)[start_offset];
     if (tb > 127)
       {
       tb &= 0xc0;
@@ -4686,11 +4727,11 @@ for(;;)
     while (iptr < iend) *iptr++ = -1;
     }
 
-  /* Advance to a unique first char if possible. If firstline is TRUE, the
-  start of the match is constrained to the first line of a multiline string.
-  That is, the match must be before or at the first newline. Implement this by
-  temporarily adjusting end_subject so that we stop scanning at a newline. If
-  the match fails at the newline, later code breaks this loop. */
+  /* If firstline is TRUE, the start of the match is constrained to the first
+  line of a multiline string. That is, the match must be before or at the first
+  newline. Implement this by temporarily adjusting end_subject so that we stop
+  scanning at a newline. If the match fails at the newline, later code breaks
+  this loop. */
 
   if (firstline)
     {
@@ -4710,62 +4751,70 @@ for(;;)
     end_subject = t;
     }
 
-  /* Now advance to a unique first byte if there is one. */
+  /* There are some optimizations that avoid running the match if a known
+  starting point is not found, or if a known later character is not present.
+  However, there is an option that disables these, for testing and for ensuring
+  that all callouts do actually occur. */
 
-  if (first_byte >= 0)
+  if ((options & PCRE_NO_START_OPTIMIZE) == 0)
     {
-    if (first_byte_caseless)
-      while (start_match < end_subject && md->lcc[*start_match] != first_byte)
-        start_match++;
-    else
-      while (start_match < end_subject && *start_match != first_byte)
-        start_match++;
-    }
+    /* Advance to a unique first byte if there is one. */
 
-  /* Or to just after a linebreak for a multiline match */
+    if (first_byte >= 0)
+      {
+      if (first_byte_caseless)
+        while (start_match < end_subject && md->lcc[*start_match] != first_byte)
+          start_match++;
+      else
+        while (start_match < end_subject && *start_match != first_byte)
+          start_match++;
+      }
 
-  else if (startline)
-    {
-    if (start_match > md->start_subject + start_offset)
+    /* Or to just after a linebreak for a multiline match */
+
+    else if (startline)
       {
-#ifdef SUPPORT_UTF8
-      if (utf8)
+      if (start_match > md->start_subject + start_offset)
         {
-        while (start_match < end_subject && !WAS_NEWLINE(start_match))
+#ifdef SUPPORT_UTF8
+        if (utf8)
           {
-          start_match++;
-          while(start_match < end_subject && (*start_match & 0xc0) == 0x80)
+          while (start_match < end_subject && !WAS_NEWLINE(start_match))
+            {
             start_match++;
+            while(start_match < end_subject && (*start_match & 0xc0) == 0x80)
+              start_match++;
+            }
           }
-        }
-      else
+        else
 #endif
-      while (start_match < end_subject && !WAS_NEWLINE(start_match))
-        start_match++;
-
-      /* If we have just passed a CR and the newline option is ANY or ANYCRLF,
-      and we are now at a LF, advance the match position by one more character.
-      */
-
-      if (start_match[-1] == '\r' &&
-           (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
-           start_match < end_subject &&
-           *start_match == '\n')
-        start_match++;
+        while (start_match < end_subject && !WAS_NEWLINE(start_match))
+          start_match++;
+
+        /* If we have just passed a CR and the newline option is ANY or ANYCRLF,
+        and we are now at a LF, advance the match position by one more character.
+        */
+
+        if (start_match[-1] == CHAR_CR &&
+             (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
+             start_match < end_subject &&
+             *start_match == CHAR_NL)
+          start_match++;
+        }
       }
-    }
 
-  /* Or to a non-unique first byte after study */
+    /* Or to a non-unique first byte after study */
 
-  else if (start_bits != NULL)
-    {
-    while (start_match < end_subject)
+    else if (start_bits != NULL)
       {
-      register unsigned int c = *start_match;
-      if ((start_bits[c/8] & (1 << (c&7))) == 0) start_match++;
-        else break;
+      while (start_match < end_subject)
+        {
+        register unsigned int c = *start_match;
+        if ((start_bits[c/8] & (1 << (c&7))) == 0) start_match++;
+          else break;
+        }
       }
-    }
+    }   /* Starting optimizations */
 
   /* Restore fudged end_subject */
 
@@ -4777,23 +4826,25 @@ for(;;)
   printf("\n");
 #endif
 
-  /* If req_byte is set, we know that that character must appear in the subject
-  for the match to succeed. If the first character is set, req_byte must be
-  later in the subject; otherwise the test starts at the match point. This
-  optimization can save a huge amount of backtracking in patterns with nested
-  unlimited repeats that aren't going to match. Writing separate code for
-  cased/caseless versions makes it go faster, as does using an autoincrement
-  and backing off on a match.
-
-  HOWEVER: when the subject string is very, very long, searching to its end can
-  take a long time, and give bad performance on quite ordinary patterns. This
-  showed up when somebody was matching something like /^\d+C/ on a 32-megabyte
-  string... so we don't do this when the string is sufficiently long.
-
-  ALSO: this processing is disabled when partial matching is requested.
-  */
-
-  if (req_byte >= 0 &&
+  /* If req_byte is set, we know that that character must appear in the
+  subject for the match to succeed. If the first character is set, req_byte
+  must be later in the subject; otherwise the test starts at the match point.
+  This optimization can save a huge amount of backtracking in patterns with
+  nested unlimited repeats that aren't going to match. Writing separate code
+  for cased/caseless versions makes it go faster, as does using an
+  autoincrement and backing off on a match.
+
+  HOWEVER: when the subject string is very, very long, searching to its end
+  can take a long time, and give bad performance on quite ordinary patterns.
+  This showed up when somebody was matching something like /^\d+C/ on a
+  32-megabyte string... so we don't do this when the string is sufficiently
+  long.
+
+  ALSO: this processing is disabled when partial matching is requested, or if
+  disabling is explicitly requested. */
+
+  if ((options & PCRE_NO_START_OPTIMIZE) == 0 &&
+      req_byte >= 0 &&
       end_subject - start_match < REQ_BYTE_MAX &&
       !md->partial)
     {
@@ -4901,9 +4952,9 @@ for(;;)
   not contain any explicit matches for \r or \n, and the newline option is CRLF
   or ANY or ANYCRLF, advance the match position by one more character. */
 
-  if (start_match[-1] == '\r' &&
+  if (start_match[-1] == CHAR_CR &&
       start_match < end_subject &&
-      *start_match == '\n' &&
+      *start_match == CHAR_NL &&
       (re->flags & PCRE_HASCRORLF) == 0 &&
         (md->nltype == NLTYPE_ANY ||
          md->nltype == NLTYPE_ANYCRLF ||