1 files changed, 351 insertions, 28 deletions
diff --git a/ext/pcre/pcrelib/pcre.c b/ext/pcre/pcrelib/pcre.c
index e3fdde9114..5149f8dad7 100644
--- a/ext/pcre/pcrelib/pcre.c
+++ b/ext/pcre/pcrelib/pcre.c
@@ -66,6 +66,16 @@ not be set greater than 200. */
 #define BRASTACK_SIZE 200
 
 
+/* The number of bytes in a literal character string above which we can't add
+any more is different when UTF-8 characters may be encountered. */
+
+#ifdef SUPPORT_UTF8
+#define MAXLIT 250
+#else
+#define MAXLIT 255
+#endif
+
+
 /* Min and max values for the common repeats; for the maxima, 0 => infinity */
 
 static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
@@ -176,6 +186,64 @@ void  (*pcre_free)(void *) = free;
 
 
 
+/*************************************************
+*    Macros and tables for character handling    *
+*************************************************/
+
+/* When UTF-8 encoding is being used, a character is no longer just a single
+byte. The macros for character handling generate simple sequences when used in
+byte-mode, and more complicated ones for UTF-8 characters. */
+
+#ifndef SUPPORT_UTF8
+#define GETCHARINC(c, eptr) c = *eptr++;
+#define GETCHARLEN(c, eptr, len) c = *eptr;
+#define BACKCHAR(eptr)
+
+#else   /* SUPPORT_UTF8 */
+
+/* Get the next UTF-8 character, advancing the pointer */
+
+#define GETCHARINC(c, eptr) \
+  c = *eptr++; \
+  if (md->utf8 && (c & 0xc0) == 0xc0) \
+    { \
+    int a = utf8_table4[c & 0x3f];  /* Number of additional bytes */ \
+    int s = 6 - a;                  /* Amount to shift next byte */  \
+    c &= utf8_table3[a];            /* Low order bits from first byte */ \
+    while (a-- > 0) \
+      { \
+      c |= (*eptr++ & 0x3f) << s; \
+      s += 6; \
+      } \
+    }
+
+/* Get the next UTF-8 character, not advancing the pointer, setting length */
+
+#define GETCHARLEN(c, eptr, len) \
+  c = *eptr; \
+  len = 1; \
+  if (md->utf8 && (c & 0xc0) == 0xc0) \
+    { \
+    int i; \
+    int a = utf8_table4[c & 0x3f];  /* Number of additional bytes */ \
+    int s = 6 - a;                  /* Amount to shift next byte */  \
+    c &= utf8_table3[a];            /* Low order bits from first byte */ \
+    for (i = 1; i <= a; i++) \
+      { \
+      c |= (eptr[i] & 0x3f) << s; \
+      s += 6; \
+      } \
+    len += a; \
+    }
+
+/* If the pointer is not at the start of a character, move it back until
+it is. */
+
+#define BACKCHAR(eptr) while((*eptr & 0xc0) == 0x80) eptr--;
+
+#endif
+
+
 
 /*************************************************
 *             Default character tables           *
@@ -191,6 +259,66 @@ tables. */
 
 
 
+#ifdef SUPPORT_UTF8
+/*************************************************
+*           Tables for UTF-8 support             *
+*************************************************/
+
+/* These are the breakpoints for different numbers of bytes in a UTF-8
+character. */
+
+static int utf8_table1[] = { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff};
+
+/* These are the indicator bits and the mask for the data bits to set in the
+first byte of a character, indexed by the number of additional bytes. */
+
+static int utf8_table2[] = { 0,    0xc0, 0xe0, 0xf0, 0xf8, 0xfc};
+static int utf8_table3[] = { 0xff, 0x1f, 0x0f, 0x07, 0x03, 0x01};
+
+/* Table of the number of extra characters, indexed by the first character
+masked with 0x3f. The highest number for a valid UTF-8 character is in fact
+0x3d. */
+
+static uschar utf8_table4[] = {
+  1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+  1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+  2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
+  3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5 };
+
+
+/*************************************************
+*       Convert character value to UTF-8         *
+*************************************************/
+
+/* This function takes an integer value in the range 0 - 0x7fffffff
+and encodes it as a UTF-8 character in 0 to 6 bytes.
+
+Arguments:
+  cvalue     the character value
+  buffer     pointer to buffer for result - at least 6 bytes long
+
+Returns:     number of characters placed in the buffer
+*/
+
+static int
+ord2utf8(int cvalue, uschar *buffer)
+{
+register int i, j;
+for (i = 0; i < sizeof(utf8_table1)/sizeof(int); i++)
+  if (cvalue <= utf8_table1[i]) break;
+*buffer++ = utf8_table2[i] | (cvalue & utf8_table3[i]);
+cvalue >>= 6 - i;
+for (j = 0; j < i; j++)
+  {
+  *buffer++ = 0x80 | (cvalue & 0x3f);
+  cvalue >>= 6;
+  }
+return i + 1;
+}
+#endif
+
+
+
 /*************************************************
 *          Return version string                 *
 *************************************************/
@@ -349,9 +477,9 @@ while (length-- > 0)
 
 /* This function is called when a \ has been encountered. It either returns a
 positive value for a simple escape such as \n, or a negative value which
-encodes one of the more complicated things such as \d. On entry, ptr is
-pointing at the \. On exit, it is on the final character of the escape
-sequence.
+encodes one of the more complicated things such as \d. When UTF-8 is enabled,
+a positive value greater than 255 may be returned. On entry, ptr is pointing at
+the \. On exit, it is on the final character of the escape sequence.
 
 Arguments:
   ptrptr     points to the pattern position pointer
@@ -373,7 +501,9 @@ check_escape(const uschar **ptrptr, const char **errorptr, int bracount,
 const uschar *ptr = *ptrptr;
 int c, i;
 
-c = *(++ptr) & 255;   /* Ensure > 0 on signed-char systems */
+/* If backslash is at the end of the pattern, it's an error. */
+
+c = *(++ptr);
 if (c == 0) *errorptr = ERR1;
 
 /* Digits or letters may have special meaning; all others are literals. */
@@ -433,18 +563,46 @@ else
       }
 
     /* \0 always starts an octal number, but we may drop through to here with a
-    larger first octal digit */
+    larger first octal digit. */
 
     case '0':
     c -= '0';
     while(i++ < 2 && (cd->ctypes[ptr[1]] & ctype_digit) != 0 &&
       ptr[1] != '8' && ptr[1] != '9')
         c = c * 8 + *(++ptr) - '0';
+    c &= 255;     /* Take least significant 8 bits */
     break;
 
-    /* Special escapes not starting with a digit are straightforward */
+    /* \x is complicated when UTF-8 is enabled. \x{ddd} is a character number
+    which can be greater than 0xff, but only if the ddd are hex digits. */
 
     case 'x':
+#ifdef SUPPORT_UTF8
+    if (ptr[1] == '{' && (options & PCRE_UTF8) != 0)
+      {
+      const uschar *pt = ptr + 2;
+      register int count = 0;
+      c = 0;
+      while ((cd->ctypes[*pt] & ctype_xdigit) != 0)
+        {
+        count++;
+        c = c * 16 + cd->lcc[*pt] -
+          (((cd->ctypes[*pt] & ctype_digit) != 0)? '0' : 'W');
+        pt++;
+        }
+      if (*pt == '}')
+        {
+        if (c < 0 || count > 8) *errorptr = ERR34;
+        ptr = pt;
+        break;
+        }
+      /* If the sequence of hex digits does not end with '}', then we don't
+      recognize this construct; fall through to the normal \x handling. */
+      }
+#endif
+
+    /* Read just a single hex char */
+
     c = 0;
     while (i++ < 2 && (cd->ctypes[ptr[1]] & ctype_xdigit) != 0)
       {
@@ -454,6 +612,8 @@ else
       }
     break;
 
+    /* Other special escapes not starting with a digit are straightforward */
+
     case 'c':
     c = *(++ptr);
     if (c == 0)
@@ -591,12 +751,13 @@ if the length is fixed. This is needed for dealing with backward assertions.
 
 Arguments:
   code     points to the start of the pattern (the bracket)
+  options  the compiling options
 
 Returns:   the fixed length, or -1 if there is no fixed length
 */
 
 static int
-find_fixedlength(uschar *code)
+find_fixedlength(uschar *code, int options)
 {
 int length = -1;
 
@@ -617,7 +778,7 @@ for (;;)
     case OP_BRA:
     case OP_ONCE:
     case OP_COND:
-    d = find_fixedlength(cc);
+    d = find_fixedlength(cc, options);
     if (d < 0) return -1;
     branchlength += d;
     do cc += (cc[1] << 8) + cc[2]; while (*cc == OP_ALT);
@@ -671,10 +832,17 @@ for (;;)
     cc++;
     break;
 
-    /* Handle char strings */
+    /* Handle char strings. In UTF-8 mode we must count characters, not bytes.
+    This requires a scan of the string, unfortunately. We assume valid UTF-8
+    strings, so all we do is reduce the length by one for byte whose bits are
+    10xxxxxx. */
 
     case OP_CHARS:
     branchlength += *(++cc);
+#ifdef SUPPORT_UTF8
+    for (d = 1; d <= *cc; d++)
+      if ((cc[d] & 0xc0) == 0x80) branchlength--;
+#endif
     cc += *cc + 1;
     break;
 
@@ -1054,7 +1222,17 @@ for (;; ptr++)
             goto FAILED;
             }
           }
-        /* Fall through if single character */
+
+        /* Fall through if single character, but don't at present allow
+        chars > 255 in UTF-8 mode. */
+
+#ifdef SUPPORT_UTF8
+        if (c > 255)
+          {
+          *errorptr = ERR33;
+          goto FAILED;
+          }
+#endif
         }
 
       /* A single character may be followed by '-' to form a range. However,
@@ -1074,17 +1252,29 @@ for (;; ptr++)
           }
 
         /* The second part of a range can be a single-character escape, but
-        not any of the other escapes. */
+        not any of the other escapes. Perl 5.6 treats a hyphen as a literal
+        in such circumstances. */
 
         if (d == '\\')
           {
+          const uschar *oldptr = ptr;
           d = check_escape(&ptr, errorptr, *brackets, options, TRUE, cd);
+
+#ifdef SUPPORT_UTF8
+          if (d > 255)
+            {
+            *errorptr = ERR33;
+            goto FAILED;
+            }
+#endif
+          /* \b is backslash; any other special means the '-' was literal */
+
           if (d < 0)
             {
             if (d == -ESC_b) d = '\b'; else
               {
-              *errorptr = ERR7;
-              goto FAILED;
+              ptr = oldptr - 2;
+              goto SINGLE_CHARACTER;  /* A few lines below */
               }
             }
           }
@@ -1112,6 +1302,8 @@ for (;; ptr++)
       /* Handle a lone single character - we can get here for a normal
       non-escape char, or after \ that introduces a single character. */
 
+      SINGLE_CHARACTER:
+
       class [c/8] |= (1 << (c&7));
       if ((options & PCRE_CASELESS) != 0)
         {
@@ -1562,6 +1754,11 @@ for (;; ptr++)
           {
           condref = *ptr - '0';
           while (*(++ptr) != ')') condref = condref*10 + *ptr - '0';
+          if (condref == 0)
+            {
+            *errorptr = ERR35;
+            goto FAILED;
+            }
           ptr++;
           }
         else ptr--;
@@ -1829,6 +2026,20 @@ for (;; ptr++)
         tempptr = ptr;
         c = check_escape(&ptr, errorptr, *brackets, options, FALSE, cd);
         if (c < 0) { ptr = tempptr; break; }
+
+        /* If a character is > 127 in UTF-8 mode, we have to turn it into
+        two or more characters in the UTF-8 encoding. */
+
+#ifdef SUPPORT_UTF8
+        if (c > 127 && (options & PCRE_UTF8) != 0)
+          {
+          uschar buffer[8];
+          int len = ord2utf8(c, buffer);
+          for (c = 0; c < len; c++) *code++ = buffer[c];
+          length += len;
+          continue;
+          }
+#endif
         }
 
       /* Ordinary character or single-char escape */
@@ -1839,7 +2050,7 @@ for (;; ptr++)
 
     /* This "while" is the end of the "do" above. */
 
-    while (length < 255 && (cd->ctypes[c = *(++ptr)] & ctype_meta) == 0);
+    while (length < MAXLIT && (cd->ctypes[c = *(++ptr)] & ctype_meta) == 0);
 
     /* Update the last character and the count of literals */
 
@@ -1851,7 +2062,7 @@ for (;; ptr++)
     the next state. */
 
     previous[1] = length;
-    if (length < 255) ptr--;
+    if (length < MAXLIT) ptr--;
     break;
     }
   }                   /* end of big loop */
@@ -1889,7 +2100,7 @@ Argument:
   ptrptr      -> the address of the current pattern pointer
   errorptr    -> pointer to error message
   lookbehind  TRUE if this is a lookbehind assertion
-  condref     > 0 for OPT_CREF setting at start of conditional group
+  condref     >= 0 for OPT_CREF setting at start of conditional group
   reqchar     -> place to put the last required character, or a negative number
   countlits   -> place to put the shortest literal count of any branch
   cd          points to the data block with tables pointers
@@ -1917,7 +2128,7 @@ code += 3;
 /* At the start of a reference-based conditional group, insert the reference
 number as an OP_CREF item. */
 
-if (condref > 0)
+if (condref >= 0)
   {
   *code++ = OP_CREF;
   *code++ = condref;
@@ -1989,7 +2200,7 @@ for (;;)
   if (lookbehind)
     {
     *code = OP_END;
-    length = find_fixedlength(last_branch);
+    length = find_fixedlength(last_branch, options);
     DPRINTF(("fixed length = %d\n", length));
     if (length < 0)
       {
@@ -2280,6 +2491,16 @@ uschar bralenstack[BRASTACK_SIZE];
 uschar *code_base, *code_end;
 #endif
 
+/* Can't support UTF8 unless PCRE has been compiled to include the code. */
+
+#ifndef SUPPORT_UTF8
+if ((options & PCRE_UTF8) != 0)
+  {
+  *errorptr = ERR32;
+  return NULL;
+  }
+#endif
+
 /* We can't pass back an error message if errorptr is NULL; I guess the best we
 can do is just return NULL. */
 
@@ -2775,6 +2996,16 @@ while ((c = *(++ptr)) != 0)
           &compile_block);
         if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
         if (c < 0) { ptr = saveptr; break; }
+
+#ifdef SUPPORT_UTF8
+        if (c > 127 && (options & PCRE_UTF8) != 0)
+          {
+          int i;
+          for (i = 0; i < sizeof(utf8_table1)/sizeof(int); i++)
+            if (c <= utf8_table1[i]) break;
+          runlength += i;
+          }
+#endif
         }
 
       /* Ordinary character or single-char escape */
@@ -2784,7 +3015,7 @@ while ((c = *(++ptr)) != 0)
 
     /* This "while" is the end of the "do" above. */
 
-    while (runlength < 255 &&
+    while (runlength < MAXLIT &&
       (compile_block.ctypes[c = *(++ptr)] & ctype_meta) == 0);
 
     ptr--;
@@ -3429,10 +3660,21 @@ for (;;)
 
     /* Move the subject pointer back. This occurs only at the start of
     each branch of a lookbehind assertion. If we are too close to the start to
-    move back, this match function fails. */
+    move back, this match function fails. When working with UTF-8 we move
+    back a number of characters, not bytes. */
 
     case OP_REVERSE:
+#ifdef SUPPORT_UTF8
+    c = (ecode[1] << 8) + ecode[2];
+    for (i = 0; i < c; i++)
+      {
+      eptr--;
+      BACKCHAR(eptr)
+      }
+#else
     eptr -= (ecode[1] << 8) + ecode[2];
+#endif
+
     if (eptr < md->start_subject) return FALSE;
     ecode += 3;
     break;
@@ -3752,6 +3994,10 @@ for (;;)
     if ((ims & PCRE_DOTALL) == 0 && eptr < md->end_subject && *eptr == '\n')
       return FALSE;
     if (eptr++ >= md->end_subject) return FALSE;
+#ifdef SUPPORT_UTF8
+    if (md->utf8)
+      while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
+#endif
     ecode++;
     break;
 
@@ -3953,7 +4199,13 @@ for (;;)
       for (i = 1; i <= min; i++)
         {
         if (eptr >= md->end_subject) return FALSE;
-        c = *eptr++;
+        GETCHARINC(c, eptr)         /* Get character; increment eptr */
+
+#ifdef SUPPORT_UTF8
+        /* We do not yet support class members > 255 */
+        if (c > 255) return FALSE;
+#endif
+
         if ((data[c/8] & (1 << (c&7))) != 0) continue;
         return FALSE;
         }
@@ -3973,7 +4225,12 @@ for (;;)
           if (match(eptr, ecode, offset_top, md, ims, eptrb, 0))
             return TRUE;
           if (i >= max || eptr >= md->end_subject) return FALSE;
-          c = *eptr++;
+          GETCHARINC(c, eptr)       /* Get character; increment eptr */
+
+#ifdef SUPPORT_UTF8
+          /* We do not yet support class members > 255 */
+          if (c > 255) return FALSE;
+#endif
           if ((data[c/8] & (1 << (c&7))) != 0) continue;
           return FALSE;
           }
@@ -3985,17 +4242,29 @@ for (;;)
       else
         {
         const uschar *pp = eptr;
-        for (i = min; i < max; eptr++, i++)
+        int len = 1;
+        for (i = min; i < max; i++)
           {
           if (eptr >= md->end_subject) break;
-          c = *eptr;
-          if ((data[c/8] & (1 << (c&7))) != 0) continue;
-          break;
+          GETCHARLEN(c, eptr, len)  /* Get character, set length if UTF-8 */
+
+#ifdef SUPPORT_UTF8
+          /* We do not yet support class members > 255 */
+          if (c > 255) break;
+#endif
+          if ((data[c/8] & (1 << (c&7))) == 0) break;
+          eptr += len;
           }
 
         while (eptr >= pp)
+          {
           if (match(eptr--, ecode, offset_top, md, ims, eptrb, 0))
             return TRUE;
+
+#ifdef SUPPORT_UTF8
+          BACKCHAR(eptr)
+#endif
+          }
         return FALSE;
         }
       }
@@ -4315,13 +4584,29 @@ for (;;)
 
     /* First, ensure the minimum number of matches are present. Use inline
     code for maximizing the speed, and do the type test once at the start
-    (i.e. keep it out of the loop). Also test that there are at least the
-    minimum number of characters before we start. */
+    (i.e. keep it out of the loop). Also we can test that there are at least
+    the minimum number of bytes before we start, except when doing '.' in
+    UTF8 mode. Leave the test in in all cases; in the special case we have
+    to test after each character. */
 
     if (min > md->end_subject - eptr) return FALSE;
     if (min > 0) switch(ctype)
       {
       case OP_ANY:
+#ifdef SUPPORT_UTF8
+      if (md->utf8)
+        {
+        for (i = 1; i <= min; i++)
+          {
+          if (eptr >= md->end_subject ||
+             (*eptr++ == '\n' && (ims & PCRE_DOTALL) == 0))
+            return FALSE;
+          while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
+          }
+        break;
+        }
+#endif
+      /* Non-UTF8 can be faster */
       if ((ims & PCRE_DOTALL) == 0)
         { for (i = 1; i <= min; i++) if (*eptr++ == '\n') return FALSE; }
       else eptr += min;
@@ -4379,6 +4664,10 @@ for (;;)
           {
           case OP_ANY:
           if ((ims & PCRE_DOTALL) == 0 && c == '\n') return FALSE;
+#ifdef SUPPORT_UTF8
+          if (md->utf8)
+            while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
+#endif
           break;
 
           case OP_NOT_DIGIT:
@@ -4418,6 +4707,33 @@ for (;;)
       switch(ctype)
         {
         case OP_ANY:
+
+        /* Special code is required for UTF8, but when the maximum is unlimited
+        we don't need it. */
+
+#ifdef SUPPORT_UTF8
+        if (md->utf8 && max < INT_MAX)
+          {
+          if ((ims & PCRE_DOTALL) == 0)
+            {
+            for (i = min; i < max; i++)
+              {
+              if (eptr >= md->end_subject || *eptr++ == '\n') break;
+              while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
+              }
+            }
+          else
+            {
+            for (i = min; i < max; i++)
+              {
+              eptr++;
+              while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
+              }
+            }
+          break;
+          }
+#endif
+        /* Non-UTF8 can be faster */
         if ((ims & PCRE_DOTALL) == 0)
           {
           for (i = min; i < max; i++)
@@ -4490,8 +4806,14 @@ for (;;)
         }
 
       while (eptr >= pp)
+        {
         if (match(eptr--, ecode, offset_top, md, ims, eptrb, 0))
           return TRUE;
+#ifdef SUPPORT_UTF8
+        if (md->utf8)
+          while (eptr > pp && (*eptr & 0xc0) == 0x80) eptr--;
+#endif
+        }
       return FALSE;
       }
     /* Control never gets here */
@@ -4572,6 +4894,7 @@ match_block.end_subject = match_block.start_subject + length;
 end_subject = match_block.end_subject;
 
 match_block.endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
+match_block.utf8 = (re->options & PCRE_UTF8) != 0;
 
 match_block.notbol = (options & PCRE_NOTBOL) != 0;
 match_block.noteol = (options & PCRE_NOTEOL) != 0;