upgrade PCRE to version 7.9

author: Nuno Lopes <nlopess@php.net> 2009-04-11 18:57:27 +0000
committer: Nuno Lopes <nlopess@php.net> 2009-04-11 18:57:27 +0000
commit: 90a2d1979486b2e7fa14be9c3210b9d321c668f8 (patch)
tree: 9f67ba72ee6010944487be2790c467dd2ea2cbaf /ext/pcre/pcrelib/pcre_compile.c
parent: ff62b87cd6b94c50fbbac88bf4129f6418efe131 (diff)
download: php-git-90a2d1979486b2e7fa14be9c3210b9d321c668f8.tar.gz
1 files changed, 542 insertions, 319 deletions
diff --git a/ext/pcre/pcrelib/pcre_compile.c b/ext/pcre/pcrelib/pcre_compile.c
index b079d1962f..1e0672c5cd 100644
--- a/ext/pcre/pcrelib/pcre_compile.c
+++ b/ext/pcre/pcrelib/pcre_compile.c
@@ -95,21 +95,56 @@ are simple data values; negative values are for special things like \d and so
 on. Zero means further processing is needed (for things like \x), or the escape
 is invalid. */
 
-#ifndef EBCDIC  /* This is the "normal" table for ASCII systems */
+#ifndef EBCDIC
+
+/* This is the "normal" table for ASCII systems or for EBCDIC systems running
+in UTF-8 mode. */
+
 static const short int escapes[] = {
-     0,      0,      0,      0,      0,      0,      0,      0,   /* 0 - 7 */
-     0,      0,    ':',    ';',    '<',    '=',    '>',    '?',   /* 8 - ? */
-   '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E,      0, -ESC_G,   /* @ - G */
--ESC_H,      0,      0, -ESC_K,      0,      0,      0,      0,   /* H - O */
--ESC_P, -ESC_Q, -ESC_R, -ESC_S,      0,      0, -ESC_V, -ESC_W,   /* P - W */
--ESC_X,      0, -ESC_Z,    '[',   '\\',    ']',    '^',    '_',   /* X - _ */
-   '`',      7, -ESC_b,      0, -ESC_d,  ESC_e,  ESC_f,      0,   /* ` - g */
--ESC_h,      0,      0, -ESC_k,      0,      0,  ESC_n,      0,   /* h - o */
--ESC_p,      0,  ESC_r, -ESC_s,  ESC_tee,    0, -ESC_v, -ESC_w,   /* p - w */
-     0,      0, -ESC_z                                            /* x - z */
+     0,                       0,
+     0,                       0,
+     0,                       0,
+     0,                       0,
+     0,                       0,
+     CHAR_COLON,              CHAR_SEMICOLON,
+     CHAR_LESS_THAN_SIGN,     CHAR_EQUALS_SIGN,
+     CHAR_GREATER_THAN_SIGN,  CHAR_QUESTION_MARK,
+     CHAR_COMMERCIAL_AT,      -ESC_A,
+     -ESC_B,                  -ESC_C,
+     -ESC_D,                  -ESC_E,
+     0,                       -ESC_G,
+     -ESC_H,                  0,
+     0,                       -ESC_K,
+     0,                       0,
+     0,                       0,
+     -ESC_P,                  -ESC_Q,
+     -ESC_R,                  -ESC_S,
+     0,                       0,
+     -ESC_V,                  -ESC_W,
+     -ESC_X,                  0,
+     -ESC_Z,                  CHAR_LEFT_SQUARE_BRACKET,
+     CHAR_BACKSLASH,          CHAR_RIGHT_SQUARE_BRACKET,
+     CHAR_CIRCUMFLEX_ACCENT,  CHAR_UNDERSCORE,
+     CHAR_GRAVE_ACCENT,       7,
+     -ESC_b,                  0,
+     -ESC_d,                  ESC_e,
+     ESC_f,                   0,
+     -ESC_h,                  0,
+     0,                       -ESC_k,
+     0,                       0,
+     ESC_n,                   0,
+     -ESC_p,                  0,
+     ESC_r,                   -ESC_s,
+     ESC_tee,                 0,
+     -ESC_v,                  -ESC_w,
+     0,                       0,
+     -ESC_z
 };
 
-#else           /* This is the "abnormal" table for EBCDIC systems */
+#else
+
+/* This is the "abnormal" table for EBCDIC systems without UTF-8 support. */
+
 static const short int escapes[] = {
 /*  48 */     0,     0,      0,     '.',    '<',   '(',    '+',    '|',
 /*  50 */   '&',     0,      0,       0,      0,     0,      0,      0,
@@ -140,7 +175,9 @@ static const short int escapes[] = {
 
 /* Table of special "verbs" like (*PRUNE). This is a short table, so it is
 searched linearly. Put all the names into a single string, in order to reduce
-the number of relocations when a shared library is dynamically linked. */
+the number of relocations when a shared library is dynamically linked. The
+string is built from string macros so that it works in UTF-8 mode on EBCDIC
+platforms. */
 
 typedef struct verbitem {
   int   len;
@@ -148,13 +185,13 @@ typedef struct verbitem {
 } verbitem;
 
 static const char verbnames[] =
-  "ACCEPT\0"
-  "COMMIT\0"
-  "F\0"
-  "FAIL\0"
-  "PRUNE\0"
-  "SKIP\0"
-  "THEN";
+  STRING_ACCEPT0
+  STRING_COMMIT0
+  STRING_F0
+  STRING_FAIL0
+  STRING_PRUNE0
+  STRING_SKIP0
+  STRING_THEN;
 
 static const verbitem verbs[] = {
   { 6, OP_ACCEPT },
@@ -176,9 +213,10 @@ length entry. The first three must be alpha, lower, upper, as this is assumed
 for handling case independence. */
 
 static const char posix_names[] =
-  "alpha\0"  "lower\0"  "upper\0"  "alnum\0"  "ascii\0"  "blank\0"
-  "cntrl\0"  "digit\0"  "graph\0"  "print\0"  "punct\0"  "space\0"
-  "word\0"   "xdigit";
+  STRING_alpha0 STRING_lower0 STRING_upper0 STRING_alnum0
+  STRING_ascii0 STRING_blank0 STRING_cntrl0 STRING_digit0
+  STRING_graph0 STRING_print0 STRING_punct0 STRING_space0
+  STRING_word0  STRING_xdigit;
 
 static const uschar posix_name_lengths[] = {
   5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
@@ -320,7 +358,11 @@ For convenience, we use the same bit definitions as in chartables:
 
 Then we can use ctype_digit and ctype_xdigit in the code. */
 
-#ifndef EBCDIC  /* This is the "normal" case, for ASCII systems */
+#ifndef EBCDIC
+
+/* This is the "normal" case, for ASCII systems, and EBCDIC systems running in
+UTF-8 mode. */
+
 static const unsigned char digitab[] =
   {
   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7 */
@@ -356,7 +398,10 @@ static const unsigned char digitab[] =
   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
 
-#else           /* This is the "abnormal" case, for EBCDIC systems */
+#else
+
+/* This is the "abnormal" case, for EBCDIC systems not running in UTF-8 mode. */
+
 static const unsigned char digitab[] =
   {
   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7  0 */
@@ -501,9 +546,9 @@ if (c == 0) *errorcodeptr = ERR1;
 in a table. A non-zero result is something that can be returned immediately.
 Otherwise further processing may be required. */
 
-#ifndef EBCDIC  /* ASCII coding */
-else if (c < '0' || c > 'z') {}                           /* Not alphanumeric */
-else if ((i = escapes[c - '0']) != 0) c = i;
+#ifndef EBCDIC  /* ASCII/UTF-8 coding */
+else if (c < CHAR_0 || c > CHAR_z) {}                     /* Not alphanumeric */
+else if ((i = escapes[c - CHAR_0]) != 0) c = i;
 
 #else           /* EBCDIC coding */
 else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {}   /* Not alphanumeric */
@@ -522,11 +567,11 @@ else
     /* A number of Perl escapes are not handled by PCRE. We give an explicit
     error. */
 
-    case 'l':
-    case 'L':
-    case 'N':
-    case 'u':
-    case 'U':
+    case CHAR_l:
+    case CHAR_L:
+    case CHAR_N:
+    case CHAR_u:
+    case CHAR_U:
     *errorcodeptr = ERR37;
     break;
 
@@ -546,8 +591,8 @@ else
     (possibly recursive) subroutine calls, _not_ backreferences. Just return
     the -ESC_g code (cf \k). */
 
-    case 'g':
-    if (ptr[1] == '<' || ptr[1] == '\'')
+    case CHAR_g:
+    if (ptr[1] == CHAR_LESS_THAN_SIGN || ptr[1] == CHAR_APOSTROPHE)
       {
       c = -ESC_g;
       break;
@@ -555,12 +600,12 @@ else
 
     /* Handle the Perl-compatible cases */
 
-    if (ptr[1] == '{')
+    if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
       {
       const uschar *p;
-      for (p = ptr+2; *p != 0 && *p != '}'; p++)
-        if (*p != '-' && (digitab[*p] & ctype_digit) == 0) break;
-      if (*p != 0 && *p != '}')
+      for (p = ptr+2; *p != 0 && *p != CHAR_RIGHT_CURLY_BRACKET; p++)
+        if (*p != CHAR_MINUS && (digitab[*p] & ctype_digit) == 0) break;
+      if (*p != 0 && *p != CHAR_RIGHT_CURLY_BRACKET)
         {
         c = -ESC_k;
         break;
@@ -570,7 +615,7 @@ else
       }
     else braced = FALSE;
 
-    if (ptr[1] == '-')
+    if (ptr[1] == CHAR_MINUS)
       {
       negated = TRUE;
       ptr++;
@@ -579,7 +624,7 @@ else
 
     c = 0;
     while ((digitab[ptr[1]] & ctype_digit) != 0)
-      c = c * 10 + *(++ptr) - '0';
+      c = c * 10 + *(++ptr) - CHAR_0;
 
     if (c < 0)   /* Integer overflow */
       {
@@ -587,7 +632,7 @@ else
       break;
       }
 
-    if (braced && *(++ptr) != '}')
+    if (braced && *(++ptr) != CHAR_RIGHT_CURLY_BRACKET)
       {
       *errorcodeptr = ERR57;
       break;
@@ -624,15 +669,15 @@ else
     value is greater than 377, the least significant 8 bits are taken. Inside a
     character class, \ followed by a digit is always an octal number. */
 
-    case '1': case '2': case '3': case '4': case '5':
-    case '6': case '7': case '8': case '9':
+    case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4: case CHAR_5:
+    case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
 
     if (!isclass)
       {
       oldptr = ptr;
-      c -= '0';
+      c -= CHAR_0;
       while ((digitab[ptr[1]] & ctype_digit) != 0)
-        c = c * 10 + *(++ptr) - '0';
+        c = c * 10 + *(++ptr) - CHAR_0;
       if (c < 0)    /* Integer overflow */
         {
         *errorcodeptr = ERR61;
@@ -650,7 +695,7 @@ else
     generates a binary zero byte and treats the digit as a following literal.
     Thus we have to pull back the pointer by one. */
 
-    if ((c = *ptr) >= '8')
+    if ((c = *ptr) >= CHAR_8)
       {
       ptr--;
       c = 0;
@@ -663,10 +708,10 @@ else
     to do). Nowadays we allow for larger numbers in UTF-8 mode, but no more
     than 3 octal digits. */
 
-    case '0':
-    c -= '0';
-    while(i++ < 2 && ptr[1] >= '0' && ptr[1] <= '7')
-        c = c * 8 + *(++ptr) - '0';
+    case CHAR_0:
+    c -= CHAR_0;
+    while(i++ < 2 && ptr[1] >= CHAR_0 && ptr[1] <= CHAR_7)
+        c = c * 8 + *(++ptr) - CHAR_0;
     if (!utf8 && c > 255) *errorcodeptr = ERR51;
     break;
 
@@ -674,8 +719,8 @@ else
     than 0xff in utf8 mode, but only if the ddd are hex digits. If not, { is
     treated as a data character. */
 
-    case 'x':
-    if (ptr[1] == '{')
+    case CHAR_x:
+    if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
       {
       const uschar *pt = ptr + 2;
       int count = 0;
@@ -684,19 +729,19 @@ else
       while ((digitab[*pt] & ctype_xdigit) != 0)
         {
         register int cc = *pt++;
-        if (c == 0 && cc == '0') continue;     /* Leading zeroes */
+        if (c == 0 && cc == CHAR_0) continue;     /* Leading zeroes */
         count++;
 
-#ifndef EBCDIC  /* ASCII coding */
-        if (cc >= 'a') cc -= 32;               /* Convert to upper case */
-        c = (c << 4) + cc - ((cc < 'A')? '0' : ('A' - 10));
+#ifndef EBCDIC  /* ASCII/UTF-8 coding */
+        if (cc >= CHAR_a) cc -= 32;               /* Convert to upper case */
+        c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
 #else           /* EBCDIC coding */
-        if (cc >= 'a' && cc <= 'z') cc += 64;  /* Convert to upper case */
-        c = (c << 4) + cc - ((cc >= '0')? '0' : ('A' - 10));
+        if (cc >= CHAR_a && cc <= CHAR_z) cc += 64;  /* Convert to upper case */
+        c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
 #endif
         }
 
-      if (*pt == '}')
+      if (*pt == CHAR_RIGHT_CURLY_BRACKET)
         {
         if (c < 0 || count > (utf8? 8 : 2)) *errorcodeptr = ERR34;
         ptr = pt;
@@ -712,14 +757,14 @@ else
     c = 0;
     while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)
       {
-      int cc;                               /* Some compilers don't like ++ */
-      cc = *(++ptr);                        /* in initializers */
-#ifndef EBCDIC  /* ASCII coding */
-      if (cc >= 'a') cc -= 32;              /* Convert to upper case */
-      c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));
+      int cc;                                  /* Some compilers don't like */
+      cc = *(++ptr);                           /* ++ in initializers */
+#ifndef EBCDIC  /* ASCII/UTF-8 coding */
+      if (cc >= CHAR_a) cc -= 32;              /* Convert to upper case */
+      c = c * 16 + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
 #else           /* EBCDIC coding */
-      if (cc <= 'z') cc += 64;              /* Convert to upper case */
-      c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));
+      if (cc <= CHAR_z) cc += 64;              /* Convert to upper case */
+      c = c * 16 + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
 #endif
       }
     break;
@@ -728,7 +773,7 @@ else
     This coding is ASCII-specific, but then the whole concept of \cx is
     ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
 
-    case 'c':
+    case CHAR_c:
     c = *(++ptr);
     if (c == 0)
       {
@@ -736,11 +781,11 @@ else
       break;
       }
 
-#ifndef EBCDIC  /* ASCII coding */
-    if (c >= 'a' && c <= 'z') c -= 32;
+#ifndef EBCDIC  /* ASCII/UTF-8 coding */
+    if (c >= CHAR_a && c <= CHAR_z) c -= 32;
     c ^= 0x40;
 #else           /* EBCDIC coding */
-    if (c >= 'a' && c <= 'z') c += 64;
+    if (c >= CHAR_a && c <= CHAR_z) c += 64;
     c ^= 0xC0;
 #endif
     break;
@@ -802,9 +847,9 @@ if (c == 0) goto ERROR_RETURN;
 /* \P or \p can be followed by a name in {}, optionally preceded by ^ for
 negation. */
 
-if (c == '{')
+if (c == CHAR_LEFT_CURLY_BRACKET)
   {
-  if (ptr[1] == '^')
+  if (ptr[1] == CHAR_CIRCUMFLEX_ACCENT)
     {
     *negptr = TRUE;
     ptr++;
@@ -813,10 +858,10 @@ if (c == '{')
     {
     c = *(++ptr);
     if (c == 0) goto ERROR_RETURN;
-    if (c == '}') break;
+    if (c == CHAR_RIGHT_CURLY_BRACKET) break;
     name[i] = c;
     }
-  if (c !='}') goto ERROR_RETURN;
+  if (c != CHAR_RIGHT_CURLY_BRACKET) goto ERROR_RETURN;
   name[i] = 0;
   }
 
@@ -881,15 +926,15 @@ is_counted_repeat(const uschar *p)
 {
 if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
 while ((digitab[*p] & ctype_digit) != 0) p++;
-if (*p == '}') return TRUE;
+if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
 
-if (*p++ != ',') return FALSE;
-if (*p == '}') return TRUE;
+if (*p++ != CHAR_COMMA) return FALSE;
+if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
 
 if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
 while ((digitab[*p] & ctype_digit) != 0) p++;
 
-return (*p == '}');
+return (*p == CHAR_RIGHT_CURLY_BRACKET);
 }
 
 
@@ -922,7 +967,7 @@ int max = -1;
 /* Read the minimum value and do a paranoid check: a negative value indicates
 an integer overflow. */
 
-while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';
+while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - CHAR_0;
 if (min < 0 || min > 65535)
   {
   *errorcodeptr = ERR5;
@@ -932,12 +977,12 @@ if (min < 0 || min > 65535)
 /* Read the maximum value if there is one, and again do a paranoid on its size.
 Also, max must not be less than min. */
 
-if (*p == '}') max = min; else
+if (*p == CHAR_RIGHT_CURLY_BRACKET) max = min; else
   {
-  if (*(++p) != '}')
+  if (*(++p) != CHAR_RIGHT_CURLY_BRACKET)
     {
     max = 0;
-    while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';
+    while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - CHAR_0;
     if (max < 0 || max > 65535)
       {
       *errorcodeptr = ERR5;
@@ -962,47 +1007,116 @@ return p;
 
 
 /*************************************************
-*       Find forward referenced subpattern       *
+*  Subroutine for finding forward reference      *
 *************************************************/
 
-/* This function scans along a pattern's text looking for capturing
+/* This recursive function is called only from find_parens() below. The
+top-level call starts at the beginning of the pattern. All other calls must
+start at a parenthesis. It scans along a pattern's text looking for capturing
 subpatterns, and counting them. If it finds a named pattern that matches the
 name it is given, it returns its number. Alternatively, if the name is NULL, it
-returns when it reaches a given numbered subpattern. This is used for forward
-references to subpatterns. We know that if (?P< is encountered, the name will
-be terminated by '>' because that is checked in the first pass.
+returns when it reaches a given numbered subpattern. We know that if (?P< is
+encountered, the name will be terminated by '>' because that is checked in the
+first pass. Recursion is used to keep track of subpatterns that reset the
+capturing group numbers - the (?| feature.
 
 Arguments:
-  ptr          current position in the pattern
+  ptrptr       address of the current character pointer (updated)
   cd           compile background data
   name         name to seek, or NULL if seeking a numbered subpattern
   lorn         name length, or subpattern number if name is NULL
   xmode        TRUE if we are in /x mode
+  count        pointer to the current capturing subpattern number (updated)
 
 Returns:       the number of the named subpattern, or -1 if not found
 */
 
 static int
-find_parens(const uschar *ptr, compile_data *cd, const uschar *name, int lorn,
-  BOOL xmode)
+find_parens_sub(uschar **ptrptr, compile_data *cd, const uschar *name, int lorn,
+  BOOL xmode, int *count)
 {
-const uschar *thisname;
-int count = cd->bracount;
+uschar *ptr = *ptrptr;
+int start_count = *count;
+int hwm_count = start_count;
+BOOL dup_parens = FALSE;
 
-for (; *ptr != 0; ptr++)
+/* If the first character is a parenthesis, check on the type of group we are
+dealing with. The very first call may not start with a parenthesis. */
+
+if (ptr[0] == CHAR_LEFT_PARENTHESIS)
   {
-  int term;
+  if (ptr[1] == CHAR_QUESTION_MARK &&
+      ptr[2] == CHAR_VERTICAL_LINE)
+    {
+    ptr += 3;
+    dup_parens = TRUE;
+    }
+
+  /* Handle a normal, unnamed capturing parenthesis */
+
+  else if (ptr[1] != CHAR_QUESTION_MARK && ptr[1] != CHAR_ASTERISK)
+    {
+    *count += 1;
+    if (name == NULL && *count == lorn) return *count;
+    ptr++;
+    }
+
+  /* Handle a condition. If it is an assertion, just carry on so that it
+  is processed as normal. If not, skip to the closing parenthesis of the
+  condition (there can't be any nested parens. */
+
+  else if (ptr[2] == CHAR_LEFT_PARENTHESIS)
+    {
+    ptr += 2;
+    if (ptr[1] != CHAR_QUESTION_MARK)
+      {
+      while (*ptr != 0 && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++;
+      if (*ptr != 0) ptr++;
+      }
+    }
+
+  /* We have either (? or (* and not a condition */
+
+  else
+    {
+    ptr += 2;
+    if (*ptr == CHAR_P) ptr++;                      /* Allow optional P */
+
+    /* We have to disambiguate (?<! and (?<= from (?<name> for named groups */
+
+    if ((*ptr == CHAR_LESS_THAN_SIGN && ptr[1] != CHAR_EXCLAMATION_MARK &&
+        ptr[1] != CHAR_EQUALS_SIGN) || *ptr == CHAR_APOSTROPHE)
+      {
+      int term;
+      const uschar *thisname;
+      *count += 1;
+      if (name == NULL && *count == lorn) return *count;
+      term = *ptr++;
+      if (term == CHAR_LESS_THAN_SIGN) term = CHAR_GREATER_THAN_SIGN;
+      thisname = ptr;
+      while (*ptr != term) ptr++;
+      if (name != NULL && lorn == ptr - thisname &&
+          strncmp((const char *)name, (const char *)thisname, lorn) == 0)
+        return *count;
+      }
+    }
+  }
 
+/* Past any initial parenthesis handling, scan for parentheses or vertical
+bars. */
+
+for (; *ptr != 0; ptr++)
+  {
   /* Skip over backslashed characters and also entire \Q...\E */
 
-  if (*ptr == '\\')
+  if (*ptr == CHAR_BACKSLASH)
     {
-    if (*(++ptr) == 0) return -1;
-    if (*ptr == 'Q') for (;;)
+    if (*(++ptr) == 0) goto FAIL_EXIT;
+    if (*ptr == CHAR_Q) for (;;)
       {
-      while (*(++ptr) != 0 && *ptr != '\\') {};
-      if (*ptr == 0) return -1;
-      if (*(++ptr) == 'E') break;
+      while (*(++ptr) != 0 && *ptr != CHAR_BACKSLASH) {};
+      if (*ptr == 0) goto FAIL_EXIT;
+      if (*(++ptr) == CHAR_E) break;
       }
     continue;
     }
@@ -1010,21 +1124,26 @@ for (; *ptr != 0; ptr++)
   /* Skip over character classes; this logic must be similar to the way they
   are handled for real. If the first character is '^', skip it. Also, if the
   first few characters (either before or after ^) are \Q\E or \E we skip them
-  too. This makes for compatibility with Perl. */
+  too. This makes for compatibility with Perl. Note the use of STR macros to
+  encode "Q\\E" so that it works in UTF-8 on EBCDIC platforms. */
 
-  if (*ptr == '[')
+  if (*ptr == CHAR_LEFT_SQUARE_BRACKET)
     {
     BOOL negate_class = FALSE;
     for (;;)
       {
       int c = *(++ptr);
-      if (c == '\\')
+      if (c == CHAR_BACKSLASH)
         {
-        if (ptr[1] == 'E') ptr++;
-          else if (strncmp((const char *)ptr+1, "Q\\E", 3) == 0) ptr += 3;
-            else break;
+        if (ptr[1] == CHAR_E)
+          ptr++;
+        else if (strncmp((const char *)ptr+1,
+                 STR_Q STR_BACKSLASH STR_E, 3) == 0)
+          ptr += 3;
+        else
+          break;
         }
-      else if (!negate_class && c == '^')
+      else if (!negate_class && c == CHAR_CIRCUMFLEX_ACCENT)
         negate_class = TRUE;
       else break;
       }
@@ -1032,20 +1151,21 @@ for (; *ptr != 0; ptr++)
     /* If the next character is ']', it is a data character that must be
     skipped, except in JavaScript compatibility mode. */
 
-    if (ptr[1] == ']' && (cd->external_options & PCRE_JAVASCRIPT_COMPAT) == 0)
+    if (ptr[1] == CHAR_RIGHT_SQUARE_BRACKET &&
+        (cd->external_options & PCRE_JAVASCRIPT_COMPAT) == 0)
       ptr++;
 
-    while (*(++ptr) != ']')
+    while (*(++ptr) != CHAR_RIGHT_SQUARE_BRACKET)
       {
       if (*ptr == 0) return -1;
-      if (*ptr == '\\')
+      if (*ptr == CHAR_BACKSLASH)
         {
-        if (*(++ptr) == 0) return -1;
-        if (*ptr == 'Q') for (;;)
+        if (*(++ptr) == 0) goto FAIL_EXIT;
+        if (*ptr == CHAR_Q) for (;;)
           {
-          while (*(++ptr) != 0 && *ptr != '\\') {};
-          if (*ptr == 0) return -1;
-          if (*(++ptr) == 'E') break;
+          while (*(++ptr) != 0 && *ptr != CHAR_BACKSLASH) {};
+          if (*ptr == 0) goto FAIL_EXIT;
+          if (*(++ptr) == CHAR_E) break;
           }
         continue;
         }
@@ -1055,49 +1175,92 @@ for (; *ptr != 0; ptr++)
 
   /* Skip comments in /x mode */
 
-  if (xmode && *ptr == '#')
+  if (xmode && *ptr == CHAR_NUMBER_SIGN)
     {
-    while (*(++ptr) != 0 && *ptr != '\n') {};
-    if (*ptr == 0) return -1;
+    while (*(++ptr) != 0 && *ptr != CHAR_NL) {};
+    if (*ptr == 0) goto FAIL_EXIT;
     continue;
     }
 
-  /* An opening parens must now be a real metacharacter */
+  /* Check for the special metacharacters */
 
-  if (*ptr != '(') continue;
-  if (ptr[1] != '?' && ptr[1] != '*')
+  if (*ptr == CHAR_LEFT_PARENTHESIS)
     {
-    count++;
-    if (name == NULL && count == lorn) return count;
-    continue;
+    int rc = find_parens_sub(&ptr, cd, name, lorn, xmode, count);
+    if (rc > 0) return rc;
+    if (*ptr == 0) goto FAIL_EXIT;
+    }
+
+  else if (*ptr == CHAR_RIGHT_PARENTHESIS)
+    {
+    if (dup_parens && *count < hwm_count) *count = hwm_count;
+    *ptrptr = ptr;
+    return -1;
     }
 
-  ptr += 2;
-  if (*ptr == 'P') ptr++;                      /* Allow optional P */
+  else if (*ptr == CHAR_VERTICAL_LINE && dup_parens)
+    {
+    if (*count > hwm_count) hwm_count = *count;
+    *count = start_count;
+    }
+  }
+
+FAIL_EXIT:
+*ptrptr = ptr;
+return -1;
+}
+
 
-  /* We have to disambiguate (?<! and (?<= from (?<name> */
 
-  if ((*ptr != '<' || ptr[1] == '!' || ptr[1] == '=') &&
-       *ptr != '\'')
-    continue;
 
-  count++;
+/*************************************************
+*       Find forward referenced subpattern       *
+*************************************************/
+
+/* This function scans along a pattern's text looking for capturing
+subpatterns, and counting them. If it finds a named pattern that matches the
+name it is given, it returns its number. Alternatively, if the name is NULL, it
+returns when it reaches a given numbered subpattern. This is used for forward
+references to subpatterns. We used to be able to start this scan from the
+current compiling point, using the current count value from cd->bracount, and
+do it all in a single loop, but the addition of the possibility of duplicate
+subpattern numbers means that we have to scan from the very start, in order to
+take account of such duplicates, and to use a recursive function to keep track
+of the different types of group.
+
+Arguments:
+  cd           compile background data
+  name         name to seek, or NULL if seeking a numbered subpattern
+  lorn         name length, or subpattern number if name is NULL
+  xmode        TRUE if we are in /x mode
+
+Returns:       the number of the found subpattern, or -1 if not found
+*/
+
+static int
+find_parens(compile_data *cd, const uschar *name, int lorn, BOOL xmode)
+{
+uschar *ptr = (uschar *)cd->start_pattern;
+int count = 0;
+int rc;
 
-  if (name == NULL && count == lorn) return count;
-  term = *ptr++;
-  if (term == '<') term = '>';
-  thisname = ptr;
-  while (*ptr != term) ptr++;
-  if (name != NULL && lorn == ptr - thisname &&
-      strncmp((const char *)name, (const char *)thisname, lorn) == 0)
-    return count;
+/* If the pattern does not start with an opening parenthesis, the first call
+to find_parens_sub() will scan right to the end (if necessary). However, if it
+does start with a parenthesis, find_parens_sub() will return when it hits the
+matching closing parens. That is why we have to have a loop. */
+
+for (;;)
+  {
+  rc = find_parens_sub(&ptr, cd, name, lorn, xmode, &count);
+  if (rc > 0 || *ptr++ == 0) break;
   }
 
-return -1;
+return rc;
 }
 
 
 
+
 /*************************************************
 *      Find first significant op code            *
 *************************************************/
@@ -1611,17 +1774,25 @@ for (code = first_significant_code(code + _pcre_OP_lengths[*code], NULL, 0, TRUE
     BOOL empty_branch;
     if (GET(code, 1) == 0) return TRUE;    /* Hit unclosed bracket */
 
-    /* Scan a closed bracket */
+    /* If a conditional group has only one branch, there is a second, implied,
+    empty branch, so just skip over the conditional, because it could be empty.
+    Otherwise, scan the individual branches of the group. */
 
-    empty_branch = FALSE;
-    do
-      {
-      if (!empty_branch && could_be_empty_branch(code, endcode, utf8))
-        empty_branch = TRUE;
+    if (c == OP_COND && code[GET(code, 1)] != OP_ALT)
       code += GET(code, 1);
+    else
+      {
+      empty_branch = FALSE;
+      do
+        {
+        if (!empty_branch && could_be_empty_branch(code, endcode, utf8))
+          empty_branch = TRUE;
+        code += GET(code, 1);
+        }
+      while (*code == OP_ALT);
+      if (!empty_branch) return FALSE;   /* All branches are non-empty */
       }
-    while (*code == OP_ALT);
-    if (!empty_branch) return FALSE;   /* All branches are non-empty */
+
     c = *code;
     continue;
     }
@@ -1823,10 +1994,10 @@ int terminator;          /* Don't combine these lines; the Solaris cc */
 terminator = *(++ptr);   /* compiler warns about "non-constant" initializer. */
 for (++ptr; *ptr != 0; ptr++)
   {
-  if (*ptr == '\\' && ptr[1] == ']') ptr++; else
+  if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET) ptr++; else
     {
-    if (*ptr == ']') return FALSE;
-    if (*ptr == terminator && ptr[1] == ']')
+    if (*ptr == CHAR_RIGHT_SQUARE_BRACKET) return FALSE;
+    if (*ptr == terminator && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
       {
       *endptr = ptr;
       return TRUE;
@@ -2072,7 +2243,7 @@ if ((options & PCRE_EXTENDED) != 0)
   for (;;)
     {
     while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
-    if (*ptr == '#')
+    if (*ptr == CHAR_NUMBER_SIGN)
       {
       while (*(++ptr) != 0)
         if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
@@ -2084,7 +2255,7 @@ if ((options & PCRE_EXTENDED) != 0)
 /* If the next item is one that we can handle, get its value. A non-negative
 value is a character, a negative value is an escape value. */
 
-if (*ptr == '\\')
+if (*ptr == CHAR_BACKSLASH)
   {
   int temperrorcode = 0;
   next = check_escape(&ptr, &temperrorcode, cd->bracount, options, FALSE);
@@ -2109,7 +2280,7 @@ if ((options & PCRE_EXTENDED) != 0)
   for (;;)
     {
     while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
-    if (*ptr == '#')
+    if (*ptr == CHAR_NUMBER_SIGN)
       {
       while (*(++ptr) != 0)
         if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
@@ -2120,8 +2291,9 @@ if ((options & PCRE_EXTENDED) != 0)
 
 /* If the next thing is itself optional, we have to give up. */
 
-if (*ptr == '*' || *ptr == '?' || strncmp((char *)ptr, "{0,", 3) == 0)
-  return FALSE;
+if (*ptr == CHAR_ASTERISK || *ptr == CHAR_QUESTION_MARK ||
+  strncmp((char *)ptr, STR_LEFT_CURLY_BRACKET STR_0 STR_COMMA, 3) == 0)
+    return FALSE;
 
 /* Now compare the next item with the previous opcode. If the previous is a
 positive single character match, "item" either contains the character or, if
@@ -2559,7 +2731,7 @@ for (;; ptr++)
 
   if (inescq && c != 0)
     {
-    if (c == '\\' && ptr[1] == 'E')
+    if (c == CHAR_BACKSLASH && ptr[1] == CHAR_E)
       {
       inescq = FALSE;
       ptr++;
@@ -2585,8 +2757,9 @@ for (;; ptr++)
   /* Fill in length of a previous callout, except when the next thing is
   a quantifier. */
 
-  is_quantifier = c == '*' || c == '+' || c == '?' ||
-    (c == '{' && is_counted_repeat(ptr+1));
+  is_quantifier =
+    c == CHAR_ASTERISK || c == CHAR_PLUS || c == CHAR_QUESTION_MARK ||
+    (c == CHAR_LEFT_CURLY_BRACKET && is_counted_repeat(ptr+1));
 
   if (!is_quantifier && previous_callout != NULL &&
        after_manual_callout-- <= 0)
@@ -2601,7 +2774,7 @@ for (;; ptr++)
   if ((options & PCRE_EXTENDED) != 0)
     {
     if ((cd->ctypes[c] & ctype_space) != 0) continue;
-    if (c == '#')
+    if (c == CHAR_NUMBER_SIGN)
       {
       while (*(++ptr) != 0)
         {
@@ -2626,8 +2799,8 @@ for (;; ptr++)
     {
     /* ===================================================================*/
     case 0:                        /* The branch terminates at string end */
-    case '|':                      /* or | or ) */
-    case ')':
+    case CHAR_VERTICAL_LINE:       /* or | or ) */
+    case CHAR_RIGHT_PARENTHESIS:
     *firstbyteptr = firstbyte;
     *reqbyteptr = reqbyte;
     *codeptr = code;
@@ -2649,7 +2822,7 @@ for (;; ptr++)
     /* Handle single-character metacharacters. In multiline mode, ^ disables
     the setting of any following char as a first character. */
 
-    case '^':
+    case CHAR_CIRCUMFLEX_ACCENT:
     if ((options & PCRE_MULTILINE) != 0)
       {
       if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
@@ -2658,7 +2831,7 @@ for (;; ptr++)
     *code++ = OP_CIRC;
     break;
 
-    case '$':
+    case CHAR_DOLLAR_SIGN:
     previous = NULL;
     *code++ = OP_DOLL;
     break;
@@ -2666,7 +2839,7 @@ for (;; ptr++)
     /* There can never be a first char if '.' is first, whatever happens about
     repeats. The value of reqbyte doesn't change either. */
 
-    case '.':
+    case CHAR_DOT:
     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
     zerofirstbyte = firstbyte;
     zeroreqbyte = reqbyte;
@@ -2690,7 +2863,7 @@ for (;; ptr++)
     In JavaScript compatibility mode, an isolated ']' causes an error. In
     default (Perl) mode, it is treated as a data character. */
 
-    case ']':
+    case CHAR_RIGHT_SQUARE_BRACKET:
     if ((cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
       {
       *errorcodeptr = ERR64;
@@ -2698,16 +2871,17 @@ for (;; ptr++)
       }
     goto NORMAL_CHAR;
 
-    case '[':
+    case CHAR_LEFT_SQUARE_BRACKET:
     previous = code;
 
     /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
     they are encountered at the top level, so we'll do that too. */
 
-    if ((ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
+    if ((ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
+         ptr[1] == CHAR_EQUALS_SIGN) &&
         check_posix_syntax(ptr, &tempptr))
       {
-      *errorcodeptr = (ptr[1] == ':')? ERR13 : ERR31;
+      *errorcodeptr = (ptr[1] == CHAR_COLON)? ERR13 : ERR31;
       goto FAILED;
       }
 
@@ -2719,13 +2893,17 @@ for (;; ptr++)
     for (;;)
       {
       c = *(++ptr);
-      if (c == '\\')
+      if (c == CHAR_BACKSLASH)
         {
-        if (ptr[1] == 'E') ptr++;
-          else if (strncmp((const char *)ptr+1, "Q\\E", 3) == 0) ptr += 3;
-            else break;
+        if (ptr[1] == CHAR_E)
+          ptr++;
+        else if (strncmp((const char *)ptr+1,
+                          STR_Q STR_BACKSLASH STR_E, 3) == 0)
+          ptr += 3;
+        else
+          break;
         }
-      else if (!negate_class && c == '^')
+      else if (!negate_class && c == CHAR_CIRCUMFLEX_ACCENT)
         negate_class = TRUE;
       else break;
       }
@@ -2735,7 +2913,8 @@ for (;; ptr++)
     that. In JS mode, [] must always fail, so generate OP_FAIL, whereas
     [^] must match any character, so generate OP_ALLANY. */
 
-    if (c ==']' && (cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
+    if (c == CHAR_RIGHT_SQUARE_BRACKET &&
+        (cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
       {
       *code++ = negate_class? OP_ALLANY : OP_FAIL;
       if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
@@ -2800,7 +2979,7 @@ for (;; ptr++)
 
       if (inescq)
         {
-        if (c == '\\' && ptr[1] == 'E')     /* If we are at \E */
+        if (c == CHAR_BACKSLASH && ptr[1] == CHAR_E)  /* If we are at \E */
           {
           inescq = FALSE;                   /* Reset literal state */
           ptr++;                            /* Skip the 'E' */
@@ -2815,23 +2994,23 @@ for (;; ptr++)
       [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
       5.6 and 5.8 do. */
 
-      if (c == '[' &&
-          (ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
-          check_posix_syntax(ptr, &tempptr))
+      if (c == CHAR_LEFT_SQUARE_BRACKET &&
+          (ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
+           ptr[1] == CHAR_EQUALS_SIGN) && check_posix_syntax(ptr, &tempptr))
         {
         BOOL local_negate = FALSE;
         int posix_class, taboffset, tabopt;
         register const uschar *cbits = cd->cbits;
         uschar pbits[32];
 
-        if (ptr[1] != ':')
+        if (ptr[1] != CHAR_COLON)
           {
           *errorcodeptr = ERR31;
           goto FAILED;
           }
 
         ptr += 2;
-        if (*ptr == '^')
+        if (*ptr == CHAR_CIRCUMFLEX_ACCENT)
           {
           local_negate = TRUE;
           should_flip_negation = TRUE;  /* Note negative special */
@@ -2904,17 +3083,17 @@ for (;; ptr++)
       to 'or' into the one we are building. We assume they have more than one
       character in them, so set class_charcount bigger than one. */
 
-      if (c == '\\')
+      if (c == CHAR_BACKSLASH)
         {
         c = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
         if (*errorcodeptr != 0) goto FAILED;
 
-        if (-c == ESC_b) c = '\b';       /* \b is backspace in a class */
-        else if (-c == ESC_X) c = 'X';   /* \X is literal X in a class */
-        else if (-c == ESC_R) c = 'R';   /* \R is literal R in a class */
+        if (-c == ESC_b) c = CHAR_BS;       /* \b is backspace in a class */
+        else if (-c == ESC_X) c = CHAR_X;   /* \X is literal X in a class */
+        else if (-c == ESC_R) c = CHAR_R;   /* \R is literal R in a class */
         else if (-c == ESC_Q)            /* Handle start of quoted string */
           {
-          if (ptr[1] == '\\' && ptr[2] == 'E')
+          if (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
             {
             ptr += 2; /* avoid empty string */
             }
@@ -3140,7 +3319,7 @@ for (;; ptr++)
       entirely. The code for handling \Q and \E is messy. */
 
       CHECK_RANGE:
-      while (ptr[1] == '\\' && ptr[2] == 'E')
+      while (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
         {
         inescq = FALSE;
         ptr += 2;
@@ -3150,28 +3329,29 @@ for (;; ptr++)
 
       /* Remember \r or \n */
 
-      if (c == '\r' || c == '\n') cd->external_flags |= PCRE_HASCRORLF;
+      if (c == CHAR_CR || c == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF;
 
       /* Check for range */
 
-      if (!inescq && ptr[1] == '-')
+      if (!inescq && ptr[1] == CHAR_MINUS)
         {
         int d;
         ptr += 2;
-        while (*ptr == '\\' && ptr[1] == 'E') ptr += 2;
+        while (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_E) ptr += 2;
 
         /* If we hit \Q (not followed by \E) at this point, go into escaped
         mode. */
 
-        while (*ptr == '\\' && ptr[1] == 'Q')
+        while (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_Q)
           {
           ptr += 2;
-          if (*ptr == '\\' && ptr[1] == 'E') { ptr += 2; continue; }
+          if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_E)
+            { ptr += 2; continue; }
           inescq = TRUE;
           break;
           }
 
-        if (*ptr == 0 || (!inescq && *ptr == ']'))
+        if (*ptr == 0 || (!inescq && *ptr == CHAR_RIGHT_SQUARE_BRACKET))
           {
           ptr = oldptr;
           goto LONE_SINGLE_CHARACTER;
@@ -3190,7 +3370,7 @@ for (;; ptr++)
         not any of the other escapes. Perl 5.6 treats a hyphen as a literal
         in such circumstances. */
 
-        if (!inescq && d == '\\')
+        if (!inescq && d == CHAR_BACKSLASH)
           {
           d = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
           if (*errorcodeptr != 0) goto FAILED;
@@ -3200,9 +3380,9 @@ for (;; ptr++)
 
           if (d < 0)
             {
-            if (d == -ESC_b) d = '\b';
-            else if (d == -ESC_X) d = 'X';
-            else if (d == -ESC_R) d = 'R'; else
+            if (d == -ESC_b) d = CHAR_BS;
+            else if (d == -ESC_X) d = CHAR_X;
+            else if (d == -ESC_R) d = CHAR_R; else
               {
               ptr = oldptr;
               goto LONE_SINGLE_CHARACTER;  /* A few lines below */
@@ -3223,7 +3403,7 @@ for (;; ptr++)
 
         /* Remember \r or \n */
 
-        if (d == '\r' || d == '\n') cd->external_flags |= PCRE_HASCRORLF;
+        if (d == CHAR_CR || d == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF;
 
         /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless
         matching, we have to use an XCLASS with extra data items. Caseless
@@ -3370,7 +3550,7 @@ for (;; ptr++)
 
     /* Loop until ']' reached. This "while" is the end of the "do" above. */
 
-    while ((c = *(++ptr)) != 0 && (c != ']' || inescq));
+    while ((c = *(++ptr)) != 0 && (c != CHAR_RIGHT_SQUARE_BRACKET || inescq));
 
     if (c == 0)                          /* Missing terminating ']' */
       {
@@ -3515,23 +3695,23 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */
     /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
     has been tested above. */
 
-    case '{':
+    case CHAR_LEFT_CURLY_BRACKET:
     if (!is_quantifier) goto NORMAL_CHAR;
     ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorcodeptr);
     if (*errorcodeptr != 0) goto FAILED;
     goto REPEAT;
 
-    case '*':
+    case CHAR_ASTERISK:
     repeat_min = 0;
     repeat_max = -1;
     goto REPEAT;
 
-    case '+':
+    case CHAR_PLUS:
     repeat_min = 1;
     repeat_max = -1;
     goto REPEAT;
 
-    case '?':
+    case CHAR_QUESTION_MARK:
     repeat_min = 0;
     repeat_max = 1;
 
@@ -3566,13 +3746,13 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */
     but if PCRE_UNGREEDY is set, it works the other way round. We change the
     repeat type to the non-default. */
 
-    if (ptr[1] == '+')
+    if (ptr[1] == CHAR_PLUS)
       {
       repeat_type = 0;                  /* Force greedy */
       possessive_quantifier = TRUE;
       ptr++;
       }
-    else if (ptr[1] == '?')
+    else if (ptr[1] == CHAR_QUESTION_MARK)
       {
       repeat_type = greedy_non_default;
       ptr++;
@@ -4205,7 +4385,7 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */
     lookbehind or option setting or condition or all the other extended
     parenthesis forms.  */
 
-    case '(':
+    case CHAR_LEFT_PARENTHESIS:
     newoptions = options;
     skipbytes = 0;
     bravalue = OP_CBRA;
@@ -4214,19 +4394,19 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */
 
     /* First deal with various "verbs" that can be introduced by '*'. */
 
-    if (*(++ptr) == '*' && (cd->ctypes[ptr[1]] & ctype_letter) != 0)
+    if (*(++ptr) == CHAR_ASTERISK && (cd->ctypes[ptr[1]] & ctype_letter) != 0)
       {
       int i, namelen;
       const char *vn = verbnames;
       const uschar *name = ++ptr;
       previous = NULL;
       while ((cd->ctypes[*++ptr] & ctype_letter) != 0) {};
-      if (*ptr == ':')
+      if (*ptr == CHAR_COLON)
         {
         *errorcodeptr = ERR59;   /* Not supported */
         goto FAILED;
         }
-      if (*ptr != ')')
+      if (*ptr != CHAR_RIGHT_PARENTHESIS)
         {
         *errorcodeptr = ERR60;
         goto FAILED;
@@ -4251,7 +4431,7 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */
     /* Deal with the extended parentheses; all are introduced by '?', and the
     appearance of any of them means that this is not a capturing group. */
 
-    else if (*ptr == '?')
+    else if (*ptr == CHAR_QUESTION_MARK)
       {
       int i, set, unset, namelen;
       int *optset;
@@ -4260,9 +4440,9 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */
 
       switch (*(++ptr))
         {
-        case '#':                 /* Comment; skip to ket */
+        case CHAR_NUMBER_SIGN:                 /* Comment; skip to ket */
         ptr++;
-        while (*ptr != 0 && *ptr != ')') ptr++;
+        while (*ptr != 0 && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++;
         if (*ptr == 0)
           {
           *errorcodeptr = ERR18;
@@ -4272,19 +4452,19 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */
 
 
         /* ------------------------------------------------------------ */
-        case '|':                 /* Reset capture count for each branch */
+        case CHAR_VERTICAL_LINE:  /* Reset capture count for each branch */
         reset_bracount = TRUE;
         /* Fall through */
 
         /* ------------------------------------------------------------ */
-        case ':':                 /* Non-capturing bracket */
+        case CHAR_COLON:          /* Non-capturing bracket */
         bravalue = OP_BRA;
         ptr++;
         break;
 
 
         /* ------------------------------------------------------------ */
-        case '(':
+        case CHAR_LEFT_PARENTHESIS:
         bravalue = OP_COND;       /* Conditional group */
 
         /* A condition can be an assertion, a number (referring to a numbered
@@ -4304,7 +4484,8 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */
         the switch. This will take control down to where bracketed groups,
         including assertions, are processed. */
 
-        if (ptr[1] == '?' && (ptr[2] == '=' || ptr[2] == '!' || ptr[2] == '<'))
+        if (ptr[1] == CHAR_QUESTION_MARK && (ptr[2] == CHAR_EQUALS_SIGN ||
+            ptr[2] == CHAR_EXCLAMATION_MARK || ptr[2] == CHAR_LESS_THAN_SIGN))
           break;
 
         /* Most other conditions use OP_CREF (a couple change to OP_RREF
@@ -4316,7 +4497,7 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */
 
         /* Check for a test for recursion in a named group. */
 
-        if (ptr[1] == 'R' && ptr[2] == '&')
+        if (ptr[1] == CHAR_R && ptr[2] == CHAR_AMPERSAND)
           {
           terminator = -1;
           ptr += 2;
@@ -4326,20 +4507,20 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */
         /* Check for a test for a named group's having been set, using the Perl
         syntax (?(<name>) or (?('name') */
 
-        else if (ptr[1] == '<')
+        else if (ptr[1] == CHAR_LESS_THAN_SIGN)
           {
-          terminator = '>';
+          terminator = CHAR_GREATER_THAN_SIGN;
           ptr++;
           }
-        else if (ptr[1] == '\'')
+        else if (ptr[1] == CHAR_APOSTROPHE)
           {
-          terminator = '\'';
+          terminator = CHAR_APOSTROPHE;
           ptr++;
           }
         else
           {
           terminator = 0;
-          if (ptr[1] == '-' || ptr[1] == '+') refsign = *(++ptr);
+          if (ptr[1] == CHAR_MINUS || ptr[1] == CHAR_PLUS) refsign = *(++ptr);
           }
 
         /* We now expect to read a name; any thing else is an error */
@@ -4359,12 +4540,13 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */
           {
           if (recno >= 0)
             recno = ((digitab[*ptr] & ctype_digit) != 0)?
-              recno * 10 + *ptr - '0' : -1;
+              recno * 10 + *ptr - CHAR_0 : -1;
           ptr++;
           }
         namelen = ptr - name;
 
-        if ((terminator > 0 && *ptr++ != terminator) || *ptr++ != ')')
+        if ((terminator > 0 && *ptr++ != terminator) ||
+            *ptr++ != CHAR_RIGHT_PARENTHESIS)
           {
           ptr--;      /* Error offset */
           *errorcodeptr = ERR26;
@@ -4386,7 +4568,7 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */
             *errorcodeptr = ERR58;
             goto FAILED;
             }
-          recno = (refsign == '-')?
+          recno = (refsign == CHAR_MINUS)?
             cd->bracount - recno + 1 : recno +cd->bracount;
           if (recno <= 0 || recno > cd->final_bracount)
             {
@@ -4417,7 +4599,7 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */
 
         /* Search the pattern for a forward reference */
 
-        else if ((i = find_parens(ptr, cd, name, namelen,
+        else if ((i = find_parens(cd, name, namelen,
                         (options & PCRE_EXTENDED) != 0)) > 0)
           {
           PUT2(code, 2+LINK_SIZE, i);
@@ -4438,7 +4620,7 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */
         /* Check for (?(R) for recursion. Allow digits after R to specify a
         specific group number. */
 
-        else if (*name == 'R')
+        else if (*name == CHAR_R)
           {
           recno = 0;
           for (i = 1; i < namelen; i++)
@@ -4448,7 +4630,7 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */
               *errorcodeptr = ERR15;
               goto FAILED;
               }
-            recno = recno * 10 + name[i] - '0';
+            recno = recno * 10 + name[i] - CHAR_0;
             }
           if (recno == 0) recno = RREF_ANY;
           code[1+LINK_SIZE] = OP_RREF;      /* Change test type */
@@ -4458,7 +4640,7 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */
         /* Similarly, check for the (?(DEFINE) "condition", which is always
         false. */
 
-        else if (namelen == 6 && strncmp((char *)name, "DEFINE", 6) == 0)
+        else if (namelen == 6 && strncmp((char *)name, STRING_DEFINE, 6) == 0)
           {
           code[1+LINK_SIZE] = OP_DEF;
           skipbytes = 1;
@@ -4483,16 +4665,16 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */
 
 
         /* ------------------------------------------------------------ */
-        case '=':                 /* Positive lookahead */
+        case CHAR_EQUALS_SIGN:                 /* Positive lookahead */
         bravalue = OP_ASSERT;
         ptr++;
         break;
 
 
         /* ------------------------------------------------------------ */
-        case '!':                 /* Negative lookahead */
+        case CHAR_EXCLAMATION_MARK:            /* Negative lookahead */
         ptr++;
-        if (*ptr == ')')          /* Optimize (?!) */
+        if (*ptr == CHAR_RIGHT_PARENTHESIS)    /* Optimize (?!) */
           {
           *code++ = OP_FAIL;
           previous = NULL;
@@ -4503,15 +4685,15 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */
 
 
         /* ------------------------------------------------------------ */
-        case '<':                 /* Lookbehind or named define */
+        case CHAR_LESS_THAN_SIGN:              /* Lookbehind or named define */
         switch (ptr[1])
           {
-          case '=':               /* Positive lookbehind */
+          case CHAR_EQUALS_SIGN:               /* Positive lookbehind */
           bravalue = OP_ASSERTBACK;
           ptr += 2;
           break;
 
-          case '!':               /* Negative lookbehind */
+          case CHAR_EXCLAMATION_MARK:          /* Negative lookbehind */
           bravalue = OP_ASSERTBACK_NOT;
           ptr += 2;
           break;
@@ -4526,22 +4708,22 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */
 
 
         /* ------------------------------------------------------------ */
-        case '>':                 /* One-time brackets */
+        case CHAR_GREATER_THAN_SIGN:           /* One-time brackets */
         bravalue = OP_ONCE;
         ptr++;
         break;
 
 
         /* ------------------------------------------------------------ */
-        case 'C':                 /* Callout - may be followed by digits; */
+        case CHAR_C:                 /* Callout - may be followed by digits; */
         previous_callout = code;  /* Save for later completion */
         after_manual_callout = 1; /* Skip one item before completing */
         *code++ = OP_CALLOUT;
           {
           int n = 0;
           while ((digitab[*(++ptr)] & ctype_digit) != 0)
-            n = n * 10 + *ptr - '0';
-          if (*ptr != ')')
+            n = n * 10 + *ptr - CHAR_0;
+          if (*ptr != CHAR_RIGHT_PARENTHESIS)
             {
             *errorcodeptr = ERR39;
             goto FAILED;
@@ -4561,14 +4743,15 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */
 
 
         /* ------------------------------------------------------------ */
-        case 'P':                 /* Python-style named subpattern handling */
-        if (*(++ptr) == '=' || *ptr == '>')  /* Reference or recursion */
+        case CHAR_P:              /* Python-style named subpattern handling */
+        if (*(++ptr) == CHAR_EQUALS_SIGN ||
+            *ptr == CHAR_GREATER_THAN_SIGN)  /* Reference or recursion */
           {
-          is_recurse = *ptr == '>';
-          terminator = ')';
+          is_recurse = *ptr == CHAR_GREATER_THAN_SIGN;
+          terminator = CHAR_RIGHT_PARENTHESIS;
           goto NAMED_REF_OR_RECURSE;
           }
-        else if (*ptr != '<')    /* Test for Python-style definition */
+        else if (*ptr != CHAR_LESS_THAN_SIGN)  /* Test for Python-style defn */
           {
           *errorcodeptr = ERR41;
           goto FAILED;
@@ -4578,9 +4761,10 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */
 
         /* ------------------------------------------------------------ */
         DEFINE_NAME:    /* Come here from (?< handling */
-        case '\'':
+        case CHAR_APOSTROPHE:
           {
-          terminator = (*ptr == '<')? '>' : '\'';
+          terminator = (*ptr == CHAR_LESS_THAN_SIGN)?
+            CHAR_GREATER_THAN_SIGN : CHAR_APOSTROPHE;
           name = ++ptr;
 
           while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
@@ -4654,8 +4838,8 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */
 
 
         /* ------------------------------------------------------------ */
-        case '&':                 /* Perl recursion/subroutine syntax */
-        terminator = ')';
+        case CHAR_AMPERSAND:            /* Perl recursion/subroutine syntax */
+        terminator = CHAR_RIGHT_PARENTHESIS;
         is_recurse = TRUE;
         /* Fall through */
 
@@ -4714,7 +4898,7 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */
             recno = GET2(slot, 0);
             }
           else if ((recno =                /* Forward back reference */
-                    find_parens(ptr, cd, name, namelen,
+                    find_parens(cd, name, namelen,
                       (options & PCRE_EXTENDED) != 0)) <= 0)
             {
             *errorcodeptr = ERR15;
@@ -4730,18 +4914,18 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */
 
 
         /* ------------------------------------------------------------ */
-        case 'R':                 /* Recursion */
+        case CHAR_R:              /* Recursion */
         ptr++;                    /* Same as (?0)      */
         /* Fall through */
 
 
         /* ------------------------------------------------------------ */
-        case '-': case '+':
-        case '0': case '1': case '2': case '3': case '4':   /* Recursion or */
-        case '5': case '6': case '7': case '8': case '9':   /* subroutine */
+        case CHAR_MINUS: case CHAR_PLUS:  /* Recursion or subroutine */
+        case CHAR_0: case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4:
+        case CHAR_5: case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
           {
           const uschar *called;
-          terminator = ')';
+          terminator = CHAR_RIGHT_PARENTHESIS;
 
           /* Come here from the \g<...> and \g'...' code (Oniguruma
           compatibility). However, the syntax has been checked to ensure that
@@ -4751,7 +4935,7 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */
 
           HANDLE_NUMERICAL_RECURSION:
 
-          if ((refsign = *ptr) == '+')
+          if ((refsign = *ptr) == CHAR_PLUS)
             {
             ptr++;
             if ((digitab[*ptr] & ctype_digit) == 0)
@@ -4760,7 +4944,7 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */
               goto FAILED;
               }
             }
-          else if (refsign == '-')
+          else if (refsign == CHAR_MINUS)
             {
             if ((digitab[ptr[1]] & ctype_digit) == 0)
               goto OTHER_CHAR_AFTER_QUERY;
@@ -4769,7 +4953,7 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */
 
           recno = 0;
           while((digitab[*ptr] & ctype_digit) != 0)
-            recno = recno * 10 + *ptr++ - '0';
+            recno = recno * 10 + *ptr++ - CHAR_0;
 
           if (*ptr != terminator)
             {
@@ -4777,7 +4961,7 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */
             goto FAILED;
             }
 
-          if (refsign == '-')
+          if (refsign == CHAR_MINUS)
             {
             if (recno == 0)
               {
@@ -4791,7 +4975,7 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */
               goto FAILED;
               }
             }
-          else if (refsign == '+')
+          else if (refsign == CHAR_PLUS)
             {
             if (recno == 0)
               {
@@ -4824,7 +5008,7 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */
 
             if (called == NULL)
               {
-              if (find_parens(ptr, cd, NULL, recno,
+              if (find_parens(cd, NULL, recno,
                     (options & PCRE_EXTENDED) != 0) < 0)
                 {
                 *errorcodeptr = ERR15;
@@ -4877,23 +5061,23 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */
         set = unset = 0;
         optset = &set;
 
-        while (*ptr != ')' && *ptr != ':')
+        while (*ptr != CHAR_RIGHT_PARENTHESIS && *ptr != CHAR_COLON)
           {
           switch (*ptr++)
             {
-            case '-': optset = &unset; break;
+            case CHAR_MINUS: optset = &unset; break;
 
-            case 'J':    /* Record that it changed in the external options */
+            case CHAR_J:    /* Record that it changed in the external options */
             *optset |= PCRE_DUPNAMES;
             cd->external_flags |= PCRE_JCHANGED;
             break;
 
-            case 'i': *optset |= PCRE_CASELESS; break;
-            case 'm': *optset |= PCRE_MULTILINE; break;
-            case 's': *optset |= PCRE_DOTALL; break;
-            case 'x': *optset |= PCRE_EXTENDED; break;
-            case 'U': *optset |= PCRE_UNGREEDY; break;
-            case 'X': *optset |= PCRE_EXTRA; break;
+            case CHAR_i: *optset |= PCRE_CASELESS; break;
+            case CHAR_m: *optset |= PCRE_MULTILINE; break;
+            case CHAR_s: *optset |= PCRE_DOTALL; break;
+            case CHAR_x: *optset |= PCRE_EXTENDED; break;
+            case CHAR_U: *optset |= PCRE_UNGREEDY; break;
+            case CHAR_X: *optset |= PCRE_EXTRA; break;
 
             default:  *errorcodeptr = ERR12;
                       ptr--;    /* Correct the offset */
@@ -4927,7 +5111,7 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */
         options if this setting actually changes any of them, and reset the
         greedy defaults and the case value for firstbyte and reqbyte. */
 
-        if (*ptr == ')')
+        if (*ptr == CHAR_RIGHT_PARENTHESIS)
           {
           if (code == cd->start_code + 1 + LINK_SIZE &&
                (lengthptr == NULL || *lengthptr == 2 + 2*LINK_SIZE))
@@ -5067,7 +5251,7 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */
 
     /* Error if hit end of pattern */
 
-    if (*ptr != ')')
+    if (*ptr != CHAR_RIGHT_PARENTHESIS)
       {
       *errorcodeptr = ERR14;
       goto FAILED;
@@ -5165,7 +5349,7 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */
     We can test for values between ESC_b and ESC_Z for the latter; this may
     have to change if any new ones are ever created. */
 
-    case '\\':
+    case CHAR_BACKSLASH:
     tempptr = ptr;
     c = check_escape(&ptr, errorcodeptr, cd->bracount, options, FALSE);
     if (*errorcodeptr != 0) goto FAILED;
@@ -5174,8 +5358,9 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */
       {
       if (-c == ESC_Q)            /* Handle start of quoted string */
         {
-        if (ptr[1] == '\\' && ptr[2] == 'E') ptr += 2; /* avoid empty string */
-          else inescq = TRUE;
+        if (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
+          ptr += 2;               /* avoid empty string */
+            else inescq = TRUE;
         continue;
         }
 
@@ -5203,7 +5388,8 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */
         {
         const uschar *p;
         save_hwm = cd->hwm;   /* Normally this is set when '(' is read */
-        terminator = (*(++ptr) == '<')? '>' : '\'';
+        terminator = (*(++ptr) == CHAR_LESS_THAN_SIGN)?
+          CHAR_GREATER_THAN_SIGN : CHAR_APOSTROPHE;
 
         /* These two statements stop the compiler for warning about possibly
         unset variables caused by the jump to HANDLE_NUMERICAL_RECURSION. In
@@ -5215,7 +5401,7 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */
 
         /* Test for a name */
 
-        if (ptr[1] != '+' && ptr[1] != '-')
+        if (ptr[1] != CHAR_PLUS && ptr[1] != CHAR_MINUS)
           {
           BOOL isnumber = TRUE;
           for (p = ptr + 1; *p != 0 && *p != terminator; p++)
@@ -5253,10 +5439,13 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */
       /* \k<name> or \k'name' is a back reference by name (Perl syntax).
       We also support \k{name} (.NET syntax) */
 
-      if (-c == ESC_k && (ptr[1] == '<' || ptr[1] == '\'' || ptr[1] == '{'))
+      if (-c == ESC_k && (ptr[1] == CHAR_LESS_THAN_SIGN ||
+          ptr[1] == CHAR_APOSTROPHE || ptr[1] == CHAR_LEFT_CURLY_BRACKET))
         {
         is_recurse = FALSE;
-        terminator = (*(++ptr) == '<')? '>' : (*ptr == '\'')? '\'' : '}';
+        terminator = (*(++ptr) == CHAR_LESS_THAN_SIGN)?
+          CHAR_GREATER_THAN_SIGN : (*ptr == CHAR_APOSTROPHE)?
+          CHAR_APOSTROPHE : CHAR_RIGHT_CURLY_BRACKET;
         goto NAMED_REF_OR_RECURSE;
         }
 
@@ -5359,7 +5548,7 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */
 
     /* Remember if \r or \n were seen */
 
-    if (mcbuffer[0] == '\r' || mcbuffer[0] == '\n')
+    if (mcbuffer[0] == CHAR_CR || mcbuffer[0] == CHAR_NL)
       cd->external_flags |= PCRE_HASCRORLF;
 
     /* Set the first and required bytes appropriately. If no previous first
@@ -5604,7 +5793,7 @@ for (;;)
   compile a resetting op-code following, except at the very end of the pattern.
   Return leaving the pointer at the terminating char. */
 
-  if (*ptr != '|')
+  if (*ptr != CHAR_VERTICAL_LINE)
     {
     if (lengthptr == NULL)
       {
@@ -5627,7 +5816,7 @@ for (;;)
 
     /* Resetting option if needed */
 
-    if ((options & PCRE_IMS) != oldims && *ptr == ')')
+    if ((options & PCRE_IMS) != oldims && *ptr == CHAR_RIGHT_PARENTHESIS)
       {
       *code++ = OP_OPT;
       *code++ = oldims;
@@ -5809,6 +5998,32 @@ do {
      NULL, 0, FALSE);
    register int op = *scode;
 
+   /* If we are at the start of a conditional assertion group, *both* the
+   conditional assertion *and* what follows the condition must satisfy the test
+   for start of line. Other kinds of condition fail. Note that there may be an
+   auto-callout at the start of a condition. */
+
+   if (op == OP_COND)
+     {
+     scode += 1 + LINK_SIZE;
+     if (*scode == OP_CALLOUT) scode += _pcre_OP_lengths[OP_CALLOUT];
+     switch (*scode)
+       {
+       case OP_CREF:
+       case OP_RREF:
+       case OP_DEF:
+       return FALSE;
+
+       default:     /* Assertion */
+       if (!is_startline(scode, bracket_map, backref_map)) return FALSE;
+       do scode += GET(scode, 1); while (*scode == OP_ALT);
+       scode += 1 + LINK_SIZE;
+       break;
+       }
+     scode = first_significant_code(scode, NULL, 0, FALSE);
+     op = *scode;
+     }
+
    /* Non-capturing brackets */
 
    if (op == OP_BRA)
@@ -5827,8 +6042,10 @@ do {
 
    /* Other brackets */
 
-   else if (op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
-     { if (!is_startline(scode, bracket_map, backref_map)) return FALSE; }
+   else if (op == OP_ASSERT || op == OP_ONCE)
+     {
+     if (!is_startline(scode, bracket_map, backref_map)) return FALSE;
+     }
 
    /* .* means "start at start or after \n" if it isn't in brackets that
    may be referenced. */
@@ -6007,30 +6224,6 @@ if (erroroffset == NULL)
 
 *erroroffset = 0;
 
-/* Can't support UTF8 unless PCRE has been compiled to include the code. */
-
-#ifdef SUPPORT_UTF8
-utf8 = (options & PCRE_UTF8) != 0;
-if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0 &&
-     (*erroroffset = _pcre_valid_utf8((uschar *)pattern, -1)) >= 0)
-  {
-  errorcode = ERR44;
-  goto PCRE_EARLY_ERROR_RETURN2;
-  }
-#else
-if ((options & PCRE_UTF8) != 0)
-  {
-  errorcode = ERR32;
-  goto PCRE_EARLY_ERROR_RETURN;
-  }
-#endif
-
-if ((options & ~PUBLIC_OPTIONS) != 0)
-  {
-  errorcode = ERR17;
-  goto PCRE_EARLY_ERROR_RETURN;
-  }
-
 /* Set up pointers to the individual character tables */
 
 if (tables == NULL) tables = _pcre_default_tables;
@@ -6039,28 +6232,40 @@ cd->fcc = tables + fcc_offset;
 cd->cbits = tables + cbits_offset;
 cd->ctypes = tables + ctypes_offset;
 
+/* Check that all undefined public option bits are zero */
+
+if ((options & ~PUBLIC_COMPILE_OPTIONS) != 0)
+  {
+  errorcode = ERR17;
+  goto PCRE_EARLY_ERROR_RETURN;
+  }
+
 /* Check for global one-time settings at the start of the pattern, and remember
 the offset for later. */
 
-while (ptr[skipatstart] == '(' && ptr[skipatstart+1] == '*')
+while (ptr[skipatstart] == CHAR_LEFT_PARENTHESIS &&
+       ptr[skipatstart+1] == CHAR_ASTERISK)
   {
   int newnl = 0;
   int newbsr = 0;
 
-  if (strncmp((char *)(ptr+skipatstart+2), "CR)", 3) == 0)
+  if (strncmp((char *)(ptr+skipatstart+2), STRING_UTF8_RIGHTPAR, 5) == 0)
+    { skipatstart += 7; options |= PCRE_UTF8; continue; }
+
+  if (strncmp((char *)(ptr+skipatstart+2), STRING_CR_RIGHTPAR, 3) == 0)
     { skipatstart += 5; newnl = PCRE_NEWLINE_CR; }
-  else if (strncmp((char *)(ptr+skipatstart+2), "LF)", 3)  == 0)
+  else if (strncmp((char *)(ptr+skipatstart+2), STRING_LF_RIGHTPAR, 3)  == 0)
     { skipatstart += 5; newnl = PCRE_NEWLINE_LF; }
-  else if (strncmp((char *)(ptr+skipatstart+2), "CRLF)", 5)  == 0)
+  else if (strncmp((char *)(ptr+skipatstart+2), STRING_CRLF_RIGHTPAR, 5)  == 0)
     { skipatstart += 7; newnl = PCRE_NEWLINE_CR + PCRE_NEWLINE_LF; }
-  else if (strncmp((char *)(ptr+skipatstart+2), "ANY)", 4) == 0)
+  else if (strncmp((char *)(ptr+skipatstart+2), STRING_ANY_RIGHTPAR, 4) == 0)
     { skipatstart += 6; newnl = PCRE_NEWLINE_ANY; }
-  else if (strncmp((char *)(ptr+skipatstart+2), "ANYCRLF)", 8)  == 0)
+  else if (strncmp((char *)(ptr+skipatstart+2), STRING_ANYCRLF_RIGHTPAR, 8) == 0)
     { skipatstart += 10; newnl = PCRE_NEWLINE_ANYCRLF; }
 
-  else if (strncmp((char *)(ptr+skipatstart+2), "BSR_ANYCRLF)", 12) == 0)
+  else if (strncmp((char *)(ptr+skipatstart+2), STRING_BSR_ANYCRLF_RIGHTPAR, 12) == 0)
     { skipatstart += 14; newbsr = PCRE_BSR_ANYCRLF; }
-  else if (strncmp((char *)(ptr+skipatstart+2), "BSR_UNICODE)", 12) == 0)
+  else if (strncmp((char *)(ptr+skipatstart+2), STRING_BSR_UNICODE_RIGHTPAR, 12) == 0)
     { skipatstart += 14; newbsr = PCRE_BSR_UNICODE; }
 
   if (newnl != 0)
@@ -6070,6 +6275,24 @@ while (ptr[skipatstart] == '(' && ptr[skipatstart+1] == '*')
   else break;
   }
 
+/* Can't support UTF8 unless PCRE has been compiled to include the code. */
+
+#ifdef SUPPORT_UTF8
+utf8 = (options & PCRE_UTF8) != 0;
+if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0 &&
+     (*erroroffset = _pcre_valid_utf8((uschar *)pattern, -1)) >= 0)
+  {
+  errorcode = ERR44;
+  goto PCRE_EARLY_ERROR_RETURN2;
+  }
+#else
+if ((options & PCRE_UTF8) != 0)
+  {
+  errorcode = ERR32;
+  goto PCRE_EARLY_ERROR_RETURN;
+  }
+#endif
+
 /* Check validity of \R options. */
 
 switch (options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE))
@@ -6088,10 +6311,10 @@ current code allows for fixed one- or two-byte sequences, plus "any" and
 switch (options & PCRE_NEWLINE_BITS)
   {
   case 0: newline = NEWLINE; break;   /* Build-time default */
-  case PCRE_NEWLINE_CR: newline = '\r'; break;
-  case PCRE_NEWLINE_LF: newline = '\n'; break;
+  case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
+  case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
   case PCRE_NEWLINE_CR+
-       PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break;
+       PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
   case PCRE_NEWLINE_ANY: newline = -1; break;
   case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
   default: errorcode = ERR56; goto PCRE_EARLY_ERROR_RETURN;
author	Nuno Lopes <nlopess@php.net>	2009-04-11 18:57:27 +0000
committer	Nuno Lopes <nlopess@php.net>	2009-04-11 18:57:27 +0000
commit	90a2d1979486b2e7fa14be9c3210b9d321c668f8 (patch)
tree	9f67ba72ee6010944487be2790c467dd2ea2cbaf /ext/pcre/pcrelib/pcre_compile.c
parent	ff62b87cd6b94c50fbbac88bf4129f6418efe131 (diff)
download	php-git-90a2d1979486b2e7fa14be9c3210b9d321c668f8.tar.gz