diff options
Diffstat (limited to 'ext/pcre/pcrelib/pcre_compile.c')
| -rw-r--r-- | ext/pcre/pcrelib/pcre_compile.c | 748 | 
1 files changed, 526 insertions, 222 deletions
| diff --git a/ext/pcre/pcrelib/pcre_compile.c b/ext/pcre/pcrelib/pcre_compile.c index 54e23ea39e..53027e603d 100644 --- a/ext/pcre/pcrelib/pcre_compile.c +++ b/ext/pcre/pcrelib/pcre_compile.c @@ -122,7 +122,7 @@ static const short int escapes[] = {       -ESC_H,                  0,       0,                       -ESC_K,       0,                       0, -     0,                       0, +     -ESC_N,                  0,       -ESC_P,                  -ESC_Q,       -ESC_R,                  -ESC_S,       0,                       0, @@ -169,7 +169,7 @@ static const short int escapes[] = {  /*  B8 */     0,     0,      0,       0,      0,   ']',    '=',    '-',  /*  C0 */   '{',-ESC_A, -ESC_B,  -ESC_C, -ESC_D,-ESC_E,      0, -ESC_G,  /*  C8 */-ESC_H,     0,      0,       0,      0,     0,      0,      0, -/*  D0 */   '}',     0, -ESC_K,       0,      0,     0,      0, -ESC_P, +/*  D0 */   '}',     0, -ESC_K,       0,      0,-ESC_N,      0, -ESC_P,  /*  D8 */-ESC_Q,-ESC_R,      0,       0,      0,     0,      0,      0,  /*  E0 */  '\\',     0, -ESC_S,       0,      0,-ESC_V, -ESC_W, -ESC_X,  /*  E8 */     0,-ESC_Z,      0,       0,      0,     0,      0,      0, @@ -186,11 +186,14 @@ string is built from string macros so that it works in UTF-8 mode on EBCDIC  platforms. */  typedef struct verbitem { -  int   len; -  int   op; +  int   len;                 /* Length of verb name */ +  int   op;                  /* Op when no arg, or -1 if arg mandatory */ +  int   op_arg;              /* Op when arg present, or -1 if not allowed */  } verbitem;  static const char verbnames[] = +  "\0"                       /* Empty name is a shorthand for MARK */ +  STRING_MARK0    STRING_ACCEPT0    STRING_COMMIT0    STRING_F0 @@ -200,13 +203,15 @@ static const char verbnames[] =    STRING_THEN;  static const verbitem verbs[] = { -  { 6, OP_ACCEPT }, -  { 6, OP_COMMIT }, -  { 1, OP_FAIL }, -  { 4, OP_FAIL }, -  { 5, OP_PRUNE }, -  { 4, OP_SKIP  }, -  { 4, OP_THEN  } +  { 0, -1,        OP_MARK }, +  { 4, -1,        OP_MARK }, +  { 6, OP_ACCEPT, -1 }, +  { 6, OP_COMMIT, -1 }, +  { 1, OP_FAIL,   -1 }, +  { 4, OP_FAIL,   -1 }, +  { 5, OP_PRUNE,  OP_PRUNE_ARG }, +  { 4, OP_SKIP,   OP_SKIP_ARG  }, +  { 4, OP_THEN,   OP_THEN_ARG  }  };  static const int verbcount = sizeof(verbs)/sizeof(verbitem); @@ -254,6 +259,53 @@ static const int posix_class_maps[] = {    cbit_xdigit,-1,          0              /* xdigit */  }; +/* Table of substitutes for \d etc when PCRE_UCP is set. The POSIX class +substitutes must be in the order of the names, defined above, and there are +both positive and negative cases. NULL means no substitute. */ + +#ifdef SUPPORT_UCP +static const uschar *substitutes[] = { +  (uschar *)"\\P{Nd}",    /* \D */ +  (uschar *)"\\p{Nd}",    /* \d */ +  (uschar *)"\\P{Xsp}",   /* \S */       /* NOTE: Xsp is Perl space */ +  (uschar *)"\\p{Xsp}",   /* \s */ +  (uschar *)"\\P{Xwd}",   /* \W */ +  (uschar *)"\\p{Xwd}"    /* \w */ +}; + +static const uschar *posix_substitutes[] = { +  (uschar *)"\\p{L}",     /* alpha */ +  (uschar *)"\\p{Ll}",    /* lower */ +  (uschar *)"\\p{Lu}",    /* upper */ +  (uschar *)"\\p{Xan}",   /* alnum */ +  NULL,                   /* ascii */ +  (uschar *)"\\h",        /* blank */ +  NULL,                   /* cntrl */ +  (uschar *)"\\p{Nd}",    /* digit */ +  NULL,                   /* graph */ +  NULL,                   /* print */ +  NULL,                   /* punct */ +  (uschar *)"\\p{Xps}",   /* space */    /* NOTE: Xps is POSIX space */ +  (uschar *)"\\p{Xwd}",   /* word */ +  NULL,                   /* xdigit */ +  /* Negated cases */ +  (uschar *)"\\P{L}",     /* ^alpha */ +  (uschar *)"\\P{Ll}",    /* ^lower */ +  (uschar *)"\\P{Lu}",    /* ^upper */ +  (uschar *)"\\P{Xan}",   /* ^alnum */ +  NULL,                   /* ^ascii */ +  (uschar *)"\\H",        /* ^blank */ +  NULL,                   /* ^cntrl */ +  (uschar *)"\\P{Nd}",    /* ^digit */ +  NULL,                   /* ^graph */ +  NULL,                   /* ^print */ +  NULL,                   /* ^punct */ +  (uschar *)"\\P{Xps}",   /* ^space */   /* NOTE: Xps is POSIX space */ +  (uschar *)"\\P{Xwd}",   /* ^word */ +  NULL                    /* ^xdigit */ +}; +#define POSIX_SUBSIZE (sizeof(posix_substitutes)/sizeof(uschar *)) +#endif  #define STRING(a)  # a  #define XSTRING(s) STRING(s) @@ -317,7 +369,7 @@ static const char error_texts[] =    /* 35 */    "invalid condition (?(0)\0"    "\\C not allowed in lookbehind assertion\0" -  "PCRE does not support \\L, \\l, \\N, \\U, or \\u\0" +  "PCRE does not support \\L, \\l, \\N{name}, \\U, or \\u\0"    "number after (?C is > 255\0"    "closing ) for (?C expected\0"    /* 40 */ @@ -343,7 +395,7 @@ static const char error_texts[] =    "inconsistent NEWLINE options\0"    "\\g is not followed by a braced, angle-bracketed, or quoted name/number or by a plain number\0"    "a numbered reference must not be zero\0" -  "(*VERB) with an argument is not supported\0" +  "an argument is not allowed for (*ACCEPT), (*FAIL), or (*COMMIT)\0"    /* 60 */    "(*VERB) not recognized\0"    "number is too big\0" @@ -351,7 +403,10 @@ static const char error_texts[] =    "digit expected after (?+\0"    "] is an invalid data character in JavaScript compatibility mode\0"    /* 65 */ -  "different names for subpatterns of the same number are not allowed\0"; +  "different names for subpatterns of the same number are not allowed\0" +  "(*MARK) must have an argument\0" +  "this version of PCRE is not compiled with PCRE_UCP support\0" +  ;  /* Table to identify digits and hex digits. This is used when compiling  patterns. Note that the tables in chartables are dependent on the locale, and @@ -584,7 +639,6 @@ else      case CHAR_l:      case CHAR_L: -    case CHAR_N:      case CHAR_u:      case CHAR_U:      *errorcodeptr = ERR37; @@ -822,6 +876,19 @@ else      }    } +/* Perl supports \N{name} for character names, as well as plain \N for "not +newline". PCRE does not support \N{name}. */ + +if (c == -ESC_N && ptr[1] == CHAR_LEFT_CURLY_BRACKET) +  *errorcodeptr = ERR37; + +/* If PCRE_UCP is set, we change the values for \d etc. */ + +if ((options & PCRE_UCP) != 0 && c <= -ESC_D && c >= -ESC_w) +  c -= (ESC_DU - ESC_D); + +/* Set the pointer to the final character before returning. */ +  *ptrptr = ptr;  return c;  } @@ -1060,25 +1127,39 @@ dealing with. The very first call may not start with a parenthesis. */  if (ptr[0] == CHAR_LEFT_PARENTHESIS)    { -  if (ptr[1] == CHAR_QUESTION_MARK && -      ptr[2] == CHAR_VERTICAL_LINE) +  /* Handle specials such as (*SKIP) or (*UTF8) etc. */ + +  if (ptr[1] == CHAR_ASTERISK) ptr += 2; + +  /* Handle a normal, unnamed capturing parenthesis. */ + +  else if (ptr[1] != CHAR_QUESTION_MARK) +    { +    *count += 1; +    if (name == NULL && *count == lorn) return *count; +    ptr++; +    } + +  /* All cases now have (? at the start. Remember when we are in a group +  where the parenthesis numbers are duplicated. */ + +  else if (ptr[2] == CHAR_VERTICAL_LINE)      {      ptr += 3;      dup_parens = TRUE;      } -  /* Handle a normal, unnamed capturing parenthesis */ +  /* Handle comments; all characters are allowed until a ket is reached. */ -  else if (ptr[1] != CHAR_QUESTION_MARK && ptr[1] != CHAR_ASTERISK) +  else if (ptr[2] == CHAR_NUMBER_SIGN)      { -    *count += 1; -    if (name == NULL && *count == lorn) return *count; -    ptr++; +    for (ptr += 3; *ptr != 0; ptr++) if (*ptr == CHAR_RIGHT_PARENTHESIS) break; +    goto FAIL_EXIT;      }    /* Handle a condition. If it is an assertion, just carry on so that it    is processed as normal. If not, skip to the closing parenthesis of the -  condition (there can't be any nested parens. */ +  condition (there can't be any nested parens). */    else if (ptr[2] == CHAR_LEFT_PARENTHESIS)      { @@ -1090,7 +1171,7 @@ if (ptr[0] == CHAR_LEFT_PARENTHESIS)        }      } -  /* We have either (? or (* and not a condition */ +  /* Start with (? but not a condition. */    else      { @@ -1212,8 +1293,7 @@ for (; *ptr != 0; ptr++)    else if (*ptr == CHAR_RIGHT_PARENTHESIS)      {      if (dup_parens && *count < hwm_count) *count = hwm_count; -    *ptrptr = ptr; -    return -1; +    goto FAIL_EXIT;      }    else if (*ptr == CHAR_VERTICAL_LINE && dup_parens) @@ -1613,7 +1693,8 @@ for (;;)    /* Otherwise, we can get the item's length from the table, except that for    repeated character types, we have to test for \p and \P, which have an extra -  two bytes of parameters. */ +  two bytes of parameters, and for MARK/PRUNE/SKIP/THEN with an argument, we +  must add in its length. */    else      { @@ -1637,6 +1718,13 @@ for (;;)        case OP_TYPEPOSUPTO:        if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;        break; + +      case OP_MARK: +      case OP_PRUNE_ARG: +      case OP_SKIP_ARG: +      case OP_THEN_ARG: +      code += code[1]; +      break;        }      /* Add in the fixed length from the table */ @@ -1708,7 +1796,8 @@ for (;;)    /* Otherwise, we can get the item's length from the table, except that for    repeated character types, we have to test for \p and \P, which have an extra -  two bytes of parameters. */ +  two bytes of parameters, and for MARK/PRUNE/SKIP/THEN with an argument, we +  must add in its length. */    else      { @@ -1732,6 +1821,13 @@ for (;;)        case OP_TYPEEXACT:        if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;        break; + +      case OP_MARK: +      case OP_PRUNE_ARG: +      case OP_SKIP_ARG: +      case OP_THEN_ARG: +      code += code[1]; +      break;        }      /* Add in the fixed length from the table */ @@ -2001,6 +2097,16 @@ for (code = first_significant_code(code + _pcre_OP_lengths[*code], NULL, 0, TRUE      break;  #endif +    /* MARK, and PRUNE/SKIP/THEN with an argument must skip over the argument +    string. */ + +    case OP_MARK: +    case OP_PRUNE_ARG: +    case OP_SKIP_ARG: +    case OP_THEN_ARG: +    code += code[1]; +    break; +      /* None of the remaining opcodes are required to match a character. */      default: @@ -2221,8 +2327,8 @@ auto_callout(uschar *code, const uschar *ptr, compile_data *cd)  {  *code++ = OP_CALLOUT;  *code++ = 255; -PUT(code, 0, ptr - cd->start_pattern);  /* Pattern offset */ -PUT(code, LINK_SIZE, 0);                /* Default length */ +PUT(code, 0, (int)(ptr - cd->start_pattern));  /* Pattern offset */ +PUT(code, LINK_SIZE, 0);                       /* Default length */  return code + 2*LINK_SIZE;  } @@ -2247,7 +2353,7 @@ Returns:             nothing  static void  complete_callout(uschar *previous_callout, const uschar *ptr, compile_data *cd)  { -int length = ptr - cd->start_pattern - GET(previous_callout, 2); +int length = (int)(ptr - cd->start_pattern - GET(previous_callout, 2));  PUT(previous_callout, 2 + LINK_SIZE, length);  } @@ -2297,6 +2403,69 @@ for (++c; c <= d; c++)  return TRUE;  } + + + +/************************************************* +*        Check a character and a property        * +*************************************************/ + +/* This function is called by check_auto_possessive() when a property item +is adjacent to a fixed character. + +Arguments: +  c            the character +  ptype        the property type +  pdata        the data for the type +  negated      TRUE if it's a negated property (\P or \p{^) + +Returns:       TRUE if auto-possessifying is OK +*/ + +static BOOL +check_char_prop(int c, int ptype, int pdata, BOOL negated) +{ +const ucd_record *prop = GET_UCD(c); +switch(ptype) +  { +  case PT_LAMP: +  return (prop->chartype == ucp_Lu || +          prop->chartype == ucp_Ll || +          prop->chartype == ucp_Lt) == negated; + +  case PT_GC: +  return (pdata == _pcre_ucp_gentype[prop->chartype]) == negated; + +  case PT_PC: +  return (pdata == prop->chartype) == negated; + +  case PT_SC: +  return (pdata == prop->script) == negated; + +  /* These are specials */ + +  case PT_ALNUM: +  return (_pcre_ucp_gentype[prop->chartype] == ucp_L || +          _pcre_ucp_gentype[prop->chartype] == ucp_N) == negated; + +  case PT_SPACE:    /* Perl space */ +  return (_pcre_ucp_gentype[prop->chartype] == ucp_Z || +          c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR) +          == negated; + +  case PT_PXSPACE:  /* POSIX space */ +  return (_pcre_ucp_gentype[prop->chartype] == ucp_Z || +          c == CHAR_HT || c == CHAR_NL || c == CHAR_VT || +          c == CHAR_FF || c == CHAR_CR) +          == negated; + +  case PT_WORD: +  return (_pcre_ucp_gentype[prop->chartype] == ucp_L || +          _pcre_ucp_gentype[prop->chartype] == ucp_N || +          c == CHAR_UNDERSCORE) == negated; +  } +return FALSE; +}  #endif  /* SUPPORT_UCP */ @@ -2310,10 +2479,8 @@ whether the next thing could possibly match the repeated item. If not, it makes  sense to automatically possessify the repeated item.  Arguments: -  op_code       the repeated op code -  this          data for this item, depends on the opcode +  previous      pointer to the repeated opcode    utf8          TRUE in UTF-8 mode -  utf8_char     used for utf8 character bytes, NULL if not relevant    ptr           next character in pattern    options       options bits    cd            contains pointers to tables etc. @@ -2322,10 +2489,11 @@ Returns:        TRUE if possessifying is wanted  */  static BOOL -check_auto_possessive(int op_code, int item, BOOL utf8, uschar *utf8_char, -  const uschar *ptr, int options, compile_data *cd) +check_auto_possessive(const uschar *previous, BOOL utf8, const uschar *ptr, +  int options, compile_data *cd)  { -int next; +int c, next; +int op_code = *previous++;  /* Skip whitespace and comments in extended mode */ @@ -2386,23 +2554,18 @@ if (*ptr == CHAR_ASTERISK || *ptr == CHAR_QUESTION_MARK ||    strncmp((char *)ptr, STR_LEFT_CURLY_BRACKET STR_0 STR_COMMA, 3) == 0)      return FALSE; -/* Now compare the next item with the previous opcode. If the previous is a -positive single character match, "item" either contains the character or, if -"item" is greater than 127 in utf8 mode, the character's bytes are in -utf8_char. */ - - -/* Handle cases when the next item is a character. */ +/* Now compare the next item with the previous opcode. First, handle cases when +the next item is a character. */  if (next >= 0) switch(op_code)    {    case OP_CHAR:  #ifdef SUPPORT_UTF8 -  if (utf8 && item > 127) { GETCHAR(item, utf8_char); } +  GETCHARTEST(c, previous);  #else -  (void)(utf8_char);  /* Keep compiler happy by referencing function argument */ +  c = *previous;  #endif -  return item != next; +  return c != next;    /* For CHARNC (caseless character) we must check the other case. If we have    Unicode property support, we can use it to test the other case of @@ -2410,9 +2573,11 @@ if (next >= 0) switch(op_code)    case OP_CHARNC:  #ifdef SUPPORT_UTF8 -  if (utf8 && item > 127) { GETCHAR(item, utf8_char); } +  GETCHARTEST(c, previous); +#else +  c = *previous;  #endif -  if (item == next) return FALSE; +  if (c == next) return FALSE;  #ifdef SUPPORT_UTF8    if (utf8)      { @@ -2423,16 +2588,16 @@ if (next >= 0) switch(op_code)  #else      othercase = NOTACHAR;  #endif -    return (unsigned int)item != othercase; +    return (unsigned int)c != othercase;      }    else  #endif  /* SUPPORT_UTF8 */ -  return (item != cd->fcc[next]);  /* Non-UTF-8 mode */ +  return (c != cd->fcc[next]);  /* Non-UTF-8 mode */ -  /* For OP_NOT, "item" must be a single-byte character. */ +  /* For OP_NOT, its data is always a single-byte character. */    case OP_NOT: -  if (item == next) return TRUE; +  if ((c = *previous) == next) return TRUE;    if ((options & PCRE_CASELESS) == 0) return FALSE;  #ifdef SUPPORT_UTF8    if (utf8) @@ -2444,11 +2609,14 @@ if (next >= 0) switch(op_code)  #else      othercase = NOTACHAR;  #endif -    return (unsigned int)item == othercase; +    return (unsigned int)c == othercase;      }    else  #endif  /* SUPPORT_UTF8 */ -  return (item == cd->fcc[next]);  /* Non-UTF-8 mode */ +  return (c == cd->fcc[next]);  /* Non-UTF-8 mode */ + +  /* Note that OP_DIGIT etc. are generated only when PCRE_UCP is *not* set. +  When it is set, \d etc. are converted into OP_(NOT_)PROP codes. */    case OP_DIGIT:    return next > 127 || (cd->ctypes[next] & ctype_digit) == 0; @@ -2491,11 +2659,12 @@ if (next >= 0) switch(op_code)      case 0x202f:      case 0x205f:      case 0x3000: -    return op_code != OP_HSPACE; +    return op_code == OP_NOT_HSPACE;      default: -    return op_code == OP_HSPACE; +    return op_code != OP_NOT_HSPACE;      } +  case OP_ANYNL:    case OP_VSPACE:    case OP_NOT_VSPACE:    switch(next) @@ -2507,48 +2676,62 @@ if (next >= 0) switch(op_code)      case 0x85:      case 0x2028:      case 0x2029: -    return op_code != OP_VSPACE; +    return op_code == OP_NOT_VSPACE;      default: -    return op_code == OP_VSPACE; +    return op_code != OP_NOT_VSPACE;      } +#ifdef SUPPORT_UCP +  case OP_PROP: +  return check_char_prop(next, previous[0], previous[1], FALSE); + +  case OP_NOTPROP: +  return check_char_prop(next, previous[0], previous[1], TRUE); +#endif +    default:    return FALSE;    } -/* Handle the case when the next item is \d, \s, etc. */ +/* Handle the case when the next item is \d, \s, etc. Note that when PCRE_UCP +is set, \d turns into ESC_du rather than ESC_d, etc., so ESC_d etc. are +generated only when PCRE_UCP is *not* set, that is, when only ASCII +characteristics are recognized. Similarly, the opcodes OP_DIGIT etc. are +replaced by OP_PROP codes when PCRE_UCP is set. */  switch(op_code)    {    case OP_CHAR:    case OP_CHARNC:  #ifdef SUPPORT_UTF8 -  if (utf8 && item > 127) { GETCHAR(item, utf8_char); } +  GETCHARTEST(c, previous); +#else +  c = *previous;  #endif    switch(-next)      {      case ESC_d: -    return item > 127 || (cd->ctypes[item] & ctype_digit) == 0; +    return c > 127 || (cd->ctypes[c] & ctype_digit) == 0;      case ESC_D: -    return item <= 127 && (cd->ctypes[item] & ctype_digit) != 0; +    return c <= 127 && (cd->ctypes[c] & ctype_digit) != 0;      case ESC_s: -    return item > 127 || (cd->ctypes[item] & ctype_space) == 0; +    return c > 127 || (cd->ctypes[c] & ctype_space) == 0;      case ESC_S: -    return item <= 127 && (cd->ctypes[item] & ctype_space) != 0; +    return c <= 127 && (cd->ctypes[c] & ctype_space) != 0;      case ESC_w: -    return item > 127 || (cd->ctypes[item] & ctype_word) == 0; +    return c > 127 || (cd->ctypes[c] & ctype_word) == 0;      case ESC_W: -    return item <= 127 && (cd->ctypes[item] & ctype_word) != 0; +    return c <= 127 && (cd->ctypes[c] & ctype_word) != 0;      case ESC_h:      case ESC_H: -    switch(item) +    switch(c)        {        case 0x09:        case 0x20: @@ -2576,7 +2759,7 @@ switch(op_code)      case ESC_v:      case ESC_V: -    switch(item) +    switch(c)        {        case 0x0a:        case 0x0b: @@ -2590,38 +2773,92 @@ switch(op_code)        return -next == ESC_v;        } +    /* When PCRE_UCP is set, these values get generated for \d etc. Find +    their substitutions and process them. The result will always be either +    -ESC_p or -ESC_P. Then fall through to process those values. */ + +#ifdef SUPPORT_UCP +    case ESC_du: +    case ESC_DU: +    case ESC_wu: +    case ESC_WU: +    case ESC_su: +    case ESC_SU: +      { +      int temperrorcode = 0; +      ptr = substitutes[-next - ESC_DU]; +      next = check_escape(&ptr, &temperrorcode, 0, options, FALSE); +      if (temperrorcode != 0) return FALSE; +      ptr++;    /* For compatibility */ +      } +    /* Fall through */ + +    case ESC_p: +    case ESC_P: +      { +      int ptype, pdata, errorcodeptr; +      BOOL negated; + +      ptr--;      /* Make ptr point at the p or P */ +      ptype = get_ucp(&ptr, &negated, &pdata, &errorcodeptr); +      if (ptype < 0) return FALSE; +      ptr++;      /* Point past the final curly ket */ + +      /* If the property item is optional, we have to give up. (When generated +      from \d etc by PCRE_UCP, this test will have been applied much earlier, +      to the original \d etc. At this point, ptr will point to a zero byte. */ + +      if (*ptr == CHAR_ASTERISK || *ptr == CHAR_QUESTION_MARK || +        strncmp((char *)ptr, STR_LEFT_CURLY_BRACKET STR_0 STR_COMMA, 3) == 0) +          return FALSE; + +      /* Do the property check. */ + +      return check_char_prop(c, ptype, pdata, (next == -ESC_P) != negated); +      } +#endif +      default:      return FALSE;      } +  /* In principle, support for Unicode properties should be integrated here as +  well. It means re-organizing the above code so as to get hold of the property +  values before switching on the op-code. However, I wonder how many patterns +  combine ASCII \d etc with Unicode properties? (Note that if PCRE_UCP is set, +  these op-codes are never generated.) */ +    case OP_DIGIT:    return next == -ESC_D || next == -ESC_s || next == -ESC_W || -         next == -ESC_h || next == -ESC_v; +         next == -ESC_h || next == -ESC_v || next == -ESC_R;    case OP_NOT_DIGIT:    return next == -ESC_d;    case OP_WHITESPACE: -  return next == -ESC_S || next == -ESC_d || next == -ESC_w; +  return next == -ESC_S || next == -ESC_d || next == -ESC_w || next == -ESC_R;    case OP_NOT_WHITESPACE:    return next == -ESC_s || next == -ESC_h || next == -ESC_v;    case OP_HSPACE: -  return next == -ESC_S || next == -ESC_H || next == -ESC_d || next == -ESC_w; +  return next == -ESC_S || next == -ESC_H || next == -ESC_d || +         next == -ESC_w || next == -ESC_v || next == -ESC_R;    case OP_NOT_HSPACE:    return next == -ESC_h;    /* Can't have \S in here because VT matches \S (Perl anomaly) */ +  case OP_ANYNL:    case OP_VSPACE:    return next == -ESC_V || next == -ESC_d || next == -ESC_w;    case OP_NOT_VSPACE: -  return next == -ESC_v; +  return next == -ESC_v || next == -ESC_R;    case OP_WORDCHAR: -  return next == -ESC_W || next == -ESC_s || next == -ESC_h || next == -ESC_v; +  return next == -ESC_W || next == -ESC_s || next == -ESC_h || +         next == -ESC_v || next == -ESC_R;    case OP_NOT_WORDCHAR:    return next == -ESC_w || next == -ESC_d; @@ -2685,6 +2922,7 @@ BOOL inescq = FALSE;  BOOL groupsetfirstbyte = FALSE;  const uschar *ptr = *ptrptr;  const uschar *tempptr; +const uschar *nestptr = NULL;  uschar *previous = NULL;  uschar *previous_callout = NULL;  uschar *save_hwm = NULL; @@ -2755,6 +2993,16 @@ for (;; ptr++)    c = *ptr; +  /* If we are at the end of a nested substitution, revert to the outer level +  string. Nesting only happens one level deep. */ + +  if (c == 0 && nestptr != NULL) +    { +    ptr = nestptr; +    nestptr = NULL; +    c = *ptr; +    } +    /* If we are in the pre-compile phase, accumulate the length used for the    previous cycle of this loop. */ @@ -2785,7 +3033,7 @@ for (;; ptr++)        goto FAILED;        } -    *lengthptr += code - last_code; +    *lengthptr += (int)(code - last_code);      DPRINTF(("length=%d added %d c=%c\n", *lengthptr, code - last_code, c));      /* If "previous" is set and it is not at the start of the work space, move @@ -2903,7 +3151,7 @@ for (;; ptr++)          *errorcodeptr = ERR20;          goto FAILED;          } -      *lengthptr += code - last_code;   /* To include callout length */ +      *lengthptr += (int)(code - last_code);   /* To include callout length */        DPRINTF((">> end branch\n"));        }      return TRUE; @@ -3108,7 +3356,7 @@ for (;; ptr++)            ptr++;            } -        posix_class = check_posix_name(ptr, tempptr - ptr); +        posix_class = check_posix_name(ptr, (int)(tempptr - ptr));          if (posix_class < 0)            {            *errorcodeptr = ERR30; @@ -3122,10 +3370,25 @@ for (;; ptr++)          if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)            posix_class = 0; -        /* We build the bit map for the POSIX class in a chunk of local store -        because we may be adding and subtracting from it, and we don't want to -        subtract bits that may be in the main map already. At the end we or the -        result into the bit map that is being built. */ +        /* When PCRE_UCP is set, some of the POSIX classes are converted to +        different escape sequences that use Unicode properties. */ + +#ifdef SUPPORT_UCP +        if ((options & PCRE_UCP) != 0) +          { +          int pc = posix_class + ((local_negate)? POSIX_SUBSIZE/2 : 0); +          if (posix_substitutes[pc] != NULL) +            { +            nestptr = tempptr + 1; +            ptr = posix_substitutes[pc] - 1; +            continue; +            } +          } +#endif +        /* In the non-UCP case, we build the bit map for the POSIX class in a +        chunk of local store because we may be adding and subtracting from it, +        and we don't want to subtract bits that may be in the main map already. +        At the end we or the result into the bit map that is being built. */          posix_class *= 3; @@ -3169,19 +3432,18 @@ for (;; ptr++)        /* Backslash may introduce a single character, or it may introduce one        of the specials, which just set a flag. The sequence \b is a special -      case. Inside a class (and only there) it is treated as backspace. -      Elsewhere it marks a word boundary. Other escapes have preset maps ready -      to 'or' into the one we are building. We assume they have more than one -      character in them, so set class_charcount bigger than one. */ +      case. Inside a class (and only there) it is treated as backspace. We +      assume that other escapes have more than one character in them, so set +      class_charcount bigger than one. Unrecognized escapes fall through and +      are either treated as literal characters (by default), or are faulted if +      PCRE_EXTRA is set. */        if (c == CHAR_BACKSLASH)          {          c = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);          if (*errorcodeptr != 0) goto FAILED; -        if (-c == ESC_b) c = CHAR_BS;       /* \b is backspace in a class */ -        else if (-c == ESC_X) c = CHAR_X;   /* \X is literal X in a class */ -        else if (-c == ESC_R) c = CHAR_R;   /* \R is literal R in a class */ +        if (-c == ESC_b) c = CHAR_BS;    /* \b is backspace in a class */          else if (-c == ESC_Q)            /* Handle start of quoted string */            {            if (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E) @@ -3198,10 +3460,20 @@ for (;; ptr++)            register const uschar *cbits = cd->cbits;            class_charcount += 2;     /* Greater than 1 is what matters */ -          /* Save time by not doing this in the pre-compile phase. */ - -          if (lengthptr == NULL) switch (-c) +          switch (-c)              { +#ifdef SUPPORT_UCP +            case ESC_du:     /* These are the values given for \d etc */ +            case ESC_DU:     /* when PCRE_UCP is set. We replace the */ +            case ESC_wu:     /* escape sequence with an appropriate \p */ +            case ESC_WU:     /* or \P to test Unicode properties instead */ +            case ESC_su:     /* of the default ASCII testing. */ +            case ESC_SU: +            nestptr = ptr; +            ptr = substitutes[-c - ESC_DU] - 1;  /* Just before substitute */ +            class_charcount -= 2;                /* Undo! */ +            continue; +#endif              case ESC_d:              for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];              continue; @@ -3231,20 +3503,7 @@ for (;; ptr++)              classbits[1] |= 0x08;    /* Perl 5.004 onwards omits VT from \s */              continue; -            default:    /* Not recognized; fall through */ -            break;      /* Need "default" setting to stop compiler warning. */ -            } - -          /* In the pre-compile phase, just do the recognition. */ - -          else if (c == -ESC_d || c == -ESC_D || c == -ESC_w || -                   c == -ESC_W || c == -ESC_s || c == -ESC_S) continue; - -          /* We need to deal with \H, \h, \V, and \v in both phases because -          they use extra memory. */ - -          if (-c == ESC_h) -            { +            case ESC_h:              SETBIT(classbits, 0x09); /* VT */              SETBIT(classbits, 0x20); /* SPACE */              SETBIT(classbits, 0xa0); /* NSBP */ @@ -3268,10 +3527,8 @@ for (;; ptr++)                }  #endif              continue; -            } -          if (-c == ESC_H) -            { +            case ESC_H:              for (c = 0; c < 32; c++)                {                int x = 0xff; @@ -3313,10 +3570,8 @@ for (;; ptr++)                }  #endif              continue; -            } -          if (-c == ESC_v) -            { +            case ESC_v:              SETBIT(classbits, 0x0a); /* LF */              SETBIT(classbits, 0x0b); /* VT */              SETBIT(classbits, 0x0c); /* FF */ @@ -3332,10 +3587,8 @@ for (;; ptr++)                }  #endif              continue; -            } -          if (-c == ESC_V) -            { +            case ESC_V:              for (c = 0; c < 32; c++)                {                int x = 0xff; @@ -3365,38 +3618,38 @@ for (;; ptr++)                }  #endif              continue; -            } - -          /* We need to deal with \P and \p in both phases. */  #ifdef SUPPORT_UCP -          if (-c == ESC_p || -c == ESC_P) -            { -            BOOL negated; -            int pdata; -            int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr); -            if (ptype < 0) goto FAILED; -            class_utf8 = TRUE; -            *class_utf8data++ = ((-c == ESC_p) != negated)? -              XCL_PROP : XCL_NOTPROP; -            *class_utf8data++ = ptype; -            *class_utf8data++ = pdata; -            class_charcount -= 2;   /* Not a < 256 character */ -            continue; -            } +            case ESC_p: +            case ESC_P: +              { +              BOOL negated; +              int pdata; +              int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr); +              if (ptype < 0) goto FAILED; +              class_utf8 = TRUE; +              *class_utf8data++ = ((-c == ESC_p) != negated)? +                XCL_PROP : XCL_NOTPROP; +              *class_utf8data++ = ptype; +              *class_utf8data++ = pdata; +              class_charcount -= 2;   /* Not a < 256 character */ +              continue; +              }  #endif -          /* Unrecognized escapes are faulted if PCRE is running in its -          strict mode. By default, for compatibility with Perl, they are -          treated as literals. */ +            /* Unrecognized escapes are faulted if PCRE is running in its +            strict mode. By default, for compatibility with Perl, they are +            treated as literals. */ -          if ((options & PCRE_EXTRA) != 0) -            { -            *errorcodeptr = ERR7; -            goto FAILED; +            default: +            if ((options & PCRE_EXTRA) != 0) +              { +              *errorcodeptr = ERR7; +              goto FAILED; +              } +            class_charcount -= 2;  /* Undo the default count from above */ +            c = *ptr;              /* Get the final character and fall through */ +            break;              } - -          class_charcount -= 2;  /* Undo the default count from above */ -          c = *ptr;              /* Get the final character and fall through */            }          /* Fall through if we have a single character (c >= 0). This may be @@ -3466,14 +3719,11 @@ for (;; ptr++)            d = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);            if (*errorcodeptr != 0) goto FAILED; -          /* \b is backspace; \X is literal X; \R is literal R; any other -          special means the '-' was literal */ +          /* \b is backspace; any other special means the '-' was literal */            if (d < 0)              { -            if (d == -ESC_b) d = CHAR_BS; -            else if (d == -ESC_X) d = CHAR_X; -            else if (d == -ESC_R) d = CHAR_R; else +            if (d == -ESC_b) d = CHAR_BS; else                {                ptr = oldptr;                goto LONE_SINGLE_CHARACTER;  /* A few lines below */ @@ -3639,35 +3889,23 @@ for (;; ptr++)          }        } -    /* Loop until ']' reached. This "while" is the end of the "do" above. */ +    /* Loop until ']' reached. This "while" is the end of the "do" far above. +    If we are at the end of an internal nested string, revert to the outer +    string. */ + +    while (((c = *(++ptr)) != 0 || +           (nestptr != NULL && +             (ptr = nestptr, nestptr = NULL, c = *(++ptr)) != 0)) && +           (c != CHAR_RIGHT_SQUARE_BRACKET || inescq)); -    while ((c = *(++ptr)) != 0 && (c != CHAR_RIGHT_SQUARE_BRACKET || inescq)); +    /* Check for missing terminating ']' */ -    if (c == 0)                          /* Missing terminating ']' */ +    if (c == 0)        {        *errorcodeptr = ERR6;        goto FAILED;        } - -/* This code has been disabled because it would mean that \s counts as -an explicit \r or \n reference, and that's not really what is wanted. Now -we set the flag only if there is a literal "\r" or "\n" in the class. */ - -#if 0 -    /* Remember whether \r or \n are in this class */ - -    if (negate_class) -      { -      if ((classbits[1] & 0x24) != 0x24) cd->external_flags |= PCRE_HASCRORLF; -      } -    else -      { -      if ((classbits[1] & 0x24) != 0) cd->external_flags |= PCRE_HASCRORLF; -      } -#endif - -      /* If class_charcount is 1, we saw precisely one character whose value is      less than 256. As long as there were no characters >= 128 and there was no      use of \p or \P, in other words, no use of any XCLASS features, we can @@ -3731,13 +3969,14 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */      /* If there are characters with values > 255, we have to compile an      extended class, with its own opcode, unless there was a negated special -    such as \S in the class, because in that case all characters > 255 are in -    the class, so any that were explicitly given as well can be ignored. If -    (when there are explicit characters > 255 that must be listed) there are no -    characters < 256, we can omit the bitmap in the actual compiled code. */ +    such as \S in the class, and PCRE_UCP is not set, because in that case all +    characters > 255 are in the class, so any that were explicitly given as +    well can be ignored. If (when there are explicit characters > 255 that must +    be listed) there are no characters < 256, we can omit the bitmap in the +    actual compiled code. */  #ifdef SUPPORT_UTF8 -    if (class_utf8 && !should_flip_negation) +    if (class_utf8 && (!should_flip_negation || (options & PCRE_UCP) != 0))        {        *class_utf8data++ = XCL_END;    /* Marks the end of extra data */        *code++ = OP_XCLASS; @@ -3763,10 +4002,11 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */        }  #endif -    /* If there are no characters > 255, set the opcode to OP_CLASS or -    OP_NCLASS, depending on whether the whole class was negated and whether -    there were negative specials such as \S in the class. Then copy the 32-byte -    map into the code vector, negating it if necessary. */ +    /* If there are no characters > 255, or they are all to be included or +    excluded, set the opcode to OP_CLASS or OP_NCLASS, depending on whether the +    whole class was negated and whether there were negative specials such as \S +    (non-UCP) in the class. Then copy the 32-byte map into the code vector, +    negating it if necessary. */      *code++ = (negate_class == should_flip_negation) ? OP_CLASS : OP_NCLASS;      if (negate_class) @@ -3890,8 +4130,7 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */        if (!possessive_quantifier &&            repeat_max < 0 && -          check_auto_possessive(*previous, c, utf8, utf8_char, ptr + 1, -            options, cd)) +          check_auto_possessive(previous, utf8, ptr + 1, options, cd))          {          repeat_type = 0;    /* Force greedy */          possessive_quantifier = TRUE; @@ -3912,7 +4151,7 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */        c = previous[1];        if (!possessive_quantifier &&            repeat_max < 0 && -          check_auto_possessive(OP_NOT, c, utf8, NULL, ptr + 1, options, cd)) +          check_auto_possessive(previous, utf8, ptr + 1, options, cd))          {          repeat_type = 0;    /* Force greedy */          possessive_quantifier = TRUE; @@ -3936,7 +4175,7 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */        if (!possessive_quantifier &&            repeat_max < 0 && -          check_auto_possessive(c, 0, utf8, NULL, ptr + 1, options, cd)) +          check_auto_possessive(previous, utf8, ptr + 1, options, cd))          {          repeat_type = 0;    /* Force greedy */          possessive_quantifier = TRUE; @@ -4146,7 +4385,7 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */        {        register int i;        int ketoffset = 0; -      int len = code - previous; +      int len = (int)(code - previous);        uschar *bralink = NULL;        /* Repeating a DEFINE group is pointless */ @@ -4167,7 +4406,7 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */          {          register uschar *ket = previous;          do ket += GET(ket, 1); while (*ket != OP_KET); -        ketoffset = code - ket; +        ketoffset = (int)(code - ket);          }        /* The case of a zero minimum is special because of the need to stick @@ -4235,7 +4474,7 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */            /* We chain together the bracket offset fields that have to be            filled in later when the ends of the brackets are reached. */ -          offset = (bralink == NULL)? 0 : previous - bralink; +          offset = (bralink == NULL)? 0 : (int)(previous - bralink);            bralink = previous;            PUTINC(previous, 0, offset);            } @@ -4344,7 +4583,7 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */              {              int offset;              *code++ = OP_BRA; -            offset = (bralink == NULL)? 0 : code - bralink; +            offset = (bralink == NULL)? 0 : (int)(code - bralink);              bralink = code;              PUTINC(code, 0, offset);              } @@ -4365,7 +4604,7 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */          while (bralink != NULL)            {            int oldlinkoffset; -          int offset = code - bralink + 1; +          int offset = (int)(code - bralink + 1);            uschar *bra = code - offset;            oldlinkoffset = GET(bra, 1);            bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset; @@ -4453,7 +4692,7 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */  #endif          } -      len = code - tempcode; +      len = (int)(code - tempcode);        if (len > 0) switch (*tempcode)          {          case OP_STAR:  *tempcode = OP_POSSTAR; break; @@ -4512,24 +4751,34 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */      /* First deal with various "verbs" that can be introduced by '*'. */ -    if (*(++ptr) == CHAR_ASTERISK && (cd->ctypes[ptr[1]] & ctype_letter) != 0) +    if (*(++ptr) == CHAR_ASTERISK && +         ((cd->ctypes[ptr[1]] & ctype_letter) != 0 || ptr[1] == ':'))        {        int i, namelen; +      int arglen = 0;        const char *vn = verbnames; -      const uschar *name = ++ptr; +      const uschar *name = ptr + 1; +      const uschar *arg = NULL;        previous = NULL;        while ((cd->ctypes[*++ptr] & ctype_letter) != 0) {}; +      namelen = (int)(ptr - name); +        if (*ptr == CHAR_COLON)          { -        *errorcodeptr = ERR59;   /* Not supported */ -        goto FAILED; +        arg = ++ptr; +        while ((cd->ctypes[*ptr] & (ctype_letter|ctype_digit)) != 0 +          || *ptr == '_') ptr++; +        arglen = (int)(ptr - arg);          } +        if (*ptr != CHAR_RIGHT_PARENTHESIS)          {          *errorcodeptr = ERR60;          goto FAILED;          } -      namelen = ptr - name; + +      /* Scan the table of verb names */ +        for (i = 0; i < verbcount; i++)          {          if (namelen == verbs[i].len && @@ -4547,13 +4796,41 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */                PUT2INC(code, 0, oc->number);                }              } -          *code++ = verbs[i].op; -          break; + +          /* Handle the cases with/without an argument */ + +          if (arglen == 0) +            { +            if (verbs[i].op < 0)   /* Argument is mandatory */ +              { +              *errorcodeptr = ERR66; +              goto FAILED; +              } +            *code++ = verbs[i].op; +            } + +          else +            { +            if (verbs[i].op_arg < 0)   /* Argument is forbidden */ +              { +              *errorcodeptr = ERR59; +              goto FAILED; +              } +            *code++ = verbs[i].op_arg; +            *code++ = arglen; +            memcpy(code, arg, arglen); +            code += arglen; +            *code++ = 0; +            } + +          break;  /* Found verb, exit loop */            } +          vn += verbs[i].len + 1;          } -      if (i < verbcount) continue; -      *errorcodeptr = ERR60; + +      if (i < verbcount) continue;    /* Successfully handled a verb */ +      *errorcodeptr = ERR60;          /* Verb not recognized */        goto FAILED;        } @@ -4672,7 +4949,7 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */                recno * 10 + *ptr - CHAR_0 : -1;            ptr++;            } -        namelen = ptr - name; +        namelen = (int)(ptr - name);          if ((terminator > 0 && *ptr++ != terminator) ||              *ptr++ != CHAR_RIGHT_PARENTHESIS) @@ -4868,8 +5145,8 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */              goto FAILED;              }            *code++ = n; -          PUT(code, 0, ptr - cd->start_pattern + 1);  /* Pattern offset */ -          PUT(code, LINK_SIZE, 0);                    /* Default length */ +          PUT(code, 0, (int)(ptr - cd->start_pattern + 1)); /* Pattern offset */ +          PUT(code, LINK_SIZE, 0);                          /* Default length */            code += 2 * LINK_SIZE;            }          previous = NULL; @@ -4902,7 +5179,7 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */            name = ++ptr;            while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++; -          namelen = ptr - name; +          namelen = (int)(ptr - name);            /* In the pre-compile phase, just do a syntax check. */ @@ -5032,7 +5309,7 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */          NAMED_REF_OR_RECURSE:          name = ++ptr;          while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++; -        namelen = ptr - name; +        namelen = (int)(ptr - name);          /* In the pre-compile phase, do a syntax check and set a dummy          reference number. */ @@ -5201,7 +5478,7 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */                of the group. */                called = cd->start_code + recno; -              PUTINC(cd->hwm, 0, code + 2 + LINK_SIZE - cd->start_code); +              PUTINC(cd->hwm, 0, (int)(code + 2 + LINK_SIZE - cd->start_code));                }              /* If not a forward reference, and the subpattern is still open, @@ -5225,7 +5502,7 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */            code += 1 + LINK_SIZE;            *code = OP_RECURSE; -          PUT(code, 1, called - cd->start_code); +          PUT(code, 1, (int)(called - cd->start_code));            code += 1 + LINK_SIZE;            *code = OP_KET; @@ -5336,8 +5613,8 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */          }     /* End of switch for character following (? */        }       /* End of (? handling */ -    /* Opening parenthesis not followed by '?'. If PCRE_NO_AUTO_CAPTURE is set, -    all unadorned brackets become non-capturing and behave like (?:...) +    /* Opening parenthesis not followed by '*' or '?'. If PCRE_NO_AUTO_CAPTURE +    is set, all unadorned brackets become non-capturing and behave like (?:...)      brackets. */      else if ((options & PCRE_NO_AUTO_CAPTURE) != 0) @@ -5529,11 +5806,12 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */      /* ===================================================================*/      /* Handle metasequences introduced by \. For ones like \d, the ESC_ values -    are arranged to be the negation of the corresponding OP_values. For the -    back references, the values are ESC_REF plus the reference number. Only -    back references and those types that consume a character may be repeated. -    We can test for values between ESC_b and ESC_Z for the latter; this may -    have to change if any new ones are ever created. */ +    are arranged to be the negation of the corresponding OP_values in the +    default case when PCRE_UCP is not set. For the back references, the values +    are ESC_REF plus the reference number. Only back references and those types +    that consume a character may be repeated. We can test for values between +    ESC_b and ESC_Z for the latter; this may have to change if any new ones are +    ever created. */      case CHAR_BACKSLASH:      tempptr = ptr; @@ -5693,12 +5971,24 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */  #endif        /* For the rest (including \X when Unicode properties are supported), we -      can obtain the OP value by negating the escape value. */ +      can obtain the OP value by negating the escape value in the default +      situation when PCRE_UCP is not set. When it *is* set, we substitute +      Unicode property tests. */        else          { -        previous = (-c > ESC_b && -c < ESC_Z)? code : NULL; -        *code++ = -c; +#ifdef SUPPORT_UCP +        if (-c >= ESC_DU && -c <= ESC_wu) +          { +          nestptr = ptr + 1;                   /* Where to resume */ +          ptr = substitutes[-c - ESC_DU] - 1;  /* Just before substitute */ +          } +        else +#endif +          { +          previous = (-c > ESC_b && -c < ESC_Z)? code : NULL; +          *code++ = -c; +          }          }        continue;        } @@ -6030,7 +6320,7 @@ for (;;)      {      if (lengthptr == NULL)        { -      int branch_length = code - last_branch; +      int branch_length = (int)(code - last_branch);        do          {          int prev_length = GET(last_branch, 1); @@ -6044,7 +6334,7 @@ for (;;)      /* Fill in the ket */      *code = OP_KET; -    PUT(code, 1, code - start_bracket); +    PUT(code, 1, (int)(code - start_bracket));      code += 1 + LINK_SIZE;      /* If it was a capturing subpattern, check to see if it contained any @@ -6059,9 +6349,9 @@ for (;;)            code - start_bracket);          *start_bracket = OP_ONCE;          code += 1 + LINK_SIZE; -        PUT(start_bracket, 1, code - start_bracket); +        PUT(start_bracket, 1, (int)(code - start_bracket));          *code = OP_KET; -        PUT(code, 1, code - start_bracket); +        PUT(code, 1, (int)(code - start_bracket));          code += 1 + LINK_SIZE;          length += 2 + 2*LINK_SIZE;          } @@ -6116,7 +6406,7 @@ for (;;)    else      {      *code = OP_ALT; -    PUT(code, 1, code - last_branch); +    PUT(code, 1, (int)(code - last_branch));      bc.current_branch = last_branch = code;      code += 1 + LINK_SIZE;      } @@ -6435,7 +6725,7 @@ int length = 1;  /* For final END opcode */  int firstbyte, reqbyte, newline;  int errorcode = 0;  int skipatstart = 0; -BOOL utf8 = (options & PCRE_UTF8) != 0; +BOOL utf8;  size_t size;  uschar *code;  const uschar *codestart; @@ -6505,6 +6795,8 @@ while (ptr[skipatstart] == CHAR_LEFT_PARENTHESIS &&    if (strncmp((char *)(ptr+skipatstart+2), STRING_UTF8_RIGHTPAR, 5) == 0)      { skipatstart += 7; options |= PCRE_UTF8; continue; } +  else if (strncmp((char *)(ptr+skipatstart+2), STRING_UCP_RIGHTPAR, 4) == 0) +    { skipatstart += 6; options |= PCRE_UCP; continue; }    if (strncmp((char *)(ptr+skipatstart+2), STRING_CR_RIGHTPAR, 3) == 0)      { skipatstart += 5; newnl = PCRE_NEWLINE_CR; } @@ -6529,6 +6821,8 @@ while (ptr[skipatstart] == CHAR_LEFT_PARENTHESIS &&    else break;    } +utf8 = (options & PCRE_UTF8) != 0; +  /* Can't support UTF8 unless PCRE has been compiled to include the code. */  #ifdef SUPPORT_UTF8 @@ -6546,6 +6840,16 @@ if (utf8)    }  #endif +/* Can't support UCP unless PCRE has been compiled to include the code. */ + +#ifndef SUPPORT_UCP +if ((options & PCRE_UCP) != 0) +  { +  errorcode = ERR67; +  goto PCRE_EARLY_ERROR_RETURN; +  } +#endif +  /* Check validity of \R options. */  switch (options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) @@ -6674,7 +6978,7 @@ regex compiled on a system with 4-byte pointers is run on another with 8-byte  pointers. */  re->magic_number = MAGIC_NUMBER; -re->size = size; +re->size = (int)size;  re->options = cd->external_options;  re->flags = cd->external_flags;  re->dummy1 = 0; @@ -6745,7 +7049,7 @@ while (errorcode == 0 && cd->hwm > cworkspace)    recno = GET(codestart, offset);    groupptr = _pcre_find_bracket(codestart, utf8, recno);    if (groupptr == NULL) errorcode = ERR53; -    else PUT(((uschar *)codestart), offset, groupptr - codestart); +    else PUT(((uschar *)codestart), offset, (int)(groupptr - codestart));    }  /* Give an error if there's back reference to a non-existent capturing @@ -6800,7 +7104,7 @@ if (errorcode != 0)    {    (pcre_free)(re);    PCRE_EARLY_ERROR_RETURN: -  *erroroffset = ptr - (const uschar *)pattern; +  *erroroffset = (int)(ptr - (const uschar *)pattern);    PCRE_EARLY_ERROR_RETURN2:    *errorptr = find_error_text(errorcode);    if (errorcodeptr != NULL) *errorcodeptr = errorcode; | 
