diff options
Diffstat (limited to 'ext/pcre/pcrelib/pcre_study.c')
| -rw-r--r-- | ext/pcre/pcrelib/pcre_study.c | 274 | 
1 files changed, 208 insertions, 66 deletions
| diff --git a/ext/pcre/pcrelib/pcre_study.c b/ext/pcre/pcrelib/pcre_study.c index 2653624e0a..3ac7e81496 100644 --- a/ext/pcre/pcrelib/pcre_study.c +++ b/ext/pcre/pcrelib/pcre_study.c @@ -46,6 +46,7 @@ supporting functions. */  #include "pcre_internal.h" +#define SET_BIT(c) start_bits[c/8] |= (1 << (c&7))  /* Returns from set_start_bits() */ @@ -411,6 +412,15 @@ for (;;)  #endif      break; +    /* Skip these, but we need to add in the name length. */ + +    case OP_MARK: +    case OP_PRUNE_ARG: +    case OP_SKIP_ARG: +    case OP_THEN_ARG: +    cc += _pcre_OP_lengths[op] + cc[1]; +    break; +      /* For the record, these are the opcodes that are matched by "default":      OP_ACCEPT, OP_CLOSE, OP_COMMIT, OP_FAIL, OP_PRUNE, OP_SET_SOM, OP_SKIP,      OP_THEN. */ @@ -429,25 +439,121 @@ for (;;)  *      Set a bit and maybe its alternate case    *  *************************************************/ -/* Given a character, set its bit in the table, and also the bit for the other -version of a letter if we are caseless. +/* Given a character, set its first byte's bit in the table, and also the +corresponding bit for the other version of a letter if we are caseless. In +UTF-8 mode, for characters greater than 127, we can only do the caseless thing +when Unicode property support is available.  Arguments:    start_bits    points to the bit map -  c             is the character +  p             points to the character    caseless      the caseless flag    cd            the block with char table pointers +  utf8          TRUE for UTF-8 mode + +Returns:        pointer after the character +*/ + +static const uschar * +set_table_bit(uschar *start_bits, const uschar *p, BOOL caseless, +  compile_data *cd, BOOL utf8) +{ +unsigned int c = *p; + +SET_BIT(c); + +#ifdef SUPPORT_UTF8 +if (utf8 && c > 127) +  { +  GETCHARINC(c, p); +#ifdef SUPPORT_UCP +  if (caseless) +    { +    uschar buff[8]; +    c = UCD_OTHERCASE(c); +    (void)_pcre_ord2utf8(c, buff); +    SET_BIT(buff[0]); +    } +#endif +  return p; +  } +#endif + +/* Not UTF-8 mode, or character is less than 127. */ + +if (caseless && (cd->ctypes[c] & ctype_letter) != 0) SET_BIT(cd->fcc[c]); +return p + 1; +} + + -Returns:        nothing +/************************************************* +*     Set bits for a positive character type     * +*************************************************/ + +/* This function sets starting bits for a character type. In UTF-8 mode, we can +only do a direct setting for bytes less than 128, as otherwise there can be +confusion with bytes in the middle of UTF-8 characters. In a "traditional" +environment, the tables will only recognize ASCII characters anyway, but in at +least one Windows environment, some higher bytes bits were set in the tables. +So we deal with that case by considering the UTF-8 encoding. + +Arguments: +  start_bits     the starting bitmap +  cbit type      the type of character wanted +  table_limit    32 for non-UTF-8; 16 for UTF-8 +  cd             the block with char table pointers + +Returns:         nothing  */  static void -set_table_bit(uschar *start_bits, unsigned int c, BOOL caseless, +set_type_bits(uschar *start_bits, int cbit_type, int table_limit,    compile_data *cd)  { -start_bits[c/8] |= (1 << (c&7)); -if (caseless && (cd->ctypes[c] & ctype_letter) != 0) -  start_bits[cd->fcc[c]/8] |= (1 << (cd->fcc[c]&7)); +register int c; +for (c = 0; c < table_limit; c++) start_bits[c] |= cd->cbits[c+cbit_type]; +if (table_limit == 32) return; +for (c = 128; c < 256; c++) +  { +  if ((cd->cbits[c/8] & (1 << (c&7))) != 0) +    { +    uschar buff[8]; +    (void)_pcre_ord2utf8(c, buff); +    SET_BIT(buff[0]); +    } +  } +} + + +/************************************************* +*     Set bits for a negative character type     * +*************************************************/ + +/* This function sets starting bits for a negative character type such as \D. +In UTF-8 mode, we can only do a direct setting for bytes less than 128, as +otherwise there can be confusion with bytes in the middle of UTF-8 characters. +Unlike in the positive case, where we can set appropriate starting bits for +specific high-valued UTF-8 characters, in this case we have to set the bits for +all high-valued characters. The lowest is 0xc2, but we overkill by starting at +0xc0 (192) for simplicity. + +Arguments: +  start_bits     the starting bitmap +  cbit type      the type of character wanted +  table_limit    32 for non-UTF-8; 16 for UTF-8 +  cd             the block with char table pointers + +Returns:         nothing +*/ + +static void +set_nottype_bits(uschar *start_bits, int cbit_type, int table_limit, +  compile_data *cd) +{ +register int c; +for (c = 0; c < table_limit; c++) start_bits[c] |= ~cd->cbits[c+cbit_type]; +if (table_limit != 32) for (c = 24; c < 32; c++) start_bits[c] = 0xff;  } @@ -482,6 +588,7 @@ set_start_bits(const uschar *code, uschar *start_bits, BOOL caseless,  {  register int c;  int yield = SSB_DONE; +int table_limit = utf8? 16:32;  #if 0  /* ========================================================================= */ @@ -605,12 +712,7 @@ do        case OP_QUERY:        case OP_MINQUERY:        case OP_POSQUERY: -      set_table_bit(start_bits, tcode[1], caseless, cd); -      tcode += 2; -#ifdef SUPPORT_UTF8 -      if (utf8 && tcode[-1] >= 0xc0) -        tcode += _pcre_utf8_table4[tcode[-1] & 0x3f]; -#endif +      tcode = set_table_bit(start_bits, tcode + 1, caseless, cd, utf8);        break;        /* Single-char upto sets the bit and tries the next */ @@ -618,12 +720,7 @@ do        case OP_UPTO:        case OP_MINUPTO:        case OP_POSUPTO: -      set_table_bit(start_bits, tcode[3], caseless, cd); -      tcode += 4; -#ifdef SUPPORT_UTF8 -      if (utf8 && tcode[-1] >= 0xc0) -        tcode += _pcre_utf8_table4[tcode[-1] & 0x3f]; -#endif +      tcode = set_table_bit(start_bits, tcode + 3, caseless, cd, utf8);        break;        /* At least one single char sets the bit and stops */ @@ -636,59 +733,86 @@ do        case OP_PLUS:        case OP_MINPLUS:        case OP_POSPLUS: -      set_table_bit(start_bits, tcode[1], caseless, cd); +      (void)set_table_bit(start_bits, tcode + 1, caseless, cd, utf8);        try_next = FALSE;        break; -      /* Single character type sets the bits and stops */ +      /* Special spacing and line-terminating items. These recognize specific +      lists of characters. The difference between VSPACE and ANYNL is that the +      latter can match the two-character CRLF sequence, but that is not +      relevant for finding the first character, so their code here is +      identical. */ + +      case OP_HSPACE: +      SET_BIT(0x09); +      SET_BIT(0x20); +      if (utf8) +        { +        SET_BIT(0xC2);  /* For U+00A0 */ +        SET_BIT(0xE1);  /* For U+1680, U+180E */ +        SET_BIT(0xE2);  /* For U+2000 - U+200A, U+202F, U+205F */ +        SET_BIT(0xE3);  /* For U+3000 */ +        } +      else SET_BIT(0xA0); +      try_next = FALSE; +      break; + +      case OP_ANYNL: +      case OP_VSPACE: +      SET_BIT(0x0A); +      SET_BIT(0x0B); +      SET_BIT(0x0C); +      SET_BIT(0x0D); +      if (utf8) +        { +        SET_BIT(0xC2);  /* For U+0085 */ +        SET_BIT(0xE2);  /* For U+2028, U+2029 */ +        } +      else SET_BIT(0x85); +      try_next = FALSE; +      break; + +      /* Single character types set the bits and stop. Note that if PCRE_UCP +      is set, we do not see these op codes because \d etc are converted to +      properties. Therefore, these apply in the case when only characters less +      than 256 are recognized to match the types. */        case OP_NOT_DIGIT: -      for (c = 0; c < 32; c++) -        start_bits[c] |= ~cd->cbits[c+cbit_digit]; +      set_nottype_bits(start_bits, cbit_digit, table_limit, cd);        try_next = FALSE;        break;        case OP_DIGIT: -      for (c = 0; c < 32; c++) -        start_bits[c] |= cd->cbits[c+cbit_digit]; +      set_type_bits(start_bits, cbit_digit, table_limit, cd);        try_next = FALSE;        break;        /* The cbit_space table has vertical tab as whitespace; we have to -      discard it. */ +      ensure it is set as not whitespace. */        case OP_NOT_WHITESPACE: -      for (c = 0; c < 32; c++) -        { -        int d = cd->cbits[c+cbit_space]; -        if (c == 1) d &= ~0x08; -        start_bits[c] |= ~d; -        } +      set_nottype_bits(start_bits, cbit_space, table_limit, cd); +      start_bits[1] |= 0x08;        try_next = FALSE;        break;        /* The cbit_space table has vertical tab as whitespace; we have to -      discard it. */ +      not set it from the table. */        case OP_WHITESPACE: -      for (c = 0; c < 32; c++) -        { -        int d = cd->cbits[c+cbit_space]; -        if (c == 1) d &= ~0x08; -        start_bits[c] |= d; -        } +      c = start_bits[1];    /* Save in case it was already set */ +      set_type_bits(start_bits, cbit_space, table_limit, cd); +      start_bits[1] = (start_bits[1] & ~0x08) | c;        try_next = FALSE;        break;        case OP_NOT_WORDCHAR: -      for (c = 0; c < 32; c++) -        start_bits[c] |= ~cd->cbits[c+cbit_word]; +      set_nottype_bits(start_bits, cbit_word, table_limit, cd);        try_next = FALSE;        break;        case OP_WORDCHAR: -      for (c = 0; c < 32; c++) -        start_bits[c] |= cd->cbits[c+cbit_word]; +      set_type_bits(start_bits, cbit_word, table_limit, cd);        try_next = FALSE;        break; @@ -697,6 +821,7 @@ do        case OP_TYPEPLUS:        case OP_TYPEMINPLUS: +      case OP_TYPEPOSPLUS:        tcode++;        break; @@ -720,52 +845,69 @@ do        case OP_TYPEPOSQUERY:        switch(tcode[1])          { +        default:          case OP_ANY:          case OP_ALLANY:          return SSB_FAIL; +        case OP_HSPACE: +        SET_BIT(0x09); +        SET_BIT(0x20); +        if (utf8) +          { +          SET_BIT(0xC2);  /* For U+00A0 */ +          SET_BIT(0xE1);  /* For U+1680, U+180E */ +          SET_BIT(0xE2);  /* For U+2000 - U+200A, U+202F, U+205F */ +          SET_BIT(0xE3);  /* For U+3000 */ +          } +        else SET_BIT(0xA0); +        break; + +        case OP_ANYNL: +        case OP_VSPACE: +        SET_BIT(0x0A); +        SET_BIT(0x0B); +        SET_BIT(0x0C); +        SET_BIT(0x0D); +        if (utf8) +          { +          SET_BIT(0xC2);  /* For U+0085 */ +          SET_BIT(0xE2);  /* For U+2028, U+2029 */ +          } +        else SET_BIT(0x85); +        break; +          case OP_NOT_DIGIT: -        for (c = 0; c < 32; c++) -          start_bits[c] |= ~cd->cbits[c+cbit_digit]; +        set_nottype_bits(start_bits, cbit_digit, table_limit, cd);          break;          case OP_DIGIT: -        for (c = 0; c < 32; c++) -          start_bits[c] |= cd->cbits[c+cbit_digit]; +        set_type_bits(start_bits, cbit_digit, table_limit, cd);          break;          /* The cbit_space table has vertical tab as whitespace; we have to -        discard it. */ +        ensure it gets set as not whitespace. */          case OP_NOT_WHITESPACE: -        for (c = 0; c < 32; c++) -          { -          int d = cd->cbits[c+cbit_space]; -          if (c == 1) d &= ~0x08; -          start_bits[c] |= ~d; -          } +        set_nottype_bits(start_bits, cbit_space, table_limit, cd); +        start_bits[1] |= 0x08;          break;          /* The cbit_space table has vertical tab as whitespace; we have to -        discard it. */ +        avoid setting it. */          case OP_WHITESPACE: -        for (c = 0; c < 32; c++) -          { -          int d = cd->cbits[c+cbit_space]; -          if (c == 1) d &= ~0x08; -          start_bits[c] |= d; -          } +        c = start_bits[1];    /* Save in case it was already set */ +        set_type_bits(start_bits, cbit_space, table_limit, cd); +        start_bits[1] = (start_bits[1] & ~0x08) | c;          break;          case OP_NOT_WORDCHAR: -        for (c = 0; c < 32; c++) -          start_bits[c] |= ~cd->cbits[c+cbit_word]; +        set_nottype_bits(start_bits, cbit_word, table_limit, cd);          break;          case OP_WORDCHAR: -        for (c = 0; c < 32; c++) -          start_bits[c] |= cd->cbits[c+cbit_word]; +        set_type_bits(start_bits, cbit_word, table_limit, cd);          break;          } | 
