diff options
author | zherczeg <zherczeg@2f5784b3-3f2a-0410-8824-cb99058d5e15> | 2012-02-22 10:23:56 +0000 |
---|---|---|
committer | zherczeg <zherczeg@2f5784b3-3f2a-0410-8824-cb99058d5e15> | 2012-02-22 10:23:56 +0000 |
commit | 5f83aeee3273dc59fa8878eb3e830e64213fcbd2 (patch) | |
tree | 324e6f3bfd1df70b681706ddf23bda00d3338111 /pcre_compile.c | |
parent | e65ea5ac9477071cc408ce00a2d3e80f791c8570 (diff) | |
download | pcre-5f83aeee3273dc59fa8878eb3e830e64213fcbd2.tar.gz |
OP_NOT now supports any UTF character
git-svn-id: svn://vcs.exim.org/pcre/code/trunk@924 2f5784b3-3f2a-0410-8824-cb99058d5e15
Diffstat (limited to 'pcre_compile.c')
-rw-r--r-- | pcre_compile.c | 122 |
1 files changed, 69 insertions, 53 deletions
diff --git a/pcre_compile.c b/pcre_compile.c index 8070f51..2f76cea 100644 --- a/pcre_compile.c +++ b/pcre_compile.c @@ -2225,32 +2225,60 @@ for (;;) { case OP_CHAR: case OP_CHARI: + case OP_NOT: + case OP_NOTI: case OP_EXACT: case OP_EXACTI: + case OP_NOTEXACT: + case OP_NOTEXACTI: case OP_UPTO: case OP_UPTOI: + case OP_NOTUPTO: + case OP_NOTUPTOI: case OP_MINUPTO: case OP_MINUPTOI: + case OP_NOTMINUPTO: + case OP_NOTMINUPTOI: case OP_POSUPTO: case OP_POSUPTOI: + case OP_NOTPOSUPTO: + case OP_NOTPOSUPTOI: case OP_STAR: case OP_STARI: + case OP_NOTSTAR: + case OP_NOTSTARI: case OP_MINSTAR: case OP_MINSTARI: + case OP_NOTMINSTAR: + case OP_NOTMINSTARI: case OP_POSSTAR: case OP_POSSTARI: + case OP_NOTPOSSTAR: + case OP_NOTPOSSTARI: case OP_PLUS: case OP_PLUSI: + case OP_NOTPLUS: + case OP_NOTPLUSI: case OP_MINPLUS: case OP_MINPLUSI: + case OP_NOTMINPLUS: + case OP_NOTMINPLUSI: case OP_POSPLUS: case OP_POSPLUSI: + case OP_NOTPOSPLUS: + case OP_NOTPOSPLUSI: case OP_QUERY: case OP_QUERYI: + case OP_NOTQUERY: + case OP_NOTQUERYI: case OP_MINQUERY: case OP_MINQUERYI: + case OP_NOTMINQUERY: + case OP_NOTMINQUERYI: case OP_POSQUERY: case OP_POSQUERYI: + case OP_NOTPOSQUERY: + case OP_NOTPOSQUERYI: if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]); break; } @@ -3069,22 +3097,28 @@ if (next >= 0) switch(op_code) #endif /* SUPPORT_UTF */ return (c != TABLE_GET((unsigned int)next, cd->fcc, next)); /* Non-UTF-8 mode */ - /* For OP_NOT and OP_NOTI, the data is always a single-byte character. These - opcodes are not used for multi-byte characters, because they are coded using - an XCLASS instead. */ - case OP_NOT: - return (c = *previous) == next; +#ifdef SUPPORT_UTF + GETCHARTEST(c, previous); +#else + c = *previous; +#endif + return c == next; case OP_NOTI: - if ((c = *previous) == next) return TRUE; +#ifdef SUPPORT_UTF + GETCHARTEST(c, previous); +#else + c = *previous; +#endif + if (c == next) return TRUE; #ifdef SUPPORT_UTF if (utf) { unsigned int othercase; if (next < 128) othercase = cd->fcc[next]; else #ifdef SUPPORT_UCP - othercase = UCD_OTHERCASE(next); + othercase = UCD_OTHERCASE((unsigned int)next); #else othercase = NOTACHAR; #endif @@ -3092,7 +3126,7 @@ if (next >= 0) switch(op_code) } else #endif /* SUPPORT_UTF */ - return (c == (int)(TABLE_GET((unsigned int)next, cd->fcc, next))); /* Non-UTF-8 mode */ + return (c == TABLE_GET((unsigned int)next, cd->fcc, next)); /* Non-UTF-8 mode */ /* Note that OP_DIGIT etc. are generated only when PCRE_UCP is *not* set. When it is set, \d etc. are converted into OP_(NOT_)PROP codes. */ @@ -4485,27 +4519,16 @@ for (;; ptr++) if (class_single_char < 2) class_single_char++; /* If class_charcount is 1, we saw precisely one character. As long as - there were no negated characters >= 128 and there was no use of \p or \P, - in other words, no use of any XCLASS features, we can optimize. - - In UTF-8 mode, we can optimize the negative case only if there were no - characters >= 128 because OP_NOT and the related opcodes like OP_NOTSTAR - operate on single-bytes characters only. This is an historical hangover. - Maybe one day we can tidy these opcodes to handle multi-byte characters. + there was no use of \p or \P, in other words, no use of any XCLASS features, + we can optimize. The optimization throws away the bit map. We turn the item into a 1-character OP_CHAR[I] if it's positive, or OP_NOT[I] if it's negative. - Note that OP_NOT[I] does not support multibyte characters. In the positive - case, it can cause firstchar to be set. Otherwise, there can be no first - char if this item is first, whatever repeat count may follow. In the case - of reqchar, save the previous value for reinstating. */ + In the positive case, it can cause firstchar to be set. Otherwise, there + can be no first char if this item is first, whatever repeat count may + follow. In the case of reqchar, save the previous value for reinstating. */ -#ifdef SUPPORT_UTF - if (class_single_char == 1 && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET - && (!utf || !negate_class || c < (MAX_VALUE_FOR_SINGLE_CHAR + 1))) -#else if (class_single_char == 1 && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET) -#endif { ptr++; zeroreqchar = reqchar; @@ -4517,7 +4540,12 @@ for (;; ptr++) if (firstchar == REQ_UNSET) firstchar = REQ_NONE; zerofirstchar = firstchar; *code++ = ((options & PCRE_CASELESS) != 0)? OP_NOTI: OP_NOT; - *code++ = c; +#ifdef SUPPORT_UTF + if (utf && c > MAX_VALUE_FOR_SINGLE_CHAR) + code += PRIV(ord2utf)(c, code); + else +#endif + *code++ = c; goto NOT_CHAR; } @@ -4775,15 +4803,22 @@ for (;; ptr++) /* Now handle repetition for the different types of item. */ - /* If previous was a character match, abolish the item and generate a - repeat item instead. If a char item has a minumum of more than one, ensure - that it is set in reqchar - it might not be if a sequence such as x{3} is - the first thing in a branch because the x will have gone into firstchar - instead. */ + /* If previous was a character or negated character match, abolish the item + and generate a repeat item instead. If a char item has a minumum of more + than one, ensure that it is set in reqchar - it might not be if a sequence + such as x{3} is the first thing in a branch because the x will have gone + into firstchar instead. */ - if (*previous == OP_CHAR || *previous == OP_CHARI) + if (*previous == OP_CHAR || *previous == OP_CHARI + || *previous == OP_NOT || *previous == OP_NOTI) { - op_type = (*previous == OP_CHAR)? 0 : OP_STARI - OP_STAR; + switch (*previous) { + default: /* Make compiler happy. */ + case OP_CHAR: op_type = OP_STAR - OP_STAR; break; + case OP_CHARI: op_type = OP_STARI - OP_STAR; break; + case OP_NOT: op_type = OP_NOTSTAR - OP_STAR; break; + case OP_NOTI: op_type = OP_NOTSTARI - OP_STAR; break; + } /* Deal with UTF characters that take up more than one character. It's easier to write this out separately than try to macrify it. Use c to @@ -4806,7 +4841,8 @@ for (;; ptr++) with UTF disabled, or for a single character UTF character. */ { c = code[-1]; - if (repeat_min > 1) reqchar = c | req_caseopt | cd->req_varyopt; + if (*previous <= OP_CHARI && repeat_min > 1) + reqchar = c | req_caseopt | cd->req_varyopt; } /* If the repetition is unlimited, it pays to see if the next thing on @@ -4825,26 +4861,6 @@ for (;; ptr++) goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */ } - /* If previous was a single negated character ([^a] or similar), we use - one of the special opcodes, replacing it. The code is shared with single- - character repeats by setting opt_type to add a suitable offset into - repeat_type. We can also test for auto-possessification. OP_NOT and OP_NOTI - are currently used only for single-byte chars. */ - - else if (*previous == OP_NOT || *previous == OP_NOTI) - { - op_type = ((*previous == OP_NOT)? OP_NOTSTAR : OP_NOTSTARI) - OP_STAR; - c = previous[1]; - if (!possessive_quantifier && - repeat_max < 0 && - check_auto_possessive(previous, utf, ptr + 1, options, cd)) - { - repeat_type = 0; /* Force greedy */ - possessive_quantifier = TRUE; - } - goto OUTPUT_SINGLE_REPEAT; - } - /* If previous was a character type match (\d or similar), abolish it and create a suitable repeat item. The code is shared with single-character repeats by setting op_type to add a suitable offset into repeat_type. Note |