summaryrefslogtreecommitdiff
path: root/pcre_compile.c
diff options
context:
space:
mode:
authorzherczeg <zherczeg@2f5784b3-3f2a-0410-8824-cb99058d5e15>2012-02-22 10:23:56 +0000
committerzherczeg <zherczeg@2f5784b3-3f2a-0410-8824-cb99058d5e15>2012-02-22 10:23:56 +0000
commit5f83aeee3273dc59fa8878eb3e830e64213fcbd2 (patch)
tree324e6f3bfd1df70b681706ddf23bda00d3338111 /pcre_compile.c
parente65ea5ac9477071cc408ce00a2d3e80f791c8570 (diff)
downloadpcre-5f83aeee3273dc59fa8878eb3e830e64213fcbd2.tar.gz
OP_NOT now supports any UTF character
git-svn-id: svn://vcs.exim.org/pcre/code/trunk@924 2f5784b3-3f2a-0410-8824-cb99058d5e15
Diffstat (limited to 'pcre_compile.c')
-rw-r--r--pcre_compile.c122
1 files changed, 69 insertions, 53 deletions
diff --git a/pcre_compile.c b/pcre_compile.c
index 8070f51..2f76cea 100644
--- a/pcre_compile.c
+++ b/pcre_compile.c
@@ -2225,32 +2225,60 @@ for (;;)
{
case OP_CHAR:
case OP_CHARI:
+ case OP_NOT:
+ case OP_NOTI:
case OP_EXACT:
case OP_EXACTI:
+ case OP_NOTEXACT:
+ case OP_NOTEXACTI:
case OP_UPTO:
case OP_UPTOI:
+ case OP_NOTUPTO:
+ case OP_NOTUPTOI:
case OP_MINUPTO:
case OP_MINUPTOI:
+ case OP_NOTMINUPTO:
+ case OP_NOTMINUPTOI:
case OP_POSUPTO:
case OP_POSUPTOI:
+ case OP_NOTPOSUPTO:
+ case OP_NOTPOSUPTOI:
case OP_STAR:
case OP_STARI:
+ case OP_NOTSTAR:
+ case OP_NOTSTARI:
case OP_MINSTAR:
case OP_MINSTARI:
+ case OP_NOTMINSTAR:
+ case OP_NOTMINSTARI:
case OP_POSSTAR:
case OP_POSSTARI:
+ case OP_NOTPOSSTAR:
+ case OP_NOTPOSSTARI:
case OP_PLUS:
case OP_PLUSI:
+ case OP_NOTPLUS:
+ case OP_NOTPLUSI:
case OP_MINPLUS:
case OP_MINPLUSI:
+ case OP_NOTMINPLUS:
+ case OP_NOTMINPLUSI:
case OP_POSPLUS:
case OP_POSPLUSI:
+ case OP_NOTPOSPLUS:
+ case OP_NOTPOSPLUSI:
case OP_QUERY:
case OP_QUERYI:
+ case OP_NOTQUERY:
+ case OP_NOTQUERYI:
case OP_MINQUERY:
case OP_MINQUERYI:
+ case OP_NOTMINQUERY:
+ case OP_NOTMINQUERYI:
case OP_POSQUERY:
case OP_POSQUERYI:
+ case OP_NOTPOSQUERY:
+ case OP_NOTPOSQUERYI:
if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]);
break;
}
@@ -3069,22 +3097,28 @@ if (next >= 0) switch(op_code)
#endif /* SUPPORT_UTF */
return (c != TABLE_GET((unsigned int)next, cd->fcc, next)); /* Non-UTF-8 mode */
- /* For OP_NOT and OP_NOTI, the data is always a single-byte character. These
- opcodes are not used for multi-byte characters, because they are coded using
- an XCLASS instead. */
-
case OP_NOT:
- return (c = *previous) == next;
+#ifdef SUPPORT_UTF
+ GETCHARTEST(c, previous);
+#else
+ c = *previous;
+#endif
+ return c == next;
case OP_NOTI:
- if ((c = *previous) == next) return TRUE;
+#ifdef SUPPORT_UTF
+ GETCHARTEST(c, previous);
+#else
+ c = *previous;
+#endif
+ if (c == next) return TRUE;
#ifdef SUPPORT_UTF
if (utf)
{
unsigned int othercase;
if (next < 128) othercase = cd->fcc[next]; else
#ifdef SUPPORT_UCP
- othercase = UCD_OTHERCASE(next);
+ othercase = UCD_OTHERCASE((unsigned int)next);
#else
othercase = NOTACHAR;
#endif
@@ -3092,7 +3126,7 @@ if (next >= 0) switch(op_code)
}
else
#endif /* SUPPORT_UTF */
- return (c == (int)(TABLE_GET((unsigned int)next, cd->fcc, next))); /* Non-UTF-8 mode */
+ return (c == TABLE_GET((unsigned int)next, cd->fcc, next)); /* Non-UTF-8 mode */
/* Note that OP_DIGIT etc. are generated only when PCRE_UCP is *not* set.
When it is set, \d etc. are converted into OP_(NOT_)PROP codes. */
@@ -4485,27 +4519,16 @@ for (;; ptr++)
if (class_single_char < 2) class_single_char++;
/* If class_charcount is 1, we saw precisely one character. As long as
- there were no negated characters >= 128 and there was no use of \p or \P,
- in other words, no use of any XCLASS features, we can optimize.
-
- In UTF-8 mode, we can optimize the negative case only if there were no
- characters >= 128 because OP_NOT and the related opcodes like OP_NOTSTAR
- operate on single-bytes characters only. This is an historical hangover.
- Maybe one day we can tidy these opcodes to handle multi-byte characters.
+ there was no use of \p or \P, in other words, no use of any XCLASS features,
+ we can optimize.
The optimization throws away the bit map. We turn the item into a
1-character OP_CHAR[I] if it's positive, or OP_NOT[I] if it's negative.
- Note that OP_NOT[I] does not support multibyte characters. In the positive
- case, it can cause firstchar to be set. Otherwise, there can be no first
- char if this item is first, whatever repeat count may follow. In the case
- of reqchar, save the previous value for reinstating. */
+ In the positive case, it can cause firstchar to be set. Otherwise, there
+ can be no first char if this item is first, whatever repeat count may
+ follow. In the case of reqchar, save the previous value for reinstating. */
-#ifdef SUPPORT_UTF
- if (class_single_char == 1 && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET
- && (!utf || !negate_class || c < (MAX_VALUE_FOR_SINGLE_CHAR + 1)))
-#else
if (class_single_char == 1 && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
-#endif
{
ptr++;
zeroreqchar = reqchar;
@@ -4517,7 +4540,12 @@ for (;; ptr++)
if (firstchar == REQ_UNSET) firstchar = REQ_NONE;
zerofirstchar = firstchar;
*code++ = ((options & PCRE_CASELESS) != 0)? OP_NOTI: OP_NOT;
- *code++ = c;
+#ifdef SUPPORT_UTF
+ if (utf && c > MAX_VALUE_FOR_SINGLE_CHAR)
+ code += PRIV(ord2utf)(c, code);
+ else
+#endif
+ *code++ = c;
goto NOT_CHAR;
}
@@ -4775,15 +4803,22 @@ for (;; ptr++)
/* Now handle repetition for the different types of item. */
- /* If previous was a character match, abolish the item and generate a
- repeat item instead. If a char item has a minumum of more than one, ensure
- that it is set in reqchar - it might not be if a sequence such as x{3} is
- the first thing in a branch because the x will have gone into firstchar
- instead. */
+ /* If previous was a character or negated character match, abolish the item
+ and generate a repeat item instead. If a char item has a minumum of more
+ than one, ensure that it is set in reqchar - it might not be if a sequence
+ such as x{3} is the first thing in a branch because the x will have gone
+ into firstchar instead. */
- if (*previous == OP_CHAR || *previous == OP_CHARI)
+ if (*previous == OP_CHAR || *previous == OP_CHARI
+ || *previous == OP_NOT || *previous == OP_NOTI)
{
- op_type = (*previous == OP_CHAR)? 0 : OP_STARI - OP_STAR;
+ switch (*previous) {
+ default: /* Make compiler happy. */
+ case OP_CHAR: op_type = OP_STAR - OP_STAR; break;
+ case OP_CHARI: op_type = OP_STARI - OP_STAR; break;
+ case OP_NOT: op_type = OP_NOTSTAR - OP_STAR; break;
+ case OP_NOTI: op_type = OP_NOTSTARI - OP_STAR; break;
+ }
/* Deal with UTF characters that take up more than one character. It's
easier to write this out separately than try to macrify it. Use c to
@@ -4806,7 +4841,8 @@ for (;; ptr++)
with UTF disabled, or for a single character UTF character. */
{
c = code[-1];
- if (repeat_min > 1) reqchar = c | req_caseopt | cd->req_varyopt;
+ if (*previous <= OP_CHARI && repeat_min > 1)
+ reqchar = c | req_caseopt | cd->req_varyopt;
}
/* If the repetition is unlimited, it pays to see if the next thing on
@@ -4825,26 +4861,6 @@ for (;; ptr++)
goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
}
- /* If previous was a single negated character ([^a] or similar), we use
- one of the special opcodes, replacing it. The code is shared with single-
- character repeats by setting opt_type to add a suitable offset into
- repeat_type. We can also test for auto-possessification. OP_NOT and OP_NOTI
- are currently used only for single-byte chars. */
-
- else if (*previous == OP_NOT || *previous == OP_NOTI)
- {
- op_type = ((*previous == OP_NOT)? OP_NOTSTAR : OP_NOTSTARI) - OP_STAR;
- c = previous[1];
- if (!possessive_quantifier &&
- repeat_max < 0 &&
- check_auto_possessive(previous, utf, ptr + 1, options, cd))
- {
- repeat_type = 0; /* Force greedy */
- possessive_quantifier = TRUE;
- }
- goto OUTPUT_SINGLE_REPEAT;
- }
-
/* If previous was a character type match (\d or similar), abolish it and
create a suitable repeat item. The code is shared with single-character
repeats by setting op_type to add a suitable offset into repeat_type. Note