OP_NOT now supports any UTF character

git-svn-id: svn://vcs.exim.org/pcre/code/trunk@924 2f5784b3-3f2a-0410-8824-cb99058d5e15
author: zherczeg <zherczeg@2f5784b3-3f2a-0410-8824-cb99058d5e15> 2012-02-22 10:23:56 +0000
committer: zherczeg <zherczeg@2f5784b3-3f2a-0410-8824-cb99058d5e15> 2012-02-22 10:23:56 +0000
commit: 5f83aeee3273dc59fa8878eb3e830e64213fcbd2 (patch)
tree: 324e6f3bfd1df70b681706ddf23bda00d3338111 /pcre_compile.c
parent: e65ea5ac9477071cc408ce00a2d3e80f791c8570 (diff)
download: pcre-5f83aeee3273dc59fa8878eb3e830e64213fcbd2.tar.gz
1 files changed, 69 insertions, 53 deletions
diff --git a/pcre_compile.c b/pcre_compile.c
index 8070f51..2f76cea 100644
--- a/pcre_compile.c
+++ b/pcre_compile.c
@@ -2225,32 +2225,60 @@ for (;;)
       {
       case OP_CHAR:
       case OP_CHARI:
+      case OP_NOT:
+      case OP_NOTI:
       case OP_EXACT:
       case OP_EXACTI:
+      case OP_NOTEXACT:
+      case OP_NOTEXACTI:
       case OP_UPTO:
       case OP_UPTOI:
+      case OP_NOTUPTO:
+      case OP_NOTUPTOI:
       case OP_MINUPTO:
       case OP_MINUPTOI:
+      case OP_NOTMINUPTO:
+      case OP_NOTMINUPTOI:
       case OP_POSUPTO:
       case OP_POSUPTOI:
+      case OP_NOTPOSUPTO:
+      case OP_NOTPOSUPTOI:
       case OP_STAR:
       case OP_STARI:
+      case OP_NOTSTAR:
+      case OP_NOTSTARI:
       case OP_MINSTAR:
       case OP_MINSTARI:
+      case OP_NOTMINSTAR:
+      case OP_NOTMINSTARI:
       case OP_POSSTAR:
       case OP_POSSTARI:
+      case OP_NOTPOSSTAR:
+      case OP_NOTPOSSTARI:
       case OP_PLUS:
       case OP_PLUSI:
+      case OP_NOTPLUS:
+      case OP_NOTPLUSI:
       case OP_MINPLUS:
       case OP_MINPLUSI:
+      case OP_NOTMINPLUS:
+      case OP_NOTMINPLUSI:
       case OP_POSPLUS:
       case OP_POSPLUSI:
+      case OP_NOTPOSPLUS:
+      case OP_NOTPOSPLUSI:
       case OP_QUERY:
       case OP_QUERYI:
+      case OP_NOTQUERY:
+      case OP_NOTQUERYI:
       case OP_MINQUERY:
       case OP_MINQUERYI:
+      case OP_NOTMINQUERY:
+      case OP_NOTMINQUERYI:
       case OP_POSQUERY:
       case OP_POSQUERYI:
+      case OP_NOTPOSQUERY:
+      case OP_NOTPOSQUERYI:
       if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]);
       break;
       }
@@ -3069,22 +3097,28 @@ if (next >= 0) switch(op_code)
 #endif  /* SUPPORT_UTF */
   return (c != TABLE_GET((unsigned int)next, cd->fcc, next));  /* Non-UTF-8 mode */
 
-  /* For OP_NOT and OP_NOTI, the data is always a single-byte character. These
-  opcodes are not used for multi-byte characters, because they are coded using
-  an XCLASS instead. */
-
   case OP_NOT:
-  return (c = *previous) == next;
+#ifdef SUPPORT_UTF
+  GETCHARTEST(c, previous);
+#else
+  c = *previous;
+#endif
+  return c == next;
 
   case OP_NOTI:
-  if ((c = *previous) == next) return TRUE;
+#ifdef SUPPORT_UTF
+  GETCHARTEST(c, previous);
+#else
+  c = *previous;
+#endif
+  if (c == next) return TRUE;
 #ifdef SUPPORT_UTF
   if (utf)
     {
     unsigned int othercase;
     if (next < 128) othercase = cd->fcc[next]; else
 #ifdef SUPPORT_UCP
-    othercase = UCD_OTHERCASE(next);
+    othercase = UCD_OTHERCASE((unsigned int)next);
 #else
     othercase = NOTACHAR;
 #endif
@@ -3092,7 +3126,7 @@ if (next >= 0) switch(op_code)
     }
   else
 #endif  /* SUPPORT_UTF */
-  return (c == (int)(TABLE_GET((unsigned int)next, cd->fcc, next)));  /* Non-UTF-8 mode */
+  return (c == TABLE_GET((unsigned int)next, cd->fcc, next));  /* Non-UTF-8 mode */
 
   /* Note that OP_DIGIT etc. are generated only when PCRE_UCP is *not* set.
   When it is set, \d etc. are converted into OP_(NOT_)PROP codes. */
@@ -4485,27 +4519,16 @@ for (;; ptr++)
       if (class_single_char < 2) class_single_char++;
 
       /* If class_charcount is 1, we saw precisely one character. As long as
-      there were no negated characters >= 128 and there was no use of \p or \P,
-      in other words, no use of any XCLASS features, we can optimize.
-
-      In UTF-8 mode, we can optimize the negative case only if there were no
-      characters >= 128 because OP_NOT and the related opcodes like OP_NOTSTAR
-      operate on single-bytes characters only. This is an historical hangover.
-      Maybe one day we can tidy these opcodes to handle multi-byte characters.
+      there was no use of \p or \P, in other words, no use of any XCLASS features,
+      we can optimize.
 
       The optimization throws away the bit map. We turn the item into a
       1-character OP_CHAR[I] if it's positive, or OP_NOT[I] if it's negative.
-      Note that OP_NOT[I] does not support multibyte characters. In the positive
-      case, it can cause firstchar to be set. Otherwise, there can be no first
-      char if this item is first, whatever repeat count may follow. In the case
-      of reqchar, save the previous value for reinstating. */
+      In the positive case, it can cause firstchar to be set. Otherwise, there
+      can be no first char if this item is first, whatever repeat count may
+      follow. In the case of reqchar, save the previous value for reinstating. */
 
-#ifdef SUPPORT_UTF
-      if (class_single_char == 1 && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET
-        && (!utf || !negate_class || c < (MAX_VALUE_FOR_SINGLE_CHAR + 1)))
-#else
       if (class_single_char == 1 && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
-#endif
         {
         ptr++;
         zeroreqchar = reqchar;
@@ -4517,7 +4540,12 @@ for (;; ptr++)
           if (firstchar == REQ_UNSET) firstchar = REQ_NONE;
           zerofirstchar = firstchar;
           *code++ = ((options & PCRE_CASELESS) != 0)? OP_NOTI: OP_NOT;
-          *code++ = c;
+#ifdef SUPPORT_UTF
+          if (utf && c > MAX_VALUE_FOR_SINGLE_CHAR)
+            code += PRIV(ord2utf)(c, code);
+          else
+#endif
+            *code++ = c;
           goto NOT_CHAR;
           }
 
@@ -4775,15 +4803,22 @@ for (;; ptr++)
 
     /* Now handle repetition for the different types of item. */
 
-    /* If previous was a character match, abolish the item and generate a
-    repeat item instead. If a char item has a minumum of more than one, ensure
-    that it is set in reqchar - it might not be if a sequence such as x{3} is
-    the first thing in a branch because the x will have gone into firstchar
-    instead.  */
+    /* If previous was a character or negated character match, abolish the item
+    and generate a repeat item instead. If a char item has a minumum of more
+    than one, ensure  that it is set in reqchar - it might not be if a sequence
+    such as x{3} is  the first thing in a branch because the x will have gone
+    into firstchar instead.  */
 
-    if (*previous == OP_CHAR || *previous == OP_CHARI)
+    if (*previous == OP_CHAR || *previous == OP_CHARI
+        || *previous == OP_NOT || *previous == OP_NOTI)
       {
-      op_type = (*previous == OP_CHAR)? 0 : OP_STARI - OP_STAR;
+      switch (*previous) {
+      default: /* Make compiler happy. */
+      case OP_CHAR:  op_type = OP_STAR - OP_STAR; break;
+      case OP_CHARI: op_type = OP_STARI - OP_STAR; break;
+      case OP_NOT:   op_type = OP_NOTSTAR - OP_STAR; break;
+      case OP_NOTI:  op_type = OP_NOTSTARI - OP_STAR; break;
+      }
 
       /* Deal with UTF characters that take up more than one character. It's
       easier to write this out separately than try to macrify it. Use c to
@@ -4806,7 +4841,8 @@ for (;; ptr++)
       with UTF disabled, or for a single character UTF character. */
         {
         c = code[-1];
-        if (repeat_min > 1) reqchar = c | req_caseopt | cd->req_varyopt;
+        if (*previous <= OP_CHARI && repeat_min > 1)
+          reqchar = c | req_caseopt | cd->req_varyopt;
         }
 
       /* If the repetition is unlimited, it pays to see if the next thing on
@@ -4825,26 +4861,6 @@ for (;; ptr++)
       goto OUTPUT_SINGLE_REPEAT;   /* Code shared with single character types */
       }
 
-    /* If previous was a single negated character ([^a] or similar), we use
-    one of the special opcodes, replacing it. The code is shared with single-
-    character repeats by setting opt_type to add a suitable offset into
-    repeat_type. We can also test for auto-possessification. OP_NOT and OP_NOTI
-    are currently used only for single-byte chars. */
-
-    else if (*previous == OP_NOT || *previous == OP_NOTI)
-      {
-      op_type = ((*previous == OP_NOT)? OP_NOTSTAR : OP_NOTSTARI) - OP_STAR;
-      c = previous[1];
-      if (!possessive_quantifier &&
-          repeat_max < 0 &&
-          check_auto_possessive(previous, utf, ptr + 1, options, cd))
-        {
-        repeat_type = 0;    /* Force greedy */
-        possessive_quantifier = TRUE;
-        }
-      goto OUTPUT_SINGLE_REPEAT;
-      }
-
     /* If previous was a character type match (\d or similar), abolish it and
     create a suitable repeat item. The code is shared with single-character
     repeats by setting op_type to add a suitable offset into repeat_type. Note
author	zherczeg <zherczeg@2f5784b3-3f2a-0410-8824-cb99058d5e15>	2012-02-22 10:23:56 +0000
committer	zherczeg <zherczeg@2f5784b3-3f2a-0410-8824-cb99058d5e15>	2012-02-22 10:23:56 +0000
commit	5f83aeee3273dc59fa8878eb3e830e64213fcbd2 (patch)
tree	324e6f3bfd1df70b681706ddf23bda00d3338111 /pcre_compile.c
parent	e65ea5ac9477071cc408ce00a2d3e80f791c8570 (diff)
download	pcre-5f83aeee3273dc59fa8878eb3e830e64213fcbd2.tar.gz