Fix DFA (?!) bug; add support for JavaScript empty classes.

git-svn-id: svn://vcs.exim.org/pcre/code/trunk@341 2f5784b3-3f2a-0410-8824-cb99058d5e15
author: ph10 <ph10@2f5784b3-3f2a-0410-8824-cb99058d5e15> 2008-04-19 16:41:04 +0000
committer: ph10 <ph10@2f5784b3-3f2a-0410-8824-cb99058d5e15> 2008-04-19 16:41:04 +0000
commit: c66bac83039352a3a193c9db0b156fe8faa71296 (patch)
tree: a05cc60b6b0413efaebd9591638eaf2430f28ff2
parent: 3bbe280000d68a6360bae77fdf03737822bf020d (diff)
download: pcre-c66bac83039352a3a193c9db0b156fe8faa71296.tar.gz
13 files changed, 418 insertions, 152 deletions
diff --git a/ChangeLog b/ChangeLog
index 5b55099..64d7d23 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -58,11 +58,21 @@ Version 7.7 05-Mar-08
     pattern, with a new opcode that causes them to be skipped at execution 
     time.
     
-13. Added the PCRE_JAVASCRIPT_COMPAT option. This currently does two things:
+13. Added the PCRE_JAVASCRIPT_COMPAT option. This makes the following changes
+    to the way PCRE behaves:
+     
     (a) A lone ] character is dis-allowed (Perl treats it as data).
+     
     (b) A back reference to an unmatched subpattern matches an empty string 
         (Perl fails the current match path).
         
+    (c) A data ] in a character class must be notated as \] because if the
+        first data character in a class is ], it defines an empty class. (In 
+        Perl it is not possible to have an empty class.) The empty class []
+        never matches; it forces failure and is equivalent to (*FAIL) or (?!).
+        The negative empty class [^] matches any one character, independently
+        of the DOTALL setting.
+        
 14. A pattern such as /(?2)[]a()b](abc)/ which had a forward reference to a 
     non-existent subpattern following a character class starting with ']' and
     containing () gave an internal compiling error instead of "reference to
@@ -71,6 +81,11 @@ Version 7.7 05-Mar-08
     existencd of the subpattern, it was treating the data ']' as terminating
     the class, so got the count wrong. When actually compiling, the reference 
     was subsequently set up correctly.)
+    
+15. The "always fail" assertion (?!) is optimzed to (*FAIL) by pcre_compile;
+    it was being rejected as not supported by pcre_dfa_exec(), even though 
+    other assertions are supported. I have made pcre_dfa_exec() support 
+    (*FAIL). 
 
 
 Version 7.6 28-Jan-08
diff --git a/doc/pcrematching.3 b/doc/pcrematching.3
index a91e9a0..560a48c 100644
--- a/doc/pcrematching.3
+++ b/doc/pcrematching.3
@@ -131,7 +131,8 @@ byte, even in UTF-8 mode, is not supported because the alternative algorithm
 moves through the subject string one character at a time, for all active paths
 through the tree.
 .P
-8. None of the backtracking control verbs such as (*PRUNE) are supported.
+8. Except for (*FAIL), the backtracking control verbs such as (*PRUNE) are not
+supported. (*FAIL) is supported, and behaves like a failing negative assertion.
 .
 .SH "ADVANTAGES OF THE ALTERNATIVE ALGORITHM"
 .rs
@@ -182,6 +183,6 @@ Cambridge CB2 3QH, England.
 .rs
 .sp
 .nf
-Last updated: 08 August 2007
-Copyright (c) 1997-2007 University of Cambridge.
+Last updated: 19 April 2008
+Copyright (c) 1997-2008 University of Cambridge.
 .fi
diff --git a/doc/pcrepattern.3 b/doc/pcrepattern.3
index 2727b86..1160a4a 100644
--- a/doc/pcrepattern.3
+++ b/doc/pcrepattern.3
@@ -2115,9 +2115,10 @@ or removal in a future version of Perl". It goes on to say: "Their usage in
 production code should be noted to avoid problems during upgrades." The same
 remarks apply to the PCRE features described in this section.
 .P
-Since these verbs are specifically related to backtracking, they can be used
-only when the pattern is to be matched using \fBpcre_exec()\fP, which uses a
-backtracking algorithm. They cause an error if encountered by
+Since these verbs are specifically related to backtracking, most of them can be
+used only when the pattern is to be matched using \fBpcre_exec()\fP, which uses
+a backtracking algorithm. With the exception of (*FAIL), which behaves like a 
+failing negative assertion, they cause an error if encountered by
 \fBpcre_dfa_exec()\fP.
 .P
 The new verbs make use of what was previously invalid syntax: an opening
@@ -2239,6 +2240,6 @@ Cambridge CB2 3QH, England.
 .rs
 .sp
 .nf
-Last updated: 12 April 2008
+Last updated: 19 April 2008
 Copyright (c) 1997-2008 University of Cambridge.
 .fi
diff --git a/pcre_compile.c b/pcre_compile.c
index 81a76b2..e4dd87b 100644
--- a/pcre_compile.c
+++ b/pcre_compile.c
@@ -976,7 +976,7 @@ be terminated by '>' because that is checked in the first pass.
 
 Arguments:
   ptr          current position in the pattern
-  count        current count of capturing parens so far encountered
+  cd           compile background data 
   name         name to seek, or NULL if seeking a numbered subpattern
   lorn         name length, or subpattern number if name is NULL
   xmode        TRUE if we are in /x mode
@@ -985,10 +985,11 @@ Returns:       the number of the named subpattern, or -1 if not found
 */
 
 static int
-find_parens(const uschar *ptr, int count, const uschar *name, int lorn,
+find_parens(const uschar *ptr, compile_data *cd, const uschar *name, int lorn,
   BOOL xmode)
 {
 const uschar *thisname;
+int count = cd->bracount;
 
 for (; *ptr != 0; ptr++)
   {
@@ -1031,9 +1032,10 @@ for (; *ptr != 0; ptr++)
       }
 
     /* If the next character is ']', it is a data character that must be
-    skipped. */
+    skipped, except in JavaScript compatibility mode. */
     
-    if (ptr[1] == ']') ptr++;  
+    if (ptr[1] == ']' && (cd->external_options & PCRE_JAVASCRIPT_COMPAT) == 0) 
+      ptr++;  
  
     while (*(++ptr) != ']')
       {
@@ -2721,6 +2723,19 @@ for (;; ptr++)
         negate_class = TRUE;
       else break;
       }
+      
+    /* Empty classes are allowed in JavaScript compatibility mode. Otherwise, 
+    an initial ']' is taken as a data character -- the code below handles 
+    that. In JS mode, [] must always fail, so generate OP_FAIL, whereas
+    [^] must match any character, so generate OP_ALLANY. */
+    
+    if (c ==']' && (cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
+      {
+      *code++ = negate_class? OP_ALLANY : OP_FAIL;
+      if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
+      zerofirstbyte = firstbyte;
+      break;
+      }    
 
     /* If a class contains a negative special such as \S, we need to flip the
     negation flag at the end, so that support for characters > 255 works
@@ -4102,6 +4117,13 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */
           }
         }
       }
+      
+    /* If previous is OP_FAIL, it was generated by an empty class [] in
+    JavaScript mode. The other ways in which OP_FAIL can be generated, that is 
+    by (*FAIL) or (?!) set previous to NULL, which gives a "nothing to repeat" 
+    error above. We can just ignore the repeat in JS case. */
+    
+    else if (*previous == OP_FAIL) goto END_REPEAT;  
 
     /* Else there's some kind of shambles */
 
@@ -4389,7 +4411,7 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */
 
         /* Search the pattern for a forward reference */
 
-        else if ((i = find_parens(ptr, cd->bracount, name, namelen,
+        else if ((i = find_parens(ptr, cd, name, namelen,
                         (options & PCRE_EXTENDED) != 0)) > 0)
           {
           PUT2(code, 2+LINK_SIZE, i);
@@ -4686,7 +4708,7 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */
             recno = GET2(slot, 0);
             }
           else if ((recno =                /* Forward back reference */
-                    find_parens(ptr, cd->bracount, name, namelen,
+                    find_parens(ptr, cd, name, namelen,
                       (options & PCRE_EXTENDED) != 0)) <= 0)
             {
             *errorcodeptr = ERR15;
@@ -4796,8 +4818,8 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */
 
             if (called == NULL)
               {
-              if (find_parens(ptr, cd->bracount, NULL, recno,
-                   (options & PCRE_EXTENDED) != 0) < 0)
+              if (find_parens(ptr, cd, NULL, recno,
+                    (options & PCRE_EXTENDED) != 0) < 0)
                 {
                 *errorcodeptr = ERR15;
                 goto FAILED;
diff --git a/pcre_dfa_exec.c b/pcre_dfa_exec.c
index 427c46f..0c8f219 100644
--- a/pcre_dfa_exec.c
+++ b/pcre_dfa_exec.c
@@ -88,7 +88,7 @@ static const uschar coptable[] = {
   0,                             /* End                                    */
   0, 0, 0, 0, 0,                 /* \A, \G, \K, \B, \b                     */
   0, 0, 0, 0, 0, 0,              /* \D, \d, \S, \s, \W, \w                 */
-  0, 0,                          /* Any, Anybyte                           */
+  0, 0, 0,                       /* Any, AllAny, Anybyte                   */
   0, 0, 0,                       /* NOTPROP, PROP, EXTUNI                  */
   0, 0, 0, 0, 0,                 /* \R, \H, \h, \V, \v                     */
   0, 0, 0, 0, 0,                 /* \Z, \z, Opt, ^, $                      */
@@ -132,7 +132,7 @@ static const uschar coptable[] = {
   0,                             /* DEF                                    */
   0, 0,                          /* BRAZERO, BRAMINZERO                    */
   0, 0, 0, 0,                    /* PRUNE, SKIP, THEN, COMMIT              */
-  0, 0                           /* FAIL, ACCEPT                           */
+  0, 0, 0                        /* FAIL, ACCEPT, SKIPZERO                 */
 };
 
 /* These 2 tables allow for compact code for testing for \D, \d, \S, \s, \W,
@@ -143,7 +143,7 @@ static const uschar toptable1[] = {
   ctype_digit, ctype_digit,
   ctype_space, ctype_space,
   ctype_word,  ctype_word,
-  0                               /* OP_ANY */
+  0, 0                            /* OP_ANY, OP_ALLANY */
 };
 
 static const uschar toptable2[] = {
@@ -151,7 +151,7 @@ static const uschar toptable2[] = {
   ctype_digit, 0,
   ctype_space, 0,
   ctype_word,  0,
-  1                               /* OP_ANY */
+  1, 1                            /* OP_ANY, OP_ALLANY */
 };
 
 
@@ -223,8 +223,8 @@ Arguments:
   rlevel            function call recursion level
   recursing         regex recursive call level
 
-Returns:            > 0 =>
-                    = 0 =>
+Returns:            > 0 => number of match offset pairs placed in offsets 
+                    = 0 => offsets overflowed; longest matches are present
                      -1 => failed to match
                    < -1 => some kind of unexpected problem
 
@@ -744,6 +744,12 @@ for (;;)
       break;
 
       /*-----------------------------------------------------------------*/
+      case OP_ALLANY:
+      if (clen > 0)
+        { ADD_NEW(state_offset + 1, 0); }
+      break;
+
+      /*-----------------------------------------------------------------*/
       case OP_EODN:
       if (clen == 0 || (IS_NEWLINE(ptr) && ptr == end_subject - md->nllen))
         { ADD_ACTIVE(state_offset + 1, 0); }
@@ -859,8 +865,8 @@ for (;;)
 /* ========================================================================== */
       /* These opcodes likewise inspect the subject character, but have an
       argument that is not a data character. It is one of these opcodes:
-      OP_ANY, OP_DIGIT, OP_NOT_DIGIT, OP_WHITESPACE, OP_NOT_SPACE, OP_WORDCHAR,
-      OP_NOT_WORDCHAR. The value is loaded into d. */
+      OP_ANY, OP_ALLANY, OP_DIGIT, OP_NOT_DIGIT, OP_WHITESPACE, OP_NOT_SPACE,
+      OP_WORDCHAR, OP_NOT_WORDCHAR. The value is loaded into d. */
 
       case OP_TYPEPLUS:
       case OP_TYPEMINPLUS:
@@ -2169,7 +2175,12 @@ for (;;)
 
 /* ========================================================================== */
       /* These are the opcodes for fancy brackets of various kinds. We have
-      to use recursion in order to handle them. */
+      to use recursion in order to handle them. The "always failing" assersion 
+      (?!) is optimised when compiling to OP_FAIL, so we have to support that,
+      though the other "backtracking verbs" are not supported. */
+      
+      case OP_FAIL:
+      break;  
 
       case OP_ASSERT:
       case OP_ASSERT_NOT:
diff --git a/pcre_exec.c b/pcre_exec.c
index dceb244..526658a 100644
--- a/pcre_exec.c
+++ b/pcre_exec.c
@@ -1433,6 +1433,9 @@ for (;;)
       {
       if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
       }
+    /* Fall through */   
+     
+    case OP_ALLANY:
     if (eptr++ >= md->end_subject) RRETURN(MATCH_NOMATCH);
     if (utf8)
       while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
@@ -2960,6 +2963,15 @@ for (;;)
           }
         break;
 
+        case OP_ALLANY:
+        for (i = 1; i <= min; i++)
+          {
+          if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
+          eptr++;
+          while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
+          }
+        break;
+
         case OP_ANYBYTE:
         eptr += min;
         break;
@@ -3179,6 +3191,10 @@ for (;;)
         else eptr += min;
         break;
 
+        case OP_ALLANY:
+        eptr += min;
+        break;
+
         case OP_ANYBYTE:
         eptr += min;
         break;
@@ -3441,8 +3457,7 @@ for (;;)
           switch(ctype)
             {
             case OP_ANY:        /* This is the DOTALL case */
-            break;
-
+            case OP_ALLANY: 
             case OP_ANYBYTE:
             break;
 
@@ -3600,9 +3615,8 @@ for (;;)
           c = *eptr++;
           switch(ctype)
             {
-            case OP_ANY:   /* This is the DOTALL case */
-            break;
-
+            case OP_ANY:     /* This is the DOTALL case */
+            case OP_ALLANY: 
             case OP_ANYBYTE:
             break;
 
@@ -3896,6 +3910,19 @@ for (;;)
             }
           break;
 
+          case OP_ALLANY:
+          if (max < INT_MAX)
+            {
+            for (i = min; i < max; i++)
+              {
+              if (eptr >= md->end_subject) break;
+              eptr++;
+              while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
+              }
+            }
+          else eptr = md->end_subject;   /* Unlimited UTF-8 repeat */
+          break;
+
           /* The byte case is the same as non-UTF8 */
 
           case OP_ANYBYTE:
@@ -4090,8 +4117,9 @@ for (;;)
               }
             break;
             }
-          /* For DOTALL case, fall through and treat as \C */
+          /* For DOTALL case, fall through */
 
+          case OP_ALLANY:
           case OP_ANYBYTE:
           c = max - min;
           if (c > (unsigned int)(md->end_subject - eptr))
diff --git a/pcre_internal.h b/pcre_internal.h
index 54d9c01..16c723e 100644
--- a/pcre_internal.h
+++ b/pcre_internal.h
@@ -605,16 +605,20 @@ contain UTF-8 characters with values greater than 255. */
 value such as \n. They must have non-zero values, as check_escape() returns
 their negation. Also, they must appear in the same order as in the opcode
 definitions below, up to ESC_z. There's a dummy for OP_ANY because it
-corresponds to "." rather than an escape sequence. The final one must be
-ESC_REF as subsequent values are used for backreferences (\1, \2, \3, etc).
-There are two tests in the code for an escape greater than ESC_b and less than
-ESC_Z to detect the types that may be repeated. These are the types that
-consume characters. If any new escapes are put in between that don't consume a
-character, that code will have to change. */
+corresponds to "." rather than an escape sequence, and another for OP_ALLANY 
+(which is used for [^] in JavaScript compatibility mode).
+
+The final escape must be ESC_REF as subsequent values are used for
+backreferences (\1, \2, \3, etc). There are two tests in the code for an escape
+greater than ESC_b and less than ESC_Z to detect the types that may be
+repeated. These are the types that consume characters. If any new escapes are
+put in between that don't consume a character, that code will have to change.
+*/
 
 enum { ESC_A = 1, ESC_G, ESC_K, ESC_B, ESC_b, ESC_D, ESC_d, ESC_S, ESC_s,
-       ESC_W, ESC_w, ESC_dum1, ESC_C, ESC_P, ESC_p, ESC_R, ESC_H, ESC_h,
-       ESC_V, ESC_v, ESC_X, ESC_Z, ESC_z, ESC_E, ESC_Q, ESC_g, ESC_k, ESC_REF };
+       ESC_W, ESC_w, ESC_dum1, ESC_dum2, ESC_C, ESC_P, ESC_p, ESC_R, ESC_H, 
+       ESC_h, ESC_V, ESC_v, ESC_X, ESC_Z, ESC_z, ESC_E, ESC_Q, ESC_g, ESC_k, 
+       ESC_REF };
 
 
 /* Opcode table: Starting from 1 (i.e. after OP_END), the values up to
@@ -640,145 +644,146 @@ enum {
   OP_WHITESPACE,         /*  9 \s */
   OP_NOT_WORDCHAR,       /* 10 \W */
   OP_WORDCHAR,           /* 11 \w */
-  OP_ANY,            /* 12 Match any character */
-  OP_ANYBYTE,        /* 13 Match any byte (\C); different to OP_ANY for UTF-8 */
-  OP_NOTPROP,        /* 14 \P (not Unicode property) */
-  OP_PROP,           /* 15 \p (Unicode property) */
-  OP_ANYNL,          /* 16 \R (any newline sequence) */
-  OP_NOT_HSPACE,     /* 17 \H (not horizontal whitespace) */
-  OP_HSPACE,         /* 18 \h (horizontal whitespace) */
-  OP_NOT_VSPACE,     /* 19 \V (not vertical whitespace) */
-  OP_VSPACE,         /* 20 \v (vertical whitespace) */
-  OP_EXTUNI,         /* 21 \X (extended Unicode sequence */
-  OP_EODN,           /* 22 End of data or \n at end of data: \Z. */
-  OP_EOD,            /* 23 End of data: \z */
-
-  OP_OPT,            /* 24 Set runtime options */
-  OP_CIRC,           /* 25 Start of line - varies with multiline switch */
-  OP_DOLL,           /* 26 End of line - varies with multiline switch */
-  OP_CHAR,           /* 27 Match one character, casefully */
-  OP_CHARNC,         /* 28 Match one character, caselessly */
-  OP_NOT,            /* 29 Match one character, not the following one */
-
-  OP_STAR,           /* 30 The maximizing and minimizing versions of */
-  OP_MINSTAR,        /* 31 these six opcodes must come in pairs, with */
-  OP_PLUS,           /* 32 the minimizing one second. */
-  OP_MINPLUS,        /* 33 This first set applies to single characters.*/
-  OP_QUERY,          /* 34 */
-  OP_MINQUERY,       /* 35 */
-
-  OP_UPTO,           /* 36 From 0 to n matches */
-  OP_MINUPTO,        /* 37 */
-  OP_EXACT,          /* 38 Exactly n matches */
-
-  OP_POSSTAR,        /* 39 Possessified star */
-  OP_POSPLUS,        /* 40 Possessified plus */
-  OP_POSQUERY,       /* 41 Posesssified query */
-  OP_POSUPTO,        /* 42 Possessified upto */
-
-  OP_NOTSTAR,        /* 43 The maximizing and minimizing versions of */
-  OP_NOTMINSTAR,     /* 44 these six opcodes must come in pairs, with */
-  OP_NOTPLUS,        /* 45 the minimizing one second. They must be in */
-  OP_NOTMINPLUS,     /* 46 exactly the same order as those above. */
-  OP_NOTQUERY,       /* 47 This set applies to "not" single characters. */
-  OP_NOTMINQUERY,    /* 48 */
-
-  OP_NOTUPTO,        /* 49 From 0 to n matches */
-  OP_NOTMINUPTO,     /* 50 */
-  OP_NOTEXACT,       /* 51 Exactly n matches */
-
-  OP_NOTPOSSTAR,     /* 52 Possessified versions */
-  OP_NOTPOSPLUS,     /* 53 */
-  OP_NOTPOSQUERY,    /* 54 */
-  OP_NOTPOSUPTO,     /* 55 */
-
-  OP_TYPESTAR,       /* 56 The maximizing and minimizing versions of */
-  OP_TYPEMINSTAR,    /* 57 these six opcodes must come in pairs, with */
-  OP_TYPEPLUS,       /* 58 the minimizing one second. These codes must */
-  OP_TYPEMINPLUS,    /* 59 be in exactly the same order as those above. */
-  OP_TYPEQUERY,      /* 60 This set applies to character types such as \d */
-  OP_TYPEMINQUERY,   /* 61 */
-
-  OP_TYPEUPTO,       /* 62 From 0 to n matches */
-  OP_TYPEMINUPTO,    /* 63 */
-  OP_TYPEEXACT,      /* 64 Exactly n matches */
-
-  OP_TYPEPOSSTAR,    /* 65 Possessified versions */
-  OP_TYPEPOSPLUS,    /* 66 */
-  OP_TYPEPOSQUERY,   /* 67 */
-  OP_TYPEPOSUPTO,    /* 68 */
-
-  OP_CRSTAR,         /* 69 The maximizing and minimizing versions of */
-  OP_CRMINSTAR,      /* 70 all these opcodes must come in pairs, with */
-  OP_CRPLUS,         /* 71 the minimizing one second. These codes must */
-  OP_CRMINPLUS,      /* 72 be in exactly the same order as those above. */
-  OP_CRQUERY,        /* 73 These are for character classes and back refs */
-  OP_CRMINQUERY,     /* 74 */
-  OP_CRRANGE,        /* 75 These are different to the three sets above. */
-  OP_CRMINRANGE,     /* 76 */
-
-  OP_CLASS,          /* 77 Match a character class, chars < 256 only */
-  OP_NCLASS,         /* 78 Same, but the bitmap was created from a negative
+  OP_ANY,            /* 12 Match any character (subject to DOTALL) */
+  OP_ALLANY,         /* 13 Match any character (not subject to DOTALL) */
+  OP_ANYBYTE,        /* 14 Match any byte (\C); different to OP_ANY for UTF-8 */
+  OP_NOTPROP,        /* 15 \P (not Unicode property) */
+  OP_PROP,           /* 16 \p (Unicode property) */
+  OP_ANYNL,          /* 17 \R (any newline sequence) */
+  OP_NOT_HSPACE,     /* 18 \H (not horizontal whitespace) */
+  OP_HSPACE,         /* 19 \h (horizontal whitespace) */
+  OP_NOT_VSPACE,     /* 20 \V (not vertical whitespace) */
+  OP_VSPACE,         /* 21 \v (vertical whitespace) */
+  OP_EXTUNI,         /* 22 \X (extended Unicode sequence */
+  OP_EODN,           /* 23 End of data or \n at end of data: \Z. */
+  OP_EOD,            /* 24 End of data: \z */
+
+  OP_OPT,            /* 25 Set runtime options */
+  OP_CIRC,           /* 26 Start of line - varies with multiline switch */
+  OP_DOLL,           /* 27 End of line - varies with multiline switch */
+  OP_CHAR,           /* 28 Match one character, casefully */
+  OP_CHARNC,         /* 29 Match one character, caselessly */
+  OP_NOT,            /* 30 Match one character, not the following one */
+
+  OP_STAR,           /* 31 The maximizing and minimizing versions of */
+  OP_MINSTAR,        /* 32 these six opcodes must come in pairs, with */
+  OP_PLUS,           /* 33 the minimizing one second. */
+  OP_MINPLUS,        /* 34 This first set applies to single characters.*/
+  OP_QUERY,          /* 35 */
+  OP_MINQUERY,       /* 36 */
+
+  OP_UPTO,           /* 37 From 0 to n matches */
+  OP_MINUPTO,        /* 38 */
+  OP_EXACT,          /* 39 Exactly n matches */
+
+  OP_POSSTAR,        /* 40 Possessified star */
+  OP_POSPLUS,        /* 41 Possessified plus */
+  OP_POSQUERY,       /* 42 Posesssified query */
+  OP_POSUPTO,        /* 43 Possessified upto */
+
+  OP_NOTSTAR,        /* 44 The maximizing and minimizing versions of */
+  OP_NOTMINSTAR,     /* 45 these six opcodes must come in pairs, with */
+  OP_NOTPLUS,        /* 46 the minimizing one second. They must be in */
+  OP_NOTMINPLUS,     /* 47 exactly the same order as those above. */
+  OP_NOTQUERY,       /* 48 This set applies to "not" single characters. */
+  OP_NOTMINQUERY,    /* 49 */
+
+  OP_NOTUPTO,        /* 50 From 0 to n matches */
+  OP_NOTMINUPTO,     /* 51 */
+  OP_NOTEXACT,       /* 52 Exactly n matches */
+
+  OP_NOTPOSSTAR,     /* 53 Possessified versions */
+  OP_NOTPOSPLUS,     /* 54 */
+  OP_NOTPOSQUERY,    /* 55 */
+  OP_NOTPOSUPTO,     /* 56 */
+
+  OP_TYPESTAR,       /* 57 The maximizing and minimizing versions of */
+  OP_TYPEMINSTAR,    /* 58 these six opcodes must come in pairs, with */
+  OP_TYPEPLUS,       /* 59 the minimizing one second. These codes must */
+  OP_TYPEMINPLUS,    /* 60 be in exactly the same order as those above. */
+  OP_TYPEQUERY,      /* 61 This set applies to character types such as \d */
+  OP_TYPEMINQUERY,   /* 62 */
+
+  OP_TYPEUPTO,       /* 63 From 0 to n matches */
+  OP_TYPEMINUPTO,    /* 64 */
+  OP_TYPEEXACT,      /* 65 Exactly n matches */
+
+  OP_TYPEPOSSTAR,    /* 66 Possessified versions */
+  OP_TYPEPOSPLUS,    /* 67 */
+  OP_TYPEPOSQUERY,   /* 68 */
+  OP_TYPEPOSUPTO,    /* 69 */
+
+  OP_CRSTAR,         /* 70 The maximizing and minimizing versions of */
+  OP_CRMINSTAR,      /* 71 all these opcodes must come in pairs, with */
+  OP_CRPLUS,         /* 72 the minimizing one second. These codes must */
+  OP_CRMINPLUS,      /* 73 be in exactly the same order as those above. */
+  OP_CRQUERY,        /* 74 These are for character classes and back refs */
+  OP_CRMINQUERY,     /* 75 */
+  OP_CRRANGE,        /* 76 These are different to the three sets above. */
+  OP_CRMINRANGE,     /* 77 */
+
+  OP_CLASS,          /* 78 Match a character class, chars < 256 only */
+  OP_NCLASS,         /* 79 Same, but the bitmap was created from a negative
                            class - the difference is relevant only when a UTF-8
                            character > 255 is encountered. */
 
-  OP_XCLASS,         /* 79 Extended class for handling UTF-8 chars within the
+  OP_XCLASS,         /* 80 Extended class for handling UTF-8 chars within the
                            class. This does both positive and negative. */
 
-  OP_REF,            /* 80 Match a back reference */
-  OP_RECURSE,        /* 81 Match a numbered subpattern (possibly recursive) */
-  OP_CALLOUT,        /* 82 Call out to external function if provided */
+  OP_REF,            /* 81 Match a back reference */
+  OP_RECURSE,        /* 82 Match a numbered subpattern (possibly recursive) */
+  OP_CALLOUT,        /* 83 Call out to external function if provided */
 
-  OP_ALT,            /* 83 Start of alternation */
-  OP_KET,            /* 84 End of group that doesn't have an unbounded repeat */
-  OP_KETRMAX,        /* 85 These two must remain together and in this */
-  OP_KETRMIN,        /* 86 order. They are for groups the repeat for ever. */
+  OP_ALT,            /* 84 Start of alternation */
+  OP_KET,            /* 85 End of group that doesn't have an unbounded repeat */
+  OP_KETRMAX,        /* 86 These two must remain together and in this */
+  OP_KETRMIN,        /* 87 order. They are for groups the repeat for ever. */
 
   /* The assertions must come before BRA, CBRA, ONCE, and COND.*/
 
-  OP_ASSERT,         /* 87 Positive lookahead */
-  OP_ASSERT_NOT,     /* 88 Negative lookahead */
-  OP_ASSERTBACK,     /* 89 Positive lookbehind */
-  OP_ASSERTBACK_NOT, /* 90 Negative lookbehind */
-  OP_REVERSE,        /* 91 Move pointer back - used in lookbehind assertions */
+  OP_ASSERT,         /* 88 Positive lookahead */
+  OP_ASSERT_NOT,     /* 89 Negative lookahead */
+  OP_ASSERTBACK,     /* 90 Positive lookbehind */
+  OP_ASSERTBACK_NOT, /* 91 Negative lookbehind */
+  OP_REVERSE,        /* 92 Move pointer back - used in lookbehind assertions */
 
   /* ONCE, BRA, CBRA, and COND must come after the assertions, with ONCE first,
   as there's a test for >= ONCE for a subpattern that isn't an assertion. */
 
-  OP_ONCE,           /* 92 Atomic group */
-  OP_BRA,            /* 93 Start of non-capturing bracket */
-  OP_CBRA,           /* 94 Start of capturing bracket */
-  OP_COND,           /* 95 Conditional group */
+  OP_ONCE,           /* 93 Atomic group */
+  OP_BRA,            /* 94 Start of non-capturing bracket */
+  OP_CBRA,           /* 95 Start of capturing bracket */
+  OP_COND,           /* 96 Conditional group */
 
   /* These three must follow the previous three, in the same order. There's a
   check for >= SBRA to distinguish the two sets. */
 
-  OP_SBRA,           /* 96 Start of non-capturing bracket, check empty  */
-  OP_SCBRA,          /* 97 Start of capturing bracket, check empty */
-  OP_SCOND,          /* 98 Conditional group, check empty */
+  OP_SBRA,           /* 97 Start of non-capturing bracket, check empty  */
+  OP_SCBRA,          /* 98 Start of capturing bracket, check empty */
+  OP_SCOND,          /* 99 Conditional group, check empty */
 
-  OP_CREF,           /* 99 Used to hold a capture number as condition */
-  OP_RREF,           /* 100 Used to hold a recursion number as condition */
-  OP_DEF,            /* 101 The DEFINE condition */
+  OP_CREF,           /* 100 Used to hold a capture number as condition */
+  OP_RREF,           /* 101 Used to hold a recursion number as condition */
+  OP_DEF,            /* 102 The DEFINE condition */
 
-  OP_BRAZERO,        /* 102 These two must remain together and in this */
-  OP_BRAMINZERO,     /* 103 order. */
+  OP_BRAZERO,        /* 103 These two must remain together and in this */
+  OP_BRAMINZERO,     /* 104 order. */
 
   /* These are backtracking control verbs */
 
-  OP_PRUNE,          /* 104 */
-  OP_SKIP,           /* 105 */
-  OP_THEN,           /* 106 */
-  OP_COMMIT,         /* 107 */
+  OP_PRUNE,          /* 105 */
+  OP_SKIP,           /* 106 */
+  OP_THEN,           /* 107 */
+  OP_COMMIT,         /* 108 */
 
   /* These are forced failure and success verbs */
 
-  OP_FAIL,           /* 108 */
-  OP_ACCEPT,         /* 109 */
+  OP_FAIL,           /* 109 */
+  OP_ACCEPT,         /* 110 */
   
   /* This is used to skip a subpattern with a {0} quantifier */
  
-  OP_SKIPZERO        /* 110 */
+  OP_SKIPZERO        /* 111 */
 };
 
 
@@ -787,7 +792,7 @@ for debugging. The macro is referenced only in pcre_printint.c. */
 
 #define OP_NAME_LIST \
   "End", "\\A", "\\G", "\\K", "\\B", "\\b", "\\D", "\\d",         \
-  "\\S", "\\s", "\\W", "\\w", "Any", "Anybyte",                   \
+  "\\S", "\\s", "\\W", "\\w", "Any", "AllAny", "Anybyte",         \
   "notprop", "prop", "\\R", "\\H", "\\h", "\\V", "\\v",           \
   "extuni",  "\\Z", "\\z",                                        \
   "Opt", "^", "$", "char", "charnc", "not",                       \
@@ -820,7 +825,7 @@ in UTF-8 mode. The code that uses this table must know about such things. */
   1,                             /* End                                    */ \
   1, 1, 1, 1, 1,                 /* \A, \G, \K, \B, \b                     */ \
   1, 1, 1, 1, 1, 1,              /* \D, \d, \S, \s, \W, \w                 */ \
-  1, 1,                          /* Any, Anybyte                           */ \
+  1, 1, 1,                       /* Any, AllAny, Anybyte                   */ \
   3, 3, 1,                       /* NOTPROP, PROP, EXTUNI                  */ \
   1, 1, 1, 1, 1,                 /* \R, \H, \h, \V, \v                     */ \
   1, 1, 2, 1, 1,                 /* \Z, \z, Opt, ^, $                      */ \
diff --git a/testdata/testinput2 b/testdata/testinput2
index 6c29b39..93b0f09 100644
--- a/testdata/testinput2
+++ b/testdata/testinput2
@@ -2692,4 +2692,36 @@ a random value. /Ix
 /(?&N)[]a(?<N>)](abc)/
    abc<abc
 
+/a[]b/
+
+/a[^]b/
+
+/a[]b/<JS>
+    ** Failers
+    ab
+
+/a[]+b/<JS>
+    ** Failers
+    ab 
+
+/a[]*+b/<JS>
+    ** Failers
+    ab 
+
+/a[^]b/<JS>
+    aXb
+    a\nb 
+    ** Failers
+    ab  
+    
+/a[^]+b/<JS> 
+    aXb
+    a\nX\nXb 
+    ** Failers
+    ab  
+
+/a(?!)+b/
+
+/a(*FAIL)+b/
+
 / End of testinput2 /
diff --git a/testdata/testinput5 b/testdata/testinput5
index 75a4857..8a8e499 100644
--- a/testdata/testinput5
+++ b/testdata/testinput5
@@ -461,4 +461,16 @@ can't tell the difference.) --/
 
 /[[:a\x{100}b:]]/8
 
+/a[^]b/<JS>8
+    a\x{1234}b
+    a\nb 
+    ** Failers
+    ab  
+    
+/a[^]+b/<JS>8
+    aXb
+    a\nX\nX\x{1234}b 
+    ** Failers
+    ab  
+
 / End of testinput5 /
diff --git a/testdata/testinput7 b/testdata/testinput7
index 221bc93..5d593ee 100644
--- a/testdata/testinput7
+++ b/testdata/testinput7
@@ -4364,5 +4364,32 @@
     a\r\r\r\r\rb 
     a\x85\85b\<bsr_anycrlf>
     a\x0b\0bb\<bsr_anycrlf>
+    
+/a(?!)|\wbc/
+    abc 
+
+/a[]b/<JS>
+    ** Failers
+    ab
+
+/a[]+b/<JS>
+    ** Failers
+    ab 
+
+/a[]*+b/<JS>
+    ** Failers
+    ab 
+
+/a[^]b/<JS>
+    aXb
+    a\nb 
+    ** Failers
+    ab  
+    
+/a[^]+b/<JS> 
+    aXb
+    a\nX\nXb 
+    ** Failers
+    ab  
 
 / End of testinput7 /
diff --git a/testdata/testoutput2 b/testdata/testoutput2
index a2b9b8b..fe32975 100644
--- a/testdata/testoutput2
+++ b/testdata/testoutput2
@@ -9581,4 +9581,54 @@ Failed: reference to non-existent subpattern at offset 4
 /(?&N)[]a(?<N>)](abc)/
 Failed: reference to non-existent subpattern at offset 4
 
+/a[]b/
+Failed: missing terminating ] for character class at offset 4
+
+/a[^]b/
+Failed: missing terminating ] for character class at offset 5
+
+/a[]b/<JS>
+    ** Failers
+No match
+    ab
+No match
+
+/a[]+b/<JS>
+    ** Failers
+No match
+    ab 
+No match
+
+/a[]*+b/<JS>
+    ** Failers
+No match
+    ab 
+No match
+
+/a[^]b/<JS>
+    aXb
+ 0: aXb
+    a\nb 
+ 0: a\x0ab
+    ** Failers
+No match
+    ab  
+No match
+    
+/a[^]+b/<JS> 
+    aXb
+ 0: aXb
+    a\nX\nXb 
+ 0: a\x0aX\x0aXb
+    ** Failers
+No match
+    ab  
+No match
+
+/a(?!)+b/
+Failed: nothing to repeat at offset 5
+
+/a(*FAIL)+b/
+Failed: nothing to repeat at offset 8
+
 / End of testinput2 /
diff --git a/testdata/testoutput5 b/testdata/testoutput5
index abbe1c8..1c745f4 100644
--- a/testdata/testoutput5
+++ b/testdata/testoutput5
@@ -1608,4 +1608,24 @@ No match
 /[[:a\x{100}b:]]/8
 Failed: unknown POSIX class name at offset 3
 
+/a[^]b/<JS>8
+    a\x{1234}b
+ 0: a\x{1234}b
+    a\nb 
+ 0: a\x{0a}b
+    ** Failers
+No match
+    ab  
+No match
+    
+/a[^]+b/<JS>8
+    aXb
+ 0: aXb
+    a\nX\nX\x{1234}b 
+ 0: a\x{0a}X\x{0a}X\x{1234}b
+    ** Failers
+No match
+    ab  
+No match
+
 / End of testinput5 /
diff --git a/testdata/testoutput7 b/testdata/testoutput7
index d8e3833..9ded29d 100644
--- a/testdata/testoutput7
+++ b/testdata/testoutput7
@@ -7211,5 +7211,47 @@ No match
 No match
     a\x0b\0bb\<bsr_anycrlf>
 No match
+    
+/a(?!)|\wbc/
+    abc 
+ 0: abc
+
+/a[]b/<JS>
+    ** Failers
+No match
+    ab
+No match
+
+/a[]+b/<JS>
+    ** Failers
+No match
+    ab 
+No match
+
+/a[]*+b/<JS>
+    ** Failers
+No match
+    ab 
+No match
+
+/a[^]b/<JS>
+    aXb
+ 0: aXb
+    a\nb 
+ 0: a\x0ab
+    ** Failers
+No match
+    ab  
+No match
+    
+/a[^]+b/<JS> 
+    aXb
+ 0: aXb
+    a\nX\nXb 
+ 0: a\x0aX\x0aXb
+    ** Failers
+No match
+    ab  
+No match
 
 / End of testinput7 /
author	ph10 <ph10@2f5784b3-3f2a-0410-8824-cb99058d5e15>	2008-04-19 16:41:04 +0000
committer	ph10 <ph10@2f5784b3-3f2a-0410-8824-cb99058d5e15>	2008-04-19 16:41:04 +0000
commit	c66bac83039352a3a193c9db0b156fe8faa71296 (patch)
tree	a05cc60b6b0413efaebd9591638eaf2430f28ff2
parent	3bbe280000d68a6360bae77fdf03737822bf020d (diff)
download	pcre-c66bac83039352a3a193c9db0b156fe8faa71296.tar.gz