diff options
author | ph10 <ph10@2f5784b3-3f2a-0410-8824-cb99058d5e15> | 2008-04-19 16:41:04 +0000 |
---|---|---|
committer | ph10 <ph10@2f5784b3-3f2a-0410-8824-cb99058d5e15> | 2008-04-19 16:41:04 +0000 |
commit | c66bac83039352a3a193c9db0b156fe8faa71296 (patch) | |
tree | a05cc60b6b0413efaebd9591638eaf2430f28ff2 | |
parent | 3bbe280000d68a6360bae77fdf03737822bf020d (diff) | |
download | pcre-c66bac83039352a3a193c9db0b156fe8faa71296.tar.gz |
Fix DFA (?!) bug; add support for JavaScript empty classes.
git-svn-id: svn://vcs.exim.org/pcre/code/trunk@341 2f5784b3-3f2a-0410-8824-cb99058d5e15
-rw-r--r-- | ChangeLog | 17 | ||||
-rw-r--r-- | doc/pcrematching.3 | 7 | ||||
-rw-r--r-- | doc/pcrepattern.3 | 9 | ||||
-rw-r--r-- | pcre_compile.c | 38 | ||||
-rw-r--r-- | pcre_dfa_exec.c | 29 | ||||
-rw-r--r-- | pcre_exec.c | 40 | ||||
-rw-r--r-- | pcre_internal.h | 247 | ||||
-rw-r--r-- | testdata/testinput2 | 32 | ||||
-rw-r--r-- | testdata/testinput5 | 12 | ||||
-rw-r--r-- | testdata/testinput7 | 27 | ||||
-rw-r--r-- | testdata/testoutput2 | 50 | ||||
-rw-r--r-- | testdata/testoutput5 | 20 | ||||
-rw-r--r-- | testdata/testoutput7 | 42 |
13 files changed, 418 insertions, 152 deletions
@@ -58,11 +58,21 @@ Version 7.7 05-Mar-08 pattern, with a new opcode that causes them to be skipped at execution time. -13. Added the PCRE_JAVASCRIPT_COMPAT option. This currently does two things: +13. Added the PCRE_JAVASCRIPT_COMPAT option. This makes the following changes + to the way PCRE behaves: + (a) A lone ] character is dis-allowed (Perl treats it as data). + (b) A back reference to an unmatched subpattern matches an empty string (Perl fails the current match path). + (c) A data ] in a character class must be notated as \] because if the + first data character in a class is ], it defines an empty class. (In + Perl it is not possible to have an empty class.) The empty class [] + never matches; it forces failure and is equivalent to (*FAIL) or (?!). + The negative empty class [^] matches any one character, independently + of the DOTALL setting. + 14. A pattern such as /(?2)[]a()b](abc)/ which had a forward reference to a non-existent subpattern following a character class starting with ']' and containing () gave an internal compiling error instead of "reference to @@ -71,6 +81,11 @@ Version 7.7 05-Mar-08 existencd of the subpattern, it was treating the data ']' as terminating the class, so got the count wrong. When actually compiling, the reference was subsequently set up correctly.) + +15. The "always fail" assertion (?!) is optimzed to (*FAIL) by pcre_compile; + it was being rejected as not supported by pcre_dfa_exec(), even though + other assertions are supported. I have made pcre_dfa_exec() support + (*FAIL). Version 7.6 28-Jan-08 diff --git a/doc/pcrematching.3 b/doc/pcrematching.3 index a91e9a0..560a48c 100644 --- a/doc/pcrematching.3 +++ b/doc/pcrematching.3 @@ -131,7 +131,8 @@ byte, even in UTF-8 mode, is not supported because the alternative algorithm moves through the subject string one character at a time, for all active paths through the tree. .P -8. None of the backtracking control verbs such as (*PRUNE) are supported. +8. Except for (*FAIL), the backtracking control verbs such as (*PRUNE) are not +supported. (*FAIL) is supported, and behaves like a failing negative assertion. . .SH "ADVANTAGES OF THE ALTERNATIVE ALGORITHM" .rs @@ -182,6 +183,6 @@ Cambridge CB2 3QH, England. .rs .sp .nf -Last updated: 08 August 2007 -Copyright (c) 1997-2007 University of Cambridge. +Last updated: 19 April 2008 +Copyright (c) 1997-2008 University of Cambridge. .fi diff --git a/doc/pcrepattern.3 b/doc/pcrepattern.3 index 2727b86..1160a4a 100644 --- a/doc/pcrepattern.3 +++ b/doc/pcrepattern.3 @@ -2115,9 +2115,10 @@ or removal in a future version of Perl". It goes on to say: "Their usage in production code should be noted to avoid problems during upgrades." The same remarks apply to the PCRE features described in this section. .P -Since these verbs are specifically related to backtracking, they can be used -only when the pattern is to be matched using \fBpcre_exec()\fP, which uses a -backtracking algorithm. They cause an error if encountered by +Since these verbs are specifically related to backtracking, most of them can be +used only when the pattern is to be matched using \fBpcre_exec()\fP, which uses +a backtracking algorithm. With the exception of (*FAIL), which behaves like a +failing negative assertion, they cause an error if encountered by \fBpcre_dfa_exec()\fP. .P The new verbs make use of what was previously invalid syntax: an opening @@ -2239,6 +2240,6 @@ Cambridge CB2 3QH, England. .rs .sp .nf -Last updated: 12 April 2008 +Last updated: 19 April 2008 Copyright (c) 1997-2008 University of Cambridge. .fi diff --git a/pcre_compile.c b/pcre_compile.c index 81a76b2..e4dd87b 100644 --- a/pcre_compile.c +++ b/pcre_compile.c @@ -976,7 +976,7 @@ be terminated by '>' because that is checked in the first pass. Arguments: ptr current position in the pattern - count current count of capturing parens so far encountered + cd compile background data name name to seek, or NULL if seeking a numbered subpattern lorn name length, or subpattern number if name is NULL xmode TRUE if we are in /x mode @@ -985,10 +985,11 @@ Returns: the number of the named subpattern, or -1 if not found */ static int -find_parens(const uschar *ptr, int count, const uschar *name, int lorn, +find_parens(const uschar *ptr, compile_data *cd, const uschar *name, int lorn, BOOL xmode) { const uschar *thisname; +int count = cd->bracount; for (; *ptr != 0; ptr++) { @@ -1031,9 +1032,10 @@ for (; *ptr != 0; ptr++) } /* If the next character is ']', it is a data character that must be - skipped. */ + skipped, except in JavaScript compatibility mode. */ - if (ptr[1] == ']') ptr++; + if (ptr[1] == ']' && (cd->external_options & PCRE_JAVASCRIPT_COMPAT) == 0) + ptr++; while (*(++ptr) != ']') { @@ -2721,6 +2723,19 @@ for (;; ptr++) negate_class = TRUE; else break; } + + /* Empty classes are allowed in JavaScript compatibility mode. Otherwise, + an initial ']' is taken as a data character -- the code below handles + that. In JS mode, [] must always fail, so generate OP_FAIL, whereas + [^] must match any character, so generate OP_ALLANY. */ + + if (c ==']' && (cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0) + { + *code++ = negate_class? OP_ALLANY : OP_FAIL; + if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE; + zerofirstbyte = firstbyte; + break; + } /* If a class contains a negative special such as \S, we need to flip the negation flag at the end, so that support for characters > 255 works @@ -4102,6 +4117,13 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */ } } } + + /* If previous is OP_FAIL, it was generated by an empty class [] in + JavaScript mode. The other ways in which OP_FAIL can be generated, that is + by (*FAIL) or (?!) set previous to NULL, which gives a "nothing to repeat" + error above. We can just ignore the repeat in JS case. */ + + else if (*previous == OP_FAIL) goto END_REPEAT; /* Else there's some kind of shambles */ @@ -4389,7 +4411,7 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */ /* Search the pattern for a forward reference */ - else if ((i = find_parens(ptr, cd->bracount, name, namelen, + else if ((i = find_parens(ptr, cd, name, namelen, (options & PCRE_EXTENDED) != 0)) > 0) { PUT2(code, 2+LINK_SIZE, i); @@ -4686,7 +4708,7 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */ recno = GET2(slot, 0); } else if ((recno = /* Forward back reference */ - find_parens(ptr, cd->bracount, name, namelen, + find_parens(ptr, cd, name, namelen, (options & PCRE_EXTENDED) != 0)) <= 0) { *errorcodeptr = ERR15; @@ -4796,8 +4818,8 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */ if (called == NULL) { - if (find_parens(ptr, cd->bracount, NULL, recno, - (options & PCRE_EXTENDED) != 0) < 0) + if (find_parens(ptr, cd, NULL, recno, + (options & PCRE_EXTENDED) != 0) < 0) { *errorcodeptr = ERR15; goto FAILED; diff --git a/pcre_dfa_exec.c b/pcre_dfa_exec.c index 427c46f..0c8f219 100644 --- a/pcre_dfa_exec.c +++ b/pcre_dfa_exec.c @@ -88,7 +88,7 @@ static const uschar coptable[] = { 0, /* End */ 0, 0, 0, 0, 0, /* \A, \G, \K, \B, \b */ 0, 0, 0, 0, 0, 0, /* \D, \d, \S, \s, \W, \w */ - 0, 0, /* Any, Anybyte */ + 0, 0, 0, /* Any, AllAny, Anybyte */ 0, 0, 0, /* NOTPROP, PROP, EXTUNI */ 0, 0, 0, 0, 0, /* \R, \H, \h, \V, \v */ 0, 0, 0, 0, 0, /* \Z, \z, Opt, ^, $ */ @@ -132,7 +132,7 @@ static const uschar coptable[] = { 0, /* DEF */ 0, 0, /* BRAZERO, BRAMINZERO */ 0, 0, 0, 0, /* PRUNE, SKIP, THEN, COMMIT */ - 0, 0 /* FAIL, ACCEPT */ + 0, 0, 0 /* FAIL, ACCEPT, SKIPZERO */ }; /* These 2 tables allow for compact code for testing for \D, \d, \S, \s, \W, @@ -143,7 +143,7 @@ static const uschar toptable1[] = { ctype_digit, ctype_digit, ctype_space, ctype_space, ctype_word, ctype_word, - 0 /* OP_ANY */ + 0, 0 /* OP_ANY, OP_ALLANY */ }; static const uschar toptable2[] = { @@ -151,7 +151,7 @@ static const uschar toptable2[] = { ctype_digit, 0, ctype_space, 0, ctype_word, 0, - 1 /* OP_ANY */ + 1, 1 /* OP_ANY, OP_ALLANY */ }; @@ -223,8 +223,8 @@ Arguments: rlevel function call recursion level recursing regex recursive call level -Returns: > 0 => - = 0 => +Returns: > 0 => number of match offset pairs placed in offsets + = 0 => offsets overflowed; longest matches are present -1 => failed to match < -1 => some kind of unexpected problem @@ -744,6 +744,12 @@ for (;;) break; /*-----------------------------------------------------------------*/ + case OP_ALLANY: + if (clen > 0) + { ADD_NEW(state_offset + 1, 0); } + break; + + /*-----------------------------------------------------------------*/ case OP_EODN: if (clen == 0 || (IS_NEWLINE(ptr) && ptr == end_subject - md->nllen)) { ADD_ACTIVE(state_offset + 1, 0); } @@ -859,8 +865,8 @@ for (;;) /* ========================================================================== */ /* These opcodes likewise inspect the subject character, but have an argument that is not a data character. It is one of these opcodes: - OP_ANY, OP_DIGIT, OP_NOT_DIGIT, OP_WHITESPACE, OP_NOT_SPACE, OP_WORDCHAR, - OP_NOT_WORDCHAR. The value is loaded into d. */ + OP_ANY, OP_ALLANY, OP_DIGIT, OP_NOT_DIGIT, OP_WHITESPACE, OP_NOT_SPACE, + OP_WORDCHAR, OP_NOT_WORDCHAR. The value is loaded into d. */ case OP_TYPEPLUS: case OP_TYPEMINPLUS: @@ -2169,7 +2175,12 @@ for (;;) /* ========================================================================== */ /* These are the opcodes for fancy brackets of various kinds. We have - to use recursion in order to handle them. */ + to use recursion in order to handle them. The "always failing" assersion + (?!) is optimised when compiling to OP_FAIL, so we have to support that, + though the other "backtracking verbs" are not supported. */ + + case OP_FAIL: + break; case OP_ASSERT: case OP_ASSERT_NOT: diff --git a/pcre_exec.c b/pcre_exec.c index dceb244..526658a 100644 --- a/pcre_exec.c +++ b/pcre_exec.c @@ -1433,6 +1433,9 @@ for (;;) { if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH); } + /* Fall through */ + + case OP_ALLANY: if (eptr++ >= md->end_subject) RRETURN(MATCH_NOMATCH); if (utf8) while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++; @@ -2960,6 +2963,15 @@ for (;;) } break; + case OP_ALLANY: + for (i = 1; i <= min; i++) + { + if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); + eptr++; + while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++; + } + break; + case OP_ANYBYTE: eptr += min; break; @@ -3179,6 +3191,10 @@ for (;;) else eptr += min; break; + case OP_ALLANY: + eptr += min; + break; + case OP_ANYBYTE: eptr += min; break; @@ -3441,8 +3457,7 @@ for (;;) switch(ctype) { case OP_ANY: /* This is the DOTALL case */ - break; - + case OP_ALLANY: case OP_ANYBYTE: break; @@ -3600,9 +3615,8 @@ for (;;) c = *eptr++; switch(ctype) { - case OP_ANY: /* This is the DOTALL case */ - break; - + case OP_ANY: /* This is the DOTALL case */ + case OP_ALLANY: case OP_ANYBYTE: break; @@ -3896,6 +3910,19 @@ for (;;) } break; + case OP_ALLANY: + if (max < INT_MAX) + { + for (i = min; i < max; i++) + { + if (eptr >= md->end_subject) break; + eptr++; + while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++; + } + } + else eptr = md->end_subject; /* Unlimited UTF-8 repeat */ + break; + /* The byte case is the same as non-UTF8 */ case OP_ANYBYTE: @@ -4090,8 +4117,9 @@ for (;;) } break; } - /* For DOTALL case, fall through and treat as \C */ + /* For DOTALL case, fall through */ + case OP_ALLANY: case OP_ANYBYTE: c = max - min; if (c > (unsigned int)(md->end_subject - eptr)) diff --git a/pcre_internal.h b/pcre_internal.h index 54d9c01..16c723e 100644 --- a/pcre_internal.h +++ b/pcre_internal.h @@ -605,16 +605,20 @@ contain UTF-8 characters with values greater than 255. */ value such as \n. They must have non-zero values, as check_escape() returns their negation. Also, they must appear in the same order as in the opcode definitions below, up to ESC_z. There's a dummy for OP_ANY because it -corresponds to "." rather than an escape sequence. The final one must be -ESC_REF as subsequent values are used for backreferences (\1, \2, \3, etc). -There are two tests in the code for an escape greater than ESC_b and less than -ESC_Z to detect the types that may be repeated. These are the types that -consume characters. If any new escapes are put in between that don't consume a -character, that code will have to change. */ +corresponds to "." rather than an escape sequence, and another for OP_ALLANY +(which is used for [^] in JavaScript compatibility mode). + +The final escape must be ESC_REF as subsequent values are used for +backreferences (\1, \2, \3, etc). There are two tests in the code for an escape +greater than ESC_b and less than ESC_Z to detect the types that may be +repeated. These are the types that consume characters. If any new escapes are +put in between that don't consume a character, that code will have to change. +*/ enum { ESC_A = 1, ESC_G, ESC_K, ESC_B, ESC_b, ESC_D, ESC_d, ESC_S, ESC_s, - ESC_W, ESC_w, ESC_dum1, ESC_C, ESC_P, ESC_p, ESC_R, ESC_H, ESC_h, - ESC_V, ESC_v, ESC_X, ESC_Z, ESC_z, ESC_E, ESC_Q, ESC_g, ESC_k, ESC_REF }; + ESC_W, ESC_w, ESC_dum1, ESC_dum2, ESC_C, ESC_P, ESC_p, ESC_R, ESC_H, + ESC_h, ESC_V, ESC_v, ESC_X, ESC_Z, ESC_z, ESC_E, ESC_Q, ESC_g, ESC_k, + ESC_REF }; /* Opcode table: Starting from 1 (i.e. after OP_END), the values up to @@ -640,145 +644,146 @@ enum { OP_WHITESPACE, /* 9 \s */ OP_NOT_WORDCHAR, /* 10 \W */ OP_WORDCHAR, /* 11 \w */ - OP_ANY, /* 12 Match any character */ - OP_ANYBYTE, /* 13 Match any byte (\C); different to OP_ANY for UTF-8 */ - OP_NOTPROP, /* 14 \P (not Unicode property) */ - OP_PROP, /* 15 \p (Unicode property) */ - OP_ANYNL, /* 16 \R (any newline sequence) */ - OP_NOT_HSPACE, /* 17 \H (not horizontal whitespace) */ - OP_HSPACE, /* 18 \h (horizontal whitespace) */ - OP_NOT_VSPACE, /* 19 \V (not vertical whitespace) */ - OP_VSPACE, /* 20 \v (vertical whitespace) */ - OP_EXTUNI, /* 21 \X (extended Unicode sequence */ - OP_EODN, /* 22 End of data or \n at end of data: \Z. */ - OP_EOD, /* 23 End of data: \z */ - - OP_OPT, /* 24 Set runtime options */ - OP_CIRC, /* 25 Start of line - varies with multiline switch */ - OP_DOLL, /* 26 End of line - varies with multiline switch */ - OP_CHAR, /* 27 Match one character, casefully */ - OP_CHARNC, /* 28 Match one character, caselessly */ - OP_NOT, /* 29 Match one character, not the following one */ - - OP_STAR, /* 30 The maximizing and minimizing versions of */ - OP_MINSTAR, /* 31 these six opcodes must come in pairs, with */ - OP_PLUS, /* 32 the minimizing one second. */ - OP_MINPLUS, /* 33 This first set applies to single characters.*/ - OP_QUERY, /* 34 */ - OP_MINQUERY, /* 35 */ - - OP_UPTO, /* 36 From 0 to n matches */ - OP_MINUPTO, /* 37 */ - OP_EXACT, /* 38 Exactly n matches */ - - OP_POSSTAR, /* 39 Possessified star */ - OP_POSPLUS, /* 40 Possessified plus */ - OP_POSQUERY, /* 41 Posesssified query */ - OP_POSUPTO, /* 42 Possessified upto */ - - OP_NOTSTAR, /* 43 The maximizing and minimizing versions of */ - OP_NOTMINSTAR, /* 44 these six opcodes must come in pairs, with */ - OP_NOTPLUS, /* 45 the minimizing one second. They must be in */ - OP_NOTMINPLUS, /* 46 exactly the same order as those above. */ - OP_NOTQUERY, /* 47 This set applies to "not" single characters. */ - OP_NOTMINQUERY, /* 48 */ - - OP_NOTUPTO, /* 49 From 0 to n matches */ - OP_NOTMINUPTO, /* 50 */ - OP_NOTEXACT, /* 51 Exactly n matches */ - - OP_NOTPOSSTAR, /* 52 Possessified versions */ - OP_NOTPOSPLUS, /* 53 */ - OP_NOTPOSQUERY, /* 54 */ - OP_NOTPOSUPTO, /* 55 */ - - OP_TYPESTAR, /* 56 The maximizing and minimizing versions of */ - OP_TYPEMINSTAR, /* 57 these six opcodes must come in pairs, with */ - OP_TYPEPLUS, /* 58 the minimizing one second. These codes must */ - OP_TYPEMINPLUS, /* 59 be in exactly the same order as those above. */ - OP_TYPEQUERY, /* 60 This set applies to character types such as \d */ - OP_TYPEMINQUERY, /* 61 */ - - OP_TYPEUPTO, /* 62 From 0 to n matches */ - OP_TYPEMINUPTO, /* 63 */ - OP_TYPEEXACT, /* 64 Exactly n matches */ - - OP_TYPEPOSSTAR, /* 65 Possessified versions */ - OP_TYPEPOSPLUS, /* 66 */ - OP_TYPEPOSQUERY, /* 67 */ - OP_TYPEPOSUPTO, /* 68 */ - - OP_CRSTAR, /* 69 The maximizing and minimizing versions of */ - OP_CRMINSTAR, /* 70 all these opcodes must come in pairs, with */ - OP_CRPLUS, /* 71 the minimizing one second. These codes must */ - OP_CRMINPLUS, /* 72 be in exactly the same order as those above. */ - OP_CRQUERY, /* 73 These are for character classes and back refs */ - OP_CRMINQUERY, /* 74 */ - OP_CRRANGE, /* 75 These are different to the three sets above. */ - OP_CRMINRANGE, /* 76 */ - - OP_CLASS, /* 77 Match a character class, chars < 256 only */ - OP_NCLASS, /* 78 Same, but the bitmap was created from a negative + OP_ANY, /* 12 Match any character (subject to DOTALL) */ + OP_ALLANY, /* 13 Match any character (not subject to DOTALL) */ + OP_ANYBYTE, /* 14 Match any byte (\C); different to OP_ANY for UTF-8 */ + OP_NOTPROP, /* 15 \P (not Unicode property) */ + OP_PROP, /* 16 \p (Unicode property) */ + OP_ANYNL, /* 17 \R (any newline sequence) */ + OP_NOT_HSPACE, /* 18 \H (not horizontal whitespace) */ + OP_HSPACE, /* 19 \h (horizontal whitespace) */ + OP_NOT_VSPACE, /* 20 \V (not vertical whitespace) */ + OP_VSPACE, /* 21 \v (vertical whitespace) */ + OP_EXTUNI, /* 22 \X (extended Unicode sequence */ + OP_EODN, /* 23 End of data or \n at end of data: \Z. */ + OP_EOD, /* 24 End of data: \z */ + + OP_OPT, /* 25 Set runtime options */ + OP_CIRC, /* 26 Start of line - varies with multiline switch */ + OP_DOLL, /* 27 End of line - varies with multiline switch */ + OP_CHAR, /* 28 Match one character, casefully */ + OP_CHARNC, /* 29 Match one character, caselessly */ + OP_NOT, /* 30 Match one character, not the following one */ + + OP_STAR, /* 31 The maximizing and minimizing versions of */ + OP_MINSTAR, /* 32 these six opcodes must come in pairs, with */ + OP_PLUS, /* 33 the minimizing one second. */ + OP_MINPLUS, /* 34 This first set applies to single characters.*/ + OP_QUERY, /* 35 */ + OP_MINQUERY, /* 36 */ + + OP_UPTO, /* 37 From 0 to n matches */ + OP_MINUPTO, /* 38 */ + OP_EXACT, /* 39 Exactly n matches */ + + OP_POSSTAR, /* 40 Possessified star */ + OP_POSPLUS, /* 41 Possessified plus */ + OP_POSQUERY, /* 42 Posesssified query */ + OP_POSUPTO, /* 43 Possessified upto */ + + OP_NOTSTAR, /* 44 The maximizing and minimizing versions of */ + OP_NOTMINSTAR, /* 45 these six opcodes must come in pairs, with */ + OP_NOTPLUS, /* 46 the minimizing one second. They must be in */ + OP_NOTMINPLUS, /* 47 exactly the same order as those above. */ + OP_NOTQUERY, /* 48 This set applies to "not" single characters. */ + OP_NOTMINQUERY, /* 49 */ + + OP_NOTUPTO, /* 50 From 0 to n matches */ + OP_NOTMINUPTO, /* 51 */ + OP_NOTEXACT, /* 52 Exactly n matches */ + + OP_NOTPOSSTAR, /* 53 Possessified versions */ + OP_NOTPOSPLUS, /* 54 */ + OP_NOTPOSQUERY, /* 55 */ + OP_NOTPOSUPTO, /* 56 */ + + OP_TYPESTAR, /* 57 The maximizing and minimizing versions of */ + OP_TYPEMINSTAR, /* 58 these six opcodes must come in pairs, with */ + OP_TYPEPLUS, /* 59 the minimizing one second. These codes must */ + OP_TYPEMINPLUS, /* 60 be in exactly the same order as those above. */ + OP_TYPEQUERY, /* 61 This set applies to character types such as \d */ + OP_TYPEMINQUERY, /* 62 */ + + OP_TYPEUPTO, /* 63 From 0 to n matches */ + OP_TYPEMINUPTO, /* 64 */ + OP_TYPEEXACT, /* 65 Exactly n matches */ + + OP_TYPEPOSSTAR, /* 66 Possessified versions */ + OP_TYPEPOSPLUS, /* 67 */ + OP_TYPEPOSQUERY, /* 68 */ + OP_TYPEPOSUPTO, /* 69 */ + + OP_CRSTAR, /* 70 The maximizing and minimizing versions of */ + OP_CRMINSTAR, /* 71 all these opcodes must come in pairs, with */ + OP_CRPLUS, /* 72 the minimizing one second. These codes must */ + OP_CRMINPLUS, /* 73 be in exactly the same order as those above. */ + OP_CRQUERY, /* 74 These are for character classes and back refs */ + OP_CRMINQUERY, /* 75 */ + OP_CRRANGE, /* 76 These are different to the three sets above. */ + OP_CRMINRANGE, /* 77 */ + + OP_CLASS, /* 78 Match a character class, chars < 256 only */ + OP_NCLASS, /* 79 Same, but the bitmap was created from a negative class - the difference is relevant only when a UTF-8 character > 255 is encountered. */ - OP_XCLASS, /* 79 Extended class for handling UTF-8 chars within the + OP_XCLASS, /* 80 Extended class for handling UTF-8 chars within the class. This does both positive and negative. */ - OP_REF, /* 80 Match a back reference */ - OP_RECURSE, /* 81 Match a numbered subpattern (possibly recursive) */ - OP_CALLOUT, /* 82 Call out to external function if provided */ + OP_REF, /* 81 Match a back reference */ + OP_RECURSE, /* 82 Match a numbered subpattern (possibly recursive) */ + OP_CALLOUT, /* 83 Call out to external function if provided */ - OP_ALT, /* 83 Start of alternation */ - OP_KET, /* 84 End of group that doesn't have an unbounded repeat */ - OP_KETRMAX, /* 85 These two must remain together and in this */ - OP_KETRMIN, /* 86 order. They are for groups the repeat for ever. */ + OP_ALT, /* 84 Start of alternation */ + OP_KET, /* 85 End of group that doesn't have an unbounded repeat */ + OP_KETRMAX, /* 86 These two must remain together and in this */ + OP_KETRMIN, /* 87 order. They are for groups the repeat for ever. */ /* The assertions must come before BRA, CBRA, ONCE, and COND.*/ - OP_ASSERT, /* 87 Positive lookahead */ - OP_ASSERT_NOT, /* 88 Negative lookahead */ - OP_ASSERTBACK, /* 89 Positive lookbehind */ - OP_ASSERTBACK_NOT, /* 90 Negative lookbehind */ - OP_REVERSE, /* 91 Move pointer back - used in lookbehind assertions */ + OP_ASSERT, /* 88 Positive lookahead */ + OP_ASSERT_NOT, /* 89 Negative lookahead */ + OP_ASSERTBACK, /* 90 Positive lookbehind */ + OP_ASSERTBACK_NOT, /* 91 Negative lookbehind */ + OP_REVERSE, /* 92 Move pointer back - used in lookbehind assertions */ /* ONCE, BRA, CBRA, and COND must come after the assertions, with ONCE first, as there's a test for >= ONCE for a subpattern that isn't an assertion. */ - OP_ONCE, /* 92 Atomic group */ - OP_BRA, /* 93 Start of non-capturing bracket */ - OP_CBRA, /* 94 Start of capturing bracket */ - OP_COND, /* 95 Conditional group */ + OP_ONCE, /* 93 Atomic group */ + OP_BRA, /* 94 Start of non-capturing bracket */ + OP_CBRA, /* 95 Start of capturing bracket */ + OP_COND, /* 96 Conditional group */ /* These three must follow the previous three, in the same order. There's a check for >= SBRA to distinguish the two sets. */ - OP_SBRA, /* 96 Start of non-capturing bracket, check empty */ - OP_SCBRA, /* 97 Start of capturing bracket, check empty */ - OP_SCOND, /* 98 Conditional group, check empty */ + OP_SBRA, /* 97 Start of non-capturing bracket, check empty */ + OP_SCBRA, /* 98 Start of capturing bracket, check empty */ + OP_SCOND, /* 99 Conditional group, check empty */ - OP_CREF, /* 99 Used to hold a capture number as condition */ - OP_RREF, /* 100 Used to hold a recursion number as condition */ - OP_DEF, /* 101 The DEFINE condition */ + OP_CREF, /* 100 Used to hold a capture number as condition */ + OP_RREF, /* 101 Used to hold a recursion number as condition */ + OP_DEF, /* 102 The DEFINE condition */ - OP_BRAZERO, /* 102 These two must remain together and in this */ - OP_BRAMINZERO, /* 103 order. */ + OP_BRAZERO, /* 103 These two must remain together and in this */ + OP_BRAMINZERO, /* 104 order. */ /* These are backtracking control verbs */ - OP_PRUNE, /* 104 */ - OP_SKIP, /* 105 */ - OP_THEN, /* 106 */ - OP_COMMIT, /* 107 */ + OP_PRUNE, /* 105 */ + OP_SKIP, /* 106 */ + OP_THEN, /* 107 */ + OP_COMMIT, /* 108 */ /* These are forced failure and success verbs */ - OP_FAIL, /* 108 */ - OP_ACCEPT, /* 109 */ + OP_FAIL, /* 109 */ + OP_ACCEPT, /* 110 */ /* This is used to skip a subpattern with a {0} quantifier */ - OP_SKIPZERO /* 110 */ + OP_SKIPZERO /* 111 */ }; @@ -787,7 +792,7 @@ for debugging. The macro is referenced only in pcre_printint.c. */ #define OP_NAME_LIST \ "End", "\\A", "\\G", "\\K", "\\B", "\\b", "\\D", "\\d", \ - "\\S", "\\s", "\\W", "\\w", "Any", "Anybyte", \ + "\\S", "\\s", "\\W", "\\w", "Any", "AllAny", "Anybyte", \ "notprop", "prop", "\\R", "\\H", "\\h", "\\V", "\\v", \ "extuni", "\\Z", "\\z", \ "Opt", "^", "$", "char", "charnc", "not", \ @@ -820,7 +825,7 @@ in UTF-8 mode. The code that uses this table must know about such things. */ 1, /* End */ \ 1, 1, 1, 1, 1, /* \A, \G, \K, \B, \b */ \ 1, 1, 1, 1, 1, 1, /* \D, \d, \S, \s, \W, \w */ \ - 1, 1, /* Any, Anybyte */ \ + 1, 1, 1, /* Any, AllAny, Anybyte */ \ 3, 3, 1, /* NOTPROP, PROP, EXTUNI */ \ 1, 1, 1, 1, 1, /* \R, \H, \h, \V, \v */ \ 1, 1, 2, 1, 1, /* \Z, \z, Opt, ^, $ */ \ diff --git a/testdata/testinput2 b/testdata/testinput2 index 6c29b39..93b0f09 100644 --- a/testdata/testinput2 +++ b/testdata/testinput2 @@ -2692,4 +2692,36 @@ a random value. /Ix /(?&N)[]a(?<N>)](abc)/ abc<abc +/a[]b/ + +/a[^]b/ + +/a[]b/<JS> + ** Failers + ab + +/a[]+b/<JS> + ** Failers + ab + +/a[]*+b/<JS> + ** Failers + ab + +/a[^]b/<JS> + aXb + a\nb + ** Failers + ab + +/a[^]+b/<JS> + aXb + a\nX\nXb + ** Failers + ab + +/a(?!)+b/ + +/a(*FAIL)+b/ + / End of testinput2 / diff --git a/testdata/testinput5 b/testdata/testinput5 index 75a4857..8a8e499 100644 --- a/testdata/testinput5 +++ b/testdata/testinput5 @@ -461,4 +461,16 @@ can't tell the difference.) --/ /[[:a\x{100}b:]]/8 +/a[^]b/<JS>8 + a\x{1234}b + a\nb + ** Failers + ab + +/a[^]+b/<JS>8 + aXb + a\nX\nX\x{1234}b + ** Failers + ab + / End of testinput5 / diff --git a/testdata/testinput7 b/testdata/testinput7 index 221bc93..5d593ee 100644 --- a/testdata/testinput7 +++ b/testdata/testinput7 @@ -4364,5 +4364,32 @@ a\r\r\r\r\rb a\x85\85b\<bsr_anycrlf> a\x0b\0bb\<bsr_anycrlf> + +/a(?!)|\wbc/ + abc + +/a[]b/<JS> + ** Failers + ab + +/a[]+b/<JS> + ** Failers + ab + +/a[]*+b/<JS> + ** Failers + ab + +/a[^]b/<JS> + aXb + a\nb + ** Failers + ab + +/a[^]+b/<JS> + aXb + a\nX\nXb + ** Failers + ab / End of testinput7 / diff --git a/testdata/testoutput2 b/testdata/testoutput2 index a2b9b8b..fe32975 100644 --- a/testdata/testoutput2 +++ b/testdata/testoutput2 @@ -9581,4 +9581,54 @@ Failed: reference to non-existent subpattern at offset 4 /(?&N)[]a(?<N>)](abc)/ Failed: reference to non-existent subpattern at offset 4 +/a[]b/ +Failed: missing terminating ] for character class at offset 4 + +/a[^]b/ +Failed: missing terminating ] for character class at offset 5 + +/a[]b/<JS> + ** Failers +No match + ab +No match + +/a[]+b/<JS> + ** Failers +No match + ab +No match + +/a[]*+b/<JS> + ** Failers +No match + ab +No match + +/a[^]b/<JS> + aXb + 0: aXb + a\nb + 0: a\x0ab + ** Failers +No match + ab +No match + +/a[^]+b/<JS> + aXb + 0: aXb + a\nX\nXb + 0: a\x0aX\x0aXb + ** Failers +No match + ab +No match + +/a(?!)+b/ +Failed: nothing to repeat at offset 5 + +/a(*FAIL)+b/ +Failed: nothing to repeat at offset 8 + / End of testinput2 / diff --git a/testdata/testoutput5 b/testdata/testoutput5 index abbe1c8..1c745f4 100644 --- a/testdata/testoutput5 +++ b/testdata/testoutput5 @@ -1608,4 +1608,24 @@ No match /[[:a\x{100}b:]]/8 Failed: unknown POSIX class name at offset 3 +/a[^]b/<JS>8 + a\x{1234}b + 0: a\x{1234}b + a\nb + 0: a\x{0a}b + ** Failers +No match + ab +No match + +/a[^]+b/<JS>8 + aXb + 0: aXb + a\nX\nX\x{1234}b + 0: a\x{0a}X\x{0a}X\x{1234}b + ** Failers +No match + ab +No match + / End of testinput5 / diff --git a/testdata/testoutput7 b/testdata/testoutput7 index d8e3833..9ded29d 100644 --- a/testdata/testoutput7 +++ b/testdata/testoutput7 @@ -7211,5 +7211,47 @@ No match No match a\x0b\0bb\<bsr_anycrlf> No match + +/a(?!)|\wbc/ + abc + 0: abc + +/a[]b/<JS> + ** Failers +No match + ab +No match + +/a[]+b/<JS> + ** Failers +No match + ab +No match + +/a[]*+b/<JS> + ** Failers +No match + ab +No match + +/a[^]b/<JS> + aXb + 0: aXb + a\nb + 0: a\x0ab + ** Failers +No match + ab +No match + +/a[^]+b/<JS> + aXb + 0: aXb + a\nX\nXb + 0: a\x0aX\x0aXb + ** Failers +No match + ab +No match / End of testinput7 / |