diff options
author | ph10 <ph10@2f5784b3-3f2a-0410-8824-cb99058d5e15> | 2013-10-14 13:54:07 +0000 |
---|---|---|
committer | ph10 <ph10@2f5784b3-3f2a-0410-8824-cb99058d5e15> | 2013-10-14 13:54:07 +0000 |
commit | 019360748cb83ed81a5fb1a68466c9b23e70f867 (patch) | |
tree | a4b945f14e772cf539a4e6aad84822c30df8df49 | |
parent | 5bedf037b4d42e927e89cd4e5e7c789217a4df0d (diff) | |
download | pcre-019360748cb83ed81a5fb1a68466c9b23e70f867.tar.gz |
More auto-possessification additions, using possessive class repeats. These are
not yet used for explicit possessification.
git-svn-id: svn://vcs.exim.org/pcre/code/trunk@1379 2f5784b3-3f2a-0410-8824-cb99058d5e15
-rw-r--r-- | pcre_compile.c | 89 | ||||
-rw-r--r-- | pcre_dfa_exec.c | 76 | ||||
-rw-r--r-- | pcre_exec.c | 29 | ||||
-rw-r--r-- | pcre_internal.h | 113 | ||||
-rw-r--r-- | pcre_jit_compile.c | 103 | ||||
-rw-r--r-- | pcre_printint.c | 5 | ||||
-rw-r--r-- | pcre_study.c | 11 | ||||
-rw-r--r-- | testdata/saved16BE-1 | bin | 410 -> 410 bytes | |||
-rw-r--r-- | testdata/saved16BE-2 | bin | 344 -> 344 bytes | |||
-rw-r--r-- | testdata/saved16LE-1 | bin | 410 -> 410 bytes | |||
-rw-r--r-- | testdata/saved16LE-2 | bin | 344 -> 344 bytes | |||
-rw-r--r-- | testdata/saved32BE-1 | bin | 552 -> 552 bytes | |||
-rw-r--r-- | testdata/saved32BE-2 | bin | 456 -> 456 bytes | |||
-rw-r--r-- | testdata/saved32LE-1 | bin | 552 -> 552 bytes | |||
-rw-r--r-- | testdata/saved32LE-2 | bin | 456 -> 456 bytes | |||
-rw-r--r-- | testdata/testinput2 | 30 | ||||
-rw-r--r-- | testdata/testinput8 | 32 | ||||
-rw-r--r-- | testdata/testoutput14 | 2 | ||||
-rw-r--r-- | testdata/testoutput2 | 116 | ||||
-rw-r--r-- | testdata/testoutput8 | 118 | ||||
-rw-r--r-- | testdata/testoutput9 | 2 |
21 files changed, 524 insertions, 202 deletions
diff --git a/pcre_compile.c b/pcre_compile.c index d56b7f8..44118ea 100644 --- a/pcre_compile.c +++ b/pcre_compile.c @@ -1817,16 +1817,20 @@ for (;;) switch (*cc) { - case OP_CRPLUS: - case OP_CRMINPLUS: case OP_CRSTAR: case OP_CRMINSTAR: + case OP_CRPLUS: + case OP_CRMINPLUS: case OP_CRQUERY: case OP_CRMINQUERY: + case OP_CRPOSSTAR: + case OP_CRPOSPLUS: + case OP_CRPOSQUERY: return -1; case OP_CRRANGE: case OP_CRMINRANGE: + case OP_CRPOSRANGE: if (GET2(cc,1) != GET2(cc,1+IMM2_SIZE)) return -1; branchlength += (int)GET2(cc,1); cc += 1 + 2 * IMM2_SIZE; @@ -2419,15 +2423,19 @@ for (code = first_significant_code(code + PRIV(OP_lengths)[*code], TRUE); case OP_CRMINSTAR: case OP_CRQUERY: case OP_CRMINQUERY: + case OP_CRPOSSTAR: + case OP_CRPOSQUERY: break; default: /* Non-repeat => class must match */ case OP_CRPLUS: /* These repeats aren't empty */ case OP_CRMINPLUS: + case OP_CRPOSPLUS: return FALSE; case OP_CRRANGE: case OP_CRMINRANGE: + case OP_CRPOSRANGE: if (GET2(ccode, 1) > 0) return FALSE; /* Minimum > 0 */ break; } @@ -2920,12 +2928,21 @@ switch(c) case OP_CRMINSTAR: case OP_CRQUERY: case OP_CRMINQUERY: + case OP_CRPOSSTAR: + case OP_CRPOSQUERY: list[1] = TRUE; end++; break; + case OP_CRPLUS: + case OP_CRMINPLUS: + case OP_CRPOSPLUS: + end++; + break; + case OP_CRRANGE: case OP_CRMINRANGE: + case OP_CRPOSRANGE: list[1] = (GET2(end, 1) == 0); end += 1 + 2 * IMM2_SIZE; break; @@ -2956,7 +2973,7 @@ Returns: TRUE if the auto-possessification is possible static BOOL compare_opcodes(const pcre_uchar *code, BOOL utf, const compile_data *cd, - const pcre_uint32* base_list) + const pcre_uint32* base_list, const pcre_uchar *base_end) { pcre_uchar c; pcre_uint32 list[8]; @@ -2964,6 +2981,7 @@ const pcre_uint32* chr_ptr; const pcre_uint32* ochr_ptr; const pcre_uint32* list_ptr; const pcre_uchar *next_code; +const pcre_uint8 *class_bits; pcre_uint32 chr; /* Note: the base_list[1] contains whether the current opcode has greedy @@ -3039,7 +3057,7 @@ for(;;) while (*next_code == OP_ALT) { - if (!compare_opcodes(code, utf, cd, base_list)) return FALSE; + if (!compare_opcodes(code, utf, cd, base_list, base_end)) return FALSE; code = next_code + 1 + LINK_SIZE; next_code += GET(next_code, 1); } @@ -3061,7 +3079,7 @@ for(;;) /* The bracket content will be checked by the OP_BRA/OP_CBRA case above. */ next_code += 1 + LINK_SIZE; - if (!compare_opcodes(next_code, utf, cd, base_list)) return FALSE; + if (!compare_opcodes(next_code, utf, cd, base_list, base_end)) return FALSE; code += PRIV(OP_lengths)[c]; continue; @@ -3318,21 +3336,14 @@ for(;;) return FALSE; break; - /* The class comparisons work only when the class is the second item - of the pair, because there are at present no possessive forms of the - class opcodes. Note also that the "code" variable that is used below - points after the second item, and that the pointer for the first item - is not available, so even if there were possessive forms of the class - opcodes, the correct comparison could not be done. */ - case OP_NCLASS: if (chr > 255) return FALSE; /* Fall through */ case OP_CLASS: - if (list_ptr != list) return FALSE; /* Class is first opcode */ if (chr > 255) break; - if ((((pcre_uint8 *)(code - list_ptr[2]))[chr >> 3] & (1 << (chr & 7))) != 0) + class_bits = (pcre_uint8 *)((list_ptr == list ? code : base_end) - list_ptr[2]); + if ((class_bits[chr >> 3] & (1 << (chr & 7))) != 0) return FALSE; break; @@ -3380,14 +3391,15 @@ Returns: nothing static void auto_possessify(pcre_uchar *code, BOOL utf, const compile_data *cd) { -register pcre_uchar c; +register pcre_uchar c, d; const pcre_uchar *end; +pcre_uchar *repeat_code; pcre_uint32 list[8]; for (;;) { c = *code; - + if (c >= OP_STAR && c <= OP_TYPEPOSUPTO) { c -= get_repeat_base(c) - OP_STAR; @@ -3395,7 +3407,7 @@ for (;;) get_chr_property_list(code, utf, cd->fcc, list) : NULL; list[1] = c == OP_STAR || c == OP_PLUS || c == OP_QUERY || c == OP_UPTO; - if (end != NULL && compare_opcodes(end, utf, cd, list)) + if (end != NULL && compare_opcodes(end, utf, cd, list, end)) { switch(c) { @@ -3434,6 +3446,47 @@ for (;;) } c = *code; } + else if (c == OP_CLASS || c == OP_NCLASS || c == OP_XCLASS) + { +#if defined SUPPORT_UTF || !defined COMPILE_PCRE8 + if (c == OP_XCLASS) + repeat_code = code + 1 + GET(code, 1); + else +#endif + repeat_code = code + 1 + (32 / sizeof(pcre_uchar)); + + d = *repeat_code; + if (d >= OP_CRSTAR && d <= OP_CRMINRANGE) + { + /* end must not be NULL. */ + end = get_chr_property_list(code, utf, cd->fcc, list); + + list[1] = d == OP_CRSTAR || d == OP_CRPLUS || d == OP_CRQUERY || + d == OP_CRRANGE; + + if (compare_opcodes(end, utf, cd, list, end)) + { + switch (d) + { + case OP_CRSTAR: + *repeat_code = OP_CRPOSSTAR; + break; + + case OP_CRPLUS: + *repeat_code = OP_CRPOSPLUS; + break; + + case OP_CRQUERY: + *repeat_code = OP_CRPOSQUERY; + break; + + case OP_CRRANGE: + *repeat_code = OP_CRPOSRANGE; + break; + } + } + } + } switch(c) { @@ -3460,9 +3513,11 @@ for (;;) code += 2; break; +#if defined SUPPORT_UTF || !defined COMPILE_PCRE8 case OP_XCLASS: code += GET(code, 1); break; +#endif case OP_MARK: case OP_PRUNE_ARG: diff --git a/pcre_dfa_exec.c b/pcre_dfa_exec.c index 19fba5b..216a515 100644 --- a/pcre_dfa_exec.c +++ b/pcre_dfa_exec.c @@ -151,6 +151,7 @@ static const pcre_uint8 coptable[] = { /* Character class & ref repeats */ 0, 0, 0, 0, 0, 0, /* *, *?, +, +?, ?, ?? */ 0, 0, /* CRRANGE, CRMINRANGE */ + 0, 0, 0, 0, /* Possessive *+, ++, ?+, CRPOSRANGE */ 0, /* CLASS */ 0, /* NCLASS */ 0, /* XCLASS - variable length */ @@ -222,6 +223,7 @@ static const pcre_uint8 poptable[] = { /* Character class & ref repeats */ 1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */ 1, 1, /* CRRANGE, CRMINRANGE */ + 1, 1, 1, 1, /* Possessive *+, ++, ?+, CRPOSRANGE */ 1, /* CLASS */ 1, /* NCLASS */ 1, /* XCLASS - variable length */ @@ -1101,7 +1103,7 @@ for (;;) /* Perl space used to exclude VT, but from Perl 5.18 it is included, which means that Perl space and POSIX space are now identical. PCRE was changed at release 8.34. */ - + case PT_SPACE: /* Perl space */ case PT_PXSPACE: /* POSIX space */ switch(c) @@ -1110,11 +1112,11 @@ for (;;) VSPACE_CASES: OK = TRUE; break; - - default: + + default: OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z; break; - } + } break; case PT_WORD: @@ -1359,7 +1361,7 @@ for (;;) /* Perl space used to exclude VT, but from Perl 5.18 it is included, which means that Perl space and POSIX space are now identical. PCRE was changed at release 8.34. */ - + case PT_SPACE: /* Perl space */ case PT_PXSPACE: /* POSIX space */ switch(c) @@ -1368,11 +1370,11 @@ for (;;) VSPACE_CASES: OK = TRUE; break; - - default: + + default: OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z; break; - } + } break; case PT_WORD: @@ -1611,7 +1613,7 @@ for (;;) /* Perl space used to exclude VT, but from Perl 5.18 it is included, which means that Perl space and POSIX space are now identical. PCRE was changed at release 8.34. */ - + case PT_SPACE: /* Perl space */ case PT_PXSPACE: /* POSIX space */ switch(c) @@ -1620,11 +1622,11 @@ for (;;) VSPACE_CASES: OK = TRUE; break; - - default: + + default: OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z; break; - } + } break; case PT_WORD: @@ -1888,7 +1890,7 @@ for (;;) /* Perl space used to exclude VT, but from Perl 5.18 it is included, which means that Perl space and POSIX space are now identical. PCRE was changed at release 8.34. */ - + case PT_SPACE: /* Perl space */ case PT_PXSPACE: /* POSIX space */ switch(c) @@ -1897,11 +1899,11 @@ for (;;) VSPACE_CASES: OK = TRUE; break; - - default: + + default: OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z; break; - } + } break; case PT_WORD: @@ -2569,31 +2571,65 @@ for (;;) { case OP_CRSTAR: case OP_CRMINSTAR: + case OP_CRPOSSTAR: ADD_ACTIVE(next_state_offset + 1, 0); - if (isinclass) { ADD_NEW(state_offset, 0); } + if (isinclass) + { + if (*ecode == OP_CRPOSSTAR) + { + active_count--; /* Remove non-match possibility */ + next_active_state--; + } + ADD_NEW(state_offset, 0); + } break; case OP_CRPLUS: case OP_CRMINPLUS: + case OP_CRPOSPLUS: count = current_state->count; /* Already matched */ if (count > 0) { ADD_ACTIVE(next_state_offset + 1, 0); } - if (isinclass) { count++; ADD_NEW(state_offset, count); } + if (isinclass) + { + if (count > 0 && *ecode == OP_CRPOSPLUS) + { + active_count--; /* Remove non-match possibility */ + next_active_state--; + } + count++; + ADD_NEW(state_offset, count); + } break; case OP_CRQUERY: case OP_CRMINQUERY: + case OP_CRPOSQUERY: ADD_ACTIVE(next_state_offset + 1, 0); - if (isinclass) { ADD_NEW(next_state_offset + 1, 0); } + if (isinclass) + { + if (*ecode == OP_CRPOSQUERY) + { + active_count--; /* Remove non-match possibility */ + next_active_state--; + } + ADD_NEW(next_state_offset + 1, 0); + } break; case OP_CRRANGE: case OP_CRMINRANGE: + case OP_CRPOSRANGE: count = current_state->count; /* Already matched */ if (count >= (int)GET2(ecode, 1)) { ADD_ACTIVE(next_state_offset + 1 + 2 * IMM2_SIZE, 0); } if (isinclass) { int max = (int)GET2(ecode, 1 + IMM2_SIZE); + if (*ecode == OP_CRPOSRANGE) + { + active_count--; /* Remove non-match possibility */ + next_active_state--; + } if (++count >= max && max != 0) /* Max 0 => no limit */ { ADD_NEW(next_state_offset + 1 + 2 * IMM2_SIZE, 0); } else @@ -2696,7 +2732,7 @@ for (;;) /* Back reference conditions and duplicate named recursion conditions are not supported */ - if (condcode == OP_CREF || condcode == OP_DNCREF || + if (condcode == OP_CREF || condcode == OP_DNCREF || condcode == OP_DNRREF) return PCRE_ERROR_DFA_UCOND; diff --git a/pcre_exec.c b/pcre_exec.c index 7311aac..2470ee9 100644 --- a/pcre_exec.c +++ b/pcre_exec.c @@ -107,8 +107,8 @@ because the offset vector is always a multiple of 3 long. */ /* Min and max values for the common repeats; for the maxima, 0 => infinity */ -static const char rep_min[] = { 0, 0, 1, 1, 0, 0 }; -static const char rep_max[] = { 0, 0, 0, 0, 1, 1 }; +static const char rep_min[] = { 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, }; +static const char rep_max[] = { 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, }; #ifdef PCRE_DEBUG /************************************************* @@ -2864,8 +2864,12 @@ for (;;) case OP_CRMINPLUS: case OP_CRQUERY: case OP_CRMINQUERY: + case OP_CRPOSSTAR: + case OP_CRPOSPLUS: + case OP_CRPOSQUERY: c = *ecode++ - OP_CRSTAR; - minimize = (c & 1) != 0; + if (c < OP_CRPOSSTAR - OP_CRSTAR) minimize = (c & 1) != 0; + else possessive = TRUE; min = rep_min[c]; /* Pick up values from tables; */ max = rep_max[c]; /* zero for max => infinity */ if (max == 0) max = INT_MAX; @@ -2873,7 +2877,9 @@ for (;;) case OP_CRRANGE: case OP_CRMINRANGE: + case OP_CRPOSRANGE: minimize = (*ecode == OP_CRMINRANGE); + possessive = (*ecode == OP_CRPOSRANGE); min = GET2(ecode, 1); max = GET2(ecode, 1 + IMM2_SIZE); if (max == 0) max = INT_MAX; @@ -3015,6 +3021,9 @@ for (;;) if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) break; eptr += len; } + + if (possessive) continue; /* No backtracking */ + for (;;) { RMATCH(eptr, ecode, offset_top, md, eptrb, RM18); @@ -3045,6 +3054,9 @@ for (;;) if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) break; eptr++; } + + if (possessive) continue; /* No backtracking */ + while (eptr >= pp) { RMATCH(eptr, ecode, offset_top, md, eptrb, RM19); @@ -3078,8 +3090,12 @@ for (;;) case OP_CRMINPLUS: case OP_CRQUERY: case OP_CRMINQUERY: + case OP_CRPOSSTAR: + case OP_CRPOSPLUS: + case OP_CRPOSQUERY: c = *ecode++ - OP_CRSTAR; - minimize = (c & 1) != 0; + if (c < OP_CRPOSSTAR - OP_CRSTAR) minimize = (c & 1) != 0; + else possessive = TRUE; min = rep_min[c]; /* Pick up values from tables; */ max = rep_max[c]; /* zero for max => infinity */ if (max == 0) max = INT_MAX; @@ -3087,7 +3103,9 @@ for (;;) case OP_CRRANGE: case OP_CRMINRANGE: + case OP_CRPOSRANGE: minimize = (*ecode == OP_CRMINRANGE); + possessive = (*ecode == OP_CRPOSRANGE); min = GET2(ecode, 1); max = GET2(ecode, 1 + IMM2_SIZE); if (max == 0) max = INT_MAX; @@ -3159,6 +3177,9 @@ for (;;) if (!PRIV(xclass)(c, data, utf)) break; eptr += len; } + + if (possessive) continue; /* No backtracking */ + for(;;) { RMATCH(eptr, ecode, offset_top, md, eptrb, RM21); diff --git a/pcre_internal.h b/pcre_internal.h index 124d28c..c483e4c 100644 --- a/pcre_internal.h +++ b/pcre_internal.h @@ -2070,91 +2070,96 @@ enum { OP_CRRANGE, /* 104 These are different to the three sets above. */ OP_CRMINRANGE, /* 105 */ + OP_CRPOSSTAR, /* 106 Possessified versions */ + OP_CRPOSPLUS, /* 107 */ + OP_CRPOSQUERY, /* 108 */ + OP_CRPOSRANGE, /* 109 */ + /* End of quantifier opcodes */ - OP_CLASS, /* 106 Match a character class, chars < 256 only */ - OP_NCLASS, /* 107 Same, but the bitmap was created from a negative + OP_CLASS, /* 110 Match a character class, chars < 256 only */ + OP_NCLASS, /* 111 Same, but the bitmap was created from a negative class - the difference is relevant only when a character > 255 is encountered. */ - OP_XCLASS, /* 108 Extended class for handling > 255 chars within the + OP_XCLASS, /* 112 Extended class for handling > 255 chars within the class. This does both positive and negative. */ - OP_REF, /* 109 Match a back reference, casefully */ - OP_REFI, /* 110 Match a back reference, caselessly */ - OP_DNREF, /* 111 Match a duplicate name backref, casefully */ - OP_DNREFI, /* 112 Match a duplicate name backref, caselessly */ - OP_RECURSE, /* 113 Match a numbered subpattern (possibly recursive) */ - OP_CALLOUT, /* 114 Call out to external function if provided */ - - OP_ALT, /* 115 Start of alternation */ - OP_KET, /* 116 End of group that doesn't have an unbounded repeat */ - OP_KETRMAX, /* 117 These two must remain together and in this */ - OP_KETRMIN, /* 118 order. They are for groups the repeat for ever. */ - OP_KETRPOS, /* 119 Possessive unlimited repeat. */ + OP_REF, /* 113 Match a back reference, casefully */ + OP_REFI, /* 114 Match a back reference, caselessly */ + OP_DNREF, /* 115 Match a duplicate name backref, casefully */ + OP_DNREFI, /* 116 Match a duplicate name backref, caselessly */ + OP_RECURSE, /* 117 Match a numbered subpattern (possibly recursive) */ + OP_CALLOUT, /* 118 Call out to external function if provided */ + + OP_ALT, /* 119 Start of alternation */ + OP_KET, /* 120 End of group that doesn't have an unbounded repeat */ + OP_KETRMAX, /* 121 These two must remain together and in this */ + OP_KETRMIN, /* 122 order. They are for groups the repeat for ever. */ + OP_KETRPOS, /* 123 Possessive unlimited repeat. */ /* The assertions must come before BRA, CBRA, ONCE, and COND, and the four asserts must remain in order. */ - OP_REVERSE, /* 129 Move pointer back - used in lookbehind assertions */ - OP_ASSERT, /* 121 Positive lookahead */ - OP_ASSERT_NOT, /* 122 Negative lookahead */ - OP_ASSERTBACK, /* 123 Positive lookbehind */ - OP_ASSERTBACK_NOT, /* 124 Negative lookbehind */ + OP_REVERSE, /* 124 Move pointer back - used in lookbehind assertions */ + OP_ASSERT, /* 125 Positive lookahead */ + OP_ASSERT_NOT, /* 126 Negative lookahead */ + OP_ASSERTBACK, /* 127 Positive lookbehind */ + OP_ASSERTBACK_NOT, /* 128 Negative lookbehind */ /* ONCE, ONCE_NC, BRA, BRAPOS, CBRA, CBRAPOS, and COND must come immediately after the assertions, with ONCE first, as there's a test for >= ONCE for a subpattern that isn't an assertion. The POS versions must immediately follow the non-POS versions in each case. */ - OP_ONCE, /* 125 Atomic group, contains captures */ - OP_ONCE_NC, /* 126 Atomic group containing no captures */ - OP_BRA, /* 127 Start of non-capturing bracket */ - OP_BRAPOS, /* 128 Ditto, with unlimited, possessive repeat */ - OP_CBRA, /* 129 Start of capturing bracket */ - OP_CBRAPOS, /* 130 Ditto, with unlimited, possessive repeat */ - OP_COND, /* 131 Conditional group */ + OP_ONCE, /* 129 Atomic group, contains captures */ + OP_ONCE_NC, /* 130 Atomic group containing no captures */ + OP_BRA, /* 131 Start of non-capturing bracket */ + OP_BRAPOS, /* 132 Ditto, with unlimited, possessive repeat */ + OP_CBRA, /* 133 Start of capturing bracket */ + OP_CBRAPOS, /* 134 Ditto, with unlimited, possessive repeat */ + OP_COND, /* 135 Conditional group */ /* These five must follow the previous five, in the same order. There's a check for >= SBRA to distinguish the two sets. */ - OP_SBRA, /* 132 Start of non-capturing bracket, check empty */ - OP_SBRAPOS, /* 133 Ditto, with unlimited, possessive repeat */ - OP_SCBRA, /* 134 Start of capturing bracket, check empty */ - OP_SCBRAPOS, /* 135 Ditto, with unlimited, possessive repeat */ - OP_SCOND, /* 136 Conditional group, check empty */ + OP_SBRA, /* 136 Start of non-capturing bracket, check empty */ + OP_SBRAPOS, /* 137 Ditto, with unlimited, possessive repeat */ + OP_SCBRA, /* 138 Start of capturing bracket, check empty */ + OP_SCBRAPOS, /* 139 Ditto, with unlimited, possessive repeat */ + OP_SCOND, /* 140 Conditional group, check empty */ /* The next two pairs must (respectively) be kept together. */ - OP_CREF, /* 137 Used to hold a capture number as condition */ - OP_DNCREF, /* 138 Used to point to duplicate names as a condition */ - OP_RREF, /* 139 Used to hold a recursion number as condition */ - OP_DNRREF, /* 140 Used to point to duplicate names as a condition */ - OP_DEF, /* 141 The DEFINE condition */ + OP_CREF, /* 141 Used to hold a capture number as condition */ + OP_DNCREF, /* 142 Used to point to duplicate names as a condition */ + OP_RREF, /* 143 Used to hold a recursion number as condition */ + OP_DNRREF, /* 144 Used to point to duplicate names as a condition */ + OP_DEF, /* 145 The DEFINE condition */ - OP_BRAZERO, /* 142 These two must remain together and in this */ - OP_BRAMINZERO, /* 143 order. */ - OP_BRAPOSZERO, /* 144 */ + OP_BRAZERO, /* 146 These two must remain together and in this */ + OP_BRAMINZERO, /* 147 order. */ + OP_BRAPOSZERO, /* 148 */ /* These are backtracking control verbs */ - OP_MARK, /* 145 always has an argument */ - OP_PRUNE, /* 146 */ - OP_PRUNE_ARG, /* 147 same, but with argument */ - OP_SKIP, /* 148 */ - OP_SKIP_ARG, /* 149 same, but with argument */ - OP_THEN, /* 150 */ - OP_THEN_ARG, /* 151 same, but with argument */ - OP_COMMIT, /* 152 */ + OP_MARK, /* 149 always has an argument */ + OP_PRUNE, /* 150 */ + OP_PRUNE_ARG, /* 151 same, but with argument */ + OP_SKIP, /* 152 */ + OP_SKIP_ARG, /* 153 same, but with argument */ + OP_THEN, /* 154 */ + OP_THEN_ARG, /* 155 same, but with argument */ + OP_COMMIT, /* 156 */ /* These are forced failure and success verbs */ - OP_FAIL, /* 153 */ - OP_ACCEPT, /* 154 */ - OP_ASSERT_ACCEPT, /* 155 Used inside assertions */ - OP_CLOSE, /* 156 Used before OP_ACCEPT to close open captures */ + OP_FAIL, /* 157 */ + OP_ACCEPT, /* 158 */ + OP_ASSERT_ACCEPT, /* 159 Used inside assertions */ + OP_CLOSE, /* 160 Used before OP_ACCEPT to close open captures */ /* This is used to skip a subpattern with a {0} quantifier */ - OP_SKIPZERO, /* 157 */ + OP_SKIPZERO, /* 161 */ /* This is not an opcode, but is used to check that tables indexed by opcode are the correct length, in order to catch updating errors - there have been @@ -2194,6 +2199,7 @@ some cases doesn't actually use these names at all). */ "*", "*?", "+", "+?", "?", "??", "{", "{", "{", \ "*+","++", "?+", "{", \ "*", "*?", "+", "+?", "?", "??", "{", "{", \ + "*+","++", "?+", "{", \ "class", "nclass", "xclass", "Ref", "Refi", "DnRef", "DnRefi", \ "Recurse", "Callout", \ "Alt", "Ket", "KetRmax", "KetRmin", "KetRpos", \ @@ -2259,6 +2265,7 @@ in UTF-8 mode. The code that uses this table must know about such things. */ /* Character class & ref repeats */ \ 1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */ \ 1+2*IMM2_SIZE, 1+2*IMM2_SIZE, /* CRRANGE, CRMINRANGE */ \ + 1, 1, 1, 1+2*IMM2_SIZE, /* Possessive *+, ++, ?+, CRPOSRANGE */ \ 1+(32/sizeof(pcre_uchar)), /* CLASS */ \ 1+(32/sizeof(pcre_uchar)), /* NCLASS */ \ 0, /* XCLASS - variable length */ \ diff --git a/pcre_jit_compile.c b/pcre_jit_compile.c index a8a3426..c62ef0d 100644 --- a/pcre_jit_compile.c +++ b/pcre_jit_compile.c @@ -585,6 +585,10 @@ switch(*cc) case OP_CRMINQUERY: case OP_CRRANGE: case OP_CRMINRANGE: + case OP_CRPOSSTAR: + case OP_CRPOSPLUS: + case OP_CRPOSQUERY: + case OP_CRPOSRANGE: case OP_CLASS: case OP_NCLASS: case OP_REF: @@ -6919,7 +6923,7 @@ count_match(common); return cc + 1 + LINK_SIZE; } -static SLJIT_INLINE pcre_uchar *get_iterator_parameters(compiler_common *common, pcre_uchar *cc, pcre_uchar *opcode, pcre_uchar *type, int *arg1, int *arg2, pcre_uchar **end) +static SLJIT_INLINE pcre_uchar *get_iterator_parameters(compiler_common *common, pcre_uchar *cc, pcre_uchar *opcode, pcre_uchar *type, int *max, int *min, pcre_uchar **end) { int class_len; @@ -6955,7 +6959,7 @@ else if (*opcode >= OP_TYPESTAR && *opcode <= OP_TYPEPOSUPTO) } else { - SLJIT_ASSERT(*opcode >= OP_CLASS || *opcode <= OP_XCLASS); + SLJIT_ASSERT(*opcode == OP_CLASS || *opcode == OP_NCLASS || *opcode == OP_XCLASS); *type = *opcode; cc++; class_len = (*type < OP_XCLASS) ? (int)(1 + (32 / sizeof(pcre_uchar))) : GET(cc, 0); @@ -6966,18 +6970,24 @@ else if (end != NULL) *end = cc + class_len; } + else if (*opcode >= OP_CRPOSSTAR && *opcode <= OP_CRPOSQUERY) + { + *opcode -= OP_CRPOSSTAR - OP_POSSTAR; + if (end != NULL) + *end = cc + class_len; + } else { - SLJIT_ASSERT(*opcode == OP_CRRANGE || *opcode == OP_CRMINRANGE); - *arg1 = GET2(cc, (class_len + IMM2_SIZE)); - *arg2 = GET2(cc, class_len); + SLJIT_ASSERT(*opcode == OP_CRRANGE || *opcode == OP_CRMINRANGE || *opcode == OP_CRPOSRANGE); + *max = GET2(cc, (class_len + IMM2_SIZE)); + *min = GET2(cc, class_len); - if (*arg2 == 0) + if (*min == 0) { - SLJIT_ASSERT(*arg1 != 0); - *opcode = (*opcode == OP_CRRANGE) ? OP_UPTO : OP_MINUPTO; + SLJIT_ASSERT(*max != 0); + *opcode = (*opcode == OP_CRRANGE) ? OP_UPTO : (*opcode == OP_CRMINRANGE ? OP_MINUPTO : OP_POSUPTO); } - if (*arg1 == *arg2) + if (*max == *min) *opcode = OP_EXACT; if (end != NULL) @@ -6988,7 +6998,7 @@ else if (*opcode == OP_UPTO || *opcode == OP_MINUPTO || *opcode == OP_EXACT || *opcode == OP_POSUPTO) { - *arg1 = GET2(cc, 0); + *max = GET2(cc, 0); cc += IMM2_SIZE; } @@ -7017,7 +7027,7 @@ DEFINE_COMPILER; backtrack_common *backtrack; pcre_uchar opcode; pcre_uchar type; -int arg1 = -1, arg2 = -1; +int max = -1, min = -1; pcre_uchar* end; jump_list *nomatch = NULL; struct sljit_jump *jump = NULL; @@ -7030,7 +7040,7 @@ int tmp_base, tmp_offset; PUSH_BACKTRACK(sizeof(iterator_backtrack), cc, NULL); -cc = get_iterator_parameters(common, cc, &opcode, &type, &arg1, &arg2, &end); +cc = get_iterator_parameters(common, cc, &opcode, &type, &max, &min, &end); switch(type) { @@ -7101,10 +7111,10 @@ switch(opcode) { OP1(SLJIT_MOV, TMP1, 0, SLJIT_MEM1(SLJIT_LOCALS_REG), POSSESSIVE0); OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, SLJIT_IMM, 1); - if (opcode == OP_CRRANGE && arg2 > 0) - CMPTO(SLJIT_C_LESS, TMP1, 0, SLJIT_IMM, arg2, label); - if (opcode == OP_UPTO || (opcode == OP_CRRANGE && arg1 > 0)) - jump = CMP(SLJIT_C_GREATER_EQUAL, TMP1, 0, SLJIT_IMM, arg1); + if (opcode == OP_CRRANGE && min > 0) + CMPTO(SLJIT_C_LESS, TMP1, 0, SLJIT_IMM, min, label); + if (opcode == OP_UPTO || (opcode == OP_CRRANGE && max > 0)) + jump = CMP(SLJIT_C_GREATER_EQUAL, TMP1, 0, SLJIT_IMM, max); OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_LOCALS_REG), POSSESSIVE0, TMP1, 0); } @@ -7131,7 +7141,7 @@ switch(opcode) OP1(SLJIT_MOV, base, offset0, STR_PTR, 0); if (opcode <= OP_PLUS) JUMPTO(SLJIT_JUMP, label); - else if (opcode == OP_CRRANGE && arg1 == 0) + else if (opcode == OP_CRRANGE && max == 0) { OP2(SLJIT_ADD, base, offset1, base, offset1, SLJIT_IMM, 1); JUMPTO(SLJIT_JUMP, label); @@ -7141,11 +7151,11 @@ switch(opcode) OP1(SLJIT_MOV, TMP1, 0, base, offset1); OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, SLJIT_IMM, 1); OP1(SLJIT_MOV, base, offset1, TMP1, 0); - CMPTO(SLJIT_C_LESS, TMP1, 0, SLJIT_IMM, arg1 + 1, label); + CMPTO(SLJIT_C_LESS, TMP1, 0, SLJIT_IMM, max + 1, label); } set_jumps(nomatch, LABEL()); if (opcode == OP_CRRANGE) - add_jump(compiler, &backtrack->topbacktracks, CMP(SLJIT_C_LESS, base, offset1, SLJIT_IMM, arg2 + 1)); + add_jump(compiler, &backtrack->topbacktracks, CMP(SLJIT_C_LESS, base, offset1, SLJIT_IMM, min + 1)); OP1(SLJIT_MOV, STR_PTR, 0, base, offset0); } BACKTRACK_AS(iterator_backtrack)->matchingpath = LABEL(); @@ -7183,7 +7193,7 @@ switch(opcode) break; case OP_EXACT: - OP1(SLJIT_MOV, tmp_base, tmp_offset, SLJIT_IMM, arg1); + OP1(SLJIT_MOV, tmp_base, tmp_offset, SLJIT_IMM, max); label = LABEL(); compile_char1_matchingpath(common, type, cc, &backtrack->topbacktracks); OP2(SLJIT_SUB | SLJIT_SET_E, tmp_base, tmp_offset, tmp_base, tmp_offset, SLJIT_IMM, 1); @@ -7196,7 +7206,7 @@ switch(opcode) if (opcode == OP_POSPLUS) compile_char1_matchingpath(common, type, cc, &backtrack->topbacktracks); if (opcode == OP_POSUPTO) - OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_LOCALS_REG), POSSESSIVE1, SLJIT_IMM, arg1); + OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_LOCALS_REG), POSSESSIVE1, SLJIT_IMM, max); OP1(SLJIT_MOV, tmp_base, tmp_offset, STR_PTR, 0); label = LABEL(); compile_char1_matchingpath(common, type, cc, &nomatch); @@ -7220,6 +7230,34 @@ switch(opcode) OP1(SLJIT_MOV, STR_PTR, 0, tmp_base, tmp_offset); break; + case OP_CRPOSRANGE: + /* Combination of OP_EXACT and OP_POSSTAR or OP_POSUPTO */ + OP1(SLJIT_MOV, tmp_base, tmp_offset, SLJIT_IMM, min); + label = LABEL(); + compile_char1_matchingpath(common, type, cc, &backtrack->topbacktracks); + OP2(SLJIT_SUB | SLJIT_SET_E, tmp_base, tmp_offset, tmp_base, tmp_offset, SLJIT_IMM, 1); + JUMPTO(SLJIT_C_NOT_ZERO, label); + + if (max != 0) + { + SLJIT_ASSERT(max - min > 0); + OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_LOCALS_REG), POSSESSIVE1, SLJIT_IMM, max - min); + } + OP1(SLJIT_MOV, tmp_base, tmp_offset, STR_PTR, 0); + label = LABEL(); + compile_char1_matchingpath(common, type, cc, &nomatch); + OP1(SLJIT_MOV, tmp_base, tmp_offset, STR_PTR, 0); + if (max == 0) + JUMPTO(SLJIT_JUMP, label); + else + { + OP2(SLJIT_SUB | SLJIT_SET_E, SLJIT_MEM1(SLJIT_LOCALS_REG), POSSESSIVE1, SLJIT_MEM1(SLJIT_LOCALS_REG), POSSESSIVE1, SLJIT_IMM, 1); + JUMPTO(SLJIT_C_NOT_ZERO, label); + } + set_jumps(nomatch, LABEL()); + OP1(SLJIT_MOV, STR_PTR, 0, tmp_base, tmp_offset); + break; + default: SLJIT_ASSERT_STOP(); break; @@ -7497,7 +7535,7 @@ while (cc < ccend) case OP_CLASS: case OP_NCLASS: - if (cc[1 + (32 / sizeof(pcre_uchar))] >= OP_CRSTAR && cc[1 + (32 / sizeof(pcre_uchar))] <= OP_CRMINRANGE) + if (cc[1 + (32 / sizeof(pcre_uchar))] >= OP_CRSTAR && cc[1 + (32 / sizeof(pcre_uchar))] <= OP_CRPOSRANGE) cc = compile_iterator_matchingpath(common, cc, parent); else cc = compile_char1_matchingpath(common, *cc, cc + 1, parent->top != NULL ? &parent->top->nextbacktracks : &parent->topbacktracks); @@ -7505,7 +7543,7 @@ while (cc < ccend) #if defined SUPPORT_UTF || defined COMPILE_PCRE16 || defined COMPILE_PCRE32 case OP_XCLASS: - if (*(cc + GET(cc, 1)) >= OP_CRSTAR && *(cc + GET(cc, 1)) <= OP_CRMINRANGE) + if (*(cc + GET(cc, 1)) >= OP_CRSTAR && *(cc + GET(cc, 1)) <= OP_CRPOSRANGE) cc = compile_iterator_matchingpath(common, cc, parent); else cc = compile_char1_matchingpath(common, *cc, cc + 1, parent->top != NULL ? &parent->top->nextbacktracks : &parent->topbacktracks); @@ -7514,7 +7552,7 @@ while (cc < ccend) case OP_REF: case OP_REFI: - if (cc[1 + IMM2_SIZE] >= OP_CRSTAR && cc[1 + IMM2_SIZE] <= OP_CRMINRANGE) + if (cc[1 + IMM2_SIZE] >= OP_CRSTAR && cc[1 + IMM2_SIZE] <= OP_CRPOSRANGE) cc = compile_ref_iterator_matchingpath(common, cc, parent); else { @@ -7525,7 +7563,7 @@ while (cc < ccend) case OP_DNREF: case OP_DNREFI: - if (cc[1 + 2 * IMM2_SIZE] >= OP_CRSTAR && cc[1 + 2 * IMM2_SIZE] <= OP_CRMINRANGE) + if (cc[1 + 2 * IMM2_SIZE] >= OP_CRSTAR && cc[1 + 2 * IMM2_SIZE] <= OP_CRPOSRANGE) cc = compile_ref_iterator_matchingpath(common, cc, parent); else { @@ -7685,7 +7723,7 @@ DEFINE_COMPILER; pcre_uchar *cc = current->cc; pcre_uchar opcode; pcre_uchar type; -int arg1 = -1, arg2 = -1; +int max = -1, min = -1; struct sljit_label *label = NULL; struct sljit_jump *jump = NULL; jump_list *jumplist = NULL; @@ -7694,7 +7732,7 @@ int base = (private_data_ptr == 0) ? SLJIT_MEM1(STACK_TOP) : SLJIT_MEM1(SLJIT_LO int offset0 = (private_data_ptr == 0) ? STACK(0) : private_data_ptr; int offset1 = (private_data_ptr == 0) ? STACK(1) : private_data_ptr + (int)sizeof(sljit_sw); -cc = get_iterator_parameters(common, cc, &opcode, &type, &arg1, &arg2, NULL); +cc = get_iterator_parameters(common, cc, &opcode, &type, &max, &min, NULL); switch(opcode) { @@ -7713,7 +7751,7 @@ switch(opcode) else { if (opcode == OP_UPTO) - arg2 = 0; + min = 0; if (opcode <= OP_PLUS) { OP1(SLJIT_MOV, STR_PTR, 0, base, offset0); @@ -7723,7 +7761,7 @@ switch(opcode) { OP1(SLJIT_MOV, TMP1, 0, base, offset1); OP1(SLJIT_MOV, STR_PTR, 0, base, offset0); - jump = CMP(SLJIT_C_LESS_EQUAL, TMP1, 0, SLJIT_IMM, arg2 + 1); + jump = CMP(SLJIT_C_LESS_EQUAL, TMP1, 0, SLJIT_IMM, min + 1); OP2(SLJIT_SUB, base, offset1, TMP1, 0, SLJIT_IMM, 1); } skip_char_back(common); @@ -7768,12 +7806,12 @@ switch(opcode) OP1(SLJIT_MOV, base, offset1, TMP1, 0); if (opcode == OP_CRMINRANGE) - CMPTO(SLJIT_C_LESS, TMP1, 0, SLJIT_IMM, arg2 + 1, label); + CMPTO(SLJIT_C_LESS, TMP1, 0, SLJIT_IMM, min + 1, label); - if (opcode == OP_CRMINRANGE && arg1 == 0) + if (opcode == OP_CRMINRANGE && max == 0) JUMPTO(SLJIT_JUMP, CURRENT_AS(iterator_backtrack)->matchingpath); else - CMPTO(SLJIT_C_LESS, TMP1, 0, SLJIT_IMM, arg1 + 2, CURRENT_AS(iterator_backtrack)->matchingpath); + CMPTO(SLJIT_C_LESS, TMP1, 0, SLJIT_IMM, max + 2, CURRENT_AS(iterator_backtrack)->matchingpath); set_jumps(jumplist, LABEL()); if (private_data_ptr == 0) @@ -7808,6 +7846,7 @@ switch(opcode) case OP_EXACT: case OP_POSPLUS: + case OP_CRPOSRANGE: set_jumps(current->topbacktracks, LABEL()); break; diff --git a/pcre_printint.c b/pcre_printint.c index 5e6a1ae..65b8e7d 100644 --- a/pcre_printint.c +++ b/pcre_printint.c @@ -735,17 +735,22 @@ for(;;) case OP_CRMINPLUS: case OP_CRQUERY: case OP_CRMINQUERY: + case OP_CRPOSSTAR: + case OP_CRPOSPLUS: + case OP_CRPOSQUERY: fprintf(f, "%s", priv_OP_names[*ccode]); extra += priv_OP_lengths[*ccode]; break; case OP_CRRANGE: case OP_CRMINRANGE: + case OP_CRPOSRANGE: min = GET2(ccode,1); max = GET2(ccode,1 + IMM2_SIZE); if (max == 0) fprintf(f, "{%u,}", min); else fprintf(f, "{%u,%u}", min, max); if (*ccode == OP_CRMINRANGE) fprintf(f, "?"); + else if (*ccode == OP_CRPOSRANGE) fprintf(f, "+"); extra += priv_OP_lengths[*ccode]; break; diff --git a/pcre_study.c b/pcre_study.c index e180caf..7e53bdb 100644 --- a/pcre_study.c +++ b/pcre_study.c @@ -342,6 +342,7 @@ for (;;) { case OP_CRPLUS: case OP_CRMINPLUS: + case OP_CRPOSPLUS: branchlength++; /* Fall through */ @@ -349,11 +350,14 @@ for (;;) case OP_CRMINSTAR: case OP_CRQUERY: case OP_CRMINQUERY: + case OP_CRPOSSTAR: + case OP_CRPOSQUERY: cc++; break; case OP_CRRANGE: case OP_CRMINRANGE: + case OP_CRPOSRANGE: branchlength += GET2(cc,1); cc += 1 + 2 * IMM2_SIZE; break; @@ -436,18 +440,22 @@ for (;;) case OP_CRMINSTAR: case OP_CRQUERY: case OP_CRMINQUERY: + case OP_CRPOSSTAR: + case OP_CRPOSQUERY: min = 0; cc++; break; case OP_CRPLUS: case OP_CRMINPLUS: + case OP_CRPOSPLUS: min = 1; cc++; break; case OP_CRRANGE: case OP_CRMINRANGE: + case OP_CRPOSRANGE: min = GET2(cc, 1); cc += 1 + 2 * IMM2_SIZE; break; @@ -1305,11 +1313,14 @@ do case OP_CRMINSTAR: case OP_CRQUERY: case OP_CRMINQUERY: + case OP_CRPOSSTAR: + case OP_CRPOSQUERY: tcode++; break; case OP_CRRANGE: case OP_CRMINRANGE: + case OP_CRPOSRANGE: if (GET2(tcode, 1) == 0) tcode += 1 + 2 * IMM2_SIZE; else try_next = FALSE; break; diff --git a/testdata/saved16BE-1 b/testdata/saved16BE-1 Binary files differindex 1bd9fa5..e6edddc 100644 --- a/testdata/saved16BE-1 +++ b/testdata/saved16BE-1 diff --git a/testdata/saved16BE-2 b/testdata/saved16BE-2 Binary files differindex 063d6bc..c91ce37 100644 --- a/testdata/saved16BE-2 +++ b/testdata/saved16BE-2 diff --git a/testdata/saved16LE-1 b/testdata/saved16LE-1 Binary files differindex 65f9d1c..5035ec0 100644 --- a/testdata/saved16LE-1 +++ b/testdata/saved16LE-1 diff --git a/testdata/saved16LE-2 b/testdata/saved16LE-2 Binary files differindex b74d9a0..656c058 100644 --- a/testdata/saved16LE-2 +++ b/testdata/saved16LE-2 diff --git a/testdata/saved32BE-1 b/testdata/saved32BE-1 Binary files differindex 2573d72..b4c2ffe 100644 --- a/testdata/saved32BE-1 +++ b/testdata/saved32BE-1 diff --git a/testdata/saved32BE-2 b/testdata/saved32BE-2 Binary files differindex ae6c18f..79bb5e8 100644 --- a/testdata/saved32BE-2 +++ b/testdata/saved32BE-2 diff --git a/testdata/saved32LE-1 b/testdata/saved32LE-1 Binary files differindex 7f6bddb..49392b8 100644 --- a/testdata/saved32LE-1 +++ b/testdata/saved32LE-1 diff --git a/testdata/saved32LE-2 b/testdata/saved32LE-2 Binary files differindex d260260..5f64af9 100644 --- a/testdata/saved32LE-2 +++ b/testdata/saved32LE-2 diff --git a/testdata/testinput2 b/testdata/testinput2 index 3604a8e..bf3d926 100644 --- a/testdata/testinput2 +++ b/testdata/testinput2 @@ -3898,6 +3898,36 @@ backtracking verbs. --/ /a+(?:bb)?a#a+(?:|||)#a+(?:|b)a#a+(?:|||)?a/BZ +/[ab]*/BZ + aaaa + +/[ab]*?/BZ + aaaa + +/[ab]?/BZ + aaaa + +/[ab]??/BZ + aaaa + +/[ab]+/BZ + aaaa + +/[ab]+?/BZ + aaaa + +/[ab]{2,3}/BZ + aaaa + +/[ab]{2,3}?/BZ + aaaa + +/[ab]{2,}/BZ + aaaa + +/[ab]{2,}?/BZ + aaaa + /-- End of special auto-possessive tests --/ /^A\o{1239}B/ diff --git a/testdata/testinput8 b/testdata/testinput8 index fe836d4..1a74eb7 100644 --- a/testdata/testinput8 +++ b/testdata/testinput8 @@ -4803,4 +4803,36 @@ /abcd/ abcd\O0 +/-- These tests show up auto-possessification --/ + +/[ab]*/ + aaaa + +/[ab]*?/ + aaaa + +/[ab]?/ + aaaa + +/[ab]??/ + aaaa + +/[ab]+/ + aaaa + +/[ab]+?/ + aaaa + +/[ab]{2,3}/ + aaaa + +/[ab]{2,3}?/ + aaaa + +/[ab]{2,}/ + aaaa + +/[ab]{2,}?/ + aaaa + /-- End of testinput8 --/ diff --git a/testdata/testoutput14 b/testdata/testoutput14 index 1a94420..8ef235e 100644 --- a/testdata/testoutput14 +++ b/testdata/testoutput14 @@ -432,7 +432,7 @@ Starting byte set: \x0a \x0b \x0c \x0d \x85 /[\h]+/BZ ------------------------------------------------------------------ Bra - [\x09 \xa0]+ + [\x09 \xa0]++ Ket End ------------------------------------------------------------------ diff --git a/testdata/testoutput2 b/testdata/testoutput2 index 7a070dc..690226e 100644 --- a/testdata/testoutput2 +++ b/testdata/testoutput2 @@ -4591,7 +4591,7 @@ No need char ------------------------------------------------------------------ Bra Once - [ab]{1,1} + [ab]{1,1}+ Ket Ket End @@ -5304,7 +5304,7 @@ No match Callout 255 0 21 CBra 1 Callout 255 1 9 - [ab]{1,4} + [ab]{1,4}+ Callout 255 10 1 c Callout 255 11 0 @@ -5317,7 +5317,7 @@ No match Ket CBra 1 Callout 255 1 9 - [ab]{1,4} + [ab]{1,4}+ Callout 255 10 1 c Callout 255 11 0 @@ -5330,7 +5330,7 @@ No match Ket CBra 1 Callout 255 1 9 - [ab]{1,4} + [ab]{1,4}+ Callout 255 10 1 c Callout 255 11 0 @@ -5343,7 +5343,7 @@ No match Ket CBra 1 Callout 255 1 9 - [ab]{1,4} + [ab]{1,4}+ Callout 255 10 1 c Callout 255 11 0 @@ -5357,7 +5357,7 @@ No match Braminzero CBra 1 Callout 255 1 9 - [ab]{1,4} + [ab]{1,4}+ Callout 255 10 1 c Callout 255 11 0 @@ -7644,7 +7644,7 @@ No match ------------------------------------------------------------------ Bra ^ - [a-z]+ + [a-z]++ Ket End ------------------------------------------------------------------ @@ -12757,7 +12757,7 @@ No set of starting bytes /[bcd]*a/BZ ------------------------------------------------------------------ Bra - [b-d]* + [b-d]*+ a Ket End @@ -13647,6 +13647,106 @@ No set of starting bytes End ------------------------------------------------------------------ +/[ab]*/BZ +------------------------------------------------------------------ + Bra + [ab]*+ + Ket + End +------------------------------------------------------------------ + aaaa + 0: aaaa + +/[ab]*?/BZ +------------------------------------------------------------------ + Bra + [ab]*? + Ket + End +------------------------------------------------------------------ + aaaa + 0: + +/[ab]?/BZ +------------------------------------------------------------------ + Bra + [ab]?+ + Ket + End +------------------------------------------------------------------ + aaaa + 0: a + +/[ab]??/BZ +------------------------------------------------------------------ + Bra + [ab]?? + Ket + End +------------------------------------------------------------------ + aaaa + 0: + +/[ab]+/BZ +------------------------------------------------------------------ + Bra + [ab]++ + Ket + End +------------------------------------------------------------------ + aaaa + 0: aaaa + +/[ab]+?/BZ +------------------------------------------------------------------ + Bra + [ab]+? + Ket + End +------------------------------------------------------------------ + aaaa + 0: a + +/[ab]{2,3}/BZ +------------------------------------------------------------------ + Bra + [ab]{2,3}+ + Ket + End +------------------------------------------------------------------ + aaaa + 0: aaa + +/[ab]{2,3}?/BZ +------------------------------------------------------------------ + Bra + [ab]{2,3}? + Ket + End +------------------------------------------------------------------ + aaaa + 0: aa + +/[ab]{2,}/BZ +------------------------------------------------------------------ + Bra + [ab]{2,}+ + Ket + End +------------------------------------------------------------------ + aaaa + 0: aaaa + +/[ab]{2,}?/BZ +------------------------------------------------------------------ + Bra + [ab]{2,}? + Ket + End +------------------------------------------------------------------ + aaaa + 0: aa + /-- End of special auto-possessive tests --/ /^A\o{1239}B/ diff --git a/testdata/testoutput8 b/testdata/testoutput8 index 5ca6b45..9cb06fb 100644 --- a/testdata/testoutput8 +++ b/testdata/testoutput8 @@ -1549,18 +1549,6 @@ No match /^[.^$|()*+?{,}]+/ .^\$(*+)|{?,?} 0: .^$(*+)|{?,?} - 1: .^$(*+)|{?,? - 2: .^$(*+)|{?, - 3: .^$(*+)|{? - 4: .^$(*+)|{ - 5: .^$(*+)| - 6: .^$(*+) - 7: .^$(*+ - 8: .^$(* - 9: .^$( -10: .^$ -11: .^ -12: . /^a*\w/ z @@ -2790,8 +2778,6 @@ No match /[-az]+/ az- 0: az- - 1: az - 2: a *** Failers 0: a b @@ -2800,8 +2786,6 @@ No match /[az-]+/ za- 0: za- - 1: za - 2: z *** Failers 0: a b @@ -2810,8 +2794,6 @@ No match /[a\-z]+/ a-z 0: a-z - 1: a- - 2: a *** Failers 0: a b @@ -2820,20 +2802,10 @@ No match /[a-z]+/ abcdxyz 0: abcdxyz - 1: abcdxy - 2: abcdx - 3: abcd - 4: abc - 5: ab - 6: a /[\d-]+/ 12-34 0: 12-34 - 1: 12-3 - 2: 12- - 3: 12 - 4: 1 *** Failers No match aaa @@ -2842,11 +2814,6 @@ No match /[\d-z]+/ 12-34z 0: 12-34z - 1: 12-34 - 2: 12-3 - 3: 12- - 4: 12 - 5: 1 *** Failers No match aaa @@ -4782,9 +4749,6 @@ No match /[^ab]*/ cde 0: cde - 1: cd - 2: c - 3: /abc/ *** Failers @@ -4884,10 +4848,6 @@ No match /[a-zA-Z_][a-zA-Z0-9_]*/ alpha 0: alpha - 1: alph - 2: alp - 3: al - 4: a /^a(bc+|b[eh])g|.h$/ abh @@ -5244,9 +5204,6 @@ No match /[^ab]*/i CDE 0: CDE - 1: CD - 2: C - 3: /abc/i @@ -5337,10 +5294,6 @@ No match /[a-zA-Z_][a-zA-Z0-9_]*/i ALPHA 0: ALPHA - 1: ALPH - 2: ALP - 3: AL - 4: A /^a(bc+|b[eh])g|.h$/i ABH @@ -5897,17 +5850,14 @@ No match /([[:]+)/ a:[b]: 0: :[ - 1: : /([[=]+)/ a=[b]= 0: =[ - 1: = /([[.]+)/ a.[b]. 0: .[ - 1: . /((?>a+)b)/ aaab @@ -6074,25 +6024,14 @@ No match /[[:space:]]+/ > \x09\x0a\x0c\x0d\x0b< 0: \x09\x0a\x0c\x0d\x0b - 1: \x09\x0a\x0c\x0d - 2: \x09\x0a\x0c - 3: \x09\x0a - 4: \x09 - 5: /[[:blank:]]+/ > \x09\x0a\x0c\x0d\x0b< 0: \x09 - 1: /[\s]+/ > \x09\x0a\x0c\x0d\x0b< 0: \x09\x0a\x0c\x0d\x0b - 1: \x09\x0a\x0c\x0d - 2: \x09\x0a\x0c - 3: \x09\x0a - 4: \x09 - 5: /\s+/ > \x09\x0a\x0c\x0d\x0b< @@ -6403,8 +6342,6 @@ Partial match: 123 /Content-Type\x3A[^\r\n]{6,}/ Content-Type:xxxxxyyy 0: Content-Type:xxxxxyyy - 1: Content-Type:xxxxxyy - 2: Content-Type:xxxxxy /Content-Type\x3A[^\r\n]{6,}z/ Content-Type:xxxxxyyyz @@ -7354,8 +7291,6 @@ Partial match: abc1 /abc[de]*/ xxxxabcde\P 0: abcde - 1: abcd - 2: abc xxxxabcde\P\P Partial match: abcde @@ -7798,4 +7733,57 @@ Error -30 (invalid data in workspace for DFA restart) abcd\O0 Matched, but offsets vector is too small to show all matches +/-- These tests show up auto-possessification --/ + +/[ab]*/ + aaaa + 0: aaaa + +/[ab]*?/ + aaaa + 0: aaaa + 1: aaa + 2: aa + 3: a + 4: + +/[ab]?/ + aaaa + 0: a + +/[ab]??/ + aaaa + 0: a + 1: + +/[ab]+/ + aaaa + 0: aaaa + +/[ab]+?/ + aaaa + 0: aaaa + 1: aaa + 2: aa + 3: a + +/[ab]{2,3}/ + aaaa + 0: aaa + +/[ab]{2,3}?/ + aaaa + 0: aaa + 1: aa + +/[ab]{2,}/ + aaaa + 0: aaaa + +/[ab]{2,}?/ + aaaa + 0: aaaa + 1: aaa + 2: aa + /-- End of testinput8 --/ diff --git a/testdata/testoutput9 b/testdata/testoutput9 index 1b38337..797d9ac 100644 --- a/testdata/testoutput9 +++ b/testdata/testoutput9 @@ -1227,8 +1227,6 @@ Partial match: abc1 /abc[de]*/8 xxxxabcde\P 0: abcde - 1: abcd - 2: abc xxxxabcde\P\P Partial match: abcde |