diff options
-rw-r--r-- | pod/perl595delta.pod | 4 | ||||
-rw-r--r-- | pod/perlre.pod | 260 | ||||
-rw-r--r-- | regcomp.c | 49 | ||||
-rw-r--r-- | regcomp.h | 1 | ||||
-rw-r--r-- | regcomp.pl | 9 | ||||
-rw-r--r-- | regcomp.sym | 30 | ||||
-rw-r--r-- | regexec.c | 221 | ||||
-rw-r--r-- | regexp.h | 11 | ||||
-rw-r--r-- | regnodes.h | 55 | ||||
-rwxr-xr-x | t/op/pat.t | 86 | ||||
-rw-r--r-- | win32/Makefile | 18 |
11 files changed, 481 insertions, 263 deletions
diff --git a/pod/perl595delta.pod b/pod/perl595delta.pod index d072de028e..af76cf68ee 100644 --- a/pod/perl595delta.pod +++ b/pod/perl595delta.pod @@ -112,8 +112,8 @@ quantifiers. (Yves Orton) =item Backtracking control verbs The regex engine now supports a number of special purpose backtrack -control verbs: (*COMMIT), (*MARK), (*CUT), (*ERROR), (*FAIL) and -(*ACCEPT). See L<perlre> for their descriptions. (Yves Orton) +control verbs: (*THEN), (*PRUNE), (*MARK), (*SKIP), (*COMMIT), (*FAIL) +and (*ACCEPT). See L<perlre> for their descriptions. =back diff --git a/pod/perlre.pod b/pod/perlre.pod index fcf3d510e5..0323a97405 100644 --- a/pod/perlre.pod +++ b/pod/perlre.pod @@ -5,7 +5,7 @@ perlre - Perl regular expressions =head1 DESCRIPTION -This page describes the syntax of regular expressions in Perl. +This page describes the syntax of regular expressions in Perl. If you haven't used regular expressions before, a quick-start introduction is available in L<perlrequick>, and a longer tutorial @@ -19,7 +19,7 @@ Operators">. Matching operations can have various modifiers. Modifiers that relate to the interpretation of the regular expression inside are listed below. Modifiers that alter the way a regular expression -is used by Perl are detailed in L<perlop/"Regexp Quote-Like Operators"> and +is used by Perl are detailed in L<perlop/"Regexp Quote-Like Operators"> and L<perlop/"Gory details of parsing quoted constructs">. =over 4 @@ -245,10 +245,10 @@ X<word> X<whitespace> NOTE: breaks up characters into their UTF-8 bytes, so you may end up with malformed pieces of UTF-8. Unsupported in lookbehind. - \1 Backreference to a a specific group. - '1' may actually be any positive integer + \1 Backreference to a specific group. + '1' may actually be any positive integer. \k<name> Named backreference - \N{name} Named unicode character, or unicode escape. + \N{name} Named unicode character, or unicode escape \x12 Hexadecimal escape sequence \x{1234} Long hexadecimal escape sequence @@ -607,12 +607,12 @@ sensitive and some do not. The case insensitive ones need to include merely C<(?i)> at the front of the pattern. For example: $pattern = "foobar"; - if ( /$pattern/i ) { } + if ( /$pattern/i ) { } # more flexible: $pattern = "(?i)foobar"; - if ( /$pattern/ ) { } + if ( /$pattern/ ) { } These modifiers are restored at the end of the enclosing group. For example, @@ -640,7 +640,7 @@ but doesn't spit out extra fields. It's also cheaper not to capture characters if you don't need to. Any letters between C<?> and C<:> act as flags modifiers as with -C<(?imsx-imsx)>. For example, +C<(?imsx-imsx)>. For example, /(?s-i:more.*than).*million/i @@ -759,14 +759,14 @@ is backtracked (compare L<"Backtracking">), all changes introduced after C<local>ization are undone, so that $_ = 'a' x 8; - m< + m< (?{ $cnt = 0 }) # Initialize $cnt. ( - a + a (?{ local $cnt = $cnt + 1; # Update $cnt, backtracking-safe. }) - )* + )* aaaa (?{ $res = $cnt }) # On success copy to non-localized # location. @@ -797,7 +797,7 @@ For reasons of security, this construct is forbidden if the regular expression involves run-time interpolation of variables, unless the perilous C<use re 'eval'> pragma has been used (see L<re>), or the variables contain results of C<qr//> operator (see -L<perlop/"qr/STRING/imosx">). +L<perlop/"qr/STRING/imosx">). This restriction is because of the wide-spread and remarkably convenient custom of using run-time determined strings as patterns. For example: @@ -814,7 +814,7 @@ so you should only do so if you are also using taint checking. Better yet, use the carefully constrained evaluation within a Safe compartment. See L<perlsec> for details about both these mechanisms. -Because perl's regex engine is not currently re-entrant, interpolated +Because perl's regex engine is not currently re-entrant, interpolated code may not invoke the regex engine either directly with C<m//> or C<s///>), or indirectly with functions such as C<split>. @@ -858,12 +858,12 @@ The following pattern matches a parenthesized group: See also C<(?PARNO)> for a different, more efficient way to accomplish the same task. -Because perl's regex engine is not currently re-entrant, delayed +Because perl's regex engine is not currently re-entrant, delayed code may not invoke the regex engine either directly with C<m//> or C<s///>), or indirectly with functions such as C<split>. -Recursing deeper than 50 times without consuming any input string will -result in a fatal error. The maximum depth is compiled into perl, so +Recursing deeper than 50 times without consuming any input string will +result in a fatal error. The maximum depth is compiled into perl, so changing it requires a custom build. =item C<(?PARNO)> C<(?R)> C<(?0)> @@ -1147,22 +1147,27 @@ forbidden. Any pattern containing a special backtracking verb that allows an argument has the special behaviour that when executed it sets the current packages' -C<$REGERROR> variable. In this case, the following rules apply: +C<$REGERROR> and C<$REGMARK> variables. When doing so the following +rules apply: -On failure, this variable will be set to the ARG value of the verb -pattern, if the verb was involved in the failure of the match. If the ARG -part of the pattern was omitted, then C<$REGERROR> will be set to TRUE. +On failure, the C<$REGERROR> variable will be set to the ARG value of the +verb pattern, if the verb was involved in the failure of the match. If the +ARG part of the pattern was omitted, then C<$REGERROR> will be set to the +name of the last C<(*MARK:NAME)> pattern executed, or to TRUE if there was +none. Also, the C<$REGMARK> variable will be set to FALSE. -On a successful match this variable will be set to FALSE. +On a successful match, the C<$REGERROR> variable will be set to FALSE, and +the C<$REGMARK> variable will be set to the name of the last +C<(*MARK:NAME)> pattern executed. See the explanation for the +C<(*MARK:NAME)> verb below for more details. -B<NOTE:> C<$REGERROR> is not a magic variable in the same sense than -C<$1> and most other regex related variables. It is not local to a -scope, nor readonly but instead a volatile package variable similar to -C<$AUTOLOAD>. Use C<local> to localize changes to it to a specific scope -if necessary. +B<NOTE:> C<$REGERROR> and C<$REGMARK> are not magic variables like C<$1> +and most other regex related variables. They are not local to a scope, nor +readonly, but instead are volatile package variables similar to C<$AUTOLOAD>. +Use C<local> to localize changes to them to a specific scope if necessary. If a pattern does not contain a special backtracking verb that allows an -argument, then C<$REGERROR> is not touched at all. +argument, then C<$REGERROR> and C<$REGMARK> are not touched at all. =over 4 @@ -1170,16 +1175,16 @@ argument, then C<$REGERROR> is not touched at all. =over 4 -=item C<(*NOMATCH)> C<(*NOMATCH:NAME)> -X<(*NOMATCH)> X<(*NOMATCH:NAME)> +=item C<(*PRUNE)> C<(*PRUNE:NAME)> +X<(*PRUNE)> X<(*PRUNE:NAME)> -This zero-width pattern commits the match at the current point, preventing -the engine from backtracking on failure to the left of the this point. -Consider the pattern C<A (*NOMATCH) B>, where A and B are complex patterns. -Until the C<(*NOMATCH)> is reached, A may backtrack as necessary to match. -Once it is reached, matching continues in B, which may also backtrack as -necessary; however, should B not match, then no further backtracking will -take place, and the pattern will fail outright at that starting position. +This zero-width pattern prunes the backtracking tree at the current point +when backtracked into on failure. Consider the pattern C<A (*PRUNE) B>, +where A and B are complex patterns. Until the C<(*PRUNE)> verb is reached, +A may backtrack as necessary to match. Once it is reached, matching +continues in B, which may also backtrack as necessary; however, should B +not match, then no further backtracking will take place, and the pattern +will fail outright at the current starting position. The following example counts all the possible matching strings in a pattern (without actually matching any of them). @@ -1200,9 +1205,9 @@ which produces: a Count=9 -If we add a C<(*NOMATCH)> before the count like the following +If we add a C<(*PRUNE)> before the count like the following - 'aaab' =~ /a+b?(*NOMATCH)(?{print "$&\n"; $count++})(*FAIL)/; + 'aaab' =~ /a+b?(*PRUNE)(?{print "$&\n"; $count++})(*FAIL)/; print "Count=$count\n"; we prevent backtracking and find the count of the longest matching @@ -1213,47 +1218,36 @@ at each matching startpoint like so: ab Count=3 -Any number of C<(*NOMATCH)> assertions may be used in a pattern. +Any number of C<(*PRUNE)> assertions may be used in a pattern. -See also C<< (?>pattern) >> and possessive quantifiers for other -ways to control backtracking. +See also C<< (?>pattern) >> and possessive quantifiers for other ways to +control backtracking. In some cases, the use of C<(*PRUNE)> can be +replaced with a C<< (?>pattern) >> with no functional difference; however, +C<(*PRUNE)> can be used to handle cases that cannot be expressed using a +C<< (?>pattern) >> alone. -=item C<(*MARK)> C<(*MARK:NAME)> -X<(*MARK)> -This zero-width pattern can be used to mark the point in a string -reached when a certain part of the pattern has been successfully -matched. This mark may be given a name. A later C<(*CUT)> pattern -will then cut at that point if backtracked into on failure. Any -number of (*MARK) patterns are allowed, and the NAME portion is -optional and may be duplicated. +=item C<(*SKIP)> C<(*SKIP:NAME)> +X<(*SKIP)> -See C<*CUT> for more detail. - -=item C<(*CUT)> C<(*CUT:NAME)> -X<(*CUT)> - -This zero-width pattern is similar to C<(*NOMATCH)>, except that on +This zero-width pattern is similar to C<(*PRUNE)>, except that on failure it also signifies that whatever text that was matched leading up -to the C<(*CUT)> pattern being executed cannot be part of a match, I<even -if started from a later point>. This effectively means that the regex -engine moves forward to this position on failure and tries to match -again, (assuming that there is sufficient room to match). - -The name of the C<(*CUT:NAME)> pattern has special significance. If a -C<(*MARK:NAME)> was encountered while matching, then it is the position -where that pattern was executed that is used for the "cut point" in the -string. If no mark of that name was encountered, then the cut is done at -the point where the C<(*CUT)> was. Similarly if no NAME is specified in -the C<(*CUT)>, and if a C<(*MARK)> with any name (or none) is encountered, -then that C<(*MARK)>'s cursor point will be used. If the C<(*CUT)> is not -preceded by a C<(*MARK)>, then the cut point is where the string was when -the C<(*CUT)> was encountered. - -Compare the following to the examples in C<(*NOMATCH)>, note the string +to the C<(*SKIP)> pattern being executed cannot be part of I<any> match +of this pattern. This effectively means that the regex engine "skips" forward +to this position on failure and tries to match again, (assuming that +there is sufficient room to match). + +The name of the C<(*SKIP:NAME)> pattern has special significance. If a +C<(*MARK:NAME)> was encountered while matching, then it is that position +which is used as the "skip point". If no C<(*MARK)> of that name was +encountered, then the C<(*SKIP)> operator has no effect. When used +without a name the "skip point" is where the match point was when +executing the (*SKIP) pattern. + +Compare the following to the examples in C<(*PRUNE)>, note the string is twice as long: - 'aaabaaab' =~ /a+b?(*CUT)(?{print "$&\n"; $count++})(*FAIL)/; + 'aaabaaab' =~ /a+b?(*SKIP)(?{print "$&\n"; $count++})(*FAIL)/; print "Count=$count\n"; outputs @@ -1262,15 +1256,85 @@ outputs aaab Count=2 -Once the 'aaab' at the start of the string has matched, and the C<(*CUT)> +Once the 'aaab' at the start of the string has matched, and the C<(*SKIP)> executed, the next startpoint will be where the cursor was when the -C<(*CUT)> was executed. +C<(*SKIP)> was executed. + +As a shortcut C<(*MARK:NAME)> can be written C<(*:NAME)>. + +=item C<(*MARK:NAME)> C<(*:NAME)> +X<(*MARK)> C<(*MARK:NAME)> C<(*:NAME)> + +This zero-width pattern can be used to mark the point reached in a string +when a certain part of the pattern has been successfully matched. This +mark may be given a name. A later C<(*SKIP)> pattern will then skip +forward to that point if backtracked into on failure. Any number of +C<(*MARK)> patterns are allowed, and the NAME portion is optional and may +be duplicated. + +In addition to interacting with the C<(*SKIP)> pattern, C<(*MARK:NAME)> +can be used to "label" a pattern branch, so that after matching, the +program can determine which branches of the pattern were involved in the +match. + +When a match is successful, the C<$REGMARK> variable will be set to the +name of the most recently executed C<(*MARK:NAME)> that was involved +in the match. + +This can be used to determine which branch of a pattern was matched +without using a seperate capture buffer for each branch, which in turn +can result in a performance improvement, as perl cannot optimize +C</(?:(x)|(y)|(z))/> as efficiently as something like +C</(?:x(*MARK:x)|y(*MARK:y)|z(*MARK:z))/>. + +When a match has failed, and unless another verb has been involved in +failing the match and has provided its own name to use, the C<$REGERROR> +variable will be set to the name of the most recently executed +C<(*MARK:NAME)>. + +See C<(*SKIP)> for more details. + +=item C<(*THEN)> C<(*THEN:NAME)> + +This is similar to the "cut group" operator C<::> from Perl6. Like +C<(*PRUNE)>, this verb always matches, and when backtracked into on +failure, it causes the regex engine to try the next alternation in the +innermost enclosing group (capturing or otherwise). + +Its name comes from the observation that this operation combined with the +alternation operator (C<|>) can be used to create what is essentially a +pattern-based if/then/else block: + + ( COND (*THEN) FOO | COND2 (*THEN) BAR | COND3 (*THEN) BAZ ) + +Note that if this operator is used and NOT inside of an alternation then +it acts exactly like the C<(*PRUNE)> operator. + + / A (*PRUNE) B / + +is the same as + + / A (*THEN) B / + +but + + / ( A (*THEN) B | C (*THEN) D ) / + +is not the same as + + / ( A (*PRUNE) B | C (*PRUNE) D ) / + +as after matching the A but failing on the B the C<(*THEN)> verb will +backtrack and try C; but the C<(*PRUNE)> verb will simply fail. =item C<(*COMMIT)> X<(*COMMIT)> -This zero-width pattern is similar to C<(*CUT)> except that it causes -the match to fail outright. No attempts to match will occur again. +This is the Perl6 "commit pattern" C<< <commit> >> or C<:::>. It's a +zero-width pattern similar to C<(*SKIP)>, except that when backtracked +into on failure it causes the match to fail outright. No further attempts +to find a valid match by advancing the start pointer will occur again. +For example, 'aaabaaab' =~ /a+b?(*COMMIT)(?{print "$&\n"; $count++})(*FAIL)/; print "Count=$count\n"; @@ -1527,7 +1591,7 @@ A powerful tool for optimizing such beasts is what is known as an "independent group", which does not backtrack (see L<C<< (?>pattern) >>>). Note also that zero-length look-ahead/look-behind assertions will not backtrack to make -the tail match, since they are in "logical" context: only +the tail match, since they are in "logical" context: only whether they match is considered relevant. For an example where side-effects of look-ahead I<might> have influenced the following match, see L<C<< (?>pattern) >>>. @@ -1547,7 +1611,7 @@ series of characters in the target string, so the pattern C<blurfl> would match "blurfl" in the target string. You can specify a character class, by enclosing a list of characters -in C<[]>, which will match any one character from the list. If the +in C<[]>, which will match any character from the list. If the first character after the "[" is "^", the class matches any character not in the list. Within a list, the "-" character specifies a range, so that C<a-z> represents all characters between "a" and "z", @@ -1557,10 +1621,10 @@ escape it with a backslash. "-" is also taken literally when it is at the end of the list, just before the closing "]". (The following all specify the same class of three characters: C<[-az]>, C<[az-]>, and C<[a\-z]>. All are different from C<[a-z]>, which -specifies a class containing twenty-six characters, even on EBCDIC -based coded character sets.) Also, if you try to use the character -classes C<\w>, C<\W>, C<\s>, C<\S>, C<\d>, or C<\D> as endpoints of -a range, that's not a range, the "-" is understood literally. +specifies a class containing twenty-six characters, even on EBCDIC-based +character sets.) Also, if you try to use the character +classes C<\w>, C<\W>, C<\s>, C<\S>, C<\d>, or C<\D> as endpoints of +a range, the "-" is understood literally. Note also that the whole range idea is rather unportable between character sets--and even within character sets they may cause results @@ -1572,10 +1636,10 @@ spell out the character sets in full. Characters may be specified using a metacharacter syntax much like that used in C: "\n" matches a newline, "\t" a tab, "\r" a carriage return, "\f" a form feed, etc. More generally, \I<nnn>, where I<nnn> is a string -of octal digits, matches the character whose coded character set value -is I<nnn>. Similarly, \xI<nn>, where I<nn> are hexadecimal digits, -matches the character whose numeric value is I<nn>. The expression \cI<x> -matches the character control-I<x>. Finally, the "." metacharacter +of octal digits, matches the character whose coded character set value +is I<nnn>. Similarly, \xI<nn>, where I<nn> are hexadecimal digits, +matches the character whose numeric value is I<nn>. The expression \cI<x> +matches the character control-I<x>. Finally, the "." metacharacter matches any character except "\n" (unless you use C</s>). You can specify a series of alternatives for a pattern using "|" to @@ -1679,17 +1743,17 @@ zero-length substring. Thus m{ (?: NON_ZERO_LENGTH | ZERO_LENGTH )* }x; -is made equivalent to +is made equivalent to - m{ (?: NON_ZERO_LENGTH )* - | - (?: ZERO_LENGTH )? + m{ (?: NON_ZERO_LENGTH )* + | + (?: ZERO_LENGTH )? }x; The higher level-loops preserve an additional state between iterations: -whether the last match was zero-length. To break the loop, the following +whether the last match was zero-length. To break the loop, the following match after a zero-length match is prohibited to have a length of zero. -This prohibition interacts with backtracking (see L<"Backtracking">), +This prohibition interacts with backtracking (see L<"Backtracking">), and so the I<second best> match is chosen if the I<best> match is of zero length. @@ -1699,11 +1763,11 @@ For example: s/\w??/<$&>/g; results in C<< <><b><><a><><r><> >>. At each position of the string the best -match given by non-greedy C<??> is the zero-length match, and the I<second +match given by non-greedy C<??> is the zero-length match, and the I<second best> match is what is matched by C<\w>. Thus zero-length matches alternate with one-character-long matches. -Similarly, for repeated C<m/()/g> the second-best match is the match at the +Similarly, for repeated C<m/()/g> the second-best match is the match at the position one notch further in the string. The additional state of being I<matched with zero-length> is associated with @@ -1744,7 +1808,7 @@ below C<S> and C<T> are regular subexpressions. Consider two possible matches, C<AB> and C<A'B'>, C<A> and C<A'> are substrings which can be matched by C<S>, C<B> and C<B'> are substrings -which can be matched by C<T>. +which can be matched by C<T>. If C<A> is better match for C<S> than C<A'>, C<AB> is a better match than C<A'B'>. @@ -1837,14 +1901,14 @@ this: # We must also take care of not escaping the legitimate \\Y| # sequence, hence the presence of '\\' in the conversion rules. - my %rules = ( '\\' => '\\\\', + my %rules = ( '\\' => '\\\\', 'Y|' => qr/(?=\S)(?<!\S)|(?!\S)(?<=\S)/ ); sub convert { my $re = shift; - $re =~ s{ + $re =~ s{ \\ ( \\ | Y . ) } - { $rules{$1} or invalid($re,$1) }sgex; + { $rules{$1} or invalid($re,$1) }sgex; return $re; } @@ -2649,8 +2649,14 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, if ( ((made == MADE_EXACT_TRIE && startbranch == first) || ( first_non_open == first )) && - depth==0 ) + depth==0 ) { flags |= SCF_TRIE_RESTUDY; + if ( startbranch == first + && scan == tail ) + { + RExC_seen &=~REG_TOP_LEVEL_BRANCHES; + } + } #endif } } @@ -4062,8 +4068,14 @@ reStudy: #ifdef TRIE_STUDY_OPT if ( restudied ) { + U32 seen=RExC_seen; DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n")); - RExC_state=copyRExC_state; + + RExC_state = copyRExC_state; + if (seen & REG_TOP_LEVEL_BRANCHES) + RExC_seen |= REG_TOP_LEVEL_BRANCHES; + else + RExC_seen &= ~REG_TOP_LEVEL_BRANCHES; if (data.last_found) { SvREFCNT_dec(data.longest_fixed); SvREFCNT_dec(data.longest_float); @@ -4072,7 +4084,7 @@ reStudy: StructCopy(&zero_scan_data, &data, scan_data_t); } else { StructCopy(&zero_scan_data, &data, scan_data_t); - copyRExC_state=RExC_state; + copyRExC_state = RExC_state; } #else StructCopy(&zero_scan_data, &data, scan_data_t); @@ -4400,7 +4412,7 @@ reStudy: struct regnode_charclass_class ch_class; I32 last_close = 0; - DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "\n")); + DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n")); scan = r->program + 1; cl_init(pRExC_state, &ch_class); @@ -4455,6 +4467,8 @@ reStudy: r->reganch |= ROPT_CANY_SEEN; if (RExC_seen & REG_SEEN_VERBARG) r->reganch |= ROPT_VERBARG_SEEN; + if (RExC_seen & REG_SEEN_CUTGROUP) + r->reganch |= ROPT_CUTGROUP_SEEN; if (RExC_paren_names) r->paren_names = (HV*)SvREFCNT_inc(RExC_paren_names); else @@ -4713,6 +4727,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) if ( *RExC_parse != ')' ) vFAIL("Unterminated verb pattern"); } + switch ( *start_verb ) { case 'A': /* (*ACCEPT) */ if ( CHECK_WORD("ACCEPT",start_verb,verb_len) ) { @@ -4723,8 +4738,6 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) case 'C': /* (*COMMIT) */ if ( CHECK_WORD("COMMIT",start_verb,verb_len) ) op = COMMIT; - else if ( CHECK_WORD("CUT",start_verb,verb_len) ) - op = CUT; break; case 'F': /* (*FAIL) */ if ( verb_len==1 || CHECK_WORD("FAIL",start_verb,verb_len) ) { @@ -4732,13 +4745,27 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) argok = 0; } break; - case 'M': - if ( CHECK_WORD("MARK",start_verb,verb_len) ) + case ':': /* (*:NAME) */ + case 'M': /* (*MARK:NAME) */ + if ( verb_len==0 || CHECK_WORD("MARK",start_verb,verb_len) ) { op = MARKPOINT; + argok = -1; + } + break; + case 'P': /* (*PRUNE) */ + if ( CHECK_WORD("PRUNE",start_verb,verb_len) ) + op = PRUNE; break; - case 'N': /* (*NOMATCH) */ - if ( CHECK_WORD("NOMATCH",start_verb,verb_len) ) - op = NOMATCH; + case 'S': /* (*SKIP) */ + if ( CHECK_WORD("SKIP",start_verb,verb_len) ) + op = SKIP; + break; + case 'T': /* (*THEN) */ + /* [19:06] <TimToady> :: is then */ + if ( CHECK_WORD("THEN",start_verb,verb_len) ) { + op = CUTGROUP; + RExC_seen |= REG_SEEN_CUTGROUP; + } break; } if ( ! op ) { @@ -352,6 +352,7 @@ struct regnode_charclass_class { /* has [[:blah:]] classes */ #define REG_SEEN_RECURSE 0x00000020 #define REG_TOP_LEVEL_BRANCHES 0x00000040 #define REG_SEEN_VERBARG 0x00000080 +#define REG_SEEN_CUTGROUP 0x00000100 START_EXTERN_C diff --git a/regcomp.pl b/regcomp.pl index 700268d83d..14c2eb7be2 100644 --- a/regcomp.pl +++ b/regcomp.pl @@ -55,12 +55,9 @@ while (<DESC>) { } } -my ($width,$rwidth,$twidth)=(0,0,0); -for (1..@name) { - $width=length($name[$_]) if $name[$_] and $width<length($name[$_]); - $twidth=length($type[$_]) if $type[$_] and $twidth<length($type[$_]); - $rwidth=$width if $_ == $lastregop; -} +# use fixed width to keep the diffs between regcomp.pl recompiles +# as small as possible. +my ($width,$rwidth,$twidth)=(22,12,9); $lastregop ||= $ind; my $tot = $ind; close DESC; diff --git a/regcomp.sym b/regcomp.sym index 074af13284..d6b97d5c0b 100644 --- a/regcomp.sym +++ b/regcomp.sym @@ -146,21 +146,21 @@ RENUM BRANCHJ,off 1 1 Group with independently numbered parens. # inline charclass data (ascii only), the 'C' store it in the structure. # NOTE: the relative order of the TRIE-like regops is signifigant -TRIE TRIE, trie 1 Match many EXACT(FL?)? at once. flags==type -TRIEC TRIE, trie charclass Same as TRIE, but with embedded charclass data +TRIE TRIE, trie 1 Match many EXACT(FL?)? at once. flags==type +TRIEC TRIE,trie charclass Same as TRIE, but with embedded charclass data # For start classes, contains an added fail table. -AHOCORASICK TRIE, trie 1 Aho Corasick stclass. flags==type -AHOCORASICKC TRIE, trie charclass Same as AHOCORASICK, but with embedded charclass data +AHOCORASICK TRIE, trie 1 Aho Corasick stclass. flags==type +AHOCORASICKC TRIE,trie charclass Same as AHOCORASICK, but with embedded charclass data #*Regex Subroutines (65..66) -GOSUB GOSUB, num/ofs 2L recurse to paren arg1 at (signed) ofs arg2 +GOSUB GOSUB, num/ofs 2L recurse to paren arg1 at (signed) ofs arg2 GOSTART GOSTART, no recurse to start of pattern #*Named references (67..69) -NREF NREF, no-sv 1 Match some already matched string -NREFF NREF, no-sv 1 Match already matched string, folded -NREFFL NREF, no-sv 1 Match already matched string, folded in loc. +NREF NREF, no-sv 1 Match some already matched string +NREFF NREF, no-sv 1 Match already matched string, folded +NREFFL NREF, no-sv 1 Match already matched string, folded in loc. #*Special conditionals (70..72) @@ -168,16 +168,19 @@ NGROUPP NGROUPP, no-sv 1 Whether the group matched. INSUBP INSUBP, num 1 Whether we are in a specific recurse. DEFINEP DEFINEP, none 1 Never execute directly. -#*Bactracking +#*Bactracking Verbs ENDLIKE ENDLIKE, none Used only for the type field of verbs OPFAIL ENDLIKE, none Same as (?!) ACCEPT ENDLIKE, parno 1 Accepts the current matched string. + + +#*Verbs With Arguments VERB VERB, no-sv 1 Used only for the type field of verbs -NOMATCH VERB, no-sv 1 Pattern fails at this startpoint if no-backtracking through this +PRUNE VERB, no-sv 1 Pattern fails at this startpoint if no-backtracking through this MARKPOINT VERB, no-sv 1 Push the current location for rollback by cut. -CUT VERB, no-sv 1 On failure cut the string at the mark. +SKIP VERB, no-sv 1 On failure skip forward (to the mark) before retrying COMMIT VERB, no-sv 1 Pattern fails outright if backtracking through this - +CUTGROUP VERB, no-sv 1 On failure go to the next alternation in the group # NEW STUFF ABOVE THIS LINE -- Please update counts below. @@ -217,4 +220,5 @@ IFMATCH A:FAIL CURLY B_min_known,B_min,B_max:FAIL COMMIT next:FAIL MARKPOINT next:FAIL -CUT next:FAIL +SKIP next:FAIL +CUTGROUP next:FAIL @@ -2418,9 +2418,14 @@ regmatch(), slabs allocated since entry are freed. DEBUG_STATE_r({ \ DUMP_EXEC_POS(locinput, scan, do_utf8); \ PerlIO_printf(Perl_debug_log, \ - " %*s"pp" %s\n", \ + " %*s"pp" %s%s%s%s%s\n", \ depth*2, "", \ - reg_name[st->resume_state] ); \ + reg_name[st->resume_state], \ + ((st==yes_state||st==mark_state) ? "[" : ""), \ + ((st==yes_state) ? "Y" : ""), \ + ((st==mark_state) ? "M" : ""), \ + ((st==yes_state||st==mark_state) ? "]" : "") \ + ); \ }); @@ -2574,14 +2579,20 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) /* mark_state piggy backs on the yes_state logic so that when we unwind the stack on success we can update the mark_state as we go */ regmatch_state *mark_state = NULL; /* last mark state we have seen */ + regmatch_state *cur_eval = NULL; /* most recent EVAL_AB state */ struct regmatch_state *cur_curlyx = NULL; /* most recent curlyx */ U32 state_num; - bool no_final = 0; + bool no_final = 0; /* prevent failure from backtracking? */ + bool do_cutgroup = 0; /* no_final only until next branch/trie entry */ char *startpoint = PL_reginput; - SV *popmark = NULL; - SV *sv_commit = NULL; - unsigned int lastopen = 0; + SV *popmark = NULL; /* are we looking for a mark? */ + SV *sv_commit = NULL; /* last mark name seen in failure */ + SV *sv_yes_mark = NULL; /* last mark name we have seen + during a successfull match */ + U32 lastopen = 0; /* last open we saw */ + bool has_cutgroup = RX_HAS_CUTGROUP(rex) ? 1 : 0; + /* these three flags are set by various ops to signal information to * the very next op. They have a useful lifetime of exactly one loop * iteration, and are not preserved or restored by state pushes/pops @@ -2881,9 +2892,11 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) }} /* FALL THROUGH */ - case TRIE_next_fail: /* we failed - try next alterative */ - + if (do_cutgroup) { + do_cutgroup = 0; + no_final = 0; + } if ( ST.accepted == 1 ) { /* only one choice left - just continue */ DEBUG_EXECUTE_r({ @@ -2902,23 +2915,35 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) PL_reginput = (char *)ST.accept_buff[ 0 ].endpos; /* in this case we free tmps/leave before we call regmatch as we wont be using accept_buff again. */ - FREETMPS; - LEAVE; + locinput = PL_reginput; nextchr = UCHARAT(locinput); - - if ( !ST.jump || !ST.jump[ST.accept_buff[0].wordnum]) - scan = ST.B; - else - scan = ST.me + ST.jump[ST.accept_buff[0].wordnum]; + if ( !ST.jump || !ST.jump[ST.accept_buff[0].wordnum]) + scan = ST.B; + else + scan = ST.me + ST.jump[ST.accept_buff[0].wordnum]; + if (!has_cutgroup) { + FREETMPS; + LEAVE; + } else { + ST.accepted--; + PUSH_YES_STATE_GOTO(TRIE_next, scan); + } continue; /* execute rest of RE */ } if (!ST.accepted-- ) { + DEBUG_EXECUTE_r({ + PerlIO_printf( Perl_debug_log, + "%*s %sTRIE failed...%s\n", + REPORT_CODE_OFF+depth*2, "", + PL_colors[4], + PL_colors[5] ); + }); FREETMPS; LEAVE; - sayNO; + sayNO_SILENT; } /* @@ -2976,16 +3001,26 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) } PL_reginput = (char *)ST.accept_buff[ best ].endpos; if ( !ST.jump || !ST.jump[ST.accept_buff[best].wordnum]) { - PUSH_STATE_GOTO(TRIE_next, ST.B); + scan = ST.B; /* NOTREACHED */ } else { - PUSH_STATE_GOTO(TRIE_next, ST.me + ST.jump[ST.accept_buff[best].wordnum]); + scan = ST.me + ST.jump[ST.accept_buff[best].wordnum]; /* NOTREACHED */ } + if (has_cutgroup) { + PUSH_YES_STATE_GOTO(TRIE_next, scan); + /* NOTREACHED */ + } else { + PUSH_STATE_GOTO(TRIE_next, scan); + /* NOTREACHED */ + } /* NOTREACHED */ } /* NOTREACHED */ - + case TRIE_next: + FREETMPS; + LEAVE; + sayYES; #undef ST case EXACT: { @@ -4024,19 +4059,45 @@ NULL case BRANCH: /* /(...|A|...)/ */ scan = NEXTOPER(scan); /* scan now points to inner node */ - if (!next || (OP(next) != BRANCH && OP(next) != BRANCHJ)) + if ((!next || (OP(next) != BRANCH && OP(next) != BRANCHJ)) + && !has_cutgroup) + { /* last branch; skip state push and jump direct to node */ continue; + } ST.lastparen = *PL_reglastparen; ST.next_branch = next; REGCP_SET(ST.cp); PL_reginput = locinput; /* Now go into the branch */ - PUSH_STATE_GOTO(BRANCH_next, scan); + if (has_cutgroup) { + PUSH_YES_STATE_GOTO(BRANCH_next, scan); + } else { + PUSH_STATE_GOTO(BRANCH_next, scan); + } /* NOTREACHED */ - + case CUTGROUP: + PL_reginput = locinput; + sv_yes_mark = st->u.mark.mark_name = scan->flags ? NULL : + (SV*)rex->data->data[ ARG( scan ) ]; + PUSH_STATE_GOTO(CUTGROUP_next,next); + /* NOTREACHED */ + case CUTGROUP_next_fail: + do_cutgroup = 1; + no_final = 1; + if (st->u.mark.mark_name) + sv_commit = st->u.mark.mark_name; + sayNO; + /* NOTREACHED */ + case BRANCH_next: + sayYES; + /* NOTREACHED */ case BRANCH_next_fail: /* that branch failed; try the next, if any */ + if (do_cutgroup) { + do_cutgroup = 0; + no_final = 0; + } REGCP_UNWIND(ST.cp); for (n = *PL_reglastparen; n > ST.lastparen; n--) PL_regendp[n] = -1; @@ -4044,8 +4105,16 @@ NULL /*dmq: *PL_reglastcloseparen = n; */ scan = ST.next_branch; /* no more branches? */ - if (!scan || (OP(scan) != BRANCH && OP(scan) != BRANCHJ)) - sayNO; + if (!scan || (OP(scan) != BRANCH && OP(scan) != BRANCHJ)) { + DEBUG_EXECUTE_r({ + PerlIO_printf( Perl_debug_log, + "%*s %sBRANCH failed...%s\n", + REPORT_CODE_OFF+depth*2, "", + PL_colors[4], + PL_colors[5] ); + }); + sayNO_SILENT; + } continue; /* execute next BRANCH[J] op */ /* NOTREACHED */ @@ -4658,10 +4727,10 @@ NULL case COMMIT: reginfo->cutpoint = PL_regeol; /* FALLTHROUGH */ - case NOMATCH: + case PRUNE: PL_reginput = locinput; if (!scan->flags) - sv_commit = (SV*)rex->data->data[ ARG( scan ) ]; + sv_yes_mark = sv_commit = (SV*)rex->data->data[ ARG( scan ) ]; PUSH_STATE_GOTO(COMMIT_next,next); /* NOTREACHED */ case COMMIT_next_fail: @@ -4674,8 +4743,8 @@ NULL #define ST st->u.mark case MARKPOINT: ST.prev_mark = mark_state; - ST.mark_name = scan->flags ? &PL_sv_yes : - (SV*)rex->data->data[ ARG( scan ) ]; + ST.mark_name = sv_commit = sv_yes_mark + = (SV*)rex->data->data[ ARG( scan ) ]; mark_state = st; ST.mark_loc = PL_reginput = locinput; PUSH_YES_STATE_GOTO(MARKPOINT_next,next); @@ -4685,9 +4754,7 @@ NULL sayYES; /* NOTREACHED */ case MARKPOINT_next_fail: - if (popmark && ( popmark == &PL_sv_yes || - (ST.mark_name != &PL_sv_yes && - sv_eq(ST.mark_name,popmark)))) + if (popmark && sv_eq(ST.mark_name,popmark)) { if (ST.mark_loc > startpoint) reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1); @@ -4695,40 +4762,58 @@ NULL sv_commit = ST.mark_name; DEBUG_EXECUTE_r({ - if (sv_commit != &PL_sv_yes) - PerlIO_printf(Perl_debug_log, + PerlIO_printf(Perl_debug_log, "%*s %ssetting cutpoint to mark:%"SVf"...%s\n", REPORT_CODE_OFF+depth*2, "", PL_colors[4], sv_commit, PL_colors[5]); - else - PerlIO_printf(Perl_debug_log, - "%*s %ssetting cutpoint to mark...%s\n", - REPORT_CODE_OFF+depth*2, "", - PL_colors[4], PL_colors[5]); }); } mark_state = ST.prev_mark; + sv_yes_mark = mark_state ? + mark_state->u.mark.mark_name : NULL; sayNO; /* NOTREACHED */ - case CUT: - ST.mark_name = scan->flags ? &PL_sv_yes : - (SV*)rex->data->data[ ARG( scan ) ]; - if (mark_state) { - ST.mark_loc = NULL; - } else { + case SKIP: + PL_reginput = locinput; + if (scan->flags) { + /* (*CUT) : if we fail we cut here*/ + ST.mark_name = NULL; ST.mark_loc = locinput; + PUSH_STATE_GOTO(SKIP_next,next); + } else { + /* (*CUT:NAME) : if there is a (*MARK:NAME) fail where it was, + otherwise do nothing. Meaning we need to scan + */ + regmatch_state *cur = mark_state; + SV *find = (SV*)rex->data->data[ ARG( scan ) ]; + + while (cur) { + if ( sv_eq( cur->u.mark.mark_name, + find ) ) + { + ST.mark_name = find; + PUSH_STATE_GOTO( SKIP_next, next ); + } + cur = cur->u.mark.prev_mark; + } } - PL_reginput = locinput; - PUSH_STATE_GOTO(CUT_next,next); - /* NOTREACHED */ - case CUT_next_fail: - if (ST.mark_loc) { + /* Didn't find our (*MARK:NAME) so ignore this (*CUT:NAME) */ + break; + case SKIP_next_fail: + if (ST.mark_name) { + /* (*CUT:NAME) - Set up to search for the name as we + collapse the stack*/ + popmark = ST.mark_name; + } else { + /* (*CUT) - No name, we cut here.*/ if (ST.mark_loc > startpoint) reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1); - sv_commit = ST.mark_name; - } else { - popmark = ST.mark_name; - } + /* but we set sv_commit to latest mark_name if there + is one so they can test to see how things lead to this + cut */ + if (mark_state) + sv_commit=mark_state->u.mark.mark_name; + } no_final = 1; sayNO; /* NOTREACHED */ @@ -4738,10 +4823,12 @@ NULL PerlIO_printf(Perl_error_log, "%"UVxf" %d\n", PTR2UV(scan), OP(scan)); Perl_croak(aTHX_ "regexp memory corruption"); - } + + } /* end switch */ - scan = next; - continue; + /* switch break jumps here */ + scan = next; /* prepare to execute the next op and ... */ + continue; /* ... jump back to the top, reusing st */ /* NOTREACHED */ push_yes_state: @@ -4834,7 +4921,10 @@ yes: yes_state = st->u.yes.prev_yes_state; PL_regmatch_state = st; - + if (no_final) { + locinput= st->locinput; + nextchr = UCHARAT(locinput); + } state_num = st->resume_state + no_final; goto reenter_switch; } @@ -4884,12 +4974,19 @@ no_silent: final_exit: if (rex->reganch & ROPT_VERBARG_SEEN) { - SV *sv = get_sv("REGERROR", 1); - if (result) + SV *sv_err = get_sv("REGERROR", 1); + SV *sv_mrk = get_sv("REGMARK", 1); + if (result) { sv_commit = &PL_sv_no; - else if (!sv_commit) - sv_commit = &PL_sv_yes; - sv_setsv(sv, sv_commit); + if (!sv_yes_mark) + sv_yes_mark = &PL_sv_yes; + } else { + if (!sv_commit) + sv_commit = &PL_sv_yes; + sv_yes_mark = &PL_sv_no; + } + sv_setsv(sv_err, sv_commit); + sv_setsv(sv_mrk, sv_yes_mark); } /* restore original high-water mark */ PL_regmatch_slab = orig_slab; @@ -100,15 +100,15 @@ typedef struct regexp_engine { #define ROPT_SANY_SEEN ROPT_CANY_SEEN /* src bckwrd cmpt */ #define ROPT_GPOS_CHECK (ROPT_GPOS_SEEN|ROPT_ANCH_GPOS) -/* 0xf800 of reganch is used by PMf_COMPILETIME */ +/* 0xF800 of reganch is used by PMf_COMPILETIME */ #define ROPT_UTF8 0x00010000 #define ROPT_NAUGHTY 0x00020000 /* how exponential is this pattern? */ #define ROPT_COPY_DONE 0x00040000 /* subbeg is a copy of the string */ #define ROPT_TAINTED_SEEN 0x00080000 #define ROPT_MATCH_UTF8 0x10000000 /* subbeg is utf-8 */ -#define ROPT_RECURSE_SEEN 0x20000000 -#define ROPT_VERBARG_SEEN 0x40000000 +#define ROPT_VERBARG_SEEN 0x20000000 +#define ROPT_CUTGROUP_SEEN 0x40000000 #define RE_USE_INTUIT_NOML 0x00100000 /* Best to intuit before matching */ #define RE_USE_INTUIT_ML 0x00200000 @@ -124,6 +124,7 @@ typedef struct regexp_engine { #define REINT_AUTORITATIVE (REINT_AUTORITATIVE_NOML|REINT_AUTORITATIVE_ML) #define REINT_ONCE (REINT_ONCE_NOML|REINT_ONCE_ML) +#define RX_HAS_CUTGROUP(prog) ((prog)->reganch & ROPT_CUTGROUP_SEEN) #define RX_MATCH_TAINTED(prog) ((prog)->reganch & ROPT_TAINTED_SEEN) #define RX_MATCH_TAINTED_on(prog) ((prog)->reganch |= ROPT_TAINTED_SEEN) #define RX_MATCH_TAINTED_off(prog) ((prog)->reganch &= ~ROPT_TAINTED_SEEN) @@ -229,6 +230,8 @@ typedef struct regmatch_state { } yes; struct { + /* this first element must match u.yes */ + struct regmatch_state *prev_yes_state; reg_trie_accepted *accept_buff; U32 accepted; /* how many accepting states we have seen */ U16 *jump; /* positive offsets from me */ @@ -279,6 +282,8 @@ typedef struct regmatch_state { } whilem; struct { + /* this first element must match u.yes */ + struct regmatch_state *prev_yes_state; U32 lastparen; regnode *next_branch; /* next branch node */ CHECKPOINT cp; diff --git a/regnodes.h b/regnodes.h index 005e409ab5..bbb49db983 100644 --- a/regnodes.h +++ b/regnodes.h @@ -6,8 +6,8 @@ /* Regops and State definitions */ -#define REGNODE_MAX 82 -#define REGMATCH_STATE_MAX 118 +#define REGNODE_MAX 83 +#define REGMATCH_STATE_MAX 121 #define END 0 /* 0000 End of program. */ #define SUCCEED 1 /* 0x01 Return from a subroutine, basically. */ @@ -86,12 +86,13 @@ #define OPFAIL 74 /* 0x4a Same as (?!) */ #define ACCEPT 75 /* 0x4b Accepts the current matched string. */ #define VERB 76 /* 0x4c no-sv 1 Used only for the type field of verbs */ -#define NOMATCH 77 /* 0x4d Pattern fails at this startpoint if no-backtracking through this */ +#define PRUNE 77 /* 0x4d Pattern fails at this startpoint if no-backtracking through this */ #define MARKPOINT 78 /* 0x4e Push the current location for rollback by cut. */ -#define CUT 79 /* 0x4f On failure cut the string at the mark. */ +#define SKIP 79 /* 0x4f On failure skip forward (to the mark) before retrying */ #define COMMIT 80 /* 0x50 Pattern fails outright if backtracking through this */ -#define OPTIMIZED 81 /* 0x51 Placeholder for dump. */ -#define PSEUDO 82 /* 0x52 Pseudo opcode for internal use. */ +#define CUTGROUP 81 /* 0x51 On failure go to the next alternation in the group */ +#define OPTIMIZED 82 /* 0x52 Placeholder for dump. */ +#define PSEUDO 83 /* 0x53 Pseudo opcode for internal use. */ /* ------------ States ------------- */ #define TRIE_next (REGNODE_MAX + 1) /* state for TRIE */ #define TRIE_next_fail (REGNODE_MAX + 2) /* state for TRIE */ @@ -127,8 +128,10 @@ #define COMMIT_next_fail (REGNODE_MAX + 32) /* state for COMMIT */ #define MARKPOINT_next (REGNODE_MAX + 33) /* state for MARKPOINT */ #define MARKPOINT_next_fail (REGNODE_MAX + 34) /* state for MARKPOINT */ -#define CUT_next (REGNODE_MAX + 35) /* state for CUT */ -#define CUT_next_fail (REGNODE_MAX + 36) /* state for CUT */ +#define SKIP_next (REGNODE_MAX + 35) /* state for SKIP */ +#define SKIP_next_fail (REGNODE_MAX + 36) /* state for SKIP */ +#define CUTGROUP_next (REGNODE_MAX + 37) /* state for CUTGROUP */ +#define CUTGROUP_next_fail (REGNODE_MAX + 38) /* state for CUTGROUP */ /* PL_regkind[] What type of regop or state is this. */ @@ -213,10 +216,11 @@ EXTCONST U8 PL_regkind[] = { ENDLIKE, /* OPFAIL */ ENDLIKE, /* ACCEPT */ VERB, /* VERB */ - VERB, /* NOMATCH */ + VERB, /* PRUNE */ VERB, /* MARKPOINT */ - VERB, /* CUT */ + VERB, /* SKIP */ VERB, /* COMMIT */ + VERB, /* CUTGROUP */ NOTHING, /* OPTIMIZED */ PSEUDO, /* PSEUDO */ /* ------------ States ------------- */ @@ -254,8 +258,10 @@ EXTCONST U8 PL_regkind[] = { COMMIT, /* COMMIT_next_fail */ MARKPOINT, /* MARKPOINT_next */ MARKPOINT, /* MARKPOINT_next_fail */ - CUT, /* CUT_next */ - CUT, /* CUT_next_fail */ + SKIP, /* SKIP_next */ + SKIP, /* SKIP_next_fail */ + CUTGROUP, /* CUTGROUP_next */ + CUTGROUP, /* CUTGROUP_next_fail */ }; #endif @@ -340,10 +346,11 @@ static const U8 regarglen[] = { 0, /* OPFAIL */ EXTRA_SIZE(struct regnode_1), /* ACCEPT */ 0, /* VERB */ - EXTRA_SIZE(struct regnode_1), /* NOMATCH */ + EXTRA_SIZE(struct regnode_1), /* PRUNE */ EXTRA_SIZE(struct regnode_1), /* MARKPOINT */ - EXTRA_SIZE(struct regnode_1), /* CUT */ + EXTRA_SIZE(struct regnode_1), /* SKIP */ EXTRA_SIZE(struct regnode_1), /* COMMIT */ + EXTRA_SIZE(struct regnode_1), /* CUTGROUP */ 0, /* OPTIMIZED */ 0, /* PSEUDO */ }; @@ -428,10 +435,11 @@ static const char reg_off_by_arg[] = { 0, /* OPFAIL */ 0, /* ACCEPT */ 0, /* VERB */ - 0, /* NOMATCH */ + 0, /* PRUNE */ 0, /* MARKPOINT */ - 0, /* CUT */ + 0, /* SKIP */ 0, /* COMMIT */ + 0, /* CUTGROUP */ 0, /* OPTIMIZED */ 0, /* PSEUDO */ }; @@ -517,12 +525,13 @@ const char * reg_name[] = { "OPFAIL", /* 0x4a */ "ACCEPT", /* 0x4b */ "VERB", /* 0x4c */ - "NOMATCH", /* 0x4d */ + "PRUNE", /* 0x4d */ "MARKPOINT", /* 0x4e */ - "CUT", /* 0x4f */ + "SKIP", /* 0x4f */ "COMMIT", /* 0x50 */ - "OPTIMIZED", /* 0x51 */ - "PSEUDO", /* 0x52 */ + "CUTGROUP", /* 0x51 */ + "OPTIMIZED", /* 0x52 */ + "PSEUDO", /* 0x53 */ /* ------------ States ------------- */ "TRIE_next", /* REGNODE_MAX +0x01 */ "TRIE_next_fail", /* REGNODE_MAX +0x02 */ @@ -558,8 +567,10 @@ const char * reg_name[] = { "COMMIT_next_fail", /* REGNODE_MAX +0x20 */ "MARKPOINT_next", /* REGNODE_MAX +0x21 */ "MARKPOINT_next_fail", /* REGNODE_MAX +0x22 */ - "CUT_next", /* REGNODE_MAX +0x23 */ - "CUT_next_fail", /* REGNODE_MAX +0x24 */ + "SKIP_next", /* REGNODE_MAX +0x23 */ + "SKIP_next_fail", /* REGNODE_MAX +0x24 */ + "CUTGROUP_next", /* REGNODE_MAX +0x25 */ + "CUTGROUP_next_fail", /* REGNODE_MAX +0x26 */ }; #endif /* DEBUGGING */ #else diff --git a/t/op/pat.t b/t/op/pat.t index 0de3b14b41..0bc0eb675c 100755 --- a/t/op/pat.t +++ b/t/op/pat.t @@ -3851,65 +3851,65 @@ for my $c ("z", "\0", "!", chr(254), chr(256)) { iseq($count,1,"should have matched once only [RT#36046]"); } -{ # Test the (*NOMATCH) pattern +{ # Test the (*PRUNE) pattern our $count = 0; 'aaab'=~/a+b?(?{$count++})(*FAIL)/; - iseq($count,9,"expect 9 for no (*NOMATCH)"); + iseq($count,9,"expect 9 for no (*PRUNE)"); $count = 0; - 'aaab'=~/a+b?(*NOMATCH)(?{$count++})(*FAIL)/; - iseq($count,3,"expect 3 with (*NOMATCH)"); + 'aaab'=~/a+b?(*PRUNE)(?{$count++})(*FAIL)/; + iseq($count,3,"expect 3 with (*PRUNE)"); local $_='aaab'; $count=0; - 1 while /.(*NOMATCH)(?{$count++})(*FAIL)/g; - iseq($count,4,"/.(*NOMATCH)/"); + 1 while /.(*PRUNE)(?{$count++})(*FAIL)/g; + iseq($count,4,"/.(*PRUNE)/"); $count = 0; - 'aaab'=~/a+b?(??{'(*NOMATCH)'})(?{$count++})(*FAIL)/; - iseq($count,3,"expect 3 with (*NOMATCH)"); + 'aaab'=~/a+b?(??{'(*PRUNE)'})(?{$count++})(*FAIL)/; + iseq($count,3,"expect 3 with (*PRUNE)"); local $_='aaab'; $count=0; - 1 while /.(??{'(*NOMATCH)'})(?{$count++})(*FAIL)/g; - iseq($count,4,"/.(*NOMATCH)/"); + 1 while /.(??{'(*PRUNE)'})(?{$count++})(*FAIL)/g; + iseq($count,4,"/.(*PRUNE)/"); } -{ # Test the (*CUT) pattern +{ # Test the (*SKIP) pattern our $count = 0; - 'aaab'=~/a+b?(*CUT)(?{$count++})(*FAIL)/; - iseq($count,1,"expect 1 with (*CUT)"); + 'aaab'=~/a+b?(*SKIP)(?{$count++})(*FAIL)/; + iseq($count,1,"expect 1 with (*SKIP)"); local $_='aaab'; $count=0; - 1 while /.(*CUT)(?{$count++})(*FAIL)/g; - iseq($count,4,"/.(*CUT)/"); + 1 while /.(*SKIP)(?{$count++})(*FAIL)/g; + iseq($count,4,"/.(*SKIP)/"); $_='aaabaaab'; $count=0; our @res=(); - 1 while /(a+b?)(*CUT)(?{$count++; push @res,$1})(*FAIL)/g; - iseq($count,2,"Expect 2 with (*CUT)" ); - iseq("@res","aaab aaab","adjacent (*CUT) works as expected" ); + 1 while /(a+b?)(*SKIP)(?{$count++; push @res,$1})(*FAIL)/g; + iseq($count,2,"Expect 2 with (*SKIP)" ); + iseq("@res","aaab aaab","adjacent (*SKIP) works as expected" ); } -{ # Test the (*CUT) pattern +{ # Test the (*SKIP) pattern our $count = 0; - 'aaab'=~/a+b?(*MARK)(*CUT)(?{$count++})(*FAIL)/; - iseq($count,1,"expect 1 with (*CUT)"); + 'aaab'=~/a+b?(*MARK:foo)(*SKIP)(?{$count++})(*FAIL)/; + iseq($count,1,"expect 1 with (*SKIP)"); local $_='aaab'; $count=0; - 1 while /.(*MARK)(*CUT)(?{$count++})(*FAIL)/g; - iseq($count,4,"/.(*CUT)/"); + 1 while /.(*MARK:foo)(*SKIP)(?{$count++})(*FAIL)/g; + iseq($count,4,"/.(*SKIP)/"); $_='aaabaaab'; $count=0; our @res=(); - 1 while /(a+b?)(*MARK)(*CUT)(?{$count++; push @res,$1})(*FAIL)/g; - iseq($count,2,"Expect 2 with (*CUT)" ); - iseq("@res","aaab aaab","adjacent (*CUT) works as expected" ); + 1 while /(a+b?)(*MARK:foo)(*SKIP)(?{$count++; push @res,$1})(*FAIL)/g; + iseq($count,2,"Expect 2 with (*SKIP)" ); + iseq("@res","aaab aaab","adjacent (*SKIP) works as expected" ); } -{ # Test the (*CUT) pattern +{ # Test the (*SKIP) pattern our $count = 0; - 'aaab'=~/a*(*MARK:a)b?(*MARK:b)(*CUT:a)(?{$count++})(*FAIL)/; - iseq($count,3,"expect 3 with *MARK:a)b?(*MARK:b)(*CUT:a)"); + 'aaab'=~/a*(*MARK:a)b?(*MARK:b)(*SKIP:a)(?{$count++})(*FAIL)/; + iseq($count,3,"expect 3 with *MARK:a)b?(*MARK:b)(*SKIP:a)"); local $_='aaabaaab'; $count=0; our @res=(); - 1 while /(a*(*MARK:a)b?)(*MARK)(*CUT:a)(?{$count++; push @res,$1})(*FAIL)/g; - iseq($count,5,"Expect 5 with (*MARK:a)b?)(*MARK)(*CUT:a)" ); - iseq("@res","aaab b aaab b ","adjacent (*MARK:a)b?)(*MARK)(*CUT:a) works as expected" ); + 1 while /(a*(*MARK:a)b?)(*MARK:x)(*SKIP:a)(?{$count++; push @res,$1})(*FAIL)/g; + iseq($count,5,"Expect 5 with (*MARK:a)b?)(*MARK:x)(*SKIP:a)" ); + iseq("@res","aaab b aaab b ","adjacent (*MARK:a)b?)(*MARK:x)(*SKIP:a) works as expected" ); } { # Test the (*COMMIT) pattern our $count = 0; @@ -3931,8 +3931,10 @@ for my $c ("z", "\0", "!", chr(254), chr(256)) { our $REGERROR; for my $name ('',':foo') { - for my $pat ("(*NOMATCH$name)","(*MARK$name)(*CUT)", - "(*CUT$name)","(*COMMIT$name)") + for my $pat ("(*PRUNE$name)", + ($name? "(*MARK$name)" : "") + . "(*SKIP$name)", + "(*COMMIT$name)") { for my $suffix ('(*FAIL)','') { @@ -3952,8 +3954,10 @@ for my $c ("z", "\0", "!", chr(254), chr(256)) { our $REGERROR; for my $name ('',':foo') { - for my $pat ("(*NOMATCH$name)","(*MARK$name)(*CUT)", - "(*CUT$name)","(*COMMIT$name)") + for my $pat ("(*PRUNE$name)", + ($name? "(*MARK$name)" : "") + . "(*SKIP$name)", + "(*COMMIT$name)") { for my $suffix ('(*FAIL)','') { @@ -3982,6 +3986,13 @@ for my $c ("z", "\0", "!", chr(254), chr(256)) { ok($s =~ m/$rex/); ok($s =~ m/^abc$/m); } +{ + #Mindnumbingly simple test of (*THEN) + for ("ABC","BAX") { + ok(/A (*THEN) X | B (*THEN) C/x,"Simple (*THEN) test"); + } +} + #------------------------------------------------------------------- # Keep the following tests last -- they may crash perl @@ -4008,5 +4019,4 @@ ok((q(a)x 100) =~ /^(??{'(.)'x 100})/, # Put new tests above the line, not here. # Don't forget to update this! -BEGIN{print "1..1347\n"}; - +BEGIN { print "1..1341\n" }; diff --git a/win32/Makefile b/win32/Makefile index a7e6431b91..e5faa76890 100644 --- a/win32/Makefile +++ b/win32/Makefile @@ -496,6 +496,7 @@ $(o).dll: .rc.res: $(RSC) -i.. $< + # # various targets @@ -922,7 +923,14 @@ all : .\config.h $(GLOBEXE) $(MINIMOD) $(CONFIGPM) $(UNIDATAFILES) $(PERLEXE) \ $(X2P) MakePPPort Extensions @echo Everything is up to date. '$(MAKE_BARE) test' to run test suite. -reonly : .\config.h $(GLOBEXE) $(MINIMOD) $(CONFIGPM) $(UNIDATAFILES) $(PERLEXE) \ +..\regnodes.h : ..\regcomp.sym + cd .. + regcomp.pl + cd win32 + +regnodes : ..\regnodes.h + +reonly : regnodes .\config.h $(GLOBEXE) $(MINIMOD) $(CONFIGPM) $(UNIDATAFILES) $(PERLEXE) \ $(X2P) Extensions_reonly @echo Perl and 're' are up to date. @@ -1302,17 +1310,11 @@ test-reonly : reonly utils $(PERLEXE) -I..\lib harness $(OPT) -re \bpat\b \breg \bre\b $(EXTRA) cd ..\win32 -regen : +regen : cd .. regen.pl cd win32 -regnodes : - cd .. - regcomp.pl - cd win32 - - test-notty : test-prep set PERL_SKIP_TTY_TEST=1 cd ..\t |