summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorYves Orton <demerphq@gmail.com>2007-01-10 21:33:39 +0100
committerRafael Garcia-Suarez <rgarciasuarez@gmail.com>2007-01-11 14:47:01 +0000
commitee9b8eaedac8053a01cc9281ada34dd182a8f7d0 (patch)
tree129df8e187e17fb664051b0f7f9a1b55d46fcecd
parentb4390064818aaae08b8f53f740ea62f7dd8517a1 (diff)
downloadperl-ee9b8eaedac8053a01cc9281ada34dd182a8f7d0.tar.gz
Add Regexp::Keep \K functionality to regex engine as well as add \v and \V, cleanup and more docs for regatom()
Message-ID: <9b18b3110701101133i46dc5fd0p1476a0f1dd1e9c5a@mail.gmail.com> (plus POD nits by Merijn and myself) p4raw-id: //depot/perl@29756
-rw-r--r--pod/perl595delta.pod19
-rw-r--r--pod/perlre.pod49
-rw-r--r--regcomp.c210
-rw-r--r--regcomp.sym10
-rw-r--r--regexec.c67
-rw-r--r--regexp.h4
-rw-r--r--regnodes.h29
-rwxr-xr-xt/op/pat.t52
8 files changed, 311 insertions, 129 deletions
diff --git a/pod/perl595delta.pod b/pod/perl595delta.pod
index 0497d55781..47fbaf51a1 100644
--- a/pod/perl595delta.pod
+++ b/pod/perl595delta.pod
@@ -107,9 +107,9 @@ would expect. This is considered a feature. :-) (Yves Orton)
=item Possessive Quantifiers
-Perl now supports the "possessive quantifier" syntax of the "atomic match"
+Perl now supports the "possessive quantifier" syntax of the "atomic match"
pattern. Basically a possessive quantifier matches as much as it can and never
-gives any back. Thus it can be used to control backtracking. The syntax is
+gives any back. Thus it can be used to control backtracking. The syntax is
similar to non-greedy matching, except instead of using a '?' as the modifier
the '+' is used. Thus C<?+>, C<*+>, C<++>, C<{min,max}+> are now legal
quantifiers. (Yves Orton)
@@ -129,6 +129,21 @@ that contain backreferences. (Yves Orton)
=back
+=item Regexp::Keep internalized
+
+The functionality of Jeff Pinyan's module Regexp::Keep has been added to
+the core. You can now use in regular expressions the special escape C<\K>
+as a way to do something like floating length positive lookbehind. It is
+also useful in substitutions like:
+
+ s/(foo)bar/$1/g
+
+that can now be converted to
+
+ s/foo\Kbar//g
+
+which is much more efficient.
+
=head2 The C<_> prototype
A new prototype character has been added. C<_> is equivalent to C<$> (it
diff --git a/pod/perlre.pod b/pod/perlre.pod
index 6c2049628c..7133a02c96 100644
--- a/pod/perlre.pod
+++ b/pod/perlre.pod
@@ -255,6 +255,9 @@ X<word> X<whitespace>
\N{name} Named unicode character, or unicode escape
\x12 Hexadecimal escape sequence
\x{1234} Long hexadecimal escape sequence
+ \K Keep the stuff left of the \K, don't include it in $&
+ \v Shortcut for (*PRUNE)
+ \V Shortcut for (*SKIP)
A C<\w> matches a single alphanumeric character (an alphabetic
character, or a decimal digit) or C<_>, not a whole word. Use C<\w+>
@@ -690,6 +693,17 @@ is equivalent to the more verbose
/(?:(?s-i)more.*than).*million/i
+=item Look-Around Assertions
+X<look-around assertion> X<lookaround assertion> X<look-around> X<lookaround>
+
+Look-around assertions are zero width patterns which match a specific
+pattern without including it in C<$&>. Positive assertions match when
+their subpattern matches, negative assertions match when their subpattern
+fails. Look-behind matches text up to the current match position,
+look-ahead matches text following the current match position.
+
+=over 4
+
=item C<(?=pattern)>
X<(?=)> X<look-ahead, positive> X<lookahead, positive>
@@ -716,13 +730,30 @@ Sometimes it's still easier just to say:
For look-behind see below.
-=item C<(?<=pattern)>
-X<(?<=)> X<look-behind, positive> X<lookbehind, positive>
+=item C<(?<=pattern)> C<\K>
+X<(?<=)> X<look-behind, positive> X<lookbehind, positive> X<\K>
A zero-width positive look-behind assertion. For example, C</(?<=\t)\w+/>
matches a word that follows a tab, without including the tab in C<$&>.
Works only for fixed-width look-behind.
+There is a special form of this construct, called C<\K>, which causes the
+regex engine to "keep" everything it had matched prior to the C<\K> and
+not include it in C<$&>. This effectively provides variable length
+look-behind. The use of C<\K> inside of another look-around assertion
+is allowed, but the behaviour is currently not well defined.
+
+For various reasons C<\K> may be signifigantly more efficient than the
+equivalent C<< (?<=...) >> construct, and it is especially useful in
+situations where you want to efficiently remove something following
+something else in a string. For instance
+
+ s/(foo)bar/$1/g;
+
+can be rewritten as the much more efficient
+
+ s/foo\Kbar//g;
+
=item C<(?<!pattern)>
X<(?<!)> X<look-behind, negative> X<lookbehind, negative>
@@ -730,6 +761,8 @@ A zero-width negative look-behind assertion. For example C</(?<!bar)foo/>
matches any occurrence of "foo" that does not follow "bar". Works
only for fixed-width look-behind.
+=back
+
=item C<(?'NAME'pattern)>
=item C<< (?<NAME>pattern) >>
@@ -761,7 +794,7 @@ its Unicode extension (see L<utf8>),
though it isn't extended by the locale (see L<perllocale>).
B<NOTE:> In order to make things easier for programmers with experience
-with the Python or PCRE regex engines the pattern C<< (?P<NAME>pattern) >>
+with the Python or PCRE regex engines the pattern C<< (?PE<lt>NAMEE<gt>pattern) >>
maybe be used instead of C<< (?<NAME>pattern) >>; however this form does not
support the use of single quotes as a delimiter for the name. This is
only available in Perl 5.10 or later.
@@ -1251,7 +1284,7 @@ argument, then C<$REGERROR> and C<$REGMARK> are not touched at all.
=over 4
=item C<(*PRUNE)> C<(*PRUNE:NAME)>
-X<(*PRUNE)> X<(*PRUNE:NAME)>
+X<(*PRUNE)> X<(*PRUNE:NAME)> X<\v>
This zero-width pattern prunes the backtracking tree at the current point
when backtracked into on failure. Consider the pattern C<A (*PRUNE) B>,
@@ -1261,6 +1294,8 @@ continues in B, which may also backtrack as necessary; however, should B
not match, then no further backtracking will take place, and the pattern
will fail outright at the current starting position.
+As a shortcut, X<\v> is exactly equivalent to C<(*PRUNE)>.
+
The following example counts all the possible matching strings in a
pattern (without actually matching any of them).
@@ -1312,6 +1347,8 @@ of this pattern. This effectively means that the regex engine "skips" forward
to this position on failure and tries to match again, (assuming that
there is sufficient room to match).
+As a shortcut X<\V> is exactly equivalent to C<(*SKIP)>.
+
The name of the C<(*SKIP:NAME)> pattern has special significance. If a
C<(*MARK:NAME)> was encountered while matching, then it is that position
which is used as the "skip point". If no C<(*MARK)> of that name was
@@ -2008,7 +2045,7 @@ Perl specific syntax, the following are legal in Perl 5.10:
=over 4
-=item C<< (?P<NAME>pattern) >>
+=item C<< (?PE<lt>NAMEE<gt>pattern) >>
Define a named capture buffer. Equivalent to C<< (?<NAME>pattern) >>.
@@ -2020,7 +2057,7 @@ Backreference to a named capture buffer. Equivalent to C<< \g{NAME} >>.
Subroutine call to a named capture buffer. Equivalent to C<< (?&NAME) >>.
-=back 4
+=back
=head1 BUGS
diff --git a/regcomp.c b/regcomp.c
index c1c141a0ab..05d2c091b1 100644
--- a/regcomp.c
+++ b/regcomp.c
@@ -6223,15 +6223,26 @@ S_reg_recode(pTHX_ const char value, SV **encp)
/*
- regatom - the lowest level
- *
- * Optimization: gobbles an entire sequence of ordinary characters so that
- * it can turn them into a single node, which is smaller to store and
- * faster to run. Backslashed characters are exceptions, each becoming a
- * separate node; the code is simpler that way and it's not worth fixing.
- *
- * [Yes, it is worth fixing, some scripts can run twice the speed.]
- * [It looks like its ok, as in S_study_chunk we merge adjacent EXACT nodes]
- */
+
+ Try to identify anything special at the start of the pattern. If there
+ is, then handle it as required. This may involve generating a single regop,
+ such as for an assertion; or it may involve recursing, such as to
+ handle a () structure.
+
+ If the string doesn't start with something special then we gobble up
+ as much literal text as we can.
+
+ Once we have been able to handle whatever type of thing started the
+ sequence, we return.
+
+ Note: we have to be careful with escapes, as they can be both literal
+ and special, and in the case of \10 and friends can either, depending
+ on context. Specifically there are two seperate switches for handling
+ escape sequences, with the one for handling literal escapes requiring
+ a dummy entry for all of the special escapes that are actually handled
+ by the other.
+*/
+
STATIC regnode *
S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
{
@@ -6243,6 +6254,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
DEBUG_PARSE("atom");
*flagp = WORST; /* Tentatively. */
+
tryagain:
switch (*RExC_parse) {
case '^':
@@ -6329,99 +6341,103 @@ tryagain:
vFAIL("Quantifier follows nothing");
break;
case '\\':
+ /* Special Escapes
+
+ This switch handles escape sequences that resolve to some kind
+ of special regop and not to literal text. Escape sequnces that
+ resolve to literal text are handled below in the switch marked
+ "Literal Escapes".
+
+ Every entry in this switch *must* have a corresponding entry
+ in the literal escape switch. However, the opposite is not
+ required, as the default for this switch is to jump to the
+ literal text handling code.
+ */
switch (*++RExC_parse) {
+ /* Special Escapes */
case 'A':
RExC_seen_zerolen++;
ret = reg_node(pRExC_state, SBOL);
*flagp |= SIMPLE;
- nextchar(pRExC_state);
- Set_Node_Length(ret, 2); /* MJD */
- break;
+ goto finish_meta_pat;
case 'G':
ret = reg_node(pRExC_state, GPOS);
RExC_seen |= REG_SEEN_GPOS;
*flagp |= SIMPLE;
- nextchar(pRExC_state);
- Set_Node_Length(ret, 2); /* MJD */
- break;
+ goto finish_meta_pat;
+ case 'K':
+ RExC_seen_zerolen++;
+ ret = reg_node(pRExC_state, KEEPS);
+ *flagp |= SIMPLE;
+ goto finish_meta_pat;
case 'Z':
ret = reg_node(pRExC_state, SEOL);
*flagp |= SIMPLE;
RExC_seen_zerolen++; /* Do not optimize RE away */
- nextchar(pRExC_state);
- break;
+ goto finish_meta_pat;
case 'z':
ret = reg_node(pRExC_state, EOS);
*flagp |= SIMPLE;
RExC_seen_zerolen++; /* Do not optimize RE away */
- nextchar(pRExC_state);
- Set_Node_Length(ret, 2); /* MJD */
- break;
+ goto finish_meta_pat;
case 'C':
ret = reg_node(pRExC_state, CANY);
RExC_seen |= REG_SEEN_CANY;
*flagp |= HASWIDTH|SIMPLE;
- nextchar(pRExC_state);
- Set_Node_Length(ret, 2); /* MJD */
- break;
+ goto finish_meta_pat;
case 'X':
ret = reg_node(pRExC_state, CLUMP);
*flagp |= HASWIDTH;
- nextchar(pRExC_state);
- Set_Node_Length(ret, 2); /* MJD */
- break;
+ goto finish_meta_pat;
case 'w':
ret = reg_node(pRExC_state, (U8)(LOC ? ALNUML : ALNUM));
*flagp |= HASWIDTH|SIMPLE;
- nextchar(pRExC_state);
- Set_Node_Length(ret, 2); /* MJD */
- break;
+ goto finish_meta_pat;
case 'W':
ret = reg_node(pRExC_state, (U8)(LOC ? NALNUML : NALNUM));
*flagp |= HASWIDTH|SIMPLE;
- nextchar(pRExC_state);
- Set_Node_Length(ret, 2); /* MJD */
- break;
+ goto finish_meta_pat;
case 'b':
RExC_seen_zerolen++;
RExC_seen |= REG_SEEN_LOOKBEHIND;
ret = reg_node(pRExC_state, (U8)(LOC ? BOUNDL : BOUND));
*flagp |= SIMPLE;
- nextchar(pRExC_state);
- Set_Node_Length(ret, 2); /* MJD */
- break;
+ goto finish_meta_pat;
case 'B':
RExC_seen_zerolen++;
RExC_seen |= REG_SEEN_LOOKBEHIND;
ret = reg_node(pRExC_state, (U8)(LOC ? NBOUNDL : NBOUND));
*flagp |= SIMPLE;
- nextchar(pRExC_state);
- Set_Node_Length(ret, 2); /* MJD */
- break;
+ goto finish_meta_pat;
case 's':
ret = reg_node(pRExC_state, (U8)(LOC ? SPACEL : SPACE));
*flagp |= HASWIDTH|SIMPLE;
- nextchar(pRExC_state);
- Set_Node_Length(ret, 2); /* MJD */
- break;
+ goto finish_meta_pat;
case 'S':
ret = reg_node(pRExC_state, (U8)(LOC ? NSPACEL : NSPACE));
*flagp |= HASWIDTH|SIMPLE;
- nextchar(pRExC_state);
- Set_Node_Length(ret, 2); /* MJD */
- break;
+ goto finish_meta_pat;
case 'd':
ret = reg_node(pRExC_state, DIGIT);
*flagp |= HASWIDTH|SIMPLE;
- nextchar(pRExC_state);
- Set_Node_Length(ret, 2); /* MJD */
- break;
+ goto finish_meta_pat;
case 'D':
ret = reg_node(pRExC_state, NDIGIT);
*flagp |= HASWIDTH|SIMPLE;
+ goto finish_meta_pat;
+ case 'v':
+ ret = reganode(pRExC_state, PRUNE, 0);
+ ret->flags = 1;
+ *flagp |= SIMPLE;
+ goto finish_meta_pat;
+ case 'V':
+ ret = reganode(pRExC_state, SKIP, 0);
+ ret->flags = 1;
+ *flagp |= SIMPLE;
+ finish_meta_pat:
nextchar(pRExC_state);
Set_Node_Length(ret, 2); /* MJD */
- break;
+ break;
case 'p':
case 'P':
{
@@ -6503,16 +6519,6 @@ tryagain:
}
break;
}
- case 'n':
- case 'r':
- case 't':
- case 'f':
- case 'e':
- case 'a':
- case 'x':
- case 'c':
- case '0':
- goto defchar;
case 'g':
case '1': case '2': case '3': case '4':
case '5': case '6': case '7': case '8': case '9':
@@ -6629,29 +6635,40 @@ tryagain:
case '|':
goto loopdone;
case '\\':
+ /* Literal Escapes Switch
+
+ This switch is meant to handle escape sequences that
+ resolve to a literal character.
+
+ Every escape sequence that represents something
+ else, like an assertion or a char class, is handled
+ in the switch marked 'Special Escapes' above in this
+ routine, but also has an entry here as anything that
+ isn't explicitly mentioned here will be treated as
+ an unescaped equivalent literal.
+ */
+
switch (*++p) {
- case 'A':
- case 'C':
- case 'X':
- case 'G':
- case 'g':
- case 'Z':
- case 'z':
- case 'w':
- case 'W':
- case 'b':
- case 'B':
- case 's':
- case 'S':
- case 'd':
- case 'D':
- case 'p':
- case 'P':
- case 'N':
- case 'R':
- case 'k':
+ /* These are all the special escapes. */
+ case 'A': /* Start assertion */
+ case 'b': case 'B': /* Word-boundary assertion*/
+ case 'C': /* Single char !DANGEROUS! */
+ case 'd': case 'D': /* digit class */
+ case 'g': case 'G': /* generic-backref, pos assertion */
+ case 'k': case 'K': /* named backref, keep marker */
+ case 'N': /* named char sequence */
+ case 'p': case 'P': /* unicode property */
+ case 's': case 'S': /* space class */
+ case 'v': case 'V': /* (*PRUNE) and (*SKIP) */
+ case 'w': case 'W': /* word class */
+ case 'X': /* eXtended Unicode "combining character sequence" */
+ case 'z': case 'Z': /* End of line/string assertion */
--p;
goto loopdone;
+
+ /* Anything after here is an escape that resolves to a
+ literal. (Except digits, which may or may not)
+ */
case 'n':
ender = '\n';
p++;
@@ -8213,26 +8230,27 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP || OP(o)==ACCEPT) {
Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
if ( prog->paren_names ) {
- AV *list= (AV *)progi->data->data[progi->name_list_idx];
- SV **name= av_fetch(list, ARG(o), 0 );
- if (name)
- Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
- }
- } else if (k == NREF) {
- if ( prog->paren_names ) {
- AV *list= (AV *)progi->data->data[ progi->name_list_idx ];
- SV *sv_dat=(SV*)progi->data->data[ ARG( o ) ];
- I32 *nums=(I32*)SvPVX(sv_dat);
- SV **name= av_fetch(list, nums[0], 0 );
- I32 n;
- if (name) {
- for ( n=0; n<SvIVX(sv_dat); n++ ) {
- Perl_sv_catpvf(aTHX_ sv, "%s%"IVdf,
- (n ? "," : ""), (IV)nums[n]);
+ if ( k != REF || OP(o) < NREF) {
+ AV *list= (AV *)progi->data->data[progi->name_list_idx];
+ SV **name= av_fetch(list, ARG(o), 0 );
+ if (name)
+ Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
+ }
+ else {
+ AV *list= (AV *)progi->data->data[ progi->name_list_idx ];
+ SV *sv_dat=(SV*)progi->data->data[ ARG( o ) ];
+ I32 *nums=(I32*)SvPVX(sv_dat);
+ SV **name= av_fetch(list, nums[0], 0 );
+ I32 n;
+ if (name) {
+ for ( n=0; n<SvIVX(sv_dat); n++ ) {
+ Perl_sv_catpvf(aTHX_ sv, "%s%"IVdf,
+ (n ? "," : ""), (IV)nums[n]);
+ }
+ Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
}
- Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
}
- }
+ }
} else if (k == GOSUB)
Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o)); /* Paren and offset */
else if (k == VERB) {
diff --git a/regcomp.sym b/regcomp.sym
index 656988e10b..c57a386af7 100644
--- a/regcomp.sym
+++ b/regcomp.sym
@@ -158,9 +158,9 @@ GOSUB GOSUB, num/ofs 2L recurse to paren arg1 at (signed) ofs arg2
GOSTART GOSTART, no recurse to start of pattern
#*Named references (67..69)
-NREF NREF, no-sv 1 Match some already matched string
-NREFF NREF, no-sv 1 Match already matched string, folded
-NREFFL NREF, no-sv 1 Match already matched string, folded in loc.
+NREF REF, no-sv 1 Match some already matched string
+NREFF REF, no-sv 1 Match already matched string, folded
+NREFFL REF, no-sv 1 Match already matched string, folded in loc.
#*Special conditionals (70..72)
@@ -182,6 +182,9 @@ SKIP VERB, no-sv 1 On failure skip forward (to the mark) before retrying
COMMIT VERB, no-sv 1 Pattern fails outright if backtracking through this
CUTGROUP VERB, no-sv 1 On failure go to the next alternation in the group
+#*Control what to keep in $&.
+KEEPS KEEPS, no $& begins here.
+
# NEW STUFF ABOVE THIS LINE -- Please update counts below.
################################################################################
@@ -221,3 +224,4 @@ COMMIT next:FAIL
MARKPOINT next:FAIL
SKIP next:FAIL
CUTGROUP next:FAIL
+KEEPS next:FAIL
diff --git a/regexec.c b/regexec.c
index cae3244936..c475b9a55c 100644
--- a/regexec.c
+++ b/regexec.c
@@ -126,12 +126,27 @@
OP(rn) == OPEN || OP(rn) == CLOSE || OP(rn) == EVAL || \
OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
OP(rn) == PLUS || OP(rn) == MINMOD || \
+ OP(rn) == KEEPS || (PL_regkind[OP(rn)] == VERB) || \
(PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \
)
+#define IS_EXACT(rn) (PL_regkind[OP(rn)] == EXACT)
-#define HAS_TEXT(rn) ( \
- PL_regkind[OP(rn)] == EXACT || PL_regkind[OP(rn)] == REF \
-)
+#define HAS_TEXT(rn) ( IS_EXACT(rn) || PL_regkind[OP(rn)] == REF )
+
+#if 0
+/* Currently these are only used when PL_regkind[OP(rn)] == EXACT so
+ we don't need this definition. */
+#define IS_TEXT(rn) ( OP(rn)==EXACT || OP(rn)==REF || OP(rn)==NREF )
+#define IS_TEXTF(rn) ( OP(rn)==EXACTF || OP(rn)==REFF || OP(rn)==NREFF )
+#define IS_TEXTFL(rn) ( OP(rn)==EXACTFL || OP(rn)==REFFL || OP(rn)==NREFFL )
+
+#else
+/* ... so we use this as its faster. */
+#define IS_TEXT(rn) ( OP(rn)==EXACT )
+#define IS_TEXTF(rn) ( OP(rn)==EXACTF )
+#define IS_TEXTFL(rn) ( OP(rn)==EXACTFL )
+
+#endif
/*
Search for mandatory following text node; for lookahead, the text must
@@ -2726,6 +2741,19 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog)
if (locinput == reginfo->ganch)
break;
sayNO;
+
+ case KEEPS:
+ /* update the startpoint */
+ st->u.keeper.val = PL_regstartp[0];
+ PL_reginput = locinput;
+ PL_regstartp[0] = locinput - PL_bostr;
+ PUSH_STATE_GOTO(KEEPS_next, next);
+ /*NOT-REACHED*/
+ case KEEPS_next_fail:
+ /* rollback the start point change */
+ PL_regstartp[0] = st->u.keeper.val;
+ sayNO_SILENT;
+ /*NOT-REACHED*/
case EOL:
goto seol;
case MEOL:
@@ -4292,14 +4320,23 @@ NULL
regnode *text_node = ST.B;
if (! HAS_TEXT(text_node))
FIND_NEXT_IMPT(text_node);
- if (HAS_TEXT(text_node)
- && PL_regkind[OP(text_node)] != REF)
+ /* this used to be
+
+ (HAS_TEXT(text_node) && PL_regkind[OP(text_node)] == EXACT)
+
+ But the former is redundant in light of the latter.
+
+ if this changes back then the macro for
+ IS_TEXT and friends need to change.
+ */
+ if (PL_regkind[OP(text_node)] == EXACT)
{
+
ST.c1 = (U8)*STRING(text_node);
ST.c2 =
- (OP(text_node) == EXACTF || OP(text_node) == REFF)
+ (IS_TEXTF(text_node))
? PL_fold[ST.c1]
- : (OP(text_node) == EXACTFL || OP(text_node) == REFFL)
+ : (IS_TEXTFL(text_node))
? PL_fold_locale[ST.c1]
: ST.c1;
}
@@ -4427,22 +4464,28 @@ NULL
if (! HAS_TEXT(text_node))
ST.c1 = ST.c2 = CHRTEST_VOID;
else {
- if (PL_regkind[OP(text_node)] == REF) {
+ if ( PL_regkind[OP(text_node)] != EXACT ) {
ST.c1 = ST.c2 = CHRTEST_VOID;
goto assume_ok_easy;
}
else
s = (U8*)STRING(text_node);
-
+
+ /* Currently we only get here when
+
+ PL_rekind[OP(text_node)] == EXACT
+
+ if this changes back then the macro for IS_TEXT and
+ friends need to change. */
if (!UTF) {
ST.c2 = ST.c1 = *s;
- if (OP(text_node) == EXACTF || OP(text_node) == REFF)
+ if (IS_TEXTF(text_node))
ST.c2 = PL_fold[ST.c1];
- else if (OP(text_node) == EXACTFL || OP(text_node) == REFFL)
+ else if (IS_TEXTFL(text_node))
ST.c2 = PL_fold_locale[ST.c1];
}
else { /* UTF */
- if (OP(text_node) == EXACTF || OP(text_node) == REFF) {
+ if (IS_TEXTF(text_node)) {
STRLEN ulen1, ulen2;
U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
diff --git a/regexp.h b/regexp.h
index d02b3210fa..c28c78e07a 100644
--- a/regexp.h
+++ b/regexp.h
@@ -371,6 +371,10 @@ typedef struct regmatch_state {
SV* mark_name;
char *mark_loc;
} mark;
+
+ struct {
+ int val;
+ } keeper;
} u;
} regmatch_state;
diff --git a/regnodes.h b/regnodes.h
index bbb49db983..8727a01392 100644
--- a/regnodes.h
+++ b/regnodes.h
@@ -6,8 +6,8 @@
/* Regops and State definitions */
-#define REGNODE_MAX 83
-#define REGMATCH_STATE_MAX 121
+#define REGNODE_MAX 84
+#define REGMATCH_STATE_MAX 124
#define END 0 /* 0000 End of program. */
#define SUCCEED 1 /* 0x01 Return from a subroutine, basically. */
@@ -91,8 +91,9 @@
#define SKIP 79 /* 0x4f On failure skip forward (to the mark) before retrying */
#define COMMIT 80 /* 0x50 Pattern fails outright if backtracking through this */
#define CUTGROUP 81 /* 0x51 On failure go to the next alternation in the group */
-#define OPTIMIZED 82 /* 0x52 Placeholder for dump. */
-#define PSEUDO 83 /* 0x53 Pseudo opcode for internal use. */
+#define KEEPS 82 /* 0x52 $& begins here. */
+#define OPTIMIZED 83 /* 0x53 Placeholder for dump. */
+#define PSEUDO 84 /* 0x54 Pseudo opcode for internal use. */
/* ------------ States ------------- */
#define TRIE_next (REGNODE_MAX + 1) /* state for TRIE */
#define TRIE_next_fail (REGNODE_MAX + 2) /* state for TRIE */
@@ -132,6 +133,8 @@
#define SKIP_next_fail (REGNODE_MAX + 36) /* state for SKIP */
#define CUTGROUP_next (REGNODE_MAX + 37) /* state for CUTGROUP */
#define CUTGROUP_next_fail (REGNODE_MAX + 38) /* state for CUTGROUP */
+#define KEEPS_next (REGNODE_MAX + 39) /* state for KEEPS */
+#define KEEPS_next_fail (REGNODE_MAX + 40) /* state for KEEPS */
/* PL_regkind[] What type of regop or state is this. */
@@ -206,9 +209,9 @@ EXTCONST U8 PL_regkind[] = {
TRIE, /* AHOCORASICKC */
GOSUB, /* GOSUB */
GOSTART, /* GOSTART */
- NREF, /* NREF */
- NREF, /* NREFF */
- NREF, /* NREFFL */
+ REF, /* NREF */
+ REF, /* NREFF */
+ REF, /* NREFFL */
NGROUPP, /* NGROUPP */
INSUBP, /* INSUBP */
DEFINEP, /* DEFINEP */
@@ -221,6 +224,7 @@ EXTCONST U8 PL_regkind[] = {
VERB, /* SKIP */
VERB, /* COMMIT */
VERB, /* CUTGROUP */
+ KEEPS, /* KEEPS */
NOTHING, /* OPTIMIZED */
PSEUDO, /* PSEUDO */
/* ------------ States ------------- */
@@ -262,6 +266,8 @@ EXTCONST U8 PL_regkind[] = {
SKIP, /* SKIP_next_fail */
CUTGROUP, /* CUTGROUP_next */
CUTGROUP, /* CUTGROUP_next_fail */
+ KEEPS, /* KEEPS_next */
+ KEEPS, /* KEEPS_next_fail */
};
#endif
@@ -351,6 +357,7 @@ static const U8 regarglen[] = {
EXTRA_SIZE(struct regnode_1), /* SKIP */
EXTRA_SIZE(struct regnode_1), /* COMMIT */
EXTRA_SIZE(struct regnode_1), /* CUTGROUP */
+ 0, /* KEEPS */
0, /* OPTIMIZED */
0, /* PSEUDO */
};
@@ -440,6 +447,7 @@ static const char reg_off_by_arg[] = {
0, /* SKIP */
0, /* COMMIT */
0, /* CUTGROUP */
+ 0, /* KEEPS */
0, /* OPTIMIZED */
0, /* PSEUDO */
};
@@ -530,8 +538,9 @@ const char * reg_name[] = {
"SKIP", /* 0x4f */
"COMMIT", /* 0x50 */
"CUTGROUP", /* 0x51 */
- "OPTIMIZED", /* 0x52 */
- "PSEUDO", /* 0x53 */
+ "KEEPS", /* 0x52 */
+ "OPTIMIZED", /* 0x53 */
+ "PSEUDO", /* 0x54 */
/* ------------ States ------------- */
"TRIE_next", /* REGNODE_MAX +0x01 */
"TRIE_next_fail", /* REGNODE_MAX +0x02 */
@@ -571,6 +580,8 @@ const char * reg_name[] = {
"SKIP_next_fail", /* REGNODE_MAX +0x24 */
"CUTGROUP_next", /* REGNODE_MAX +0x25 */
"CUTGROUP_next_fail", /* REGNODE_MAX +0x26 */
+ "KEEPS_next", /* REGNODE_MAX +0x27 */
+ "KEEPS_next_fail", /* REGNODE_MAX +0x28 */
};
#endif /* DEBUGGING */
#else
diff --git a/t/op/pat.t b/t/op/pat.t
index 24aa38a6df..94703c15ca 100755
--- a/t/op/pat.t
+++ b/t/op/pat.t
@@ -3913,6 +3913,25 @@ for my $c ("z", "\0", "!", chr(254), chr(256)) {
1 while /.(??{'(*PRUNE)'})(?{$count++})(*FAIL)/g;
iseq($count,4,"/.(*PRUNE)/");
}
+{ # Test the \v form of the (*PRUNE) pattern
+ our $count = 0;
+ 'aaab'=~/a+b?(?{$count++})(*FAIL)/;
+ iseq($count,9,"expect 9 for no \\v");
+ $count = 0;
+ 'aaab'=~/a+b?\v(?{$count++})(*FAIL)/;
+ iseq($count,3,"expect 3 with \\v");
+ local $_='aaab';
+ $count=0;
+ 1 while /.\v(?{$count++})(*FAIL)/g;
+ iseq($count,4,"/.\\v/");
+ $count = 0;
+ 'aaab'=~/a+b?(??{'\v'})(?{$count++})(*FAIL)/;
+ iseq($count,3,"expect 3 with \\v");
+ local $_='aaab';
+ $count=0;
+ 1 while /.(??{'\v'})(?{$count++})(*FAIL)/g;
+ iseq($count,4,"/.\\v/");
+}
{ # Test the (*SKIP) pattern
our $count = 0;
'aaab'=~/a+b?(*SKIP)(?{$count++})(*FAIL)/;
@@ -3928,6 +3947,21 @@ for my $c ("z", "\0", "!", chr(254), chr(256)) {
iseq($count,2,"Expect 2 with (*SKIP)" );
iseq("@res","aaab aaab","adjacent (*SKIP) works as expected" );
}
+{ # Test the \V form of the (*SKIP) pattern
+ our $count = 0;
+ 'aaab'=~/a+b?\V(?{$count++})(*FAIL)/;
+ iseq($count,1,"expect 1 with \\V");
+ local $_='aaab';
+ $count=0;
+ 1 while /.\V(?{$count++})(*FAIL)/g;
+ iseq($count,4,"/.\\V/");
+ $_='aaabaaab';
+ $count=0;
+ our @res=();
+ 1 while /(a+b?)\V(?{$count++; push @res,$1})(*FAIL)/g;
+ iseq($count,2,"Expect 2 with \\V" );
+ iseq("@res","aaab aaab","adjacent \\V works as expected" );
+}
{ # Test the (*SKIP) pattern
our $count = 0;
'aaab'=~/a+b?(*MARK:foo)(*SKIP)(?{$count++})(*FAIL)/;
@@ -4208,6 +4242,22 @@ for my $c ("z", "\0", "!", chr(254), chr(256)) {
ok(!$REGMARK);
iseq($REGERROR,'foo');
}
+{
+ my $x;
+ $x = "abc.def.ghi.jkl";
+ $x =~ s/.*\K\..*//;
+ ok($x eq "abc.def.ghi");
+
+ $x = "one two three four";
+ $x =~ s/o+ \Kthree//g;
+ ok($x eq "one two four");
+
+ $x = "abcde";
+ $x =~ s/(.)\K/$1/g;
+ ok($x eq "aabbccddee");
+}
+
+
# Test counter is at bottom of file. Put new tests above here.
#-------------------------------------------------------------------
# Keep the following tests last -- they may crash perl
@@ -4257,7 +4307,7 @@ ok($@=~/\QSequence \k... not terminated in regex;\E/);
iseq(0+$::test,$::TestCount,"Got the right number of tests!");
# Don't forget to update this!
BEGIN {
- $::TestCount = 1608;
+ $::TestCount = 1620;
print "1..$::TestCount\n";
}