Add Regexp::Keep \K functionality to regex engine as well as add \v and \V, cleanup and more docs for regatom()

Message-ID: <9b18b3110701101133i46dc5fd0p1476a0f1dd1e9c5a@mail.gmail.com> (plus POD nits by Merijn and myself) p4raw-id: //depot/perl@29756
author: Yves Orton <demerphq@gmail.com> 2007-01-10 21:33:39 +0100
committer: Rafael Garcia-Suarez <rgarciasuarez@gmail.com> 2007-01-11 14:47:01 +0000
commit: ee9b8eaedac8053a01cc9281ada34dd182a8f7d0 (patch)
tree: 129df8e187e17fb664051b0f7f9a1b55d46fcecd
parent: b4390064818aaae08b8f53f740ea62f7dd8517a1 (diff)
download: perl-ee9b8eaedac8053a01cc9281ada34dd182a8f7d0.tar.gz
8 files changed, 311 insertions, 129 deletions
diff --git a/pod/perl595delta.pod b/pod/perl595delta.pod
index 0497d55781..47fbaf51a1 100644
--- a/pod/perl595delta.pod
+++ b/pod/perl595delta.pod
@@ -107,9 +107,9 @@ would expect. This is considered a feature. :-) (Yves Orton)
 
 =item Possessive Quantifiers
 
-Perl now supports the "possessive quantifier" syntax of the "atomic match" 
+Perl now supports the "possessive quantifier" syntax of the "atomic match"
 pattern. Basically a possessive quantifier matches as much as it can and never
-gives any back. Thus it can be used to control backtracking. The syntax is 
+gives any back. Thus it can be used to control backtracking. The syntax is
 similar to non-greedy matching, except instead of using a '?' as the modifier
 the '+' is used. Thus C<?+>, C<*+>, C<++>, C<{min,max}+> are now legal
 quantifiers. (Yves Orton)
@@ -129,6 +129,21 @@ that contain backreferences. (Yves Orton)
 
 =back
 
+=item Regexp::Keep internalized
+
+The functionality of Jeff Pinyan's module Regexp::Keep has been added to
+the core. You can now use in regular expressions the special escape C<\K>
+as a way to do something like floating length positive lookbehind. It is
+also useful in substitutions like:
+
+  s/(foo)bar/$1/g
+
+that can now be converted to
+
+  s/foo\Kbar//g
+
+which is much more efficient.
+
 =head2 The C<_> prototype
 
 A new prototype character has been added. C<_> is equivalent to C<$> (it
diff --git a/pod/perlre.pod b/pod/perlre.pod
index 6c2049628c..7133a02c96 100644
--- a/pod/perlre.pod
+++ b/pod/perlre.pod
@@ -255,6 +255,9 @@ X<word> X<whitespace>
     \N{name} Named unicode character, or unicode escape
     \x12     Hexadecimal escape sequence
     \x{1234} Long hexadecimal escape sequence
+    \K       Keep the stuff left of the \K, don't include it in $&
+    \v       Shortcut for (*PRUNE)
+    \V       Shortcut for (*SKIP)
 
 A C<\w> matches a single alphanumeric character (an alphabetic
 character, or a decimal digit) or C<_>, not a whole word.  Use C<\w+>
@@ -690,6 +693,17 @@ is equivalent to the more verbose
 
     /(?:(?s-i)more.*than).*million/i
 
+=item Look-Around Assertions
+X<look-around assertion> X<lookaround assertion> X<look-around> X<lookaround>
+
+Look-around assertions are zero width patterns which match a specific
+pattern without including it in C<$&>. Positive assertions match when
+their subpattern matches, negative assertions match when their subpattern
+fails. Look-behind matches text up to the current match position,
+look-ahead matches text following the current match position.
+
+=over 4
+
 =item C<(?=pattern)>
 X<(?=)> X<look-ahead, positive> X<lookahead, positive>
 
@@ -716,13 +730,30 @@ Sometimes it's still easier just to say:
 
 For look-behind see below.
 
-=item C<(?<=pattern)>
-X<(?<=)> X<look-behind, positive> X<lookbehind, positive>
+=item C<(?<=pattern)> C<\K>
+X<(?<=)> X<look-behind, positive> X<lookbehind, positive> X<\K>
 
 A zero-width positive look-behind assertion.  For example, C</(?<=\t)\w+/>
 matches a word that follows a tab, without including the tab in C<$&>.
 Works only for fixed-width look-behind.
 
+There is a special form of this construct, called C<\K>, which causes the
+regex engine to "keep" everything it had matched prior to the C<\K> and
+not include it in C<$&>. This effectively provides variable length
+look-behind. The use of C<\K> inside of another look-around assertion
+is allowed, but the behaviour is currently not well defined.
+
+For various reasons C<\K> may be signifigantly more efficient than the
+equivalent C<< (?<=...) >> construct, and it is especially useful in
+situations where you want to efficiently remove something following
+something else in a string. For instance
+
+  s/(foo)bar/$1/g;
+
+can be rewritten as the much more efficient
+
+  s/foo\Kbar//g;
+
 =item C<(?<!pattern)>
 X<(?<!)> X<look-behind, negative> X<lookbehind, negative>
 
@@ -730,6 +761,8 @@ A zero-width negative look-behind assertion.  For example C</(?<!bar)foo/>
 matches any occurrence of "foo" that does not follow "bar".  Works
 only for fixed-width look-behind.
 
+=back
+
 =item C<(?'NAME'pattern)>
 
 =item C<< (?<NAME>pattern) >>
@@ -761,7 +794,7 @@ its Unicode extension (see L<utf8>),
 though it isn't extended by the locale (see L<perllocale>).
 
 B<NOTE:> In order to make things easier for programmers with experience
-with the Python or PCRE regex engines the pattern C<< (?P<NAME>pattern) >>
+with the Python or PCRE regex engines the pattern C<< (?PE<lt>NAMEE<gt>pattern) >>
 maybe be used instead of C<< (?<NAME>pattern) >>; however this form does not
 support the use of single quotes as a delimiter for the name. This is
 only available in Perl 5.10 or later.
@@ -1251,7 +1284,7 @@ argument, then C<$REGERROR> and C<$REGMARK> are not touched at all.
 =over 4
 
 =item C<(*PRUNE)> C<(*PRUNE:NAME)>
-X<(*PRUNE)> X<(*PRUNE:NAME)>
+X<(*PRUNE)> X<(*PRUNE:NAME)> X<\v>
 
 This zero-width pattern prunes the backtracking tree at the current point
 when backtracked into on failure. Consider the pattern C<A (*PRUNE) B>,
@@ -1261,6 +1294,8 @@ continues in B, which may also backtrack as necessary; however, should B
 not match, then no further backtracking will take place, and the pattern
 will fail outright at the current starting position.
 
+As a shortcut, X<\v> is exactly equivalent to C<(*PRUNE)>.
+
 The following example counts all the possible matching strings in a
 pattern (without actually matching any of them).
 
@@ -1312,6 +1347,8 @@ of this pattern. This effectively means that the regex engine "skips" forward
 to this position on failure and tries to match again, (assuming that
 there is sufficient room to match).
 
+As a shortcut X<\V> is exactly equivalent to C<(*SKIP)>.
+
 The name of the C<(*SKIP:NAME)> pattern has special significance. If a
 C<(*MARK:NAME)> was encountered while matching, then it is that position
 which is used as the "skip point". If no C<(*MARK)> of that name was
@@ -2008,7 +2045,7 @@ Perl specific syntax, the following are legal in Perl 5.10:
 
 =over 4
 
-=item C<< (?P<NAME>pattern) >>
+=item C<< (?PE<lt>NAMEE<gt>pattern) >>
 
 Define a named capture buffer. Equivalent to C<< (?<NAME>pattern) >>.
 
@@ -2020,7 +2057,7 @@ Backreference to a named capture buffer. Equivalent to C<< \g{NAME} >>.
 
 Subroutine call to a named capture buffer. Equivalent to C<< (?&NAME) >>.
 
-=back 4
+=back
 
 =head1 BUGS
 
diff --git a/regcomp.c b/regcomp.c
index c1c141a0ab..05d2c091b1 100644
--- a/regcomp.c
+++ b/regcomp.c
@@ -6223,15 +6223,26 @@ S_reg_recode(pTHX_ const char value, SV **encp)
 
 /*
  - regatom - the lowest level
- *
- * Optimization:  gobbles an entire sequence of ordinary characters so that
- * it can turn them into a single node, which is smaller to store and
- * faster to run.  Backslashed characters are exceptions, each becoming a
- * separate node; the code is simpler that way and it's not worth fixing.
- *
- * [Yes, it is worth fixing, some scripts can run twice the speed.]
- * [It looks like its ok, as in S_study_chunk we merge adjacent EXACT nodes]
- */
+
+   Try to identify anything special at the start of the pattern. If there
+   is, then handle it as required. This may involve generating a single regop,
+   such as for an assertion; or it may involve recursing, such as to
+   handle a () structure.
+
+   If the string doesn't start with something special then we gobble up
+   as much literal text as we can.
+
+   Once we have been able to handle whatever type of thing started the
+   sequence, we return.
+
+   Note: we have to be careful with escapes, as they can be both literal
+   and special, and in the case of \10 and friends can either, depending
+   on context. Specifically there are two seperate switches for handling
+   escape sequences, with the one for handling literal escapes requiring
+   a dummy entry for all of the special escapes that are actually handled
+   by the other.
+*/
+
 STATIC regnode *
 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
 {
@@ -6243,6 +6254,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
     DEBUG_PARSE("atom");
     *flagp = WORST;		/* Tentatively. */
 
+
 tryagain:
     switch (*RExC_parse) {
     case '^':
@@ -6329,99 +6341,103 @@ tryagain:
 	vFAIL("Quantifier follows nothing");
 	break;
     case '\\':
+	/* Special Escapes
+
+	   This switch handles escape sequences that resolve to some kind
+	   of special regop and not to literal text. Escape sequnces that
+	   resolve to literal text are handled below in the switch marked
+	   "Literal Escapes".
+
+	   Every entry in this switch *must* have a corresponding entry
+	   in the literal escape switch. However, the opposite is not
+	   required, as the default for this switch is to jump to the
+	   literal text handling code.
+	*/
 	switch (*++RExC_parse) {
+	/* Special Escapes */
 	case 'A':
 	    RExC_seen_zerolen++;
 	    ret = reg_node(pRExC_state, SBOL);
 	    *flagp |= SIMPLE;
-	    nextchar(pRExC_state);
-            Set_Node_Length(ret, 2); /* MJD */
-	    break;
+	    goto finish_meta_pat;
 	case 'G':
 	    ret = reg_node(pRExC_state, GPOS);
 	    RExC_seen |= REG_SEEN_GPOS;
 	    *flagp |= SIMPLE;
-	    nextchar(pRExC_state);
-            Set_Node_Length(ret, 2); /* MJD */
-	    break;
+	    goto finish_meta_pat;
+	case 'K':
+	    RExC_seen_zerolen++;
+	    ret = reg_node(pRExC_state, KEEPS);
+	    *flagp |= SIMPLE;
+	    goto finish_meta_pat;
 	case 'Z':
 	    ret = reg_node(pRExC_state, SEOL);
 	    *flagp |= SIMPLE;
 	    RExC_seen_zerolen++;		/* Do not optimize RE away */
-	    nextchar(pRExC_state);
-	    break;
+	    goto finish_meta_pat;
 	case 'z':
 	    ret = reg_node(pRExC_state, EOS);
 	    *flagp |= SIMPLE;
 	    RExC_seen_zerolen++;		/* Do not optimize RE away */
-	    nextchar(pRExC_state);
-            Set_Node_Length(ret, 2); /* MJD */
-	    break;
+	    goto finish_meta_pat;
 	case 'C':
 	    ret = reg_node(pRExC_state, CANY);
 	    RExC_seen |= REG_SEEN_CANY;
 	    *flagp |= HASWIDTH|SIMPLE;
-	    nextchar(pRExC_state);
-            Set_Node_Length(ret, 2); /* MJD */
-	    break;
+	    goto finish_meta_pat;
 	case 'X':
 	    ret = reg_node(pRExC_state, CLUMP);
 	    *flagp |= HASWIDTH;
-	    nextchar(pRExC_state);
-            Set_Node_Length(ret, 2); /* MJD */
-	    break;
+	    goto finish_meta_pat;
 	case 'w':
 	    ret = reg_node(pRExC_state, (U8)(LOC ? ALNUML     : ALNUM));
 	    *flagp |= HASWIDTH|SIMPLE;
-	    nextchar(pRExC_state);
-            Set_Node_Length(ret, 2); /* MJD */
-	    break;
+	    goto finish_meta_pat;
 	case 'W':
 	    ret = reg_node(pRExC_state, (U8)(LOC ? NALNUML    : NALNUM));
 	    *flagp |= HASWIDTH|SIMPLE;
-	    nextchar(pRExC_state);
-            Set_Node_Length(ret, 2); /* MJD */
-	    break;
+	    goto finish_meta_pat;
 	case 'b':
 	    RExC_seen_zerolen++;
 	    RExC_seen |= REG_SEEN_LOOKBEHIND;
 	    ret = reg_node(pRExC_state, (U8)(LOC ? BOUNDL     : BOUND));
 	    *flagp |= SIMPLE;
-	    nextchar(pRExC_state);
-            Set_Node_Length(ret, 2); /* MJD */
-	    break;
+	    goto finish_meta_pat;
 	case 'B':
 	    RExC_seen_zerolen++;
 	    RExC_seen |= REG_SEEN_LOOKBEHIND;
 	    ret = reg_node(pRExC_state, (U8)(LOC ? NBOUNDL    : NBOUND));
 	    *flagp |= SIMPLE;
-	    nextchar(pRExC_state);
-            Set_Node_Length(ret, 2); /* MJD */
-	    break;
+	    goto finish_meta_pat;
 	case 's':
 	    ret = reg_node(pRExC_state, (U8)(LOC ? SPACEL     : SPACE));
 	    *flagp |= HASWIDTH|SIMPLE;
-	    nextchar(pRExC_state);
-            Set_Node_Length(ret, 2); /* MJD */
-	    break;
+	    goto finish_meta_pat;
 	case 'S':
 	    ret = reg_node(pRExC_state, (U8)(LOC ? NSPACEL    : NSPACE));
 	    *flagp |= HASWIDTH|SIMPLE;
-	    nextchar(pRExC_state);
-            Set_Node_Length(ret, 2); /* MJD */
-	    break;
+	    goto finish_meta_pat;
 	case 'd':
 	    ret = reg_node(pRExC_state, DIGIT);
 	    *flagp |= HASWIDTH|SIMPLE;
-	    nextchar(pRExC_state);
-            Set_Node_Length(ret, 2); /* MJD */
-	    break;
+	    goto finish_meta_pat;
 	case 'D':
 	    ret = reg_node(pRExC_state, NDIGIT);
 	    *flagp |= HASWIDTH|SIMPLE;
+	    goto finish_meta_pat;
+	case 'v':
+	    ret = reganode(pRExC_state, PRUNE, 0);
+	    ret->flags = 1;
+	    *flagp |= SIMPLE;
+	    goto finish_meta_pat;
+	case 'V':
+	    ret = reganode(pRExC_state, SKIP, 0);
+	    ret->flags = 1;
+	    *flagp |= SIMPLE;
+         finish_meta_pat:	    
 	    nextchar(pRExC_state);
             Set_Node_Length(ret, 2); /* MJD */
-	    break;
+	    break;	    
 	case 'p':
 	case 'P':
 	    {	
@@ -6503,16 +6519,6 @@ tryagain:
             }
             break;
 	}
-	case 'n':
-	case 'r':
-	case 't':
-	case 'f':
-	case 'e':
-	case 'a':
-	case 'x':
-	case 'c':
-	case '0':
-	    goto defchar;
 	case 'g': 
 	case '1': case '2': case '3': case '4':
 	case '5': case '6': case '7': case '8': case '9':
@@ -6629,29 +6635,40 @@ tryagain:
 		case '|':
 		    goto loopdone;
 		case '\\':
+		    /* Literal Escapes Switch
+
+		       This switch is meant to handle escape sequences that
+		       resolve to a literal character.
+
+		       Every escape sequence that represents something
+		       else, like an assertion or a char class, is handled
+		       in the switch marked 'Special Escapes' above in this
+		       routine, but also has an entry here as anything that
+		       isn't explicitly mentioned here will be treated as
+		       an unescaped equivalent literal.
+		    */
+
 		    switch (*++p) {
-		    case 'A':
-		    case 'C':
-		    case 'X':
-		    case 'G':
-		    case 'g':
-		    case 'Z':
-		    case 'z':
-		    case 'w':
-		    case 'W':
-		    case 'b':
-		    case 'B':
-		    case 's':
-		    case 'S':
-		    case 'd':
-		    case 'D':
-		    case 'p':
-		    case 'P':
-                    case 'N':
-                    case 'R':
-                    case 'k':
+		    /* These are all the special escapes. */
+		    case 'A':             /* Start assertion */
+		    case 'b': case 'B':   /* Word-boundary assertion*/
+		    case 'C':             /* Single char !DANGEROUS! */
+		    case 'd': case 'D':   /* digit class */
+		    case 'g': case 'G':   /* generic-backref, pos assertion */
+		    case 'k': case 'K':   /* named backref, keep marker */
+		    case 'N':             /* named char sequence */
+		    case 'p': case 'P':   /* unicode property */
+		    case 's': case 'S':   /* space class */
+		    case 'v': case 'V':   /* (*PRUNE) and (*SKIP) */
+		    case 'w': case 'W':   /* word class */
+		    case 'X':             /* eXtended Unicode "combining character sequence" */
+		    case 'z': case 'Z':   /* End of line/string assertion */
 			--p;
 			goto loopdone;
+
+	            /* Anything after here is an escape that resolves to a
+	               literal. (Except digits, which may or may not)
+	             */
 		    case 'n':
 			ender = '\n';
 			p++;
@@ -8213,26 +8230,27 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
     else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP || OP(o)==ACCEPT) {
 	Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o));	/* Parenth number */
 	if ( prog->paren_names ) {
-	    AV *list= (AV *)progi->data->data[progi->name_list_idx];
-	    SV **name= av_fetch(list, ARG(o), 0 );
-	    if (name)
-	        Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
-        }	    
-    } else if (k == NREF) {
-        if ( prog->paren_names ) {
-            AV *list= (AV *)progi->data->data[ progi->name_list_idx ];
-            SV *sv_dat=(SV*)progi->data->data[ ARG( o ) ];
-            I32 *nums=(I32*)SvPVX(sv_dat);
-            SV **name= av_fetch(list, nums[0], 0 );
-            I32 n;
-            if (name) {
-                for ( n=0; n<SvIVX(sv_dat); n++ ) {
-                    Perl_sv_catpvf(aTHX_ sv, "%s%"IVdf,
-				   (n ? "," : ""), (IV)nums[n]);
+            if ( k != REF || OP(o) < NREF) {	    
+	        AV *list= (AV *)progi->data->data[progi->name_list_idx];
+	        SV **name= av_fetch(list, ARG(o), 0 );
+	        if (name)
+	            Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
+            }	    
+            else {
+                AV *list= (AV *)progi->data->data[ progi->name_list_idx ];
+                SV *sv_dat=(SV*)progi->data->data[ ARG( o ) ];
+                I32 *nums=(I32*)SvPVX(sv_dat);
+                SV **name= av_fetch(list, nums[0], 0 );
+                I32 n;
+                if (name) {
+                    for ( n=0; n<SvIVX(sv_dat); n++ ) {
+                        Perl_sv_catpvf(aTHX_ sv, "%s%"IVdf,
+			   	    (n ? "," : ""), (IV)nums[n]);
+                    }
+                    Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
                 }
-                Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
             }
-        }
+        }            
     } else if (k == GOSUB) 
 	Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o));	/* Paren and offset */
     else if (k == VERB) {
diff --git a/regcomp.sym b/regcomp.sym
index 656988e10b..c57a386af7 100644
--- a/regcomp.sym
+++ b/regcomp.sym
@@ -158,9 +158,9 @@ GOSUB		GOSUB,     num/ofs 2L	recurse to paren arg1 at (signed) ofs arg2
 GOSTART		GOSTART,   no   	recurse to start of pattern
 
 #*Named references (67..69)
-NREF		NREF,      no-sv 1	Match some already matched string
-NREFF		NREF,      no-sv 1	Match already matched string, folded
-NREFFL		NREF,      no-sv 1	Match already matched string, folded in loc.
+NREF		REF,       no-sv 1	Match some already matched string
+NREFF		REF,       no-sv 1	Match already matched string, folded
+NREFFL		REF,       no-sv 1	Match already matched string, folded in loc.
 
 
 #*Special conditionals  (70..72)
@@ -182,6 +182,9 @@ SKIP		VERB,      no-sv 1	On failure skip forward (to the mark) before retrying
 COMMIT		VERB,      no-sv 1	Pattern fails outright if backtracking through this
 CUTGROUP	VERB,      no-sv 1	On failure go to the next alternation in the group
 
+#*Control what to keep in $&.
+KEEPS		KEEPS,  no	$& begins here.
+
 # NEW STUFF ABOVE THIS LINE -- Please update counts below. 
 
 ################################################################################
@@ -221,3 +224,4 @@ COMMIT		next:FAIL
 MARKPOINT	next:FAIL
 SKIP		next:FAIL
 CUTGROUP	next:FAIL
+KEEPS		next:FAIL
diff --git a/regexec.c b/regexec.c
index cae3244936..c475b9a55c 100644
--- a/regexec.c
+++ b/regexec.c
@@ -126,12 +126,27 @@
     OP(rn) == OPEN || OP(rn) == CLOSE || OP(rn) == EVAL || \
     OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
     OP(rn) == PLUS || OP(rn) == MINMOD || \
+    OP(rn) == KEEPS || (PL_regkind[OP(rn)] == VERB) || \
     (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \
 )
+#define IS_EXACT(rn) (PL_regkind[OP(rn)] == EXACT)
 
-#define HAS_TEXT(rn) ( \
-    PL_regkind[OP(rn)] == EXACT || PL_regkind[OP(rn)] == REF \
-)
+#define HAS_TEXT(rn) ( IS_EXACT(rn) || PL_regkind[OP(rn)] == REF )
+
+#if 0 
+/* Currently these are only used when PL_regkind[OP(rn)] == EXACT so
+   we don't need this definition. */
+#define IS_TEXT(rn)   ( OP(rn)==EXACT   || OP(rn)==REF   || OP(rn)==NREF   )
+#define IS_TEXTF(rn)  ( OP(rn)==EXACTF  || OP(rn)==REFF  || OP(rn)==NREFF  )
+#define IS_TEXTFL(rn) ( OP(rn)==EXACTFL || OP(rn)==REFFL || OP(rn)==NREFFL )
+
+#else
+/* ... so we use this as its faster. */
+#define IS_TEXT(rn)   ( OP(rn)==EXACT   )
+#define IS_TEXTF(rn)  ( OP(rn)==EXACTF  )
+#define IS_TEXTFL(rn) ( OP(rn)==EXACTFL )
+
+#endif
 
 /*
   Search for mandatory following text node; for lookahead, the text must
@@ -2726,6 +2741,19 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog)
 	    if (locinput == reginfo->ganch)
 		break;
 	    sayNO;
+
+	case KEEPS:
+	    /* update the startpoint */
+	    st->u.keeper.val = PL_regstartp[0];
+	    PL_reginput = locinput;
+	    PL_regstartp[0] = locinput - PL_bostr;
+	    PUSH_STATE_GOTO(KEEPS_next, next);
+	    /*NOT-REACHED*/
+	case KEEPS_next_fail:
+	    /* rollback the start point change */
+	    PL_regstartp[0] = st->u.keeper.val;
+	    sayNO_SILENT;
+	    /*NOT-REACHED*/
 	case EOL:
 		goto seol;
 	case MEOL:
@@ -4292,14 +4320,23 @@ NULL
 		    regnode *text_node = ST.B;
 		    if (! HAS_TEXT(text_node))
 			FIND_NEXT_IMPT(text_node);
-		    if (HAS_TEXT(text_node)
-			&& PL_regkind[OP(text_node)] != REF)
+	            /* this used to be 
+	                
+	                (HAS_TEXT(text_node) && PL_regkind[OP(text_node)] == EXACT)
+	                
+	            	But the former is redundant in light of the latter.
+	            	
+	            	if this changes back then the macro for 
+	            	IS_TEXT and friends need to change.
+	             */
+		    if (PL_regkind[OP(text_node)] == EXACT)
 		    {
+		        
 			ST.c1 = (U8)*STRING(text_node);
 			ST.c2 =
-			    (OP(text_node) == EXACTF || OP(text_node) == REFF)
+			    (IS_TEXTF(text_node))
 			    ? PL_fold[ST.c1]
-			    : (OP(text_node) == EXACTFL || OP(text_node) == REFFL)
+			    : (IS_TEXTFL(text_node))
 				? PL_fold_locale[ST.c1]
 				: ST.c1;
 		    }
@@ -4427,22 +4464,28 @@ NULL
 		if (! HAS_TEXT(text_node))
 		    ST.c1 = ST.c2 = CHRTEST_VOID;
 		else {
-		    if (PL_regkind[OP(text_node)] == REF) {
+		    if ( PL_regkind[OP(text_node)] != EXACT ) {
 			ST.c1 = ST.c2 = CHRTEST_VOID;
 			goto assume_ok_easy;
 		    }
 		    else
 			s = (U8*)STRING(text_node);
-
+                    
+                    /*  Currently we only get here when 
+                        
+                        PL_rekind[OP(text_node)] == EXACT
+                    
+                        if this changes back then the macro for IS_TEXT and 
+                        friends need to change. */
 		    if (!UTF) {
 			ST.c2 = ST.c1 = *s;
-			if (OP(text_node) == EXACTF || OP(text_node) == REFF)
+			if (IS_TEXTF(text_node))
 			    ST.c2 = PL_fold[ST.c1];
-			else if (OP(text_node) == EXACTFL || OP(text_node) == REFFL)
+			else if (IS_TEXTFL(text_node))
 			    ST.c2 = PL_fold_locale[ST.c1];
 		    }
 		    else { /* UTF */
-			if (OP(text_node) == EXACTF || OP(text_node) == REFF) {
+			if (IS_TEXTF(text_node)) {
 			     STRLEN ulen1, ulen2;
 			     U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
 			     U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
diff --git a/regexp.h b/regexp.h
index d02b3210fa..c28c78e07a 100644
--- a/regexp.h
+++ b/regexp.h
@@ -371,6 +371,10 @@ typedef struct regmatch_state {
 	    SV* mark_name;
 	    char *mark_loc;
 	} mark;
+	
+	struct {
+	    int val;
+	} keeper;
     } u;
 } regmatch_state;
 
diff --git a/regnodes.h b/regnodes.h
index bbb49db983..8727a01392 100644
--- a/regnodes.h
+++ b/regnodes.h
@@ -6,8 +6,8 @@
 
 /* Regops and State definitions */
 
-#define REGNODE_MAX           	83
-#define REGMATCH_STATE_MAX    	121
+#define REGNODE_MAX           	84
+#define REGMATCH_STATE_MAX    	124
 
 #define	END                   	0	/* 0000 End of program. */
 #define	SUCCEED               	1	/* 0x01 Return from a subroutine, basically. */
@@ -91,8 +91,9 @@
 #define	SKIP                  	79	/* 0x4f On failure skip forward (to the mark) before retrying */
 #define	COMMIT                	80	/* 0x50 Pattern fails outright if backtracking through this */
 #define	CUTGROUP              	81	/* 0x51 On failure go to the next alternation in the group */
-#define	OPTIMIZED             	82	/* 0x52 Placeholder for dump. */
-#define	PSEUDO                	83	/* 0x53 Pseudo opcode for internal use. */
+#define	KEEPS                 	82	/* 0x52 $& begins here. */
+#define	OPTIMIZED             	83	/* 0x53 Placeholder for dump. */
+#define	PSEUDO                	84	/* 0x54 Pseudo opcode for internal use. */
 	/* ------------ States ------------- */
 #define	TRIE_next             	(REGNODE_MAX + 1)	/* state for TRIE */
 #define	TRIE_next_fail        	(REGNODE_MAX + 2)	/* state for TRIE */
@@ -132,6 +133,8 @@
 #define	SKIP_next_fail        	(REGNODE_MAX + 36)	/* state for SKIP */
 #define	CUTGROUP_next         	(REGNODE_MAX + 37)	/* state for CUTGROUP */
 #define	CUTGROUP_next_fail    	(REGNODE_MAX + 38)	/* state for CUTGROUP */
+#define	KEEPS_next            	(REGNODE_MAX + 39)	/* state for KEEPS */
+#define	KEEPS_next_fail       	(REGNODE_MAX + 40)	/* state for KEEPS */
 
 /* PL_regkind[] What type of regop or state is this. */
 
@@ -206,9 +209,9 @@ EXTCONST U8 PL_regkind[] = {
 	TRIE,     	/* AHOCORASICKC           */
 	GOSUB,    	/* GOSUB                  */
 	GOSTART,  	/* GOSTART                */
-	NREF,     	/* NREF                   */
-	NREF,     	/* NREFF                  */
-	NREF,     	/* NREFFL                 */
+	REF,      	/* NREF                   */
+	REF,      	/* NREFF                  */
+	REF,      	/* NREFFL                 */
 	NGROUPP,  	/* NGROUPP                */
 	INSUBP,   	/* INSUBP                 */
 	DEFINEP,  	/* DEFINEP                */
@@ -221,6 +224,7 @@ EXTCONST U8 PL_regkind[] = {
 	VERB,     	/* SKIP                   */
 	VERB,     	/* COMMIT                 */
 	VERB,     	/* CUTGROUP               */
+	KEEPS,    	/* KEEPS                  */
 	NOTHING,  	/* OPTIMIZED              */
 	PSEUDO,   	/* PSEUDO                 */
 	/* ------------ States ------------- */
@@ -262,6 +266,8 @@ EXTCONST U8 PL_regkind[] = {
 	SKIP,     	/* SKIP_next_fail         */
 	CUTGROUP, 	/* CUTGROUP_next          */
 	CUTGROUP, 	/* CUTGROUP_next_fail     */
+	KEEPS,    	/* KEEPS_next             */
+	KEEPS,    	/* KEEPS_next_fail        */
 };
 #endif
 
@@ -351,6 +357,7 @@ static const U8 regarglen[] = {
 	EXTRA_SIZE(struct regnode_1),        	/* SKIP         */
 	EXTRA_SIZE(struct regnode_1),        	/* COMMIT       */
 	EXTRA_SIZE(struct regnode_1),        	/* CUTGROUP     */
+	0,                                   	/* KEEPS        */
 	0,                                   	/* OPTIMIZED    */
 	0,                                   	/* PSEUDO       */
 };
@@ -440,6 +447,7 @@ static const char reg_off_by_arg[] = {
 	0,	/* SKIP         */
 	0,	/* COMMIT       */
 	0,	/* CUTGROUP     */
+	0,	/* KEEPS        */
 	0,	/* OPTIMIZED    */
 	0,	/* PSEUDO       */
 };
@@ -530,8 +538,9 @@ const char * reg_name[] = {
 	"SKIP",                  	/* 0x4f */
 	"COMMIT",                	/* 0x50 */
 	"CUTGROUP",              	/* 0x51 */
-	"OPTIMIZED",             	/* 0x52 */
-	"PSEUDO",                	/* 0x53 */
+	"KEEPS",                 	/* 0x52 */
+	"OPTIMIZED",             	/* 0x53 */
+	"PSEUDO",                	/* 0x54 */
 	/* ------------ States ------------- */
 	"TRIE_next",             	/* REGNODE_MAX +0x01 */
 	"TRIE_next_fail",        	/* REGNODE_MAX +0x02 */
@@ -571,6 +580,8 @@ const char * reg_name[] = {
 	"SKIP_next_fail",        	/* REGNODE_MAX +0x24 */
 	"CUTGROUP_next",         	/* REGNODE_MAX +0x25 */
 	"CUTGROUP_next_fail",    	/* REGNODE_MAX +0x26 */
+	"KEEPS_next",            	/* REGNODE_MAX +0x27 */
+	"KEEPS_next_fail",       	/* REGNODE_MAX +0x28 */
 };
 #endif /* DEBUGGING */
 #else
diff --git a/t/op/pat.t b/t/op/pat.t
index 24aa38a6df..94703c15ca 100755
--- a/t/op/pat.t
+++ b/t/op/pat.t
@@ -3913,6 +3913,25 @@ for my $c ("z", "\0", "!", chr(254), chr(256)) {
     1 while /.(??{'(*PRUNE)'})(?{$count++})(*FAIL)/g;
     iseq($count,4,"/.(*PRUNE)/");
 }
+{   # Test the \v form of the (*PRUNE) pattern
+    our $count = 0;
+    'aaab'=~/a+b?(?{$count++})(*FAIL)/;
+    iseq($count,9,"expect 9 for no \\v");
+    $count = 0;
+    'aaab'=~/a+b?\v(?{$count++})(*FAIL)/;
+    iseq($count,3,"expect 3 with \\v");
+    local $_='aaab';
+    $count=0;
+    1 while /.\v(?{$count++})(*FAIL)/g;
+    iseq($count,4,"/.\\v/");
+    $count = 0;
+    'aaab'=~/a+b?(??{'\v'})(?{$count++})(*FAIL)/;
+    iseq($count,3,"expect 3 with \\v");
+    local $_='aaab';
+    $count=0;
+    1 while /.(??{'\v'})(?{$count++})(*FAIL)/g;
+    iseq($count,4,"/.\\v/");
+}
 {   # Test the (*SKIP) pattern
     our $count = 0;
     'aaab'=~/a+b?(*SKIP)(?{$count++})(*FAIL)/;
@@ -3928,6 +3947,21 @@ for my $c ("z", "\0", "!", chr(254), chr(256)) {
     iseq($count,2,"Expect 2 with (*SKIP)" );
     iseq("@res","aaab aaab","adjacent (*SKIP) works as expected" );
 }
+{   # Test the \V form of the (*SKIP) pattern
+    our $count = 0;
+    'aaab'=~/a+b?\V(?{$count++})(*FAIL)/;
+    iseq($count,1,"expect 1 with \\V");
+    local $_='aaab';
+    $count=0;
+    1 while /.\V(?{$count++})(*FAIL)/g;
+    iseq($count,4,"/.\\V/");
+    $_='aaabaaab';
+    $count=0;
+    our @res=();
+    1 while /(a+b?)\V(?{$count++; push @res,$1})(*FAIL)/g;
+    iseq($count,2,"Expect 2 with \\V" );
+    iseq("@res","aaab aaab","adjacent \\V works as expected" );
+}
 {   # Test the (*SKIP) pattern
     our $count = 0;
     'aaab'=~/a+b?(*MARK:foo)(*SKIP)(?{$count++})(*FAIL)/;
@@ -4208,6 +4242,22 @@ for my $c ("z", "\0", "!", chr(254), chr(256)) {
     ok(!$REGMARK);
     iseq($REGERROR,'foo');
 }
+{
+    my $x;
+    $x = "abc.def.ghi.jkl";
+    $x =~ s/.*\K\..*//;
+    ok($x eq "abc.def.ghi");
+    
+    $x = "one two three four";
+    $x =~ s/o+ \Kthree//g;
+    ok($x eq "one two  four");
+    
+    $x = "abcde";
+    $x =~ s/(.)\K/$1/g;
+    ok($x eq "aabbccddee");
+}
+
+
 # Test counter is at bottom of file. Put new tests above here.
 #-------------------------------------------------------------------
 # Keep the following tests last -- they may crash perl
@@ -4257,7 +4307,7 @@ ok($@=~/\QSequence \k... not terminated in regex;\E/);
 iseq(0+$::test,$::TestCount,"Got the right number of tests!");
 # Don't forget to update this!
 BEGIN {
-    $::TestCount = 1608;
+    $::TestCount = 1620;
     print "1..$::TestCount\n";
 }
author	Yves Orton <demerphq@gmail.com>	2007-01-10 21:33:39 +0100
committer	Rafael Garcia-Suarez <rgarciasuarez@gmail.com>	2007-01-11 14:47:01 +0000
commit	ee9b8eaedac8053a01cc9281ada34dd182a8f7d0 (patch)
tree	129df8e187e17fb664051b0f7f9a1b55d46fcecd
parent	b4390064818aaae08b8f53f740ea62f7dd8517a1 (diff)
download	perl-ee9b8eaedac8053a01cc9281ada34dd182a8f7d0.tar.gz