diff options
-rw-r--r-- | MANIFEST | 2 | ||||
-rw-r--r-- | ext/re/t/re_funcs.t | 6 | ||||
-rw-r--r-- | pod/perlop.pod | 390 | ||||
-rw-r--r-- | pod/perlre.pod | 28 | ||||
-rw-r--r-- | pod/perlvar.pod | 6 | ||||
-rw-r--r-- | regexp.h | 8 | ||||
-rw-r--r-- | t/op/regexp_pmod.t (renamed from t/op/regexp_kmod.t) | 14 |
7 files changed, 232 insertions, 222 deletions
@@ -3591,7 +3591,7 @@ t/op/readline.t See if <> / readline / rcatline work t/op/read.t See if read() works t/op/recurse.t See if deep recursion works t/op/ref.t See if refs and objects work -t/op/regexp_kmod.t See if regexp /k modifier works as expected +t/op/regexp_pmod.t See if regexp /p modifier works as expected t/op/regexp_noamp.t See if regular expressions work with optimizations t/op/regexp_notrie.t See if regular expressions work without trie optimisation t/op/regexp_qr_embed.t See if regular expressions work with embedded qr// diff --git a/ext/re/t/re_funcs.t b/ext/re/t/re_funcs.t index bf8202aa44..6ac33d65cc 100644 --- a/ext/re/t/re_funcs.t +++ b/ext/re/t/re_funcs.t @@ -17,12 +17,12 @@ use re qw(is_regexp regexp_pattern regmust regname regnames regnames_count regnames_iterinit regnames_iternext); { - my $qr=qr/foo/ki; + my $qr=qr/foo/pi; ok(is_regexp($qr),'is_regexp($qr)'); ok(!is_regexp(''),'is_regexp("")'); is((regexp_pattern($qr))[0],'foo','regexp_pattern[0]'); - is((regexp_pattern($qr))[1],'ik','regexp_pattern[1]'); - is(regexp_pattern($qr),'(?ki-xsm:foo)','scalar regexp_pattern'); + is((regexp_pattern($qr))[1],'ip','regexp_pattern[1]'); + is(regexp_pattern($qr),'(?pi-xsm:foo)','scalar regexp_pattern'); ok(!regexp_pattern(''),'!regexp_pattern("")'); } { diff --git a/pod/perlop.pod b/pod/perlop.pod index 18851f04da..52ecddc4e8 100644 --- a/pod/perlop.pod +++ b/pod/perlop.pod @@ -1049,33 +1049,77 @@ matching and related activities. =over 8 -=item ?PATTERN? -X<?> +=item qr/STRING/msixpo +X<qr> X</i> X</m> X</o> X</s> X</x> -This is just like the C</pattern/> search, except that it matches only -once between calls to the reset() operator. This is a useful -optimization when you want to see only the first occurrence of -something in each file of a set of files, for instance. Only C<??> -patterns local to the current package are reset. +This operator quotes (and possibly compiles) its I<STRING> as a regular +expression. I<STRING> is interpolated the same way as I<PATTERN> +in C<m/PATTERN/>. If "'" is used as the delimiter, no interpolation +is done. Returns a Perl value which may be used instead of the +corresponding C</STRING/imosx> expression. - while (<>) { - if (?^$?) { - # blank line between header and body - } - } continue { - reset if eof; # clear ?? status for next file +For example, + + $rex = qr/my.STRING/is; + s/$rex/foo/; + +is equivalent to + + s/my.STRING/foo/is; + +The result may be used as a subpattern in a match: + + $re = qr/$pattern/; + $string =~ /foo${re}bar/; # can be interpolated in other patterns + $string =~ $re; # or used standalone + $string =~ /$re/; # or this way + +Since Perl may compile the pattern at the moment of execution of qr() +operator, using qr() may have speed advantages in some situations, +notably if the result of qr() is used standalone: + + sub match { + my $patterns = shift; + my @compiled = map qr/$_/i, @$patterns; + grep { + my $success = 0; + foreach my $pat (@compiled) { + $success = 1, last if /$pat/; + } + $success; + } @_; } -This usage is vaguely deprecated, which means it just might possibly -be removed in some distant future version of Perl, perhaps somewhere -around the year 2168. +Precompilation of the pattern into an internal representation at +the moment of qr() avoids a need to recompile the pattern every +time a match C</$pat/> is attempted. (Perl has many other internal +optimizations, but none would be triggered in the above example if +we did not use qr() operator.) + +Options are: + + m Treat string as multiple lines. + s Treat string as single line. (Make . match a newline) + i Do case-insensitive pattern matching. + x Use extended regular expressions. + p When matching preserve a copy of the matched string so + that ${^PREMATCH}, ${^MATCH}, ${^POSTMATCH} will be defined. + o Compile pattern only once. + +If a precompiled pattern is embedded in a larger pattern then the effect +of 'msixp' will be propagated appropriately. The effect of the 'o' +modifier has is not propagated, being restricted to those patterns +explicitly using it. + +See L<perlre> for additional information on valid syntax for STRING, and +for a detailed look at the semantics of regular expressions. -=item m/PATTERN/cgimosxk +=item m/PATTERN/msixpogc X<m> X<operator, match> X<regexp, options> X<regexp> X<regex, options> X<regex> X</c> X</i> X</m> X</o> X</s> X</x> -=item /PATTERN/cgimosxk +=item /PATTERN/msixpogc Searches a string for a pattern match, and in scalar context returns true if it succeeds, false if it fails. If no string is specified @@ -1086,17 +1130,11 @@ rather tightly.) See also L<perlre>. See L<perllocale> for discussion of additional considerations that apply when C<use locale> is in effect. -Options are: +Options are as described in qr// in addition to the following match +process modifiers - i Do case-insensitive pattern matching. - m Treat string as multiple lines. - s Treat string as single line. - x Use extended regular expressions. g Match globally, i.e., find all occurrences. c Do not reset search position on a failed match when /g is in effect. - o Compile pattern only once. - k Keep a copy of the matched string so that ${^MATCH} and friends - will be defined. If "/" is the delimiter then the initial C<m> is optional. With the C<m> you can use any pair of non-alphanumeric, non-whitespace characters @@ -1256,6 +1294,137 @@ Here is the output (split into several lines): lowercase lowercase line-noise lowercase lowercase line-noise MiXeD line-noise. That's all! +=item ?PATTERN? +X<?> + +This is just like the C</pattern/> search, except that it matches only +once between calls to the reset() operator. This is a useful +optimization when you want to see only the first occurrence of +something in each file of a set of files, for instance. Only C<??> +patterns local to the current package are reset. + + while (<>) { + if (?^$?) { + # blank line between header and body + } + } continue { + reset if eof; # clear ?? status for next file + } + +This usage is vaguely deprecated, which means it just might possibly +be removed in some distant future version of Perl, perhaps somewhere +around the year 2168. + +=item s/PATTERN/REPLACEMENT/msixpogce +X<substitute> X<substitution> X<replace> X<regexp, replace> +X<regexp, substitute> X</e> X</g> X</i> X</m> X</o> X</s> X</x> + +Searches a string for a pattern, and if found, replaces that pattern +with the replacement text and returns the number of substitutions +made. Otherwise it returns false (specifically, the empty string). + +If no string is specified via the C<=~> or C<!~> operator, the C<$_> +variable is searched and modified. (The string specified with C<=~> must +be scalar variable, an array element, a hash element, or an assignment +to one of those, i.e., an lvalue.) + +If the delimiter chosen is a single quote, no interpolation is +done on either the PATTERN or the REPLACEMENT. Otherwise, if the +PATTERN contains a $ that looks like a variable rather than an +end-of-string test, the variable will be interpolated into the pattern +at run-time. If you want the pattern compiled only once the first time +the variable is interpolated, use the C</o> option. If the pattern +evaluates to the empty string, the last successfully executed regular +expression is used instead. See L<perlre> for further explanation on these. +See L<perllocale> for discussion of additional considerations that apply +when C<use locale> is in effect. + +Options are as with m// with the addition of the following replacement +specific options: + + e Evaluate the right side as an expression. + ee Evaluate the right side as a string then eval the result + +Any non-alphanumeric, non-whitespace delimiter may replace the +slashes. If single quotes are used, no interpretation is done on the +replacement string (the C</e> modifier overrides this, however). Unlike +Perl 4, Perl 5 treats backticks as normal delimiters; the replacement +text is not evaluated as a command. If the +PATTERN is delimited by bracketing quotes, the REPLACEMENT has its own +pair of quotes, which may or may not be bracketing quotes, e.g., +C<s(foo)(bar)> or C<< s<foo>/bar/ >>. A C</e> will cause the +replacement portion to be treated as a full-fledged Perl expression +and evaluated right then and there. It is, however, syntax checked at +compile-time. A second C<e> modifier will cause the replacement portion +to be C<eval>ed before being run as a Perl expression. + +Examples: + + s/\bgreen\b/mauve/g; # don't change wintergreen + + $path =~ s|/usr/bin|/usr/local/bin|; + + s/Login: $foo/Login: $bar/; # run-time pattern + + ($foo = $bar) =~ s/this/that/; # copy first, then change + + $count = ($paragraph =~ s/Mister\b/Mr./g); # get change-count + + $_ = 'abc123xyz'; + s/\d+/$&*2/e; # yields 'abc246xyz' + s/\d+/sprintf("%5d",$&)/e; # yields 'abc 246xyz' + s/\w/$& x 2/eg; # yields 'aabbcc 224466xxyyzz' + + s/%(.)/$percent{$1}/g; # change percent escapes; no /e + s/%(.)/$percent{$1} || $&/ge; # expr now, so /e + s/^=(\w+)/pod($1)/ge; # use function call + + # expand variables in $_, but dynamics only, using + # symbolic dereferencing + s/\$(\w+)/${$1}/g; + + # Add one to the value of any numbers in the string + s/(\d+)/1 + $1/eg; + + # This will expand any embedded scalar variable + # (including lexicals) in $_ : First $1 is interpolated + # to the variable name, and then evaluated + s/(\$\w+)/$1/eeg; + + # Delete (most) C comments. + $program =~ s { + /\* # Match the opening delimiter. + .*? # Match a minimal number of characters. + \*/ # Match the closing delimiter. + } []gsx; + + s/^\s*(.*?)\s*$/$1/; # trim whitespace in $_, expensively + + for ($variable) { # trim whitespace in $variable, cheap + s/^\s+//; + s/\s+$//; + } + + s/([^ ]*) *([^ ]*)/$2 $1/; # reverse 1st two fields + +Note the use of $ instead of \ in the last example. Unlike +B<sed>, we use the \<I<digit>> form in only the left hand side. +Anywhere else it's $<I<digit>>. + +Occasionally, you can't use just a C</g> to get all the changes +to occur that you might want. Here are two common cases: + + # put commas in the right places in an integer + 1 while s/(\d)(\d\d\d)(?!\d)/$1,$2/g; + + # expand tabs to 8-column spacing + 1 while s/\t+/' ' x (length($&)*8 - length($`)%8)/e; + +=back + +=head2 Quote-Like Operators +X<operator, quote-like> + =item q/STRING/ X<q> X<quote, single> X<'> X<''> @@ -1281,64 +1450,6 @@ A double-quoted, interpolated string. if /\b(tcl|java|python)\b/i; # :-) $baz = "\n"; # a one-character string -=item qr/STRING/imosx -X<qr> X</i> X</m> X</o> X</s> X</x> - -This operator quotes (and possibly compiles) its I<STRING> as a regular -expression. I<STRING> is interpolated the same way as I<PATTERN> -in C<m/PATTERN/>. If "'" is used as the delimiter, no interpolation -is done. Returns a Perl value which may be used instead of the -corresponding C</STRING/imosx> expression. - -For example, - - $rex = qr/my.STRING/is; - s/$rex/foo/; - -is equivalent to - - s/my.STRING/foo/is; - -The result may be used as a subpattern in a match: - - $re = qr/$pattern/; - $string =~ /foo${re}bar/; # can be interpolated in other patterns - $string =~ $re; # or used standalone - $string =~ /$re/; # or this way - -Since Perl may compile the pattern at the moment of execution of qr() -operator, using qr() may have speed advantages in some situations, -notably if the result of qr() is used standalone: - - sub match { - my $patterns = shift; - my @compiled = map qr/$_/i, @$patterns; - grep { - my $success = 0; - foreach my $pat (@compiled) { - $success = 1, last if /$pat/; - } - $success; - } @_; - } - -Precompilation of the pattern into an internal representation at -the moment of qr() avoids a need to recompile the pattern every -time a match C</$pat/> is attempted. (Perl has many other internal -optimizations, but none would be triggered in the above example if -we did not use qr() operator.) - -Options are: - - i Do case-insensitive pattern matching. - m Treat string as multiple lines. - o Compile pattern only once. - s Treat string as single line. - x Use extended regular expressions. - -See L<perlre> for additional information on valid syntax for STRING, and -for a detailed look at the semantics of regular expressions. - =item qx/STRING/ X<qx> X<`> X<``> X<backtick> @@ -1459,117 +1570,6 @@ put comments into a multi-line C<qw>-string. For this reason, the C<use warnings> pragma and the B<-w> switch (that is, the C<$^W> variable) produces warnings if the STRING contains the "," or the "#" character. -=item s/PATTERN/REPLACEMENT/egimosxk -X<substitute> X<substitution> X<replace> X<regexp, replace> -X<regexp, substitute> X</e> X</g> X</i> X</m> X</o> X</s> X</x> - -Searches a string for a pattern, and if found, replaces that pattern -with the replacement text and returns the number of substitutions -made. Otherwise it returns false (specifically, the empty string). - -If no string is specified via the C<=~> or C<!~> operator, the C<$_> -variable is searched and modified. (The string specified with C<=~> must -be scalar variable, an array element, a hash element, or an assignment -to one of those, i.e., an lvalue.) - -If the delimiter chosen is a single quote, no interpolation is -done on either the PATTERN or the REPLACEMENT. Otherwise, if the -PATTERN contains a $ that looks like a variable rather than an -end-of-string test, the variable will be interpolated into the pattern -at run-time. If you want the pattern compiled only once the first time -the variable is interpolated, use the C</o> option. If the pattern -evaluates to the empty string, the last successfully executed regular -expression is used instead. See L<perlre> for further explanation on these. -See L<perllocale> for discussion of additional considerations that apply -when C<use locale> is in effect. - -Options are: - - i Do case-insensitive pattern matching. - m Treat string as multiple lines. - s Treat string as single line. - x Use extended regular expressions. - g Replace globally, i.e., all occurrences. - o Compile pattern only once. - k Keep a copy of the original string so ${^MATCH} and friends - will be defined. - e Evaluate the right side as an expression. - - -Any non-alphanumeric, non-whitespace delimiter may replace the -slashes. If single quotes are used, no interpretation is done on the -replacement string (the C</e> modifier overrides this, however). Unlike -Perl 4, Perl 5 treats backticks as normal delimiters; the replacement -text is not evaluated as a command. If the -PATTERN is delimited by bracketing quotes, the REPLACEMENT has its own -pair of quotes, which may or may not be bracketing quotes, e.g., -C<s(foo)(bar)> or C<< s<foo>/bar/ >>. A C</e> will cause the -replacement portion to be treated as a full-fledged Perl expression -and evaluated right then and there. It is, however, syntax checked at -compile-time. A second C<e> modifier will cause the replacement portion -to be C<eval>ed before being run as a Perl expression. - -Examples: - - s/\bgreen\b/mauve/g; # don't change wintergreen - - $path =~ s|/usr/bin|/usr/local/bin|; - - s/Login: $foo/Login: $bar/; # run-time pattern - - ($foo = $bar) =~ s/this/that/; # copy first, then change - - $count = ($paragraph =~ s/Mister\b/Mr./g); # get change-count - - $_ = 'abc123xyz'; - s/\d+/$&*2/e; # yields 'abc246xyz' - s/\d+/sprintf("%5d",$&)/e; # yields 'abc 246xyz' - s/\w/$& x 2/eg; # yields 'aabbcc 224466xxyyzz' - - s/%(.)/$percent{$1}/g; # change percent escapes; no /e - s/%(.)/$percent{$1} || $&/ge; # expr now, so /e - s/^=(\w+)/pod($1)/ge; # use function call - - # expand variables in $_, but dynamics only, using - # symbolic dereferencing - s/\$(\w+)/${$1}/g; - - # Add one to the value of any numbers in the string - s/(\d+)/1 + $1/eg; - - # This will expand any embedded scalar variable - # (including lexicals) in $_ : First $1 is interpolated - # to the variable name, and then evaluated - s/(\$\w+)/$1/eeg; - - # Delete (most) C comments. - $program =~ s { - /\* # Match the opening delimiter. - .*? # Match a minimal number of characters. - \*/ # Match the closing delimiter. - } []gsx; - - s/^\s*(.*?)\s*$/$1/; # trim whitespace in $_, expensively - - for ($variable) { # trim whitespace in $variable, cheap - s/^\s+//; - s/\s+$//; - } - - s/([^ ]*) *([^ ]*)/$2 $1/; # reverse 1st two fields - -Note the use of $ instead of \ in the last example. Unlike -B<sed>, we use the \<I<digit>> form in only the left hand side. -Anywhere else it's $<I<digit>>. - -Occasionally, you can't use just a C</g> to get all the changes -to occur that you might want. Here are two common cases: - - # put commas in the right places in an integer - 1 while s/(\d)(\d\d\d)(?!\d)/$1,$2/g; - - # expand tabs to 8-column spacing - 1 while s/\t+/' ' x (length($&)*8 - length($`)%8)/e; =item tr/SEARCHLIST/REPLACEMENTLIST/cds X<tr> X<y> X<transliterate> X</c> X</d> X</s> diff --git a/pod/perlre.pod b/pod/perlre.pod index aa861ae46e..99cba6889e 100644 --- a/pod/perlre.pod +++ b/pod/perlre.pod @@ -27,15 +27,6 @@ L<perlop/"Gory details of parsing quoted constructs">. =over 4 -=item i -X</i> X<regex, case-insensitive> X<regexp, case-insensitive> -X<regular expression, case-insensitive> - -Do case-insensitive pattern matching. - -If C<use locale> is in effect, the case map is taken from the current -locale. See L<perllocale>. - =item m X</m> X<regex, multiline> X<regexp, multiline> X<regular expression, multiline> @@ -54,11 +45,26 @@ Used together, as /ms, they let the "." match any character whatsoever, while still allowing "^" and "$" to match, respectively, just after and just before newlines within the string. +=item i +X</i> X<regex, case-insensitive> X<regexp, case-insensitive> +X<regular expression, case-insensitive> + +Do case-insensitive pattern matching. + +If C<use locale> is in effect, the case map is taken from the current +locale. See L<perllocale>. + =item x X</x> Extend your pattern's legibility by permitting whitespace and comments. +=item p +X</p> X<regex, preserve> X<regexp, preserve> + +Preserve the string matched such that ${^PREMATCH}, {$^MATCH}, and +${^POSTMATCH} are available for use after matching. + =back These are usually written as "the C</x> modifier", even though the delimiter @@ -593,11 +599,11 @@ X<$&> X<$`> X<$'> As a workaround for this problem, Perl 5.10 introduces C<${^PREMATCH}>, C<${^MATCH}> and C<${^POSTMATCH}>, which are equivalent to C<$`>, C<$&> and C<$'>, B<except> that they are only guaranteed to be defined after a -successful match that was executed with the C</k> (keep-copy) modifier. +successful match that was executed with the C</p> (preserve) modifier. The use of these variables incurs no global performance penalty, unlike their punctuation char equivalents, however at the trade-off that you have to tell perl when you want to use them. -X</k> X<k modifier> +X</p> X<p modifier> Backslashed metacharacters in Perl are alphanumeric, such as C<\b>, C<\w>, C<\n>. Unlike some other regular expression languages, there diff --git a/pod/perlvar.pod b/pod/perlvar.pod index b4db654178..fc738a0903 100644 --- a/pod/perlvar.pod +++ b/pod/perlvar.pod @@ -234,7 +234,7 @@ X<${^MATCH}> This is similar to C<$&> (C<$POSTMATCH>) except that it does not incur the performance penalty associated with that variable, and is only guaranteed to return a defined value when the pattern was compiled or executed with -the C</k> modifier. +the C</p> modifier. =item $PREMATCH @@ -257,7 +257,7 @@ X<${^PREMATCH}> This is similar to C<$`> ($PREMATCH) except that it does not incur the performance penalty associated with that variable, and is only guaranteed to return a defined value when the pattern was compiled or executed with -the C</k> modifier. +the C</p> modifier. =item $POSTMATCH @@ -286,7 +286,7 @@ X<${^POSTMATCH}> This is similar to C<$'> (C<$POSTMATCH>) except that it does not incur the performance penalty associated with that variable, and is only guaranteed to return a defined value when the pattern was compiled or executed with -the C</k> modifier. +the C</p> modifier. =item $LAST_PAREN_MATCH @@ -161,9 +161,13 @@ typedef struct regexp_engine { /* chars and strings used as regex pattern modifiers * Singlular is a 'c'har, plural is a "string" + * + * NOTE, KEEPCOPY was originally 'k', but was changed to 'p' for preserve + * for compatibility reasons with Regexp::Common which highjacked (?k:...) + * for its own uses. So 'k' is out as well. */ #define EXEC_PAT_MOD 'e' -#define KEEPCOPY_PAT_MOD 'k' +#define KEEPCOPY_PAT_MOD 'p' #define ONCE_PAT_MOD 'o' #define GLOBAL_PAT_MOD 'g' #define CONTINUE_PAT_MOD 'c' @@ -173,7 +177,7 @@ typedef struct regexp_engine { #define XTENDED_PAT_MOD 'x' #define ONCE_PAT_MODS "o" -#define KEEPCOPY_PAT_MODS "k" +#define KEEPCOPY_PAT_MODS "p" #define EXEC_PAT_MODS "e" #define LOOP_PAT_MODS "gc" diff --git a/t/op/regexp_kmod.t b/t/op/regexp_pmod.t index 84efd83546..e20b859bef 100644 --- a/t/op/regexp_kmod.t +++ b/t/op/regexp_pmod.t @@ -10,8 +10,8 @@ use strict; use warnings; our @tests = ( - # /k Pattern PRE MATCH POST - [ 'k', "456", "123-", "456", "-789"], + # /p Pattern PRE MATCH POST + [ 'p', "456", "123-", "456", "-789"], [ '', "(456)", "123-", "456", "-789"], [ '', "456", undef, undef, undef ], ); @@ -24,11 +24,11 @@ sub _u($$) { "$_[0] is ".(defined $_[1] ? "'$_[1]'" : "undef") } $_ = '123-456-789'; foreach my $test (@tests) { - my ($k, $pat,$l,$m,$r) = @$test; - my $test_name = "/$pat/$k"; - my $ok = ok($k ? /$pat/k : /$pat/, $test_name); + my ($p, $pat,$l,$m,$r) = @$test; + my $test_name = "/$pat/$p"; + my $ok = ok($p ? /$pat/p : /$pat/, $test_name); SKIP: { - skip "/$pat/$k failed to match", 3 + skip "/$pat/$p failed to match", 3 unless $ok; is(${^PREMATCH}, $l,_u "$test_name: ^PREMATCH",$l); is(${^MATCH}, $m,_u "$test_name: ^MATCH",$m ); @@ -36,4 +36,4 @@ foreach my $test (@tests) { } } is($W,"","No warnings should be produced"); -ok(!defined ${^MATCH}, "No /k in scope so ^MATCH is undef"); +ok(!defined ${^MATCH}, "No /p in scope so ^MATCH is undef"); |