diff options
author | Karl Williamson <public@khwilliamson.com> | 2010-08-18 23:48:16 -0600 |
---|---|---|
committer | Florian Ragwitz <rafl@debian.org> | 2010-09-20 08:13:30 +0200 |
commit | fb85c0447bf1d343a9b4d4d7075184aeb4c9ae46 (patch) | |
tree | 47c8406e939af312e69568c7a9ced9ec7d3529ed | |
parent | 5c3fa2e7f75bb4370f758b363cec53992c7fd20a (diff) | |
download | perl-fb85c0447bf1d343a9b4d4d7075184aeb4c9ae46.tar.gz |
Add (?^...) regex construct
This adds (?^...) to signify to use the default regex modifiers for the
cluster or embedded pattern-match modifier change. The major purpose of
this is to simplify regex stringification, so that "^" is output in
place of "-xism". As a result, the stringification will not change in
the future when new regex modifiers are added, so tests, etc. that rely
on a particular stringification will have to change now, but never
again.
Code that needs to work properly with both old- and new-style regexes
can use something like the following:
# Accept both old and new-style stringification
my $modifiers = (qr/foobar/ =~ /\Q(?^/) ? '^' : '-xism';
This construct is Ben Morrow's idea.
-rw-r--r-- | dist/Data-Dumper/t/bless.t | 2 | ||||
-rw-r--r-- | ext/Devel-Peek/t/Peek.t | 6 | ||||
-rw-r--r-- | lib/Dumpvalue.t | 2 | ||||
-rw-r--r-- | pod/perldelta.pod | 29 | ||||
-rw-r--r-- | pod/perldiag.pod | 5 | ||||
-rw-r--r-- | pod/perlre.pod | 48 | ||||
-rw-r--r-- | regcomp.c | 29 | ||||
-rw-r--r-- | regexp.h | 4 | ||||
-rw-r--r-- | t/comp/parser.t | 2 | ||||
-rw-r--r-- | t/lib/warnings/regcomp | 5 | ||||
-rw-r--r-- | t/re/pat.t | 10 | ||||
-rw-r--r-- | t/re/pat_re_eval.t | 4 | ||||
-rw-r--r-- | t/re/re.t | 5 | ||||
-rw-r--r-- | t/run/fresh_perl.t | 2 |
14 files changed, 117 insertions, 36 deletions
diff --git a/dist/Data-Dumper/t/bless.t b/dist/Data-Dumper/t/bless.t index 1716d14eb0..8b9e0c3b4c 100644 --- a/dist/Data-Dumper/t/bless.t +++ b/dist/Data-Dumper/t/bless.t @@ -44,7 +44,7 @@ SKIP: { my $t = bless( qr//, 'foo'); my $dt = Dumper($t); my $o = <<'PERL'; -$VAR1 = bless( qr/(?-xism:)/, 'foo' ); +$VAR1 = bless( qr/(?^:)/, 'foo' ); PERL is($dt, $o, "We can dump blessed qr//'s properly"); diff --git a/ext/Devel-Peek/t/Peek.t b/ext/Devel-Peek/t/Peek.t index 0b9009a315..ef1e6ae340 100644 --- a/ext/Devel-Peek/t/Peek.t +++ b/ext/Devel-Peek/t/Peek.t @@ -329,8 +329,8 @@ do_test(15, SV = REGEXP\\($ADDR\\) at $ADDR REFCNT = 1 FLAGS = \\(OBJECT,POK,FAKE,pPOK\\) - PV = $ADDR "\\(\\?-xism:tic\\)" - CUR = 12 + PV = $ADDR "\\(\\?\\^:tic\\)" + CUR = 8 LEN = 0 STASH = $ADDR\\t"Regexp"'); } else { @@ -350,7 +350,7 @@ do_test(15, MG_VIRTUAL = $ADDR MG_TYPE = PERL_MAGIC_qr\(r\) MG_OBJ = $ADDR - PAT = "\(\?-xism:tic\)" # $] >= 5.009 + PAT = "\(\?^:tic\)" # $] >= 5.009 REFCNT = 2 # $] >= 5.009 STASH = $ADDR\\t"Regexp"'); } diff --git a/lib/Dumpvalue.t b/lib/Dumpvalue.t index 8eb70a34b8..6570e38a2e 100644 --- a/lib/Dumpvalue.t +++ b/lib/Dumpvalue.t @@ -130,7 +130,7 @@ is( $out->read, '', 'unwrap ignored glob on first try'); $d->unwrap(*FOO); is( $out->read, "*DUMPED_GLOB*\n", 'unwrap worked on glob'); $d->unwrap(qr/foo(.+)/); -is( $out->read, "-> qr/(?-xism:foo(.+))/\n", 'unwrap worked on Regexp' ); +is( $out->read, "-> qr/(?^:foo(.+))/\n", 'unwrap worked on Regexp' ); $d->unwrap( sub {} ); like( $out->read, qr/^-> &CODE/, 'unwrap worked on sub ref' ); diff --git a/pod/perldelta.pod b/pod/perldelta.pod index 5597857c3e..7e2e2ee031 100644 --- a/pod/perldelta.pod +++ b/pod/perldelta.pod @@ -28,6 +28,18 @@ here, but most should go in the L</Performance Enhancements> section. [ List each enhancement as a =head2 entry ] +=head2 C<(?^...)> regex construct added to signify default modifiers + +A caret (also called a "cirumflex accent") C<"^"> immediately following +a C<"(?"> in a regular expression now means that the subexpression is to +not inherit the surrounding modifiers such as C</i>, but to revert to the +Perl defaults. Any modifiers following the caret override the defaults. + +The stringification of regular expressions now uses this notation. The +main purpose of this is to allow tests that rely on the stringification +to not have to change when new modifiers are added. See +L<perlre/Extended Patterns>. + =head1 Security XXX Any security-related notices go here. In particular, any security @@ -38,10 +50,21 @@ L</Selected Bug Fixes> section. =head1 Incompatible Changes -XXX For a release on a stable branch, this section aspires to be: +=head2 Stringification of regexes has changed + +Default regular expression modifiers are now notated by using +C<(?^...)>. Code relying on the old stringification will fail. The +purpose of this is so that when new modifiers are added, such code will +not have to change, as the stringification will automatically +incorporate the new modifiers. + +Code that needs to work properly with both old- and new-style regexes +can use something like the following: + + # Accept both old and new-style stringification + my $modifiers = (qr/foobar/ =~ /\Q(?^/) ? '^' : '-xism'; - There are no changes intentionally incompatible with 5.XXX.XXX. If any - exist, they are bugs and reports are welcome. +And then use C<$modifiers> instead of C<-xism>. [ List each incompatible change as a =head2 entry ] diff --git a/pod/perldiag.pod b/pod/perldiag.pod index 7bd44980fd..f7693e6f24 100644 --- a/pod/perldiag.pod +++ b/pod/perldiag.pod @@ -4033,7 +4033,10 @@ where the problem was discovered. See L<perlre>. (F) You used a regular expression extension that doesn't make sense. The <-- HERE shows in the regular expression about where the problem was -discovered. See L<perlre>. +discovered. This happens when using the C<(?^...)> construct to tell +Perl to use the default regular expression modifiers, and you +redundantly specify a default modifier. For other causes, see +L<perlre>. =item Sequence \%s... not terminated in regex; marked by <-- HERE in m/%s/ diff --git a/pod/perlre.pod b/pod/perlre.pod index de5b719772..6e68bcd1db 100644 --- a/pod/perlre.pod +++ b/pod/perlre.pod @@ -595,12 +595,20 @@ the comment as soon as it sees a C<)>, so there is no way to put a literal C<)> in the comment. =item C<(?pimsx-imsx)> -X<(?)> + +=item C<(?^pimsx)> +X<(?)> X<(?^)> One or more embedded pattern-match modifiers, to be turned on (or turned off, if preceded by C<->) for the remainder of the pattern or -the remainder of the enclosing pattern group (if any). This is -particularly useful for dynamic patterns, such as those read in from a +the remainder of the enclosing pattern group (if any). + +Starting in Perl 5.14, a C<"^"> (caret or circumflex accent) immediately +after the C<"?"> is a shorthand equivalent to C<-imsx> and compiling the +regex under C<no locale>. Flags may follow the caret to override it. +But a minus sign is not legal with it. + +This is particularly useful for dynamic patterns, such as those read in from a configuration file, taken from an argument, or specified in a table somewhere. Consider the case where some patterns want to be case sensitive and some do not: The case insensitive ones merely need to @@ -636,6 +644,9 @@ X<(?:)> =item C<(?imsx-imsx:pattern)> +=item C<(?^imsx:pattern)> +X<(?^:)> + This is for clustering, not capturing; it groups subexpressions like "()", but doesn't make backreferences as "()" does. So @@ -657,6 +668,37 @@ is equivalent to the more verbose /(?:(?s-i)more.*than).*million/i +Starting in Perl 5.14, a C<"^"> (caret or circumflex accent) immediately +after the C<"?"> is a shorthand equivalent to C<-imsx> and compiling the +regex under C<no locale>. Any positive flags may follow the caret, so + + (?^x:foo) + +is equivalent to + + (?x-ims:foo) + +The caret tells Perl that this cluster doesn't inherit the flags of any +surrounding pattern, but to go back to the system defaults (C<-imsx>), +modified by any flags specified. + +The caret allows for simpler stringification of compiled regular +expressions. These look like + + (?^:pattern) + +with any non-default flags appearing between the caret and the colon. +A test that looks at such stringification thus doesn't need to have the +system default flags hard-coded in it, just the caret. If new flags are +added to Perl, the meaning of the caret's expansion will change to include +the default for those flags, so the test will still work, unchanged. + +Specifying a negative flag after the caret is an error, as the flag is +redundant. + +Mnemonic for C<(?^...)>: A fresh beginning since the usual use of a caret is +to match at the beginning. + =item C<(?|pattern)> X<(?|)> X<Branch reset> @@ -4428,33 +4428,29 @@ Perl_re_compile(pTHX_ SV * const pattern, U32 pm_flags) >> RXf_PMf_STD_PMMOD_SHIFT); const char *fptr = STD_PAT_MODS; /*"msix"*/ char *p; - const STRLEN wraplen = plen + has_minus + has_p + has_runon + /* Allocate for the worst case, which is all the std flags are turned + * on, but this means no caret. We never output a minus, as all those + * are defaults, so are covered by the caret */ + const STRLEN wraplen = plen + has_p + has_runon + (sizeof(STD_PAT_MODS) - 1) + (sizeof("(?:)") - 1); p = sv_grow(MUTABLE_SV(rx), wraplen + 1); - SvCUR_set(rx, wraplen); SvPOK_on(rx); SvFLAGS(rx) |= SvUTF8(pattern); *p++='('; *p++='?'; + if (has_minus) { /* If a default, cover it using the caret */ + *p++='^'; + } if (has_p) *p++ = KEEPCOPY_PAT_MOD; /*'p'*/ { - char *r = p + (sizeof(STD_PAT_MODS) - 1) + has_minus - 1; - char *colon = r + 1; char ch; - while((ch = *fptr++)) { if(reganch & 1) *p++ = ch; - else - *r-- = ch; reganch >>= 1; } - if(has_minus) { - *r = '-'; - p = colon; - } } *p++ = ':'; @@ -4466,6 +4462,7 @@ Perl_re_compile(pTHX_ SV * const pattern, U32 pm_flags) *p++ = '\n'; *p++ = ')'; *p = 0; + SvCUR_set(rx, p - SvPVX_const(rx)); } r->intflags = 0; @@ -5666,6 +5663,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) if (*RExC_parse == '?') { /* (?...) */ bool is_logical = 0; const char * const seqstart = RExC_parse; + bool has_use_defaults = FALSE; RExC_parse++; paren = *RExC_parse++; @@ -6120,6 +6118,10 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) RExC_parse--; /* for vFAIL to print correctly */ vFAIL("Sequence (? incomplete"); break; + case '^': /* Use default flags with the exceptions that follow */ + has_use_defaults = TRUE; + STD_PMMOD_FLAGS_CLEAR(&RExC_flags); + goto parse_flags; default: --RExC_parse; parse_flags: /* (?i) */ @@ -6173,7 +6175,10 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) } break; case '-': - if (flagsp == &negflags) { + /* A flag is a default iff it is following a minus, so + * if there is a minus, it means will be trying to + * re-specify a default which is an error */ + if (has_use_defaults || flagsp == &negflags) { RExC_parse++; vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart); /*NOTREACHED*/ @@ -236,6 +236,10 @@ and check for NULL. case SINGLE_PAT_MOD: *(pmfl) |= RXf_PMf_SINGLELINE; break; \ case XTENDED_PAT_MOD: *(pmfl) |= RXf_PMf_EXTENDED; break +/* Note, includes locale */ +#define STD_PMMOD_FLAGS_CLEAR(pmfl) \ + *(pmfl) &= ~(RXf_PMf_FOLD|RXf_PMf_MULTILINE|RXf_PMf_SINGLELINE|RXf_PMf_EXTENDED|RXf_PMf_LOCALE) + /* chars and strings used as regex pattern modifiers * Singlular is a 'c'har, plural is a "string" * diff --git a/t/comp/parser.t b/t/comp/parser.t index 8fd94530cb..5c64d11fb2 100644 --- a/t/comp/parser.t +++ b/t/comp/parser.t @@ -125,7 +125,7 @@ is( $@, '', 'PL_lex_brackstack' ); is("${a}[", "A[", "interpolation, qq//"); my @b=("B"); is("@{b}{", "B{", "interpolation, qq//"); - is(qr/${a}{/, '(?-xism:A{)', "interpolation, qr//"); + is(qr/${a}{/, '(?^:A{)', "interpolation, qr//"); my $c = "A{"; $c =~ /${a}{/; is($&, 'A{', "interpolation, m//"); diff --git a/t/lib/warnings/regcomp b/t/lib/warnings/regcomp index 98280f61f4..2842882371 100644 --- a/t/lib/warnings/regcomp +++ b/t/lib/warnings/regcomp @@ -250,3 +250,8 @@ $a = qr/[\8\9]/; EXPECT Unrecognized escape \8 in character class passed through in regex; marked by <-- HERE in m/[\8 <-- HERE \9]/ at - line 3. Unrecognized escape \9 in character class passed through in regex; marked by <-- HERE in m/[\8\9 <-- HERE ]/ at - line 3. +######## +# regcomp.c [Perl_re_compile] +$a = qr/(?^-i:foo)/; +EXPECT +Sequence (?^-...) not recognized in regex; marked by <-- HERE in m/(?^- <-- HERE i:foo)/ at - line 2. diff --git a/t/re/pat.t b/t/re/pat.t index ba0efcdc98..3bc7f5d372 100644 --- a/t/re/pat.t +++ b/t/re/pat.t @@ -499,12 +499,12 @@ sub run_tests { } { - iseq qr/\b\v$/i, '(?i-xsm:\b\v$)', 'qr/\b\v$/i'; - iseq qr/\b\v$/s, '(?s-xim:\b\v$)', 'qr/\b\v$/s'; - iseq qr/\b\v$/m, '(?m-xis:\b\v$)', 'qr/\b\v$/m'; - iseq qr/\b\v$/x, '(?x-ism:\b\v$)', 'qr/\b\v$/x'; + iseq qr/\b\v$/i, '(?^i:\b\v$)', 'qr/\b\v$/i'; + iseq qr/\b\v$/s, '(?^s:\b\v$)', 'qr/\b\v$/s'; + iseq qr/\b\v$/m, '(?^m:\b\v$)', 'qr/\b\v$/m'; + iseq qr/\b\v$/x, '(?^x:\b\v$)', 'qr/\b\v$/x'; iseq qr/\b\v$/xism, '(?msix:\b\v$)', 'qr/\b\v$/xism'; - iseq qr/\b\v$/, '(?-xism:\b\v$)', 'qr/\b\v$/'; + iseq qr/\b\v$/, '(?^:\b\v$)', 'qr/\b\v$/'; } diff --git a/t/re/pat_re_eval.t b/t/re/pat_re_eval.t index fab828d17d..6fcbb9360b 100644 --- a/t/re/pat_re_eval.t +++ b/t/re/pat_re_eval.t @@ -77,7 +77,7 @@ sub run_tests { { our $a = bless qr /foo/ => 'Foo'; ok 'goodfood' =~ $a, "Reblessed qr // matches"; - iseq $a, '(?-xism:foo)', "Reblessed qr // stringifies"; + iseq $a, '(?^:foo)', "Reblessed qr // stringifies"; my $x = "\x{3fe}"; my $z = my $y = "\317\276"; # Byte representation of $x $a = qr /$x/; @@ -88,7 +88,7 @@ sub run_tests { "Postponed interpolation of qr // preserves UTF-8"; { local $BugId = '17776'; - iseq length qr /##/x, 12, "## in qr // doesn't corrupt memory"; + iseq length qr /##/x, 9, "## in qr // doesn't corrupt memory"; } { use re 'eval'; @@ -21,12 +21,11 @@ use re qw(is_regexp regexp_pattern is((regexp_pattern($qr))[0],'foo','regexp_pattern[0] (ref)'); is((regexp_pattern($qr))[1],'ip','regexp_pattern[1] (ref)'); - is(regexp_pattern($qr),'(?pi-xsm:foo)','scalar regexp_pattern (ref)'); + is(regexp_pattern($qr),'(?^pi:foo)','scalar regexp_pattern (ref)'); is((regexp_pattern($rx))[0],'foo','regexp_pattern[0] (bare REGEXP)'); is((regexp_pattern($rx))[1],'ip','regexp_pattern[1] (bare REGEXP)'); - is(regexp_pattern($rx),'(?pi-xsm:foo)', - 'scalar regexp_pattern (bare REGEXP)'); + is(regexp_pattern($rx),'(?^pi:foo)', 'scalar regexp_pattern (bare REGEXP)'); ok(!regexp_pattern(''),'!regexp_pattern("")'); } diff --git a/t/run/fresh_perl.t b/t/run/fresh_perl.t index 2019d9b520..3666f0922e 100644 --- a/t/run/fresh_perl.t +++ b/t/run/fresh_perl.t @@ -605,7 +605,7 @@ EXPECT # reversed again as a result of [perl #17763] die qr(x) EXPECT -(?-xism:x) +(?^:x) ######## # 20001210.003 mjd@plover.com format REMITOUT_TOP = |