summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKarl Williamson <public@khwilliamson.com>2010-08-18 23:48:16 -0600
committerFlorian Ragwitz <rafl@debian.org>2010-09-20 08:13:30 +0200
commitfb85c0447bf1d343a9b4d4d7075184aeb4c9ae46 (patch)
tree47c8406e939af312e69568c7a9ced9ec7d3529ed
parent5c3fa2e7f75bb4370f758b363cec53992c7fd20a (diff)
downloadperl-fb85c0447bf1d343a9b4d4d7075184aeb4c9ae46.tar.gz
Add (?^...) regex construct
This adds (?^...) to signify to use the default regex modifiers for the cluster or embedded pattern-match modifier change. The major purpose of this is to simplify regex stringification, so that "^" is output in place of "-xism". As a result, the stringification will not change in the future when new regex modifiers are added, so tests, etc. that rely on a particular stringification will have to change now, but never again. Code that needs to work properly with both old- and new-style regexes can use something like the following: # Accept both old and new-style stringification my $modifiers = (qr/foobar/ =~ /\Q(?^/) ? '^' : '-xism'; This construct is Ben Morrow's idea.
-rw-r--r--dist/Data-Dumper/t/bless.t2
-rw-r--r--ext/Devel-Peek/t/Peek.t6
-rw-r--r--lib/Dumpvalue.t2
-rw-r--r--pod/perldelta.pod29
-rw-r--r--pod/perldiag.pod5
-rw-r--r--pod/perlre.pod48
-rw-r--r--regcomp.c29
-rw-r--r--regexp.h4
-rw-r--r--t/comp/parser.t2
-rw-r--r--t/lib/warnings/regcomp5
-rw-r--r--t/re/pat.t10
-rw-r--r--t/re/pat_re_eval.t4
-rw-r--r--t/re/re.t5
-rw-r--r--t/run/fresh_perl.t2
14 files changed, 117 insertions, 36 deletions
diff --git a/dist/Data-Dumper/t/bless.t b/dist/Data-Dumper/t/bless.t
index 1716d14eb0..8b9e0c3b4c 100644
--- a/dist/Data-Dumper/t/bless.t
+++ b/dist/Data-Dumper/t/bless.t
@@ -44,7 +44,7 @@ SKIP: {
my $t = bless( qr//, 'foo');
my $dt = Dumper($t);
my $o = <<'PERL';
-$VAR1 = bless( qr/(?-xism:)/, 'foo' );
+$VAR1 = bless( qr/(?^:)/, 'foo' );
PERL
is($dt, $o, "We can dump blessed qr//'s properly");
diff --git a/ext/Devel-Peek/t/Peek.t b/ext/Devel-Peek/t/Peek.t
index 0b9009a315..ef1e6ae340 100644
--- a/ext/Devel-Peek/t/Peek.t
+++ b/ext/Devel-Peek/t/Peek.t
@@ -329,8 +329,8 @@ do_test(15,
SV = REGEXP\\($ADDR\\) at $ADDR
REFCNT = 1
FLAGS = \\(OBJECT,POK,FAKE,pPOK\\)
- PV = $ADDR "\\(\\?-xism:tic\\)"
- CUR = 12
+ PV = $ADDR "\\(\\?\\^:tic\\)"
+ CUR = 8
LEN = 0
STASH = $ADDR\\t"Regexp"');
} else {
@@ -350,7 +350,7 @@ do_test(15,
MG_VIRTUAL = $ADDR
MG_TYPE = PERL_MAGIC_qr\(r\)
MG_OBJ = $ADDR
- PAT = "\(\?-xism:tic\)" # $] >= 5.009
+ PAT = "\(\?^:tic\)" # $] >= 5.009
REFCNT = 2 # $] >= 5.009
STASH = $ADDR\\t"Regexp"');
}
diff --git a/lib/Dumpvalue.t b/lib/Dumpvalue.t
index 8eb70a34b8..6570e38a2e 100644
--- a/lib/Dumpvalue.t
+++ b/lib/Dumpvalue.t
@@ -130,7 +130,7 @@ is( $out->read, '', 'unwrap ignored glob on first try');
$d->unwrap(*FOO);
is( $out->read, "*DUMPED_GLOB*\n", 'unwrap worked on glob');
$d->unwrap(qr/foo(.+)/);
-is( $out->read, "-> qr/(?-xism:foo(.+))/\n", 'unwrap worked on Regexp' );
+is( $out->read, "-> qr/(?^:foo(.+))/\n", 'unwrap worked on Regexp' );
$d->unwrap( sub {} );
like( $out->read, qr/^-> &CODE/, 'unwrap worked on sub ref' );
diff --git a/pod/perldelta.pod b/pod/perldelta.pod
index 5597857c3e..7e2e2ee031 100644
--- a/pod/perldelta.pod
+++ b/pod/perldelta.pod
@@ -28,6 +28,18 @@ here, but most should go in the L</Performance Enhancements> section.
[ List each enhancement as a =head2 entry ]
+=head2 C<(?^...)> regex construct added to signify default modifiers
+
+A caret (also called a "cirumflex accent") C<"^"> immediately following
+a C<"(?"> in a regular expression now means that the subexpression is to
+not inherit the surrounding modifiers such as C</i>, but to revert to the
+Perl defaults. Any modifiers following the caret override the defaults.
+
+The stringification of regular expressions now uses this notation. The
+main purpose of this is to allow tests that rely on the stringification
+to not have to change when new modifiers are added. See
+L<perlre/Extended Patterns>.
+
=head1 Security
XXX Any security-related notices go here. In particular, any security
@@ -38,10 +50,21 @@ L</Selected Bug Fixes> section.
=head1 Incompatible Changes
-XXX For a release on a stable branch, this section aspires to be:
+=head2 Stringification of regexes has changed
+
+Default regular expression modifiers are now notated by using
+C<(?^...)>. Code relying on the old stringification will fail. The
+purpose of this is so that when new modifiers are added, such code will
+not have to change, as the stringification will automatically
+incorporate the new modifiers.
+
+Code that needs to work properly with both old- and new-style regexes
+can use something like the following:
+
+ # Accept both old and new-style stringification
+ my $modifiers = (qr/foobar/ =~ /\Q(?^/) ? '^' : '-xism';
- There are no changes intentionally incompatible with 5.XXX.XXX. If any
- exist, they are bugs and reports are welcome.
+And then use C<$modifiers> instead of C<-xism>.
[ List each incompatible change as a =head2 entry ]
diff --git a/pod/perldiag.pod b/pod/perldiag.pod
index 7bd44980fd..f7693e6f24 100644
--- a/pod/perldiag.pod
+++ b/pod/perldiag.pod
@@ -4033,7 +4033,10 @@ where the problem was discovered. See L<perlre>.
(F) You used a regular expression extension that doesn't make sense. The
<-- HERE shows in the regular expression about where the problem was
-discovered. See L<perlre>.
+discovered. This happens when using the C<(?^...)> construct to tell
+Perl to use the default regular expression modifiers, and you
+redundantly specify a default modifier. For other causes, see
+L<perlre>.
=item Sequence \%s... not terminated in regex; marked by <-- HERE in m/%s/
diff --git a/pod/perlre.pod b/pod/perlre.pod
index de5b719772..6e68bcd1db 100644
--- a/pod/perlre.pod
+++ b/pod/perlre.pod
@@ -595,12 +595,20 @@ the comment as soon as it sees a C<)>, so there is no way to put a literal
C<)> in the comment.
=item C<(?pimsx-imsx)>
-X<(?)>
+
+=item C<(?^pimsx)>
+X<(?)> X<(?^)>
One or more embedded pattern-match modifiers, to be turned on (or
turned off, if preceded by C<->) for the remainder of the pattern or
-the remainder of the enclosing pattern group (if any). This is
-particularly useful for dynamic patterns, such as those read in from a
+the remainder of the enclosing pattern group (if any).
+
+Starting in Perl 5.14, a C<"^"> (caret or circumflex accent) immediately
+after the C<"?"> is a shorthand equivalent to C<-imsx> and compiling the
+regex under C<no locale>. Flags may follow the caret to override it.
+But a minus sign is not legal with it.
+
+This is particularly useful for dynamic patterns, such as those read in from a
configuration file, taken from an argument, or specified in a table
somewhere. Consider the case where some patterns want to be case
sensitive and some do not: The case insensitive ones merely need to
@@ -636,6 +644,9 @@ X<(?:)>
=item C<(?imsx-imsx:pattern)>
+=item C<(?^imsx:pattern)>
+X<(?^:)>
+
This is for clustering, not capturing; it groups subexpressions like
"()", but doesn't make backreferences as "()" does. So
@@ -657,6 +668,37 @@ is equivalent to the more verbose
/(?:(?s-i)more.*than).*million/i
+Starting in Perl 5.14, a C<"^"> (caret or circumflex accent) immediately
+after the C<"?"> is a shorthand equivalent to C<-imsx> and compiling the
+regex under C<no locale>. Any positive flags may follow the caret, so
+
+ (?^x:foo)
+
+is equivalent to
+
+ (?x-ims:foo)
+
+The caret tells Perl that this cluster doesn't inherit the flags of any
+surrounding pattern, but to go back to the system defaults (C<-imsx>),
+modified by any flags specified.
+
+The caret allows for simpler stringification of compiled regular
+expressions. These look like
+
+ (?^:pattern)
+
+with any non-default flags appearing between the caret and the colon.
+A test that looks at such stringification thus doesn't need to have the
+system default flags hard-coded in it, just the caret. If new flags are
+added to Perl, the meaning of the caret's expansion will change to include
+the default for those flags, so the test will still work, unchanged.
+
+Specifying a negative flag after the caret is an error, as the flag is
+redundant.
+
+Mnemonic for C<(?^...)>: A fresh beginning since the usual use of a caret is
+to match at the beginning.
+
=item C<(?|pattern)>
X<(?|)> X<Branch reset>
diff --git a/regcomp.c b/regcomp.c
index d4ce12adeb..2871e4a10c 100644
--- a/regcomp.c
+++ b/regcomp.c
@@ -4428,33 +4428,29 @@ Perl_re_compile(pTHX_ SV * const pattern, U32 pm_flags)
>> RXf_PMf_STD_PMMOD_SHIFT);
const char *fptr = STD_PAT_MODS; /*"msix"*/
char *p;
- const STRLEN wraplen = plen + has_minus + has_p + has_runon
+ /* Allocate for the worst case, which is all the std flags are turned
+ * on, but this means no caret. We never output a minus, as all those
+ * are defaults, so are covered by the caret */
+ const STRLEN wraplen = plen + has_p + has_runon
+ (sizeof(STD_PAT_MODS) - 1)
+ (sizeof("(?:)") - 1);
p = sv_grow(MUTABLE_SV(rx), wraplen + 1);
- SvCUR_set(rx, wraplen);
SvPOK_on(rx);
SvFLAGS(rx) |= SvUTF8(pattern);
*p++='('; *p++='?';
+ if (has_minus) { /* If a default, cover it using the caret */
+ *p++='^';
+ }
if (has_p)
*p++ = KEEPCOPY_PAT_MOD; /*'p'*/
{
- char *r = p + (sizeof(STD_PAT_MODS) - 1) + has_minus - 1;
- char *colon = r + 1;
char ch;
-
while((ch = *fptr++)) {
if(reganch & 1)
*p++ = ch;
- else
- *r-- = ch;
reganch >>= 1;
}
- if(has_minus) {
- *r = '-';
- p = colon;
- }
}
*p++ = ':';
@@ -4466,6 +4462,7 @@ Perl_re_compile(pTHX_ SV * const pattern, U32 pm_flags)
*p++ = '\n';
*p++ = ')';
*p = 0;
+ SvCUR_set(rx, p - SvPVX_const(rx));
}
r->intflags = 0;
@@ -5666,6 +5663,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
if (*RExC_parse == '?') { /* (?...) */
bool is_logical = 0;
const char * const seqstart = RExC_parse;
+ bool has_use_defaults = FALSE;
RExC_parse++;
paren = *RExC_parse++;
@@ -6120,6 +6118,10 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
RExC_parse--; /* for vFAIL to print correctly */
vFAIL("Sequence (? incomplete");
break;
+ case '^': /* Use default flags with the exceptions that follow */
+ has_use_defaults = TRUE;
+ STD_PMMOD_FLAGS_CLEAR(&RExC_flags);
+ goto parse_flags;
default:
--RExC_parse;
parse_flags: /* (?i) */
@@ -6173,7 +6175,10 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
}
break;
case '-':
- if (flagsp == &negflags) {
+ /* A flag is a default iff it is following a minus, so
+ * if there is a minus, it means will be trying to
+ * re-specify a default which is an error */
+ if (has_use_defaults || flagsp == &negflags) {
RExC_parse++;
vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
/*NOTREACHED*/
diff --git a/regexp.h b/regexp.h
index 298a417d26..198b51017c 100644
--- a/regexp.h
+++ b/regexp.h
@@ -236,6 +236,10 @@ and check for NULL.
case SINGLE_PAT_MOD: *(pmfl) |= RXf_PMf_SINGLELINE; break; \
case XTENDED_PAT_MOD: *(pmfl) |= RXf_PMf_EXTENDED; break
+/* Note, includes locale */
+#define STD_PMMOD_FLAGS_CLEAR(pmfl) \
+ *(pmfl) &= ~(RXf_PMf_FOLD|RXf_PMf_MULTILINE|RXf_PMf_SINGLELINE|RXf_PMf_EXTENDED|RXf_PMf_LOCALE)
+
/* chars and strings used as regex pattern modifiers
* Singlular is a 'c'har, plural is a "string"
*
diff --git a/t/comp/parser.t b/t/comp/parser.t
index 8fd94530cb..5c64d11fb2 100644
--- a/t/comp/parser.t
+++ b/t/comp/parser.t
@@ -125,7 +125,7 @@ is( $@, '', 'PL_lex_brackstack' );
is("${a}[", "A[", "interpolation, qq//");
my @b=("B");
is("@{b}{", "B{", "interpolation, qq//");
- is(qr/${a}{/, '(?-xism:A{)', "interpolation, qr//");
+ is(qr/${a}{/, '(?^:A{)', "interpolation, qr//");
my $c = "A{";
$c =~ /${a}{/;
is($&, 'A{', "interpolation, m//");
diff --git a/t/lib/warnings/regcomp b/t/lib/warnings/regcomp
index 98280f61f4..2842882371 100644
--- a/t/lib/warnings/regcomp
+++ b/t/lib/warnings/regcomp
@@ -250,3 +250,8 @@ $a = qr/[\8\9]/;
EXPECT
Unrecognized escape \8 in character class passed through in regex; marked by <-- HERE in m/[\8 <-- HERE \9]/ at - line 3.
Unrecognized escape \9 in character class passed through in regex; marked by <-- HERE in m/[\8\9 <-- HERE ]/ at - line 3.
+########
+# regcomp.c [Perl_re_compile]
+$a = qr/(?^-i:foo)/;
+EXPECT
+Sequence (?^-...) not recognized in regex; marked by <-- HERE in m/(?^- <-- HERE i:foo)/ at - line 2.
diff --git a/t/re/pat.t b/t/re/pat.t
index ba0efcdc98..3bc7f5d372 100644
--- a/t/re/pat.t
+++ b/t/re/pat.t
@@ -499,12 +499,12 @@ sub run_tests {
}
{
- iseq qr/\b\v$/i, '(?i-xsm:\b\v$)', 'qr/\b\v$/i';
- iseq qr/\b\v$/s, '(?s-xim:\b\v$)', 'qr/\b\v$/s';
- iseq qr/\b\v$/m, '(?m-xis:\b\v$)', 'qr/\b\v$/m';
- iseq qr/\b\v$/x, '(?x-ism:\b\v$)', 'qr/\b\v$/x';
+ iseq qr/\b\v$/i, '(?^i:\b\v$)', 'qr/\b\v$/i';
+ iseq qr/\b\v$/s, '(?^s:\b\v$)', 'qr/\b\v$/s';
+ iseq qr/\b\v$/m, '(?^m:\b\v$)', 'qr/\b\v$/m';
+ iseq qr/\b\v$/x, '(?^x:\b\v$)', 'qr/\b\v$/x';
iseq qr/\b\v$/xism, '(?msix:\b\v$)', 'qr/\b\v$/xism';
- iseq qr/\b\v$/, '(?-xism:\b\v$)', 'qr/\b\v$/';
+ iseq qr/\b\v$/, '(?^:\b\v$)', 'qr/\b\v$/';
}
diff --git a/t/re/pat_re_eval.t b/t/re/pat_re_eval.t
index fab828d17d..6fcbb9360b 100644
--- a/t/re/pat_re_eval.t
+++ b/t/re/pat_re_eval.t
@@ -77,7 +77,7 @@ sub run_tests {
{
our $a = bless qr /foo/ => 'Foo';
ok 'goodfood' =~ $a, "Reblessed qr // matches";
- iseq $a, '(?-xism:foo)', "Reblessed qr // stringifies";
+ iseq $a, '(?^:foo)', "Reblessed qr // stringifies";
my $x = "\x{3fe}";
my $z = my $y = "\317\276"; # Byte representation of $x
$a = qr /$x/;
@@ -88,7 +88,7 @@ sub run_tests {
"Postponed interpolation of qr // preserves UTF-8";
{
local $BugId = '17776';
- iseq length qr /##/x, 12, "## in qr // doesn't corrupt memory";
+ iseq length qr /##/x, 9, "## in qr // doesn't corrupt memory";
}
{
use re 'eval';
diff --git a/t/re/re.t b/t/re/re.t
index 249c6ddf22..10e2ee2b64 100644
--- a/t/re/re.t
+++ b/t/re/re.t
@@ -21,12 +21,11 @@ use re qw(is_regexp regexp_pattern
is((regexp_pattern($qr))[0],'foo','regexp_pattern[0] (ref)');
is((regexp_pattern($qr))[1],'ip','regexp_pattern[1] (ref)');
- is(regexp_pattern($qr),'(?pi-xsm:foo)','scalar regexp_pattern (ref)');
+ is(regexp_pattern($qr),'(?^pi:foo)','scalar regexp_pattern (ref)');
is((regexp_pattern($rx))[0],'foo','regexp_pattern[0] (bare REGEXP)');
is((regexp_pattern($rx))[1],'ip','regexp_pattern[1] (bare REGEXP)');
- is(regexp_pattern($rx),'(?pi-xsm:foo)',
- 'scalar regexp_pattern (bare REGEXP)');
+ is(regexp_pattern($rx),'(?^pi:foo)', 'scalar regexp_pattern (bare REGEXP)');
ok(!regexp_pattern(''),'!regexp_pattern("")');
}
diff --git a/t/run/fresh_perl.t b/t/run/fresh_perl.t
index 2019d9b520..3666f0922e 100644
--- a/t/run/fresh_perl.t
+++ b/t/run/fresh_perl.t
@@ -605,7 +605,7 @@ EXPECT
# reversed again as a result of [perl #17763]
die qr(x)
EXPECT
-(?-xism:x)
+(?^:x)
########
# 20001210.003 mjd@plover.com
format REMITOUT_TOP =