summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKarl Williamson <khw@cpan.org>2015-09-23 13:44:31 -0600
committerKarl Williamson <khw@cpan.org>2015-10-11 10:48:32 -0600
commitd6b8921298d7e29c2c19955580f537390b4d8cef (patch)
treee0486ffa55f38f70a2f22b7aab8074e80895808d
parent730480ce9e3e0103efa8f551281a62b39f573b1a (diff)
downloadperl-d6b8921298d7e29c2c19955580f537390b4d8cef.tar.gz
Restrict white space inside [] inside(?[ ]) to \h
This experimental construct has turned on /x processing for any interior bracketed character classes, except comments are not allowed. But, bracketed character classes have traditionally all been on one line, and I'm leery of the problems that could arise if we extend them to multiple. Therefore, restrict the white space to just spaces and tabs before this feature becomes non-experimental. If there is cause, we can later relax the prohibition.
-rw-r--r--pod/perldelta.pod16
-rw-r--r--pod/perlrecharclass.pod5
-rw-r--r--regcomp.c48
-rw-r--r--t/re/regex_sets.t1
4 files changed, 38 insertions, 32 deletions
diff --git a/pod/perldelta.pod b/pod/perldelta.pod
index 60410fc45b..2bc85d66a3 100644
--- a/pod/perldelta.pod
+++ b/pod/perldelta.pod
@@ -37,12 +37,6 @@ L</Selected Bug Fixes> section.
=head1 Incompatible Changes
-XXX For a release on a stable branch, this section aspires to be:
-
- There are no changes intentionally incompatible with 5.XXX.XXX
- If any exist, they are bugs, and we request that you submit a
- report. See L</Reporting Bugs> below.
-
=head2 Lexical $_ has been removed
C<my $_> was introduced in Perl v5.10, and subsequently caused much confusion
@@ -51,6 +45,16 @@ theory that it would either be removed or redesigned in a less confusing (but
backward-incompatible) way. Over the following years, no alternatives were
proposed. The feature has now been removed and will fail to compile.
+=head2 Only blanks and tabs are now allowed within C<[...]> within C<(?[...])>.
+
+The experimental Extended Bracketed Character Classes can contain
+regular bracketed character classes within them. These differ from
+regular ones in that white space is generally ignored, unless escaped by
+preceding it with a backslash. The white space that is ignored is now
+limited to just tab C<\t> and SPACE characters. Previously, it was any
+white space. See
+L<perlrecharclass/Extended Bracketed Character Classes>.
+
=head1 Deprecations
XXX Any deprecated features, syntax, modules etc. should be listed here.
diff --git a/pod/perlrecharclass.pod b/pod/perlrecharclass.pod
index f46de4c801..ef8048f530 100644
--- a/pod/perlrecharclass.pod
+++ b/pod/perlrecharclass.pod
@@ -1051,8 +1051,9 @@ C<\N{...}>, etc.)
This last example shows the use of this construct to specify an ordinary
bracketed character class without additional set operations. Note the
-white space within it; C<E<sol>x> is turned on even within bracketed
-character classes, except you can't have comments inside them. Hence,
+white space within it; a limited version of C<E<sol>x> is turned on even
+within bracketed character classes, with only the SPACE and TAB (C<\t>)
+characters allowed, and no comments. Hence,
(?[ [#] ])
diff --git a/regcomp.c b/regcomp.c
index d7c6e323bc..5117e5cbde 100644
--- a/regcomp.c
+++ b/regcomp.c
@@ -14253,6 +14253,23 @@ S_add_multi_match(pTHX_ AV* multi_char_matches, SV* multi_string, const STRLEN c
#define HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION \
(SvCUR(listsv) != initial_listsv_len)
+/* There is a restricted set of white space characters that are legal when
+ * ignoring white space in a bracketed character class. This generates the
+ * code to skip them.
+ *
+ * There is a line below that uses the same white space criteria but is outside
+ * this macro. Both here and there must use the same definition */
+#define SKIP_BRACKETED_WHITE_SPACE(do_skip, p) \
+ STMT_START { \
+ if (do_skip) { \
+ while ( p < RExC_end \
+ && isBLANK_A(UCHARAT(p))) \
+ { \
+ p++; \
+ } \
+ } \
+ } STMT_END
+
STATIC regnode *
S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
const bool stop_at_1, /* Just parse the next thing, don't
@@ -14413,20 +14430,14 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
SvTEMP_off(listsv); /* Grr, TEMPs and mortals are conflated. */
}
- if (skip_white) {
- RExC_parse = regpatws(pRExC_state, RExC_parse,
- FALSE /* means don't recognize comments */ );
- }
+ SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse);
if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
RExC_parse++;
invert = TRUE;
allow_multi_folds = FALSE;
MARK_NAUGHTY(1);
- if (skip_white) {
- RExC_parse = regpatws(pRExC_state, RExC_parse,
- FALSE /* means don't recognize comments */ );
- }
+ SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse);
}
/* Check that they didn't say [:posix:] instead of [[:posix:]] */
@@ -14463,10 +14474,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
break;
}
- if (skip_white) {
- RExC_parse = regpatws(pRExC_state, RExC_parse,
- FALSE /* means don't recognize comments */ );
- }
+ SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse);
if (UCHARAT(RExC_parse) == ']') {
break;
@@ -14519,7 +14527,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
* skipped, it means that that white space is wanted literally, and
* is already in 'value'. Otherwise, need to translate the escape
* into what it signifies. */
- if (! skip_white || ! is_PATWS_cp(value)) switch ((I32)value) {
+ if (! skip_white || ! isBLANK_A(value)) switch ((I32)value) {
case 'w': namedclass = ANYOF_WORDCHAR; break;
case 'W': namedclass = ANYOF_NWORDCHAR; break;
@@ -15068,10 +15076,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
}
} /* end of namedclass \blah */
- if (skip_white) {
- RExC_parse = regpatws(pRExC_state, RExC_parse,
- FALSE /* means don't recognize comments */ );
- }
+ SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse);
/* If 'range' is set, 'value' is the ending of a range--check its
* validity. (If value isn't a single code point in the case of a
@@ -15112,12 +15117,9 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
&& *RExC_parse == '-')
{
char* next_char_ptr = RExC_parse + 1;
- if (skip_white) { /* Get the next real char after the '-' */
- next_char_ptr = regpatws(pRExC_state,
- RExC_parse + 1,
- FALSE); /* means don't recognize
- comments */
- }
+
+ /* Get the next real char after the '-' */
+ SKIP_BRACKETED_WHITE_SPACE(skip_white, next_char_ptr);
/* If the '-' is at the end of the class (just before the ']',
* it is a literal minus; otherwise it is a range */
diff --git a/t/re/regex_sets.t b/t/re/regex_sets.t
index ee161b2eeb..0511117b1a 100644
--- a/t/re/regex_sets.t
+++ b/t/re/regex_sets.t
@@ -27,7 +27,6 @@ like("a", qr/(?[ [a] # This is a comment
like("a", qr/(?[ [a] # [[:notaclass:]]
])/, 'A comment isn\'t parsed');
unlike(uni_to_native("\x85"), qr/(?[ \t… ])/, 'NEL is white space');
-unlike(uni_to_native("\x85"), qr/(?[ [\t…] ])/, '... including within nested []');
like(uni_to_native("\x85"), qr/(?[ \t + \… ])/, 'can escape NEL to match');
like(uni_to_native("\x85"), qr/(?[ [\…] ])/, '... including within nested []');
like("\t", qr/(?[ \t + \… ])/, 'can do basic union');