summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--embed.fnc6
-rw-r--r--embed.h5
-rw-r--r--pod/perldelta.pod32
-rw-r--r--pod/perldiag.pod17
-rw-r--r--pod/perlre.pod15
-rw-r--r--proto.h20
-rw-r--r--regcharclass.h12
-rw-r--r--regcomp.c131
-rwxr-xr-xregen/regcharclass.pl2
-rw-r--r--t/re/pat.t7
-rw-r--r--t/re/reg_mesg.t2
11 files changed, 108 insertions, 141 deletions
diff --git a/embed.fnc b/embed.fnc
index 6fe5daa43f..478e7485cc 100644
--- a/embed.fnc
+++ b/embed.fnc
@@ -2093,16 +2093,14 @@ Es |U32 |join_exact |NN RExC_state_t *pRExC_state \
|NN regnode *scan|NN UV *min_subtract \
|NN bool *unfolded_multi_char \
|U32 flags|NULLOK regnode *val|U32 depth
-EsRn |char * |regwhite |NN RExC_state_t *pRExC_state \
- |NN char *p
-EsRn |char * |regpatws |NN RExC_state_t *pRExC_state \
+EsR |char * |regpatws |NN RExC_state_t *pRExC_state \
|NN char *p|const bool recognize_comment
Ei |void |alloc_maybe_populate_EXACT|NN RExC_state_t *pRExC_state \
|NN regnode *node|NN I32 *flagp|STRLEN len \
|UV code_point|bool downgradable
Ei |U8 |compute_EXACTish|NN RExC_state_t *pRExC_state
Es |char * |nextchar |NN RExC_state_t *pRExC_state
-Es |bool |reg_skipcomment|NN RExC_state_t *pRExC_state
+Ei |char * |reg_skipcomment|NN RExC_state_t *pRExC_state|NN char * p
Es |void |scan_commit |NN const RExC_state_t *pRExC_state \
|NN struct scan_data_t *data \
|NN SSize_t *minlenp \
diff --git a/embed.h b/embed.h
index 68e00ea59a..6ef726602d 100644
--- a/embed.h
+++ b/embed.h
@@ -940,18 +940,17 @@
#define reg_node(a,b) S_reg_node(aTHX_ a,b)
#define reg_recode(a,b) S_reg_recode(aTHX_ a,b)
#define reg_scan_name(a,b) S_reg_scan_name(aTHX_ a,b)
-#define reg_skipcomment(a) S_reg_skipcomment(aTHX_ a)
+#define reg_skipcomment(a,b) S_reg_skipcomment(aTHX_ a,b)
#define reganode(a,b,c) S_reganode(aTHX_ a,b,c)
#define regatom(a,b,c) S_regatom(aTHX_ a,b,c)
#define regbranch(a,b,c,d) S_regbranch(aTHX_ a,b,c,d)
#define regclass(a,b,c,d,e,f,g) S_regclass(aTHX_ a,b,c,d,e,f,g)
#define reginsert(a,b,c,d) S_reginsert(aTHX_ a,b,c,d)
-#define regpatws S_regpatws
+#define regpatws(a,b,c) S_regpatws(aTHX_ a,b,c)
#define regpiece(a,b,c) S_regpiece(aTHX_ a,b,c)
#define regpposixcc(a,b,c) S_regpposixcc(aTHX_ a,b,c)
#define regtail(a,b,c,d) S_regtail(aTHX_ a,b,c,d)
#define reguni(a,b,c) S_reguni(aTHX_ a,b,c)
-#define regwhite S_regwhite
#define scan_commit(a,b,c,d) S_scan_commit(aTHX_ a,b,c,d)
#define set_ANYOF_arg(a,b,c,d,e,f,g) S_set_ANYOF_arg(aTHX_ a,b,c,d,e,f,g)
#define ssc_add_range(a,b,c) S_ssc_add_range(aTHX_ a,b,c)
diff --git a/pod/perldelta.pod b/pod/perldelta.pod
index c110c9718a..359910b782 100644
--- a/pod/perldelta.pod
+++ b/pod/perldelta.pod
@@ -27,6 +27,20 @@ here, but most should go in the L</Performance Enhancements> section.
[ List each enhancement as a =head2 entry ]
+=head2 C<qr/foo/x> now ignores any Unicode pattern white space
+
+The C</x> regular expression modifier allows the pattern to contain
+white space and comments, both of which are ignored, for improved
+readability. Until now, not all the white space characters that Unicode
+designates for this purpose were handled. The additional ones now
+recognized are
+U+0085 NEXT LINE,
+U+200E LEFT-TO-RIGHT MARK,
+U+200F RIGHT-TO-LEFT MARK,
+U+2028 LINE SEPARATOR,
+and
+U+2029 PARAGRAPH SEPARATOR.
+
=head1 Security
XXX Any security-related notices go here. In particular, any security
@@ -52,6 +66,24 @@ now a fatal compilation error.
These had been deprecated since v5.18.
+=head2 5 additional characters are treated as white space under C</x> in
+regex patterns (unless escaped)
+
+The use of these characters with C</x> outside bracketed character
+classes and when not preceeded by a backslash has raised a deprecation
+warning since v5.18. Now they will be ignored. See L</qrE<sol>fooE<sol>x>
+for the list of the five characters.
+
+=head2 Comment lines within S<C<(?[ ])>> now are ended only by a C<\n>
+
+S<C<(?[ ])>> is an experimental feature, introduced in v5.18. It operates
+as if C</x> is always enabled. But there was a difference, comment
+lines (following a C<#> character) were terminated by anything matching
+C<\R> which includes all vertical whitespace, such as form feeds. For
+consistency, this is now changed to match what terminates comment lines
+outside S<C<(?[ ])>>, namely a C<\n> (even if escaped), which is the
+same as what terminates a heredoc string and formats.
+
=head1 Deprecations
XXX Any deprecated features, syntax, modules etc. should be listed here.
diff --git a/pod/perldiag.pod b/pod/perldiag.pod
index 0f23480f2e..6c02d65e37 100644
--- a/pod/perldiag.pod
+++ b/pod/perldiag.pod
@@ -1797,23 +1797,6 @@ single form when it must operate on them directly. Either you've passed
an invalid file specification to Perl, or you've found a case the
conversion routines don't handle. Drat.
-=item Escape literal pattern white space under /x
-
-(D deprecated) You compiled a regular expression pattern with C</x> to
-ignore white space, and you used, as a literal, one of the characters
-that Perl plans to eventually treat as white space. The character must
-be escaped somehow, or it will work differently on a future Perl that
-does treat it as white space. The easiest way is to insert a backslash
-immediately before it, or to enclose it with square brackets. This
-change is to bring Perl into conformance with Unicode recommendations.
-Here are the five characters that generate this warning:
-U+0085 NEXT LINE,
-U+200E LEFT-TO-RIGHT MARK,
-U+200F RIGHT-TO-LEFT MARK,
-U+2028 LINE SEPARATOR,
-and
-U+2029 PARAGRAPH SEPARATOR.
-
=item Eval-group in insecure regular expression
(F) Perl detected tainted data when trying to compile a regular
diff --git a/pod/perlre.pod b/pod/perlre.pod
index 5fffed4a39..3f76210bc8 100644
--- a/pod/perlre.pod
+++ b/pod/perlre.pod
@@ -181,6 +181,21 @@ in C<\p{...}> there can be spaces that follow the Unicode rules, for which see
L<perluniprops/Properties accessible through \p{} and \P{}>.
X</x>
+The set of characters that are deemed whitespace are those that Unicode
+calls "Pattern White Space", namely:
+
+ U+0009 CHARACTER TABULATION
+ U+000A LINE FEED
+ U+000B LINE TABULATION
+ U+000C FORM FEED
+ U+000D CARRIAGE RETURN
+ U+0020 SPACE
+ U+0085 NEXT LINE
+ U+200E LEFT-TO-RIGHT MARK
+ U+200F RIGHT-TO-LEFT MARK
+ U+2028 LINE SEPARATOR
+ U+2029 PARAGRAPH SEPARATOR
+
=head3 Character set modifiers
C</d>, C</u>, C</a>, and C</l>, available starting in 5.14, are called
diff --git a/proto.h b/proto.h
index a8a0e5a97b..7a71622edf 100644
--- a/proto.h
+++ b/proto.h
@@ -6842,10 +6842,11 @@ STATIC SV * S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
#define PERL_ARGS_ASSERT_REG_SCAN_NAME \
assert(pRExC_state)
-STATIC bool S_reg_skipcomment(pTHX_ RExC_state_t *pRExC_state)
- __attribute__nonnull__(pTHX_1);
+PERL_STATIC_INLINE char * S_reg_skipcomment(pTHX_ RExC_state_t *pRExC_state, char * p)
+ __attribute__nonnull__(pTHX_1)
+ __attribute__nonnull__(pTHX_2);
#define PERL_ARGS_ASSERT_REG_SKIPCOMMENT \
- assert(pRExC_state)
+ assert(pRExC_state); assert(p)
STATIC regnode* S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
__attribute__nonnull__(pTHX_1);
@@ -6876,10 +6877,10 @@ STATIC void S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U
#define PERL_ARGS_ASSERT_REGINSERT \
assert(pRExC_state); assert(opnd)
-STATIC char * S_regpatws(RExC_state_t *pRExC_state, char *p, const bool recognize_comment)
+STATIC char * S_regpatws(pTHX_ RExC_state_t *pRExC_state, char *p, const bool recognize_comment)
__attribute__warn_unused_result__
- __attribute__nonnull__(1)
- __attribute__nonnull__(2);
+ __attribute__nonnull__(pTHX_1)
+ __attribute__nonnull__(pTHX_2);
#define PERL_ARGS_ASSERT_REGPATWS \
assert(pRExC_state); assert(p)
@@ -6907,13 +6908,6 @@ PERL_STATIC_INLINE STRLEN S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv,
#define PERL_ARGS_ASSERT_REGUNI \
assert(pRExC_state); assert(s)
-STATIC char * S_regwhite(RExC_state_t *pRExC_state, char *p)
- __attribute__warn_unused_result__
- __attribute__nonnull__(1)
- __attribute__nonnull__(2);
-#define PERL_ARGS_ASSERT_REGWHITE \
- assert(pRExC_state); assert(p)
-
STATIC void S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, struct scan_data_t *data, SSize_t *minlenp, int is_inf)
__attribute__nonnull__(pTHX_1)
__attribute__nonnull__(pTHX_2)
diff --git a/regcharclass.h b/regcharclass.h
index 5e34ec0d91..5b04492b0f 100644
--- a/regcharclass.h
+++ b/regcharclass.h
@@ -736,18 +736,6 @@
: 0 )
/*** GENERATED CODE ***/
-#define is_PATWS_non_low_safe(s,e,is_utf8) \
-( ((e) > (s)) ? \
- ( (! is_utf8) ? \
- ( 0x85 == NATIVE_TO_LATIN1(((U8*)s)[0]) ) \
- : (((e) - (s)) >= UTF8SKIP(s)) ? \
- ( ( 0xC2 == NATIVE_TO_LATIN1(((U8*)s)[0]) ) ? \
- ( ( 0x85 == NATIVE_TO_LATIN1(((U8*)s)[1]) ) ? 2 : 0 ) \
- : ( ( ( 0xE2 == NATIVE_TO_LATIN1(((U8*)s)[0]) ) && ( 0x80 == NATIVE_TO_LATIN1(((U8*)s)[1]) ) ) && ( ( NATIVE_TO_LATIN1(((U8*)s)[2]) & 0xFE ) == 0x8E || ( NATIVE_TO_LATIN1(((U8*)s)[2]) & 0xFE ) == 0xA8 ) ) ? 3 : 0 )\
- : 0 ) \
-: 0 )
-
-/*** GENERATED CODE ***/
#define is_PATWS_cp(cp) \
( ( 0x09 <= NATIVE_TO_UNI(cp) && NATIVE_TO_UNI(cp) <= 0x0D ) || ( 0x0D < NATIVE_TO_UNI(cp) &&\
( 0x20 == NATIVE_TO_UNI(cp) || ( 0x20 < NATIVE_TO_UNI(cp) && \
diff --git a/regcomp.c b/regcomp.c
index badff4af8c..4cd50ee2ff 100644
--- a/regcomp.c
+++ b/regcomp.c
@@ -10676,7 +10676,8 @@ S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode** node_p,
* modifier. The other meaning does not, so use a temporary until we find
* out which we are being called with */
p = (RExC_flags & RXf_PMf_EXTENDED)
- ? regwhite( pRExC_state, RExC_parse )
+ ? regpatws(pRExC_state, RExC_parse,
+ TRUE) /* means recognize comments */
: RExC_parse;
/* Disambiguate between \N meaning a named character versus \N meaning
@@ -11640,7 +11641,8 @@ tryagain:
case '#':
if (RExC_flags & RXf_PMf_EXTENDED) {
- if ( reg_skipcomment( pRExC_state ) )
+ RExC_parse = reg_skipcomment( pRExC_state, RExC_parse );
+ if (RExC_parse < RExC_end)
goto tryagain;
}
/* FALLTHROUGH */
@@ -11721,7 +11723,8 @@ tryagain:
oldp = p;
if (RExC_flags & RXf_PMf_EXTENDED)
- p = regwhite( pRExC_state, p );
+ p = regpatws(pRExC_state, p,
+ TRUE); /* means recognize comments */
switch ((U8)*p) {
case '^':
case '$':
@@ -11949,15 +11952,6 @@ tryagain:
break;
default: /* A literal character */
- if (! SIZE_ONLY
- && RExC_flags & RXf_PMf_EXTENDED
- && ckWARN_d(WARN_DEPRECATED)
- && is_PATWS_non_low_safe(p, RExC_end, UTF))
- {
- vWARN_dep(p + ((UTF) ? UTF8SKIP(p) : 1),
- "Escape literal pattern white space under /x");
- }
-
normal_default:
if (UTF8_IS_START(*p) && UTF) {
STRLEN numlen;
@@ -11975,7 +11969,8 @@ tryagain:
*/
if ( RExC_flags & RXf_PMf_EXTENDED)
- p = regwhite( pRExC_state, p );
+ p = regpatws(pRExC_state, p,
+ TRUE); /* means recognize comments */
/* If the next thing is a quantifier, it applies to this
* character only, which means that this character has to be in
@@ -12330,39 +12325,11 @@ tryagain:
}
STATIC char *
-S_regwhite( RExC_state_t *pRExC_state, char *p )
-{
- const char *e = RExC_end;
-
- PERL_ARGS_ASSERT_REGWHITE;
-
- while (p < e) {
- if (isSPACE(*p))
- ++p;
- else if (*p == '#') {
- bool ended = 0;
- do {
- if (*p++ == '\n') {
- ended = 1;
- break;
- }
- } while (p < e);
- if (!ended)
- RExC_seen |= REG_RUN_ON_COMMENT_SEEN;
- }
- else
- break;
- }
- return p;
-}
-
-STATIC char *
-S_regpatws( RExC_state_t *pRExC_state, char *p , const bool recognize_comment )
+S_regpatws(pTHX_ RExC_state_t *pRExC_state, char *p , const bool recognize_comment )
{
/* Returns the next non-pattern-white space, non-comment character (the
* latter only if 'recognize_comment is true) in the string p, which is
- * ended by RExC_end. If there is no line break ending a comment,
- * RExC_seen has added the REG_RUN_ON_COMMENT_SEEN flag; */
+ * ended by RExC_end. See also reg_skipcomment */
const char *e = RExC_end;
PERL_ARGS_ASSERT_REGPATWS;
@@ -12373,16 +12340,7 @@ S_regpatws( RExC_state_t *pRExC_state, char *p , const bool recognize_comment )
p += len;
}
else if (recognize_comment && *p == '#') {
- bool ended = 0;
- do {
- p++;
- if (is_LNBREAK_safe(p, e, UTF)) {
- ended = 1;
- break;
- }
- } while (p < e);
- if (!ended)
- RExC_seen |= REG_RUN_ON_COMMENT_SEEN;
+ p = reg_skipcomment(pRExC_state, p);
}
else
break;
@@ -12710,7 +12668,7 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist,
while (RExC_parse < RExC_end) {
SV* current = NULL;
RExC_parse = regpatws(pRExC_state, RExC_parse,
- TRUE); /* means recognize comments */
+ TRUE); /* means recognize comments */
switch (*RExC_parse) {
case '?':
if (RExC_parse[1] == '[') depth++, RExC_parse++;
@@ -12827,7 +12785,7 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist,
/* Skip white space */
RExC_parse = regpatws(pRExC_state, RExC_parse,
- TRUE); /* means recognize comments */
+ TRUE /* means recognize comments */ );
if (RExC_parse >= RExC_end) {
Perl_croak(aTHX_ "panic: Read past end of '(?[ ])'");
}
@@ -13299,7 +13257,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
if (skip_white) {
RExC_parse = regpatws(pRExC_state, RExC_parse,
- FALSE /* means don't recognize comments */);
+ FALSE /* means don't recognize comments */ );
}
if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
@@ -13309,7 +13267,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
RExC_naughty++;
if (skip_white) {
RExC_parse = regpatws(pRExC_state, RExC_parse,
- FALSE /* means don't recognize comments */);
+ FALSE /* means don't recognize comments */ );
}
}
@@ -13347,7 +13305,7 @@ parseit:
if (skip_white) {
RExC_parse = regpatws(pRExC_state, RExC_parse,
- FALSE /* means don't recognize comments */);
+ FALSE /* means don't recognize comments */ );
}
if (UCHARAT(RExC_parse) == ']') {
@@ -13858,7 +13816,7 @@ parseit:
if (skip_white) {
RExC_parse = regpatws(pRExC_state, RExC_parse,
- FALSE /* means don't recognize comments */);
+ FALSE /* means don't recognize comments */ );
}
if (range) {
@@ -14955,35 +14913,34 @@ S_set_ANYOF_arg(pTHX_ RExC_state_t* const pRExC_state,
/* reg_skipcomment()
- Absorbs an /x style # comments from the input stream.
- Returns true if there is more text remaining in the stream.
- Will set the REG_RUN_ON_COMMENT_SEEN flag if the comment
- terminates the pattern without including a newline.
+ Absorbs an /x style # comment from the input stream,
+ returning a pointer to the first character beyond the comment, or if the
+ comment terminates the pattern without anything following it, this returns
+ one past the final character of the pattern (in other words, RExC_end) and
+ sets the REG_RUN_ON_COMMENT_SEEN flag.
- Note its the callers responsibility to ensure that we are
+ Note it's the callers responsibility to ensure that we are
actually in /x mode
*/
-STATIC bool
-S_reg_skipcomment(pTHX_ RExC_state_t *pRExC_state)
+PERL_STATIC_INLINE char*
+S_reg_skipcomment(pTHX_ RExC_state_t *pRExC_state, char* p)
{
- bool ended = 0;
-
PERL_ARGS_ASSERT_REG_SKIPCOMMENT;
- while (RExC_parse < RExC_end)
- if (*RExC_parse++ == '\n') {
- ended = 1;
- break;
+ assert(*p = '#');
+
+ while (p < RExC_end) {
+ if (*(++p) == '\n') {
+ return p+1;
}
- if (!ended) {
- /* we ran off the end of the pattern without ending
- the comment, so we have to add an \n when wrapping */
- RExC_seen |= REG_RUN_ON_COMMENT_SEEN;
- return 0;
- } else
- return 1;
+ }
+
+ /* we ran off the end of the pattern without ending the comment, so we have
+ * to add an \n when wrapping */
+ RExC_seen |= REG_RUN_ON_COMMENT_SEEN;
+ return p;
}
/* nextchar()
@@ -15021,16 +14978,14 @@ S_nextchar(pTHX_ RExC_state_t *pRExC_state)
continue;
}
if (RExC_flags & RXf_PMf_EXTENDED) {
- if (isSPACE(*RExC_parse)) {
- RExC_parse++;
- continue;
- }
- else if (*RExC_parse == '#') {
- if ( reg_skipcomment( pRExC_state ) )
- continue;
- }
+ char * p = regpatws(pRExC_state, RExC_parse,
+ TRUE); /* means recognize comments */
+ if (p != RExC_parse) {
+ RExC_parse = p;
+ continue;
+ }
}
- return retval;
+ return retval;
}
}
diff --git a/regen/regcharclass.pl b/regen/regcharclass.pl
index 187023aa67..b837af4931 100755
--- a/regen/regcharclass.pl
+++ b/regen/regcharclass.pl
@@ -1652,5 +1652,5 @@ PROBLEMATIC_LOCALE_FOLDEDS_START : The first folded character of folds which are
\p{_Perl_Problematic_Locale_Foldeds_Start}
PATWS: pattern white space
-=> generic generic_non_low cp : safe
+=> generic cp : safe
\p{PatWS}
diff --git a/t/re/pat.t b/t/re/pat.t
index c6e7f967b1..71cfeaaee4 100644
--- a/t/re/pat.t
+++ b/t/re/pat.t
@@ -20,7 +20,7 @@ BEGIN {
require './test.pl';
}
-plan tests => 732; # Update this when adding/deleting tests.
+plan tests => 733; # Update this when adding/deleting tests.
run_tests() unless caller;
@@ -1599,6 +1599,11 @@ EOP
ok("\x{101}" =~ /\x{101}/i, "A WITH MACRON l =~ l");
}
+ {
+ use utf8;
+ ok("abc" =~ /a…b…c/x, "NEL is white-space under /x");
+ }
+
} # End of sub run_tests
1;
diff --git a/t/re/reg_mesg.t b/t/re/reg_mesg.t
index 529708a2d9..857eba2ac6 100644
--- a/t/re/reg_mesg.t
+++ b/t/re/reg_mesg.t
@@ -416,8 +416,6 @@ my @experimental_regex_sets = (
);
my @deprecated = (
- "/(?x)latin1\\\x{85}\x{85}\\\x{85}/" => 'Escape literal pattern white space under /x {#} ' . "m/(?x)latin1\\\x{85}\x{85}{#}\\\x{85}/",
- 'use utf8; /(?x)utf8\……\…/' => 'Escape literal pattern white space under /x {#} ' . "m/(?x)utf8\\\N{NEXT LINE}\N{NEXT LINE}{#}\\\N{NEXT LINE}/",
);
while (my ($regex, $expect) = splice @death, 0, 2) {