diff options
-rw-r--r-- | embed.fnc | 6 | ||||
-rw-r--r-- | embed.h | 5 | ||||
-rw-r--r-- | pod/perldelta.pod | 32 | ||||
-rw-r--r-- | pod/perldiag.pod | 17 | ||||
-rw-r--r-- | pod/perlre.pod | 15 | ||||
-rw-r--r-- | proto.h | 20 | ||||
-rw-r--r-- | regcharclass.h | 12 | ||||
-rw-r--r-- | regcomp.c | 131 | ||||
-rwxr-xr-x | regen/regcharclass.pl | 2 | ||||
-rw-r--r-- | t/re/pat.t | 7 | ||||
-rw-r--r-- | t/re/reg_mesg.t | 2 |
11 files changed, 108 insertions, 141 deletions
@@ -2093,16 +2093,14 @@ Es |U32 |join_exact |NN RExC_state_t *pRExC_state \ |NN regnode *scan|NN UV *min_subtract \ |NN bool *unfolded_multi_char \ |U32 flags|NULLOK regnode *val|U32 depth -EsRn |char * |regwhite |NN RExC_state_t *pRExC_state \ - |NN char *p -EsRn |char * |regpatws |NN RExC_state_t *pRExC_state \ +EsR |char * |regpatws |NN RExC_state_t *pRExC_state \ |NN char *p|const bool recognize_comment Ei |void |alloc_maybe_populate_EXACT|NN RExC_state_t *pRExC_state \ |NN regnode *node|NN I32 *flagp|STRLEN len \ |UV code_point|bool downgradable Ei |U8 |compute_EXACTish|NN RExC_state_t *pRExC_state Es |char * |nextchar |NN RExC_state_t *pRExC_state -Es |bool |reg_skipcomment|NN RExC_state_t *pRExC_state +Ei |char * |reg_skipcomment|NN RExC_state_t *pRExC_state|NN char * p Es |void |scan_commit |NN const RExC_state_t *pRExC_state \ |NN struct scan_data_t *data \ |NN SSize_t *minlenp \ @@ -940,18 +940,17 @@ #define reg_node(a,b) S_reg_node(aTHX_ a,b) #define reg_recode(a,b) S_reg_recode(aTHX_ a,b) #define reg_scan_name(a,b) S_reg_scan_name(aTHX_ a,b) -#define reg_skipcomment(a) S_reg_skipcomment(aTHX_ a) +#define reg_skipcomment(a,b) S_reg_skipcomment(aTHX_ a,b) #define reganode(a,b,c) S_reganode(aTHX_ a,b,c) #define regatom(a,b,c) S_regatom(aTHX_ a,b,c) #define regbranch(a,b,c,d) S_regbranch(aTHX_ a,b,c,d) #define regclass(a,b,c,d,e,f,g) S_regclass(aTHX_ a,b,c,d,e,f,g) #define reginsert(a,b,c,d) S_reginsert(aTHX_ a,b,c,d) -#define regpatws S_regpatws +#define regpatws(a,b,c) S_regpatws(aTHX_ a,b,c) #define regpiece(a,b,c) S_regpiece(aTHX_ a,b,c) #define regpposixcc(a,b,c) S_regpposixcc(aTHX_ a,b,c) #define regtail(a,b,c,d) S_regtail(aTHX_ a,b,c,d) #define reguni(a,b,c) S_reguni(aTHX_ a,b,c) -#define regwhite S_regwhite #define scan_commit(a,b,c,d) S_scan_commit(aTHX_ a,b,c,d) #define set_ANYOF_arg(a,b,c,d,e,f,g) S_set_ANYOF_arg(aTHX_ a,b,c,d,e,f,g) #define ssc_add_range(a,b,c) S_ssc_add_range(aTHX_ a,b,c) diff --git a/pod/perldelta.pod b/pod/perldelta.pod index c110c9718a..359910b782 100644 --- a/pod/perldelta.pod +++ b/pod/perldelta.pod @@ -27,6 +27,20 @@ here, but most should go in the L</Performance Enhancements> section. [ List each enhancement as a =head2 entry ] +=head2 C<qr/foo/x> now ignores any Unicode pattern white space + +The C</x> regular expression modifier allows the pattern to contain +white space and comments, both of which are ignored, for improved +readability. Until now, not all the white space characters that Unicode +designates for this purpose were handled. The additional ones now +recognized are +U+0085 NEXT LINE, +U+200E LEFT-TO-RIGHT MARK, +U+200F RIGHT-TO-LEFT MARK, +U+2028 LINE SEPARATOR, +and +U+2029 PARAGRAPH SEPARATOR. + =head1 Security XXX Any security-related notices go here. In particular, any security @@ -52,6 +66,24 @@ now a fatal compilation error. These had been deprecated since v5.18. +=head2 5 additional characters are treated as white space under C</x> in +regex patterns (unless escaped) + +The use of these characters with C</x> outside bracketed character +classes and when not preceeded by a backslash has raised a deprecation +warning since v5.18. Now they will be ignored. See L</qrE<sol>fooE<sol>x> +for the list of the five characters. + +=head2 Comment lines within S<C<(?[ ])>> now are ended only by a C<\n> + +S<C<(?[ ])>> is an experimental feature, introduced in v5.18. It operates +as if C</x> is always enabled. But there was a difference, comment +lines (following a C<#> character) were terminated by anything matching +C<\R> which includes all vertical whitespace, such as form feeds. For +consistency, this is now changed to match what terminates comment lines +outside S<C<(?[ ])>>, namely a C<\n> (even if escaped), which is the +same as what terminates a heredoc string and formats. + =head1 Deprecations XXX Any deprecated features, syntax, modules etc. should be listed here. diff --git a/pod/perldiag.pod b/pod/perldiag.pod index 0f23480f2e..6c02d65e37 100644 --- a/pod/perldiag.pod +++ b/pod/perldiag.pod @@ -1797,23 +1797,6 @@ single form when it must operate on them directly. Either you've passed an invalid file specification to Perl, or you've found a case the conversion routines don't handle. Drat. -=item Escape literal pattern white space under /x - -(D deprecated) You compiled a regular expression pattern with C</x> to -ignore white space, and you used, as a literal, one of the characters -that Perl plans to eventually treat as white space. The character must -be escaped somehow, or it will work differently on a future Perl that -does treat it as white space. The easiest way is to insert a backslash -immediately before it, or to enclose it with square brackets. This -change is to bring Perl into conformance with Unicode recommendations. -Here are the five characters that generate this warning: -U+0085 NEXT LINE, -U+200E LEFT-TO-RIGHT MARK, -U+200F RIGHT-TO-LEFT MARK, -U+2028 LINE SEPARATOR, -and -U+2029 PARAGRAPH SEPARATOR. - =item Eval-group in insecure regular expression (F) Perl detected tainted data when trying to compile a regular diff --git a/pod/perlre.pod b/pod/perlre.pod index 5fffed4a39..3f76210bc8 100644 --- a/pod/perlre.pod +++ b/pod/perlre.pod @@ -181,6 +181,21 @@ in C<\p{...}> there can be spaces that follow the Unicode rules, for which see L<perluniprops/Properties accessible through \p{} and \P{}>. X</x> +The set of characters that are deemed whitespace are those that Unicode +calls "Pattern White Space", namely: + + U+0009 CHARACTER TABULATION + U+000A LINE FEED + U+000B LINE TABULATION + U+000C FORM FEED + U+000D CARRIAGE RETURN + U+0020 SPACE + U+0085 NEXT LINE + U+200E LEFT-TO-RIGHT MARK + U+200F RIGHT-TO-LEFT MARK + U+2028 LINE SEPARATOR + U+2029 PARAGRAPH SEPARATOR + =head3 Character set modifiers C</d>, C</u>, C</a>, and C</l>, available starting in 5.14, are called @@ -6842,10 +6842,11 @@ STATIC SV * S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags) #define PERL_ARGS_ASSERT_REG_SCAN_NAME \ assert(pRExC_state) -STATIC bool S_reg_skipcomment(pTHX_ RExC_state_t *pRExC_state) - __attribute__nonnull__(pTHX_1); +PERL_STATIC_INLINE char * S_reg_skipcomment(pTHX_ RExC_state_t *pRExC_state, char * p) + __attribute__nonnull__(pTHX_1) + __attribute__nonnull__(pTHX_2); #define PERL_ARGS_ASSERT_REG_SKIPCOMMENT \ - assert(pRExC_state) + assert(pRExC_state); assert(p) STATIC regnode* S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg) __attribute__nonnull__(pTHX_1); @@ -6876,10 +6877,10 @@ STATIC void S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U #define PERL_ARGS_ASSERT_REGINSERT \ assert(pRExC_state); assert(opnd) -STATIC char * S_regpatws(RExC_state_t *pRExC_state, char *p, const bool recognize_comment) +STATIC char * S_regpatws(pTHX_ RExC_state_t *pRExC_state, char *p, const bool recognize_comment) __attribute__warn_unused_result__ - __attribute__nonnull__(1) - __attribute__nonnull__(2); + __attribute__nonnull__(pTHX_1) + __attribute__nonnull__(pTHX_2); #define PERL_ARGS_ASSERT_REGPATWS \ assert(pRExC_state); assert(p) @@ -6907,13 +6908,6 @@ PERL_STATIC_INLINE STRLEN S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, #define PERL_ARGS_ASSERT_REGUNI \ assert(pRExC_state); assert(s) -STATIC char * S_regwhite(RExC_state_t *pRExC_state, char *p) - __attribute__warn_unused_result__ - __attribute__nonnull__(1) - __attribute__nonnull__(2); -#define PERL_ARGS_ASSERT_REGWHITE \ - assert(pRExC_state); assert(p) - STATIC void S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, struct scan_data_t *data, SSize_t *minlenp, int is_inf) __attribute__nonnull__(pTHX_1) __attribute__nonnull__(pTHX_2) diff --git a/regcharclass.h b/regcharclass.h index 5e34ec0d91..5b04492b0f 100644 --- a/regcharclass.h +++ b/regcharclass.h @@ -736,18 +736,6 @@ : 0 ) /*** GENERATED CODE ***/ -#define is_PATWS_non_low_safe(s,e,is_utf8) \ -( ((e) > (s)) ? \ - ( (! is_utf8) ? \ - ( 0x85 == NATIVE_TO_LATIN1(((U8*)s)[0]) ) \ - : (((e) - (s)) >= UTF8SKIP(s)) ? \ - ( ( 0xC2 == NATIVE_TO_LATIN1(((U8*)s)[0]) ) ? \ - ( ( 0x85 == NATIVE_TO_LATIN1(((U8*)s)[1]) ) ? 2 : 0 ) \ - : ( ( ( 0xE2 == NATIVE_TO_LATIN1(((U8*)s)[0]) ) && ( 0x80 == NATIVE_TO_LATIN1(((U8*)s)[1]) ) ) && ( ( NATIVE_TO_LATIN1(((U8*)s)[2]) & 0xFE ) == 0x8E || ( NATIVE_TO_LATIN1(((U8*)s)[2]) & 0xFE ) == 0xA8 ) ) ? 3 : 0 )\ - : 0 ) \ -: 0 ) - -/*** GENERATED CODE ***/ #define is_PATWS_cp(cp) \ ( ( 0x09 <= NATIVE_TO_UNI(cp) && NATIVE_TO_UNI(cp) <= 0x0D ) || ( 0x0D < NATIVE_TO_UNI(cp) &&\ ( 0x20 == NATIVE_TO_UNI(cp) || ( 0x20 < NATIVE_TO_UNI(cp) && \ @@ -10676,7 +10676,8 @@ S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode** node_p, * modifier. The other meaning does not, so use a temporary until we find * out which we are being called with */ p = (RExC_flags & RXf_PMf_EXTENDED) - ? regwhite( pRExC_state, RExC_parse ) + ? regpatws(pRExC_state, RExC_parse, + TRUE) /* means recognize comments */ : RExC_parse; /* Disambiguate between \N meaning a named character versus \N meaning @@ -11640,7 +11641,8 @@ tryagain: case '#': if (RExC_flags & RXf_PMf_EXTENDED) { - if ( reg_skipcomment( pRExC_state ) ) + RExC_parse = reg_skipcomment( pRExC_state, RExC_parse ); + if (RExC_parse < RExC_end) goto tryagain; } /* FALLTHROUGH */ @@ -11721,7 +11723,8 @@ tryagain: oldp = p; if (RExC_flags & RXf_PMf_EXTENDED) - p = regwhite( pRExC_state, p ); + p = regpatws(pRExC_state, p, + TRUE); /* means recognize comments */ switch ((U8)*p) { case '^': case '$': @@ -11949,15 +11952,6 @@ tryagain: break; default: /* A literal character */ - if (! SIZE_ONLY - && RExC_flags & RXf_PMf_EXTENDED - && ckWARN_d(WARN_DEPRECATED) - && is_PATWS_non_low_safe(p, RExC_end, UTF)) - { - vWARN_dep(p + ((UTF) ? UTF8SKIP(p) : 1), - "Escape literal pattern white space under /x"); - } - normal_default: if (UTF8_IS_START(*p) && UTF) { STRLEN numlen; @@ -11975,7 +11969,8 @@ tryagain: */ if ( RExC_flags & RXf_PMf_EXTENDED) - p = regwhite( pRExC_state, p ); + p = regpatws(pRExC_state, p, + TRUE); /* means recognize comments */ /* If the next thing is a quantifier, it applies to this * character only, which means that this character has to be in @@ -12330,39 +12325,11 @@ tryagain: } STATIC char * -S_regwhite( RExC_state_t *pRExC_state, char *p ) -{ - const char *e = RExC_end; - - PERL_ARGS_ASSERT_REGWHITE; - - while (p < e) { - if (isSPACE(*p)) - ++p; - else if (*p == '#') { - bool ended = 0; - do { - if (*p++ == '\n') { - ended = 1; - break; - } - } while (p < e); - if (!ended) - RExC_seen |= REG_RUN_ON_COMMENT_SEEN; - } - else - break; - } - return p; -} - -STATIC char * -S_regpatws( RExC_state_t *pRExC_state, char *p , const bool recognize_comment ) +S_regpatws(pTHX_ RExC_state_t *pRExC_state, char *p , const bool recognize_comment ) { /* Returns the next non-pattern-white space, non-comment character (the * latter only if 'recognize_comment is true) in the string p, which is - * ended by RExC_end. If there is no line break ending a comment, - * RExC_seen has added the REG_RUN_ON_COMMENT_SEEN flag; */ + * ended by RExC_end. See also reg_skipcomment */ const char *e = RExC_end; PERL_ARGS_ASSERT_REGPATWS; @@ -12373,16 +12340,7 @@ S_regpatws( RExC_state_t *pRExC_state, char *p , const bool recognize_comment ) p += len; } else if (recognize_comment && *p == '#') { - bool ended = 0; - do { - p++; - if (is_LNBREAK_safe(p, e, UTF)) { - ended = 1; - break; - } - } while (p < e); - if (!ended) - RExC_seen |= REG_RUN_ON_COMMENT_SEEN; + p = reg_skipcomment(pRExC_state, p); } else break; @@ -12710,7 +12668,7 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist, while (RExC_parse < RExC_end) { SV* current = NULL; RExC_parse = regpatws(pRExC_state, RExC_parse, - TRUE); /* means recognize comments */ + TRUE); /* means recognize comments */ switch (*RExC_parse) { case '?': if (RExC_parse[1] == '[') depth++, RExC_parse++; @@ -12827,7 +12785,7 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist, /* Skip white space */ RExC_parse = regpatws(pRExC_state, RExC_parse, - TRUE); /* means recognize comments */ + TRUE /* means recognize comments */ ); if (RExC_parse >= RExC_end) { Perl_croak(aTHX_ "panic: Read past end of '(?[ ])'"); } @@ -13299,7 +13257,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, if (skip_white) { RExC_parse = regpatws(pRExC_state, RExC_parse, - FALSE /* means don't recognize comments */); + FALSE /* means don't recognize comments */ ); } if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */ @@ -13309,7 +13267,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, RExC_naughty++; if (skip_white) { RExC_parse = regpatws(pRExC_state, RExC_parse, - FALSE /* means don't recognize comments */); + FALSE /* means don't recognize comments */ ); } } @@ -13347,7 +13305,7 @@ parseit: if (skip_white) { RExC_parse = regpatws(pRExC_state, RExC_parse, - FALSE /* means don't recognize comments */); + FALSE /* means don't recognize comments */ ); } if (UCHARAT(RExC_parse) == ']') { @@ -13858,7 +13816,7 @@ parseit: if (skip_white) { RExC_parse = regpatws(pRExC_state, RExC_parse, - FALSE /* means don't recognize comments */); + FALSE /* means don't recognize comments */ ); } if (range) { @@ -14955,35 +14913,34 @@ S_set_ANYOF_arg(pTHX_ RExC_state_t* const pRExC_state, /* reg_skipcomment() - Absorbs an /x style # comments from the input stream. - Returns true if there is more text remaining in the stream. - Will set the REG_RUN_ON_COMMENT_SEEN flag if the comment - terminates the pattern without including a newline. + Absorbs an /x style # comment from the input stream, + returning a pointer to the first character beyond the comment, or if the + comment terminates the pattern without anything following it, this returns + one past the final character of the pattern (in other words, RExC_end) and + sets the REG_RUN_ON_COMMENT_SEEN flag. - Note its the callers responsibility to ensure that we are + Note it's the callers responsibility to ensure that we are actually in /x mode */ -STATIC bool -S_reg_skipcomment(pTHX_ RExC_state_t *pRExC_state) +PERL_STATIC_INLINE char* +S_reg_skipcomment(pTHX_ RExC_state_t *pRExC_state, char* p) { - bool ended = 0; - PERL_ARGS_ASSERT_REG_SKIPCOMMENT; - while (RExC_parse < RExC_end) - if (*RExC_parse++ == '\n') { - ended = 1; - break; + assert(*p = '#'); + + while (p < RExC_end) { + if (*(++p) == '\n') { + return p+1; } - if (!ended) { - /* we ran off the end of the pattern without ending - the comment, so we have to add an \n when wrapping */ - RExC_seen |= REG_RUN_ON_COMMENT_SEEN; - return 0; - } else - return 1; + } + + /* we ran off the end of the pattern without ending the comment, so we have + * to add an \n when wrapping */ + RExC_seen |= REG_RUN_ON_COMMENT_SEEN; + return p; } /* nextchar() @@ -15021,16 +14978,14 @@ S_nextchar(pTHX_ RExC_state_t *pRExC_state) continue; } if (RExC_flags & RXf_PMf_EXTENDED) { - if (isSPACE(*RExC_parse)) { - RExC_parse++; - continue; - } - else if (*RExC_parse == '#') { - if ( reg_skipcomment( pRExC_state ) ) - continue; - } + char * p = regpatws(pRExC_state, RExC_parse, + TRUE); /* means recognize comments */ + if (p != RExC_parse) { + RExC_parse = p; + continue; + } } - return retval; + return retval; } } diff --git a/regen/regcharclass.pl b/regen/regcharclass.pl index 187023aa67..b837af4931 100755 --- a/regen/regcharclass.pl +++ b/regen/regcharclass.pl @@ -1652,5 +1652,5 @@ PROBLEMATIC_LOCALE_FOLDEDS_START : The first folded character of folds which are \p{_Perl_Problematic_Locale_Foldeds_Start} PATWS: pattern white space -=> generic generic_non_low cp : safe +=> generic cp : safe \p{PatWS} diff --git a/t/re/pat.t b/t/re/pat.t index c6e7f967b1..71cfeaaee4 100644 --- a/t/re/pat.t +++ b/t/re/pat.t @@ -20,7 +20,7 @@ BEGIN { require './test.pl'; } -plan tests => 732; # Update this when adding/deleting tests. +plan tests => 733; # Update this when adding/deleting tests. run_tests() unless caller; @@ -1599,6 +1599,11 @@ EOP ok("\x{101}" =~ /\x{101}/i, "A WITH MACRON l =~ l"); } + { + use utf8; + ok("abc" =~ /a
b
c/x, "NEL is white-space under /x"); + } + } # End of sub run_tests 1; diff --git a/t/re/reg_mesg.t b/t/re/reg_mesg.t index 529708a2d9..857eba2ac6 100644 --- a/t/re/reg_mesg.t +++ b/t/re/reg_mesg.t @@ -416,8 +416,6 @@ my @experimental_regex_sets = ( ); my @deprecated = ( - "/(?x)latin1\\\x{85}\x{85}\\\x{85}/" => 'Escape literal pattern white space under /x {#} ' . "m/(?x)latin1\\\x{85}\x{85}{#}\\\x{85}/", - 'use utf8; /(?x)utf8\
\
/' => 'Escape literal pattern white space under /x {#} ' . "m/(?x)utf8\\\N{NEXT LINE}\N{NEXT LINE}{#}\\\N{NEXT LINE}/", ); while (my ($regex, $expect) = splice @death, 0, 2) { |