11 files changed, 108 insertions, 141 deletions
diff --git a/embed.fnc b/embed.fnc
index 6fe5daa43f..478e7485cc 100644
--- a/embed.fnc
+++ b/embed.fnc
@@ -2093,16 +2093,14 @@ Es	|U32	|join_exact	|NN RExC_state_t *pRExC_state \
 				|NN regnode *scan|NN UV *min_subtract  \
 				|NN bool *unfolded_multi_char          \
 				|U32 flags|NULLOK regnode *val|U32 depth
-EsRn	|char *	|regwhite	|NN RExC_state_t *pRExC_state \
-				|NN char *p
-EsRn	|char *	|regpatws	|NN RExC_state_t *pRExC_state \
+EsR	|char *	|regpatws	|NN RExC_state_t *pRExC_state \
 				|NN char *p|const bool recognize_comment
 Ei	|void   |alloc_maybe_populate_EXACT|NN RExC_state_t *pRExC_state \
 				|NN regnode *node|NN I32 *flagp|STRLEN len \
 				|UV code_point|bool downgradable
 Ei	|U8   |compute_EXACTish|NN RExC_state_t *pRExC_state
 Es	|char *	|nextchar	|NN RExC_state_t *pRExC_state
-Es	|bool	|reg_skipcomment|NN RExC_state_t *pRExC_state
+Ei	|char *	|reg_skipcomment|NN RExC_state_t *pRExC_state|NN char * p
 Es	|void	|scan_commit	|NN const RExC_state_t *pRExC_state \
 				|NN struct scan_data_t *data        \
 				|NN SSize_t *minlenp		    \
diff --git a/embed.h b/embed.h
index 68e00ea59a..6ef726602d 100644
--- a/embed.h
+++ b/embed.h
@@ -940,18 +940,17 @@
 #define reg_node(a,b)		S_reg_node(aTHX_ a,b)
 #define reg_recode(a,b)		S_reg_recode(aTHX_ a,b)
 #define reg_scan_name(a,b)	S_reg_scan_name(aTHX_ a,b)
-#define reg_skipcomment(a)	S_reg_skipcomment(aTHX_ a)
+#define reg_skipcomment(a,b)	S_reg_skipcomment(aTHX_ a,b)
 #define reganode(a,b,c)		S_reganode(aTHX_ a,b,c)
 #define regatom(a,b,c)		S_regatom(aTHX_ a,b,c)
 #define regbranch(a,b,c,d)	S_regbranch(aTHX_ a,b,c,d)
 #define regclass(a,b,c,d,e,f,g)	S_regclass(aTHX_ a,b,c,d,e,f,g)
 #define reginsert(a,b,c,d)	S_reginsert(aTHX_ a,b,c,d)
-#define regpatws		S_regpatws
+#define regpatws(a,b,c)		S_regpatws(aTHX_ a,b,c)
 #define regpiece(a,b,c)		S_regpiece(aTHX_ a,b,c)
 #define regpposixcc(a,b,c)	S_regpposixcc(aTHX_ a,b,c)
 #define regtail(a,b,c,d)	S_regtail(aTHX_ a,b,c,d)
 #define reguni(a,b,c)		S_reguni(aTHX_ a,b,c)
-#define regwhite		S_regwhite
 #define scan_commit(a,b,c,d)	S_scan_commit(aTHX_ a,b,c,d)
 #define set_ANYOF_arg(a,b,c,d,e,f,g)	S_set_ANYOF_arg(aTHX_ a,b,c,d,e,f,g)
 #define ssc_add_range(a,b,c)	S_ssc_add_range(aTHX_ a,b,c)
diff --git a/pod/perldelta.pod b/pod/perldelta.pod
index c110c9718a..359910b782 100644
--- a/pod/perldelta.pod
+++ b/pod/perldelta.pod
@@ -27,6 +27,20 @@ here, but most should go in the L</Performance Enhancements> section.
 
 [ List each enhancement as a =head2 entry ]
 
+=head2 C<qr/foo/x> now ignores any Unicode pattern white space
+
+The C</x> regular expression modifier allows the pattern to contain
+white space and comments, both of which are ignored, for improved
+readability.  Until now, not all the white space characters that Unicode
+designates for this purpose were handled.  The additional ones now
+recognized are
+U+0085 NEXT LINE,
+U+200E LEFT-TO-RIGHT MARK,
+U+200F RIGHT-TO-LEFT MARK,
+U+2028 LINE SEPARATOR,
+and
+U+2029 PARAGRAPH SEPARATOR.
+
 =head1 Security
 
 XXX Any security-related notices go here.  In particular, any security
@@ -52,6 +66,24 @@ now a fatal compilation error.
 
 These had been deprecated since v5.18.
 
+=head2 5 additional characters are treated as white space under C</x> in
+regex patterns (unless escaped)
+
+The use of these characters with C</x> outside bracketed character
+classes and when not preceeded by a backslash has raised a deprecation
+warning since v5.18.  Now they will be ignored.  See L</qrE<sol>fooE<sol>x>
+for the list of the five characters.
+
+=head2 Comment lines within S<C<(?[ ])>> now are ended only by a C<\n>
+
+S<C<(?[ ])>>  is an experimental feature, introduced in v5.18.  It operates
+as if C</x> is always enabled.  But there was a difference, comment
+lines (following a C<#> character) were terminated by anything matching
+C<\R> which includes all vertical whitespace, such as form feeds.  For
+consistency, this is now changed to match what terminates comment lines
+outside S<C<(?[ ])>>, namely a C<\n> (even if escaped), which is the
+same as what terminates a heredoc string and formats.
+
 =head1 Deprecations
 
 XXX Any deprecated features, syntax, modules etc. should be listed here.
diff --git a/pod/perldiag.pod b/pod/perldiag.pod
index 0f23480f2e..6c02d65e37 100644
--- a/pod/perldiag.pod
+++ b/pod/perldiag.pod
@@ -1797,23 +1797,6 @@ single form when it must operate on them directly.  Either you've passed
 an invalid file specification to Perl, or you've found a case the
 conversion routines don't handle.  Drat.
 
-=item Escape literal pattern white space under /x
-
-(D deprecated) You compiled a regular expression pattern with C</x> to
-ignore white space, and you used, as a literal, one of the characters
-that Perl plans to eventually treat as white space.  The character must
-be escaped somehow, or it will work differently on a future Perl that
-does treat it as white space.  The easiest way is to insert a backslash
-immediately before it, or to enclose it with square brackets.  This
-change is to bring Perl into conformance with Unicode recommendations.
-Here are the five characters that generate this warning:
-U+0085 NEXT LINE,
-U+200E LEFT-TO-RIGHT MARK,
-U+200F RIGHT-TO-LEFT MARK,
-U+2028 LINE SEPARATOR,
-and
-U+2029 PARAGRAPH SEPARATOR.
-
 =item Eval-group in insecure regular expression
 
 (F) Perl detected tainted data when trying to compile a regular
diff --git a/pod/perlre.pod b/pod/perlre.pod
index 5fffed4a39..3f76210bc8 100644
--- a/pod/perlre.pod
+++ b/pod/perlre.pod
@@ -181,6 +181,21 @@ in C<\p{...}> there can be spaces that follow the Unicode rules, for which see
 L<perluniprops/Properties accessible through \p{} and \P{}>.
 X</x>
 
+The set of characters that are deemed whitespace are those that Unicode
+calls "Pattern White Space", namely:
+
+ U+0009 CHARACTER TABULATION
+ U+000A LINE FEED
+ U+000B LINE TABULATION
+ U+000C FORM FEED
+ U+000D CARRIAGE RETURN
+ U+0020 SPACE
+ U+0085 NEXT LINE
+ U+200E LEFT-TO-RIGHT MARK
+ U+200F RIGHT-TO-LEFT MARK
+ U+2028 LINE SEPARATOR
+ U+2029 PARAGRAPH SEPARATOR
+
 =head3 Character set modifiers
 
 C</d>, C</u>, C</a>, and C</l>, available starting in 5.14, are called
diff --git a/proto.h b/proto.h
index a8a0e5a97b..7a71622edf 100644
--- a/proto.h
+++ b/proto.h
@@ -6842,10 +6842,11 @@ STATIC SV *	S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
 #define PERL_ARGS_ASSERT_REG_SCAN_NAME	\
 	assert(pRExC_state)
 
-STATIC bool	S_reg_skipcomment(pTHX_ RExC_state_t *pRExC_state)
-			__attribute__nonnull__(pTHX_1);
+PERL_STATIC_INLINE char *	S_reg_skipcomment(pTHX_ RExC_state_t *pRExC_state, char * p)
+			__attribute__nonnull__(pTHX_1)
+			__attribute__nonnull__(pTHX_2);
 #define PERL_ARGS_ASSERT_REG_SKIPCOMMENT	\
-	assert(pRExC_state)
+	assert(pRExC_state); assert(p)
 
 STATIC regnode*	S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
 			__attribute__nonnull__(pTHX_1);
@@ -6876,10 +6877,10 @@ STATIC void	S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U
 #define PERL_ARGS_ASSERT_REGINSERT	\
 	assert(pRExC_state); assert(opnd)
 
-STATIC char *	S_regpatws(RExC_state_t *pRExC_state, char *p, const bool recognize_comment)
+STATIC char *	S_regpatws(pTHX_ RExC_state_t *pRExC_state, char *p, const bool recognize_comment)
 			__attribute__warn_unused_result__
-			__attribute__nonnull__(1)
-			__attribute__nonnull__(2);
+			__attribute__nonnull__(pTHX_1)
+			__attribute__nonnull__(pTHX_2);
 #define PERL_ARGS_ASSERT_REGPATWS	\
 	assert(pRExC_state); assert(p)
 
@@ -6907,13 +6908,6 @@ PERL_STATIC_INLINE STRLEN	S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv,
 #define PERL_ARGS_ASSERT_REGUNI	\
 	assert(pRExC_state); assert(s)
 
-STATIC char *	S_regwhite(RExC_state_t *pRExC_state, char *p)
-			__attribute__warn_unused_result__
-			__attribute__nonnull__(1)
-			__attribute__nonnull__(2);
-#define PERL_ARGS_ASSERT_REGWHITE	\
-	assert(pRExC_state); assert(p)
-
 STATIC void	S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, struct scan_data_t *data, SSize_t *minlenp, int is_inf)
 			__attribute__nonnull__(pTHX_1)
 			__attribute__nonnull__(pTHX_2)
diff --git a/regcharclass.h b/regcharclass.h
index 5e34ec0d91..5b04492b0f 100644
--- a/regcharclass.h
+++ b/regcharclass.h
@@ -736,18 +736,6 @@
 : 0 )
 
 /*** GENERATED CODE ***/
-#define is_PATWS_non_low_safe(s,e,is_utf8)                                  \
-( ((e) > (s)) ?                                                             \
-    ( (! is_utf8) ?                                                         \
-	( 0x85 == NATIVE_TO_LATIN1(((U8*)s)[0]) )                           \
-    : (((e) - (s)) >= UTF8SKIP(s)) ?                                        \
-	( ( 0xC2 == NATIVE_TO_LATIN1(((U8*)s)[0]) ) ?                       \
-	    ( ( 0x85 == NATIVE_TO_LATIN1(((U8*)s)[1]) ) ? 2 : 0 )           \
-	: ( ( ( 0xE2 == NATIVE_TO_LATIN1(((U8*)s)[0]) ) && ( 0x80 == NATIVE_TO_LATIN1(((U8*)s)[1]) ) ) && ( ( NATIVE_TO_LATIN1(((U8*)s)[2]) & 0xFE ) == 0x8E || ( NATIVE_TO_LATIN1(((U8*)s)[2]) & 0xFE ) == 0xA8 ) ) ? 3 : 0 )\
-    : 0 )                                                                   \
-: 0 )
-
-/*** GENERATED CODE ***/
 #define is_PATWS_cp(cp)                                                     \
 ( ( 0x09 <= NATIVE_TO_UNI(cp) && NATIVE_TO_UNI(cp) <= 0x0D ) || ( 0x0D < NATIVE_TO_UNI(cp) &&\
 ( 0x20 == NATIVE_TO_UNI(cp) || ( 0x20 < NATIVE_TO_UNI(cp) &&                \
diff --git a/regcomp.c b/regcomp.c
index badff4af8c..4cd50ee2ff 100644
--- a/regcomp.c
+++ b/regcomp.c
@@ -10676,7 +10676,8 @@ S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode** node_p,
      * modifier.  The other meaning does not, so use a temporary until we find
      * out which we are being called with */
     p = (RExC_flags & RXf_PMf_EXTENDED)
-	? regwhite( pRExC_state, RExC_parse )
+	? regpatws(pRExC_state, RExC_parse,
+                                TRUE) /* means recognize comments */
 	: RExC_parse;
 
     /* Disambiguate between \N meaning a named character versus \N meaning
@@ -11640,7 +11641,8 @@ tryagain:
 
     case '#':
 	if (RExC_flags & RXf_PMf_EXTENDED) {
-	    if ( reg_skipcomment( pRExC_state ) )
+	    RExC_parse = reg_skipcomment( pRExC_state, RExC_parse );
+	    if (RExC_parse < RExC_end)
 		goto tryagain;
 	}
 	/* FALLTHROUGH */
@@ -11721,7 +11723,8 @@ tryagain:
 		oldp = p;
 
 		if (RExC_flags & RXf_PMf_EXTENDED)
-		    p = regwhite( pRExC_state, p );
+                    p = regpatws(pRExC_state, p,
+                                          TRUE); /* means recognize comments */
 		switch ((U8)*p) {
 		case '^':
 		case '$':
@@ -11949,15 +11952,6 @@ tryagain:
 		    break;
 		default:    /* A literal character */
 
-                    if (! SIZE_ONLY
-                        && RExC_flags & RXf_PMf_EXTENDED
-                        && ckWARN_d(WARN_DEPRECATED)
-                        && is_PATWS_non_low_safe(p, RExC_end, UTF))
-                    {
-                        vWARN_dep(p + ((UTF) ? UTF8SKIP(p) : 1),
-                                "Escape literal pattern white space under /x");
-                    }
-
 		  normal_default:
 		    if (UTF8_IS_START(*p) && UTF) {
 			STRLEN numlen;
@@ -11975,7 +11969,8 @@ tryagain:
 		 */
 
 		if ( RExC_flags & RXf_PMf_EXTENDED)
-		    p = regwhite( pRExC_state, p );
+                    p = regpatws(pRExC_state, p,
+                                          TRUE); /* means recognize comments */
 
                 /* If the next thing is a quantifier, it applies to this
                  * character only, which means that this character has to be in
@@ -12330,39 +12325,11 @@ tryagain:
 }
 
 STATIC char *
-S_regwhite( RExC_state_t *pRExC_state, char *p )
-{
-    const char *e = RExC_end;
-
-    PERL_ARGS_ASSERT_REGWHITE;
-
-    while (p < e) {
-	if (isSPACE(*p))
-	    ++p;
-	else if (*p == '#') {
-            bool ended = 0;
-	    do {
-		if (*p++ == '\n') {
-		    ended = 1;
-		    break;
-		}
-	    } while (p < e);
-	    if (!ended)
-                RExC_seen |= REG_RUN_ON_COMMENT_SEEN;
-	}
-	else
-	    break;
-    }
-    return p;
-}
-
-STATIC char *
-S_regpatws( RExC_state_t *pRExC_state, char *p , const bool recognize_comment )
+S_regpatws(pTHX_ RExC_state_t *pRExC_state, char *p , const bool recognize_comment )
 {
     /* Returns the next non-pattern-white space, non-comment character (the
      * latter only if 'recognize_comment is true) in the string p, which is
-     * ended by RExC_end.  If there is no line break ending a comment,
-     * RExC_seen has added the REG_RUN_ON_COMMENT_SEEN flag; */
+     * ended by RExC_end.  See also reg_skipcomment */
     const char *e = RExC_end;
 
     PERL_ARGS_ASSERT_REGPATWS;
@@ -12373,16 +12340,7 @@ S_regpatws( RExC_state_t *pRExC_state, char *p , const bool recognize_comment )
 	    p += len;
         }
 	else if (recognize_comment && *p == '#') {
-            bool ended = 0;
-	    do {
-                p++;
-                if (is_LNBREAK_safe(p, e, UTF)) {
-		    ended = 1;
-		    break;
-		}
-	    } while (p < e);
-	    if (!ended)
-                RExC_seen |= REG_RUN_ON_COMMENT_SEEN;
+            p = reg_skipcomment(pRExC_state, p);
 	}
 	else
 	    break;
@@ -12710,7 +12668,7 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist,
         while (RExC_parse < RExC_end) {
             SV* current = NULL;
             RExC_parse = regpatws(pRExC_state, RExC_parse,
-                                TRUE); /* means recognize comments */
+                                          TRUE); /* means recognize comments */
             switch (*RExC_parse) {
                 case '?':
                     if (RExC_parse[1] == '[') depth++, RExC_parse++;
@@ -12827,7 +12785,7 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist,
 
         /* Skip white space */
         RExC_parse = regpatws(pRExC_state, RExC_parse,
-                                TRUE); /* means recognize comments */
+                                         TRUE /* means recognize comments */ );
         if (RExC_parse >= RExC_end) {
             Perl_croak(aTHX_ "panic: Read past end of '(?[ ])'");
         }
@@ -13299,7 +13257,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
 
     if (skip_white) {
         RExC_parse = regpatws(pRExC_state, RExC_parse,
-                              FALSE /* means don't recognize comments */);
+                              FALSE /* means don't recognize comments */ );
     }
 
     if (UCHARAT(RExC_parse) == '^') {	/* Complement of range. */
@@ -13309,7 +13267,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
         RExC_naughty++;
         if (skip_white) {
             RExC_parse = regpatws(pRExC_state, RExC_parse,
-                                  FALSE /* means don't recognize comments */);
+                                  FALSE /* means don't recognize comments */ );
         }
     }
 
@@ -13347,7 +13305,7 @@ parseit:
 
         if (skip_white) {
             RExC_parse = regpatws(pRExC_state, RExC_parse,
-                                  FALSE /* means don't recognize comments */);
+                                  FALSE /* means don't recognize comments */ );
         }
 
         if  (UCHARAT(RExC_parse) == ']') {
@@ -13858,7 +13816,7 @@ parseit:
 
         if (skip_white) {
             RExC_parse = regpatws(pRExC_state, RExC_parse,
-                                FALSE /* means don't recognize comments */);
+                                FALSE /* means don't recognize comments */ );
         }
 
 	if (range) {
@@ -14955,35 +14913,34 @@ S_set_ANYOF_arg(pTHX_ RExC_state_t* const pRExC_state,
 
 /* reg_skipcomment()
 
-   Absorbs an /x style # comments from the input stream.
-   Returns true if there is more text remaining in the stream.
-   Will set the REG_RUN_ON_COMMENT_SEEN flag if the comment
-   terminates the pattern without including a newline.
+   Absorbs an /x style # comment from the input stream,
+   returning a pointer to the first character beyond the comment, or if the
+   comment terminates the pattern without anything following it, this returns
+   one past the final character of the pattern (in other words, RExC_end) and
+   sets the REG_RUN_ON_COMMENT_SEEN flag.
 
-   Note its the callers responsibility to ensure that we are
+   Note it's the callers responsibility to ensure that we are
    actually in /x mode
 
 */
 
-STATIC bool
-S_reg_skipcomment(pTHX_ RExC_state_t *pRExC_state)
+PERL_STATIC_INLINE char*
+S_reg_skipcomment(pTHX_ RExC_state_t *pRExC_state, char* p)
 {
-    bool ended = 0;
-
     PERL_ARGS_ASSERT_REG_SKIPCOMMENT;
 
-    while (RExC_parse < RExC_end)
-        if (*RExC_parse++ == '\n') {
-            ended = 1;
-            break;
+    assert(*p = '#');
+
+    while (p < RExC_end) {
+        if (*(++p) == '\n') {
+            return p+1;
         }
-    if (!ended) {
-        /* we ran off the end of the pattern without ending
-           the comment, so we have to add an \n when wrapping */
-        RExC_seen |= REG_RUN_ON_COMMENT_SEEN;
-        return 0;
-    } else
-        return 1;
+    }
+
+    /* we ran off the end of the pattern without ending the comment, so we have
+     * to add an \n when wrapping */
+    RExC_seen |= REG_RUN_ON_COMMENT_SEEN;
+    return p;
 }
 
 /* nextchar()
@@ -15021,16 +14978,14 @@ S_nextchar(pTHX_ RExC_state_t *pRExC_state)
 	    continue;
 	}
 	if (RExC_flags & RXf_PMf_EXTENDED) {
-	    if (isSPACE(*RExC_parse)) {
-		RExC_parse++;
-		continue;
-	    }
-	    else if (*RExC_parse == '#') {
-	        if ( reg_skipcomment( pRExC_state ) )
-	            continue;
-	    }
+            char * p = regpatws(pRExC_state, RExC_parse,
+                                          TRUE); /* means recognize comments */
+            if (p != RExC_parse) {
+                RExC_parse = p;
+                continue;
+            }
 	}
-	return retval;
+        return retval;
     }
 }
 
diff --git a/regen/regcharclass.pl b/regen/regcharclass.pl
index 187023aa67..b837af4931 100755
--- a/regen/regcharclass.pl
+++ b/regen/regcharclass.pl
@@ -1652,5 +1652,5 @@ PROBLEMATIC_LOCALE_FOLDEDS_START : The first folded character of folds which are
 \p{_Perl_Problematic_Locale_Foldeds_Start}
 
 PATWS: pattern white space
-=> generic generic_non_low cp : safe
+=> generic cp : safe
 \p{PatWS}
diff --git a/t/re/pat.t b/t/re/pat.t
index c6e7f967b1..71cfeaaee4 100644
--- a/t/re/pat.t
+++ b/t/re/pat.t
@@ -20,7 +20,7 @@ BEGIN {
     require './test.pl';
 }
 
-plan tests => 732;  # Update this when adding/deleting tests.
+plan tests => 733;  # Update this when adding/deleting tests.
 
 run_tests() unless caller;
 
@@ -1599,6 +1599,11 @@ EOP
         ok("\x{101}" =~ /\x{101}/i, "A WITH MACRON l =~ l");
     }
 
+    {
+        use utf8;
+        ok("abc" =~ /abc/x, "NEL is white-space under /x");
+    }
+
 } # End of sub run_tests
 
 1;
diff --git a/t/re/reg_mesg.t b/t/re/reg_mesg.t
index 529708a2d9..857eba2ac6 100644
--- a/t/re/reg_mesg.t
+++ b/t/re/reg_mesg.t
@@ -416,8 +416,6 @@ my @experimental_regex_sets = (
 );
 
 my @deprecated = (
-    "/(?x)latin1\\\x{85}\x{85}\\\x{85}/" => 'Escape literal pattern white space under /x {#} ' . "m/(?x)latin1\\\x{85}\x{85}{#}\\\x{85}/",
-    'use utf8; /(?x)utf8\\/' => 'Escape literal pattern white space under /x {#} ' . "m/(?x)utf8\\\N{NEXT LINE}\N{NEXT LINE}{#}\\\N{NEXT LINE}/",
 );
 
 while (my ($regex, $expect) = splice @death, 0, 2) {