Restructure grok_bslash_[ox]

This commit causes these functions to allow a caller to request any messages generated to be returned to the caller, instead of always being handled within these functions. The messages are somewhat changed from previously to be clearer. I did not find any code in CPAN that relied on the previous message text. Like the previous commit for grok_bslash_c, here are two reasons to do this, repeated here. 1) In pattern compilation this brings these messages into conformity with the other ones that get generated in pattern compilation, where there is a particular syntax, including marking the exact position in the parse where the problem occurred. 2) These could generate truncated messages due to the (mostly) single-pass nature of pattern compilation that is now in effect. It keeps track of where during a parse a message has been output, and won't output it again if a second parsing pass turns out to be necessary. Prior to this commit, it had to assume that a message from one of these functions did get output, and this caused some out-of-bounds reads when a subparse (using a constructed pattern) was executed. The possibility of those went away in commit 5d894ca5213, which guarantees it won't try to read outside bounds, but that may still mean it is outputting text from the wrong parse, giving meaningless results. This commit should stop that possibility.
author: Karl Williamson <khw@cpan.org> 2020-01-16 13:17:32 -0700
committer: Karl Williamson <khw@cpan.org> 2020-01-23 15:46:56 -0700
commit: 8d1e72f0f234299a86ddd5ce728d7cff6b44a547 (patch)
tree: 477d458f1df8f1dd50551ced45ac7de81fd7368b
parent: 73351a7160e044aa54e64f7da1c78c3401a64c7b (diff)
download: perl-8d1e72f0f234299a86ddd5ce728d7cff6b44a547.tar.gz
12 files changed, 447 insertions, 182 deletions
diff --git a/dquote.c b/dquote.c
index d6e442e950..4c688b694f 100644
--- a/dquote.c
+++ b/dquote.c
@@ -79,51 +79,211 @@ Perl_grok_bslash_c(pTHX_ const char   source,
     return TRUE;
 }
 
+const char *
+Perl_form_alien_digit_msg(pTHX_
+        const U8 which,           /* 8 or 16 */
+        const STRLEN valids_len,  /* length of input before first bad char */
+        const char * const first_bad, /* Ptr to that bad char */
+        const char * const send,      /* End of input string */
+        const bool UTF,               /* Is it in UTF-8? */
+        const bool braced)            /* Is it enclosed in {} */
+{
+    /* Generate a mortal SV containing an appropriate warning message about
+     * alien characters found in an octal or hex constant given by the inputs,
+     * and return a pointer to that SV's string.  The message looks like:
+     *
+     * Non-hex character '?' terminates \x early.  Resolved as "\x{...}"
+     *
+     */
+
+    /* The usual worst case scenario: 2 chars to display per byte, plus \x{}
+     * (leading zeros could take up more space, and the scalar will
+     * automatically grow if necessary).  Space for NUL is added by the newSV()
+     * function */
+    SV * display_char = newSV(2 * UTF8_MAXBYTES + 4);
+    SV * message_sv = sv_newmortal();
+    char symbol;
+
+    PERL_ARGS_ASSERT_FORM_ALIEN_DIGIT_MSG;
+    assert(which == 8 || which == 16);
+
+    /* Calculate the display form of the character */
+    if (    UVCHR_IS_INVARIANT(*first_bad)
+        || (UTF && isUTF8_CHAR((U8 *) first_bad, (U8 *) send)))
+    {
+        pv_uni_display(display_char, (U8 *) first_bad, UTF8SKIP(first_bad),
+                                                    (STRLEN) -1, UNI_DISPLAY_QQ);
+    }
+    else {  /* Is not UTF-8, or is illegal UTF-8.  Show just the one byte */
+
+        /* It also isn't a UTF-8 invariant character, so no display shortcuts
+         * are available.  Use \\x{...} */
+	Perl_sv_setpvf(aTHX_ display_char, "\\x{%02x}", *first_bad);
+    }
+
+    /* Ready to start building the message */
+    sv_setpvs(message_sv, "Non-");
+    if (which == 8) {
+        sv_catpvs(message_sv, "octal");
+        if (braced) {
+            symbol = 'o';
+        }
+        else {
+            symbol = '0';   /* \008, for example */
+        }
+    }
+    else {
+        sv_catpvs(message_sv, "hex");
+        symbol = 'x';
+    }
+    sv_catpvs(message_sv, " character ");
+
+    if (isPRINT(*first_bad)) {
+        sv_catpvs(message_sv, "'");
+    }
+    sv_catsv(message_sv, display_char);
+    if (isPRINT(*first_bad)) {
+        sv_catpvs(message_sv, "'");
+    }
+    Perl_sv_catpvf(aTHX_ message_sv, " terminates \\%c early.  Resolved as "
+                                     "\"\\%c", symbol, symbol);
+    if (braced) {
+        sv_catpvs(message_sv, "{");
+    }
+
+    /* Octal constants have an extra leading 0, but \0 already includes that */
+    if (symbol == 'o' && valids_len < 3) {
+        sv_catpvs(message_sv, "0");
+    }
+    if (valids_len == 0) {  /* No legal digits at all */
+        sv_catpvs(message_sv, "00");
+    }
+    else if (valids_len == 1) { /* Just one is legal */
+        sv_catpvs(message_sv, "0");
+    }
+    sv_catpvn(message_sv, first_bad - valids_len, valids_len);
+
+    if (braced) {
+        sv_catpvs(message_sv, "}");
+    }
+    else {
+        sv_catsv(message_sv, display_char);
+    }
+    sv_catpvs(message_sv, "\"");
+
+    SvREFCNT_dec_NN(display_char);
+
+    return SvPVX_const(message_sv);
+}
+
+const char *
+Perl_form_cp_too_large_msg(pTHX_
+        const U8 which,        /* 8 or 16 */
+        const char * string,   /* NULL, or the text that is supposed to
+                                  represent a code point */
+        const Size_t len,      /* length of 'string' if not NULL; else 0 */
+        const UV cp)           /* 0 if 'string' not NULL; else the too-large
+                                  code point */
+{
+    /* Generate a mortal SV containing an appropriate warning message about
+     * code points that are too large for this system, given by the inputs,
+     * and return a pointer to that SV's string.  Either the text of the string
+     * to be converted to a code point is input, or a code point itself.  The
+     * former is needed to accurately represent something that overflows.
+     *
+     * The message looks like:
+     *
+     * Use of code point %s is not allowed; the permissible max is %s
+     *
+     */
+
+    SV * message_sv = sv_newmortal();
+    const char * format;
+    const char * prefix;
+
+    PERL_ARGS_ASSERT_FORM_CP_TOO_LARGE_MSG;
+    assert(which == 8 || which == 16);
+
+    /* One but not both must be non-zero */
+    assert((string != NULL) ^ (cp != 0));
+    assert((string == NULL) || len);
+
+    if (which == 8) {
+        format = "%" UVof;
+        prefix = "0";
+    }
+    else {
+        format = "%" UVXf;
+        prefix = "0x";
+    }
+
+    Perl_sv_setpvf(aTHX_ message_sv, "Use of code point %s", prefix);
+    if (string) {
+        Perl_sv_catpvf(aTHX_ message_sv, "%.*s", (int) len, string);
+    }
+    else {
+        Perl_sv_catpvf(aTHX_ message_sv, format, cp);
+    }
+    Perl_sv_catpvf(aTHX_ message_sv, " is not allowed; the permissible max is %s", prefix);
+    Perl_sv_catpvf(aTHX_ message_sv, format, MAX_LEGAL_CP);
+
+    return SvPVX_const(message_sv);
+}
+
 bool
 Perl_grok_bslash_o(pTHX_ char **s, const char * const send, UV *uv,
                       const char** message,
-                      const bool output_warning, const bool strict,
+                      U32 *      packed_warn,
+                      const bool strict,
+                      const bool allow_UV_MAX,
                       const bool UTF)
 {
 
 /*  Documentation to be supplied when interface nailed down finally
- *  This returns FALSE if there is an error which the caller need not recover
- *  from; otherwise TRUE.  In either case the caller should look at *len [???].
- *  It guarantees that the returned codepoint, *uv, when expressed as
- *  utf8 bytes, would fit within the skipped "\o{...}" bytes.
- *  On input:
+ *  This returns FALSE if there is an error the caller should probably die
+ *  from; otherwise TRUE.
  *	s   is the address of a pointer to a string.  **s is 'o', and the
  *	    previous character was a backslash.  At exit, *s will be advanced
  *	    to the byte just after those absorbed by this function.  Hence the
- *	    caller can continue parsing from there.  In the case of an error,
- *	    this routine has generally positioned *s to point just to the right
- *	    of the first bad spot, so that a message that has a "<--" to mark
- *	    the spot will be correctly positioned.
+ *	    caller can continue parsing from there.  In the case of an error
+ *	    when this function returns FALSE, continuing to parse is not an
+ *	    option, this routine has generally positioned *s to point just to
+ *	    the right of the first bad spot, so that a message that has a "<--"
+ *	    to mark the spot will be correctly positioned.
  *	send - 1  gives a limit in *s that this function is not permitted to
  *	    look beyond.  That is, the function may look at bytes only in the
  *	    range *s..send-1
  *	uv  points to a UV that will hold the output value, valid only if the
- *	    return from the function is TRUE
- *      message is a pointer that will be set to an internal buffer giving an
- *	    error message upon failure (the return is FALSE).  Untouched if
- *	    function succeeds
- *	output_warning says whether to output any warning messages, or suppress
- *	    them
+ *	    return from the function is TRUE; may be changed from the input
+ *	    value even when FALSE is returned.
+ *      message  A pointer to any warning or error message will be stored into
+ *          this pointer; NULL if none.
+ *      packed_warn if NULL on input asks that this routine display any warning
+ *          messages.  Otherwise, if the function found a warning, the packed
+ *          warning categories will be stored into *packed_warn (and the
+ *          corresponding message text into *message); 0 if none.
  *	strict is true if this should fail instead of warn if there are
  *	    non-octal digits within the braces
+ *      allow_UV_MAX is true if this shouldn't fail if the input code point is
+ *          UV_MAX, which is normally illegal, reserved for internal use.
  *	UTF is true iff the string *s is encoded in UTF-8.
  */
     char* e;
     STRLEN numbers_len;
     I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
-		| PERL_SCAN_DISALLOW_PREFIX
-                | PERL_SCAN_SILENT_NON_PORTABLE
-	        | PERL_SCAN_SILENT_ILLDIGIT;
+              | PERL_SCAN_DISALLOW_PREFIX
+              | PERL_SCAN_SILENT_NON_PORTABLE
+              | PERL_SCAN_SILENT_ILLDIGIT
+              | PERL_SCAN_SILENT_OVERFLOW;
 
     PERL_ARGS_ASSERT_GROK_BSLASH_O;
 
     assert(*(*s - 1) == '\\');
     assert(* *s       == 'o');
+
+    *message = NULL;
+    if (packed_warn) *packed_warn = 0;
+
     (*s)++;
 
     if (send <= *s || **s != '{') {
@@ -145,29 +305,40 @@ Perl_grok_bslash_o(pTHX_ char **s, const char * const send, UV *uv,
                   sequence if not a digit) */
     numbers_len = e - *s;
     if (numbers_len == 0) {
-        (*s)++;    /* Move past the } */
+        (*s)++;    /* Move past the '}' */
 	*message = "Empty \\o{}";
 	return FALSE;
     }
 
     *uv = grok_oct(*s, &numbers_len, &flags, NULL);
+    if (UNLIKELY(   (flags & PERL_SCAN_GREATER_THAN_UV_MAX)
+                 || (! allow_UV_MAX && *uv == UV_MAX)))
+    {
+        *message = form_cp_too_large_msg(8, *s, numbers_len, 0);
+        *s = e + 1;
+        return FALSE;
+    }
+
     /* Note that if has non-octal, will ignore everything starting with that up
      * to the '}' */
-
     if (numbers_len != (STRLEN) (e - *s)) {
+        *s += numbers_len;
         if (strict) {
-            *s += numbers_len;
             *s += (UTF) ? UTF8_SAFE_SKIP(*s, send) : 1;
             *message = "Non-octal character";
             return FALSE;
         }
-        else if (output_warning) {
-            Perl_ck_warner(aTHX_ packWARN(WARN_DIGIT),
-            /* diag_listed_as: Non-octal character '%c'.  Resolved as "%s" */
-                        "Non-octal character '%c'.  Resolved as \"\\o{%.*s}\"",
-                        *(*s + numbers_len),
-                        (int) numbers_len,
-                        *s);
+
+        if (ckWARN(WARN_DIGIT)) {
+            const char * failure = form_alien_digit_msg(8, numbers_len, *s, send,
+                                                                      UTF, TRUE);
+            if (packed_warn) {
+                *message = failure;
+                *packed_warn = packWARN(WARN_DIGIT);
+            }
+            else {
+                Perl_warner(aTHX_ packWARN(WARN_DIGIT), "%s", failure);
+            }
         }
     }
 
@@ -178,14 +349,16 @@ Perl_grok_bslash_o(pTHX_ char **s, const char * const send, UV *uv,
 }
 
 bool
-Perl_grok_bslash_x(pTHX_ char **s, const char * const send, UV *uv,
+Perl_grok_bslash_x(pTHX_ char ** s, const char * const send, UV *uv,
                       const char** message,
-                      const bool output_warning, const bool strict,
+                      U32 *      packed_warn,
+                      const bool strict,
+                      const bool allow_UV_MAX,
                       const bool UTF)
 {
 
 /*  Documentation to be supplied when interface nailed down finally
- *  This returns FALSE if there is an error which the caller need not recover
+ *  This returns FALSE if there is an error the caller should probably die
  *  from; otherwise TRUE.
  *  It guarantees that the returned codepoint, *uv, when expressed as
  *  utf8 bytes, would fit within the skipped "\x{...}" bytes.
@@ -202,29 +375,38 @@ Perl_grok_bslash_x(pTHX_ char **s, const char * const send, UV *uv,
  *	    look beyond.  That is, the function may look at bytes only in the
  *	    range *s..send-1
  *	uv  points to a UV that will hold the output value, valid only if the
- *	    return from the function is TRUE
- *      message is a pointer that will be set to an internal buffer giving an
- *	    error message upon failure (the return is FALSE).  Untouched if
- *	    function succeeds
- *	output_warning says whether to output any warning messages, or suppress
- *	    them
+ *	    return from the function is TRUE; may be changed from the input
+ *	    value even when FALSE is returned.
+ *      message  A pointer to any warning or error message will be stored into
+ *          this pointer; NULL if none.
+ *      packed_warn if NULL on input asks that this routine display any warning
+ *          messages.  Otherwise, if the function found a warning, the packed
+ *          warning categories will be stored into *packed_warn (and the
+ *          corresponding message text into *message); 0 if none.
  *	strict is true if anything out of the ordinary should cause this to
  *	    fail instead of warn or be silent.  For example, it requires
  *	    exactly 2 digits following the \x (when there are no braces).
  *	    3 digits could be a mistake, so is forbidden in this mode.
+ *      allow_UV_MAX is true if this shouldn't fail if the input code point is
+ *          UV_MAX, which is normally illegal, reserved for internal use.
  *	UTF is true iff the string *s is encoded in UTF-8.
  */
     char* e;
     STRLEN numbers_len;
     I32 flags = PERL_SCAN_DISALLOW_PREFIX
-              | PERL_SCAN_SILENT_NON_PORTABLE;
-
+              | PERL_SCAN_SILENT_ILLDIGIT
+              | PERL_SCAN_NOTIFY_ILLDIGIT
+              | PERL_SCAN_SILENT_NON_PORTABLE
+              | PERL_SCAN_SILENT_OVERFLOW;
 
     PERL_ARGS_ASSERT_GROK_BSLASH_X;
 
     assert(*(*s - 1) == '\\');
     assert(* *s      == 'x');
 
+    *message = NULL;
+    if (packed_warn) *packed_warn = 0;
+
     (*s)++;
 
     if (send <= *s) {
@@ -239,24 +421,34 @@ Perl_grok_bslash_x(pTHX_ char **s, const char * const send, UV *uv,
         return TRUE;
     }
 
-    if (strict || ! output_warning) {
-        flags |= PERL_SCAN_SILENT_ILLDIGIT;
-    }
-
     if (**s != '{') {
-        STRLEN len = (strict) ? 3 : 2;
-
-	*uv = grok_hex(*s, &len, &flags, NULL);
-	*s += len;
-        if (strict && len != 2) {
-            if (len < 2) {
-                *s += (UTF) ? UTF8_SAFE_SKIP(*s, send) : 1;
-                *message = "Non-hex character";
-            }
-            else {
+        numbers_len = (strict) ? 3 : 2;
+
+	*uv = grok_hex(*s, &numbers_len, &flags, NULL);
+	*s += numbers_len;
+
+        if (numbers_len != 2 && (strict || (flags & PERL_SCAN_NOTIFY_ILLDIGIT))) {
+            if (numbers_len == 3) { /* numbers_len 3 only happens with strict */
                 *message = "Use \\x{...} for more than two hex characters";
+                return FALSE;
+            }
+            else if (strict) {
+                    *s += (UTF) ? UTF8_SAFE_SKIP(*s, send) : 1;
+                    *message = "Non-hex character";
+                    return FALSE;
+            }
+            else if (ckWARN(WARN_DIGIT)) {
+                const char * failure = form_alien_digit_msg(16, numbers_len, *s,
+                                                              send, UTF, FALSE);
+
+                if (! packed_warn) {
+                    Perl_warner(aTHX_ packWARN(WARN_DIGIT), "%s", failure);
+                }
+                else {
+                    *message = failure;
+                    *packed_warn = packWARN(WARN_DIGIT);
+                }
             }
-            return FALSE;
         }
 	return TRUE;
     }
@@ -264,7 +456,7 @@ Perl_grok_bslash_x(pTHX_ char **s, const char * const send, UV *uv,
     e = (char *) memchr(*s, '}', send - *s);
     if (!e) {
         (*s)++;  /* Move past the '{' */
-        while (isXDIGIT(**s)) { /* Position beyond the legal digits */
+        while (*s < send && isXDIGIT(**s)) { /* Position beyond legal digits */
             (*s)++;
         }
         /* XXX The corresponding message above for \o is just '\\o{'; other
@@ -291,14 +483,33 @@ Perl_grok_bslash_x(pTHX_ char **s, const char * const send, UV *uv,
     flags |= PERL_SCAN_ALLOW_UNDERSCORES;
 
     *uv = grok_hex(*s, &numbers_len, &flags, NULL);
-    /* Note that if has non-hex, will ignore everything starting with that up
-     * to the '}' */
+    if (UNLIKELY(   (flags & PERL_SCAN_GREATER_THAN_UV_MAX)
+                 || (! allow_UV_MAX && *uv == UV_MAX)))
+    {
+        *message = form_cp_too_large_msg(16, *s, numbers_len, 0);
+        *s = e + 1;
+        return FALSE;
+    }
 
-    if (strict && numbers_len != (STRLEN) (e - *s)) {
+    if (numbers_len != (STRLEN) (e - *s)) {
         *s += numbers_len;
-        *s += (UTF) ? UTF8_SAFE_SKIP(*s, send) : 1;
-        *message = "Non-hex character";
-        return FALSE;
+        if (strict) {
+            *s += (UTF) ? UTF8_SAFE_SKIP(*s, send) : 1;
+            *message = "Non-hex character";
+            return FALSE;
+        }
+
+        if (ckWARN(WARN_DIGIT)) {
+            const char * failure = form_alien_digit_msg(16, numbers_len, *s,
+                                                                send, UTF, TRUE);
+            if (! packed_warn) {
+                Perl_warner(aTHX_ packWARN(WARN_DIGIT), "%s", failure);
+            }
+            else {
+                *message = failure;
+                *packed_warn = packWARN(WARN_DIGIT);
+            }
+        }
     }
 
     /* Return past the '}' */
diff --git a/embed.fnc b/embed.fnc
index 012a47932c..db38c9e155 100644
--- a/embed.fnc
+++ b/embed.fnc
@@ -1136,28 +1136,42 @@ Ap	|void	|vload_module|U32 flags|NN SV* name|NULLOK SV* ver|NULLOK va_list* args
 p	|OP*	|localize	|NN OP *o|I32 lex
 ApdR	|I32	|looks_like_number|NN SV *const sv
 #if defined(PERL_IN_REGCOMP_C) || defined(PERL_IN_TOKE_C) || defined(PERL_IN_DQUOTE_C)
-EpRX	|bool	|grok_bslash_x	|NN char** s		 \
-				|NN const char* const send	 \
-				|NN UV* uv			 \
-				|NN const char** message         \
-				|const bool output_warning       \
-				|const bool strict               \
+EpRX	|bool	|grok_bslash_x	|NN char** s			\
+				|NN const char* const send	\
+				|NN UV* uv			\
+				|NN const char** message	\
+				|NULLOK U32 * packed_warn	\
+				|const bool strict		\
+				|const bool allow_UV_MAX	\
 				|const bool utf8
 EpRX	|bool	|grok_bslash_c	|const char source		\
 				|NN U8 * result			\
 				|NN const char** message	\
 				|NULLOK U32 * packed_warn
-EpRX	|bool	|grok_bslash_o	|NN char** s		 \
-				|NN const char* const send	 \
-				|NN UV* uv			 \
-				|NN const char** message         \
-				|const bool output_warning       \
-				|const bool strict               \
+EpRX	|bool	|grok_bslash_o	|NN char** s			\
+				|NN const char* const send	\
+				|NN UV* uv			\
+				|NN const char** message        \
+				|NULLOK U32 * packed_warn	\
+				|const bool strict              \
+				|const bool allow_UV_MAX	\
 				|const bool utf8
+EpRX	|const char *|form_alien_digit_msg|const U8 which	\
+				|const STRLEN valids_len	\
+				|NN const char * const first_bad\
+				|NN const char * const send	\
+				|const bool UTF			\
+				|const bool braced
 EiR	|char*|form_short_octal_warning|NN const char * const s  \
 				|const STRLEN len
 EiRT	|I32	|regcurly	|NN const char *s
 #endif
+#if defined(PERL_IN_REGCOMP_C) || defined(PERL_IN_TOKE_C) || defined(PERL_IN_DQUOTE_C)
+EpRX	|const char *|form_cp_too_large_msg|const U8 which	\
+				|NULLOK const char * string	\
+				|const Size_t len		\
+				|const UV cp
+#endif
 AMpd	|UV	|grok_hex	|NN const char* start|NN STRLEN* len_p|NN I32* flags|NULLOK NV *result
 Apd	|int	|grok_infnan	|NN const char** sp|NN const char *send
 Apd	|int	|grok_number	|NN const char *pv|STRLEN len|NULLOK UV *valuep
diff --git a/embed.h b/embed.h
index cd167db841..426ff1ee7a 100644
--- a/embed.h
+++ b/embed.h
@@ -1108,10 +1108,12 @@
 #define invlist_clone(a,b)	Perl_invlist_clone(aTHX_ a,b)
 #  endif
 #  if defined(PERL_IN_REGCOMP_C) || defined(PERL_IN_TOKE_C) || defined(PERL_IN_DQUOTE_C)
+#define form_alien_digit_msg(a,b,c,d,e,f)	Perl_form_alien_digit_msg(aTHX_ a,b,c,d,e,f)
+#define form_cp_too_large_msg(a,b,c,d)	Perl_form_cp_too_large_msg(aTHX_ a,b,c,d)
 #define form_short_octal_warning(a,b)	S_form_short_octal_warning(aTHX_ a,b)
 #define grok_bslash_c(a,b,c,d)	Perl_grok_bslash_c(aTHX_ a,b,c,d)
-#define grok_bslash_o(a,b,c,d,e,f,g)	Perl_grok_bslash_o(aTHX_ a,b,c,d,e,f,g)
-#define grok_bslash_x(a,b,c,d,e,f,g)	Perl_grok_bslash_x(aTHX_ a,b,c,d,e,f,g)
+#define grok_bslash_o(a,b,c,d,e,f,g,h)	Perl_grok_bslash_o(aTHX_ a,b,c,d,e,f,g,h)
+#define grok_bslash_x(a,b,c,d,e,f,g,h)	Perl_grok_bslash_x(aTHX_ a,b,c,d,e,f,g,h)
 #define regcurly		S_regcurly
 #  endif
 #  if defined(PERL_IN_REGCOMP_C) || defined(PERL_IN_UTF8_C) || defined(PERL_IN_OP_C) || defined(PERL_IN_DOOP_C)
diff --git a/pod/perldelta.pod b/pod/perldelta.pod
index 3379edf6ee..54ddebee64 100644
--- a/pod/perldelta.pod
+++ b/pod/perldelta.pod
@@ -218,6 +218,14 @@ it occured.
 
 =item *
 
+L<Non-hex character '%c' terminates \x early.  Resolved as "%s"|perldiag/"Non-hex character '%c' terminates \x early.  Resolved as "%s"">
+
+This replaces a warning that was much less specific, and which gave
+false information.  This new warning parallels the similar
+already-existing one raised for C<\o{}>.
+
+=item *
+
 L<message|perldiag/"message">
 
 =back
@@ -260,6 +268,15 @@ now has extra text added at the end, when raised during regular
 expression pattern compilation, marking where precisely in the pattern
 it occured.
 
+=item *
+
+L<Non-octal character '%c' terminates \o early.  Resolved as "%s"|perldiag/"Non-octal character '%c' terminates \o early.  Resolved as "%s"">
+
+now includes the phrase "terminates \o early", and has extra text added
+at the end, when raised during regular expression pattern compilation,
+marking where precisely in the pattern it occured.  In some instances
+the text of the resolution has been clarified.
+
 =back
 
 =head1 Utility Changes
diff --git a/pod/perldiag.pod b/pod/perldiag.pod
index 3830003695..11750fb2cb 100644
--- a/pod/perldiag.pod
+++ b/pod/perldiag.pod
@@ -4152,6 +4152,15 @@ a hex one was expected, like
  (?[ [ \xDG ] ])
  (?[ [ \x{DEKA} ] ])
 
+=item Non-hex character '%c' terminates \x early.  Resolved as "%s"
+
+(W digit) In parsing a hexadecimal numeric constant, a character was
+unexpectedly encountered that isn't hexadecimal.  The resulting value
+is as indicated.
+
+Note that, within braces, every character starting with the first
+non-hexadecimal up to the ending brace is ignored.
+
 =item Non-octal character in regex; marked by S<<-- HERE> in m/%s/
 
 (F) In a regular expression, there was a non-octal character where
@@ -4159,12 +4168,15 @@ an octal one was expected, like
 
  (?[ [ \o{1278} ] ])
 
-=item Non-octal character '%c'.  Resolved as "%s"
+=item Non-octal character '%c' terminates \o early.  Resolved as "%s"
 
 (W digit) In parsing an octal numeric constant, a character was
 unexpectedly encountered that isn't octal.  The resulting value
 is as indicated.
 
+Note that, within braces, every character starting with the first
+non-octal up to the ending brace is ignored.
+
 =item "no" not allowed in expression
 
 (F) The "no" keyword is recognized and executed at compile time, and
diff --git a/proto.h b/proto.h
index d5fb2caf54..e1f8864226 100644
--- a/proto.h
+++ b/proto.h
@@ -5905,6 +5905,15 @@ PERL_CALLCONV SV*	Perl_invlist_clone(pTHX_ SV* const invlist, SV* newlist);
 	assert(invlist)
 #endif
 #if defined(PERL_IN_REGCOMP_C) || defined(PERL_IN_TOKE_C) || defined(PERL_IN_DQUOTE_C)
+PERL_CALLCONV const char *	Perl_form_alien_digit_msg(pTHX_ const U8 which, const STRLEN valids_len, const char * const first_bad, const char * const send, const bool UTF, const bool braced)
+			__attribute__warn_unused_result__;
+#define PERL_ARGS_ASSERT_FORM_ALIEN_DIGIT_MSG	\
+	assert(first_bad); assert(send)
+
+PERL_CALLCONV const char *	Perl_form_cp_too_large_msg(pTHX_ const U8 which, const char * string, const Size_t len, const UV cp)
+			__attribute__warn_unused_result__;
+#define PERL_ARGS_ASSERT_FORM_CP_TOO_LARGE_MSG
+
 #ifndef PERL_NO_INLINE_FUNCTIONS
 PERL_STATIC_INLINE char*	S_form_short_octal_warning(pTHX_ const char * const s, const STRLEN len)
 			__attribute__warn_unused_result__;
@@ -5917,12 +5926,12 @@ PERL_CALLCONV bool	Perl_grok_bslash_c(pTHX_ const char source, U8 * result, cons
 #define PERL_ARGS_ASSERT_GROK_BSLASH_C	\
 	assert(result); assert(message)
 
-PERL_CALLCONV bool	Perl_grok_bslash_o(pTHX_ char** s, const char* const send, UV* uv, const char** message, const bool output_warning, const bool strict, const bool utf8)
+PERL_CALLCONV bool	Perl_grok_bslash_o(pTHX_ char** s, const char* const send, UV* uv, const char** message, U32 * packed_warn, const bool strict, const bool allow_UV_MAX, const bool utf8)
 			__attribute__warn_unused_result__;
 #define PERL_ARGS_ASSERT_GROK_BSLASH_O	\
 	assert(s); assert(send); assert(uv); assert(message)
 
-PERL_CALLCONV bool	Perl_grok_bslash_x(pTHX_ char** s, const char* const send, UV* uv, const char** message, const bool output_warning, const bool strict, const bool utf8)
+PERL_CALLCONV bool	Perl_grok_bslash_x(pTHX_ char** s, const char* const send, UV* uv, const char** message, U32 * packed_warn, const bool strict, const bool allow_UV_MAX, const bool utf8)
 			__attribute__warn_unused_result__;
 #define PERL_ARGS_ASSERT_GROK_BSLASH_X	\
 	assert(s); assert(send); assert(uv); assert(message)
diff --git a/regcomp.c b/regcomp.c
index 93bacdfdb6..85d7555b8f 100644
--- a/regcomp.c
+++ b/regcomp.c
@@ -14087,56 +14087,51 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
 			p++;
 			break;
 		    case 'o':
-			{
-			    UV result;
-			    const char* error_msg;
-
-			    bool valid = grok_bslash_o(&p,
-                                                       RExC_end,
-						       &result,
-						       &error_msg,
-						       TO_OUTPUT_WARNINGS(p),
-                                                       (bool) RExC_strict,
-                                                       UTF);
-			    if (! valid) {
-				RExC_parse = p;	/* going to die anyway; point
-						   to exact spot of failure */
-				vFAIL(error_msg);
-			    }
-                            UPDATE_WARNINGS_LOC(p - 1);
-                            ender = result;
-			    break;
-			}
+                        if (! grok_bslash_o(&p,
+                                            RExC_end,
+                                            &ender,
+                                            &message,
+                                            &packed_warn,
+                                            (bool) RExC_strict,
+                                            FALSE, /* No illegal cp's */
+                                            UTF))
+                        {
+                            RExC_parse = p; /* going to die anyway; point to
+                                               exact spot of failure */
+                            vFAIL(message);
+                        }
+
+                        if (message && TO_OUTPUT_WARNINGS(p)) {
+                            warn_non_literal_string(p, packed_warn, message);
+                        }
+                        break;
 		    case 'x':
-			{
-                            UV result = UV_MAX; /* initialize to erroneous
-                                                   value */
-			    const char* error_msg;
-
-			    bool valid = grok_bslash_x(&p,
-                                                       RExC_end,
-						       &result,
-						       &error_msg,
-                                                       TO_OUTPUT_WARNINGS(p),
-                                                       (bool) RExC_strict,
-                                                       UTF);
-			    if (! valid) {
-				RExC_parse = p;	/* going to die anyway; point
-						   to exact spot of failure */
-				vFAIL(error_msg);
-			    }
-                            UPDATE_WARNINGS_LOC(p - 1);
-                            ender = result;
+                        if (! grok_bslash_x(&p,
+                                            RExC_end,
+                                            &ender,
+                                            &message,
+                                            &packed_warn,
+                                            (bool) RExC_strict,
+                                            FALSE, /* No illegal cp's */
+                                            UTF))
+                        {
+                            RExC_parse = p;	/* going to die anyway; point
+                                                   to exact spot of failure */
+                            vFAIL(message);
+                        }
+
+                        if (message && TO_OUTPUT_WARNINGS(p)) {
+                            warn_non_literal_string(p, packed_warn, message);
+                        }
 
 #ifdef EBCDIC
-                            if (ender < 0x100) {
-                                if (RExC_recode_x_to_native) {
-                                    ender = LATIN1_TO_NATIVE(ender);
-                                }
-			    }
+                        if (ender < 0x100) {
+                            if (RExC_recode_x_to_native) {
+                                ender = LATIN1_TO_NATIVE(ender);
+                            }
+                        }
 #endif
-			    break;
-			}
+                        break;
 		    case 'c':
                         p++;
                         if (! grok_bslash_c(*p, &grok_c_char,
@@ -16960,6 +16955,7 @@ S_output_posix_warnings(pTHX_ RExC_state_t *pRExC_state, AV* posix_warnings)
     PERL_ARGS_ASSERT_OUTPUT_POSIX_WARNINGS;
 
     if (! TO_OUTPUT_WARNINGS(RExC_parse)) {
+        CLEAR_POSIX_WARNINGS();
         return;
     }
 
@@ -17643,38 +17639,42 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
 	    case 'a':	value = '\a';                   break;
 	    case 'o':
 		RExC_parse--;	/* function expects to be pointed at the 'o' */
-		{
-		    const char* error_msg;
-		    bool valid = grok_bslash_o(&RExC_parse,
-                                               RExC_end,
-					       &value,
-					       &error_msg,
-                                               TO_OUTPUT_WARNINGS(RExC_parse),
-                                               strict,
-                                               UTF);
-		    if (! valid) {
-			vFAIL(error_msg);
-		    }
-                    UPDATE_WARNINGS_LOC(RExC_parse - 1);
-		}
+                if (! grok_bslash_o(&RExC_parse,
+                                            RExC_end,
+                                            &value,
+                                            &message,
+                                            &packed_warn,
+                                            strict,
+                                            range, /* MAX_UV allowed for range
+                                                      upper limit */
+                                            UTF))
+                {
+                    vFAIL(message);
+                }
+                else if (message && TO_OUTPUT_WARNINGS(RExC_parse)) {
+                    warn_non_literal_string(RExC_parse, packed_warn, message);
+                }
+
                 non_portable_endpoint++;
 		break;
 	    case 'x':
 		RExC_parse--;	/* function expects to be pointed at the 'x' */
-		{
-		    const char* error_msg;
-		    bool valid = grok_bslash_x(&RExC_parse,
-                                               RExC_end,
-					       &value,
-					       &error_msg,
-					       TO_OUTPUT_WARNINGS(RExC_parse),
-                                               strict,
-                                               UTF);
-                    if (! valid) {
-			vFAIL(error_msg);
-		    }
-                    UPDATE_WARNINGS_LOC(RExC_parse - 1);
-		}
+                if (!  grok_bslash_x(&RExC_parse,
+                                            RExC_end,
+                                            &value,
+                                            &message,
+                                            &packed_warn,
+                                            strict,
+                                            range, /* MAX_UV allowed for range
+                                                      upper limit */
+                                            UTF))
+                {
+                    vFAIL(message);
+                }
+                else if (message && TO_OUTPUT_WARNINGS(RExC_parse)) {
+                    warn_non_literal_string(RExC_parse, packed_warn, message);
+                }
+
                 non_portable_endpoint++;
 		break;
 	    case 'c':
@@ -17984,6 +17984,11 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
 
 	/* non-Latin1 code point implies unicode semantics. */
 	if (value > 255) {
+            if (value > MAX_LEGAL_CP && (   value != UV_MAX
+                                         || prevvalue > MAX_LEGAL_CP))
+            {
+                vFAIL(form_cp_too_large_msg(16, NULL, 0, value));
+            }
             REQUIRE_UNI_RULES(flagp, 0);
             if (  ! silence_non_portable
                 &&  UNICODE_IS_PERL_EXTENDED(value)
diff --git a/t/lib/warnings/regcomp b/t/lib/warnings/regcomp
index b10680b2a5..50b85fdd45 100644
--- a/t/lib/warnings/regcomp
+++ b/t/lib/warnings/regcomp
@@ -18,8 +18,8 @@ no warnings 'digit' ;
 my $a = qr/\o{1238456}\x{100}/;
 my $a = qr/[\o{6548321}]\x{100}/;
 EXPECT
-Non-octal character '8'.  Resolved as "\o{123}" at - line 3.
-Non-octal character '8'.  Resolved as "\o{654}" at - line 4.
+Non-octal character '8' terminates \o early.  Resolved as "\o{123}" in regex; marked by <-- HERE in m/\o{1238456} <-- HERE \x{100}/ at - line 3.
+Non-octal character '8' terminates \o early.  Resolved as "\o{654}" in regex; marked by <-- HERE in m/[\o{6548321} <-- HERE ]\x{100}/ at - line 4.
 ########
 # regcomp.c
 BEGIN {
diff --git a/t/lib/warnings/toke b/t/lib/warnings/toke
index e875874707..e22f51ebaa 100644
--- a/t/lib/warnings/toke
+++ b/t/lib/warnings/toke
@@ -1378,7 +1378,7 @@ my $a = "\o{1238456}";
 no warnings 'digit' ;
 my $a = "\o{1238456}";
 EXPECT
-Non-octal character '8'.  Resolved as "\o{123}" at - line 3.
+Non-octal character '8' terminates \o early.  Resolved as "\o{123}" at - line 3.
 ########
 # toke.c
 use warnings;
@@ -1527,9 +1527,9 @@ print "aa" =~ m{^a{1,2}$}, "G\n";
 print "aq" =~ m[^a\[a-z\]$], "H\n";
 print "aq" =~ m(^a\(q\)$), "I\n";
 EXPECT
-Illegal hexadecimal digit '\' ignored at - line 5.
-Illegal hexadecimal digit '\' ignored at - line 7.
-Illegal hexadecimal digit '\' ignored at - line 9.
+Non-hex character '\\' terminates \x early.  Resolved as "\x00\\" in regex; marked by <-- HERE in m/^a\x <-- HERE \{61\}$/ at - line 5.
+Non-hex character '\\' terminates \x early.  Resolved as "\x00\\" in regex; marked by <-- HERE in m/^a\\\x <-- HERE \{6F\}$/ at - line 7.
+Non-hex character '\\' terminates \x early.  Resolved as "\x00\\" in regex; marked by <-- HERE in m/^a\\\\\x <-- HERE \{6F\}$/ at - line 9.
 A
 B
 1C
diff --git a/t/re/anyof.t b/t/re/anyof.t
index 9b9c030a18..6434f937ee 100644
--- a/t/re/anyof.t
+++ b/t/re/anyof.t
@@ -485,7 +485,6 @@ my @tests = (
     '[\x{101}-{INFTY}]' => 'ANYOFH[0101-INFTY]',
     '[\x{101}-{HIGHEST_CP}]' => 'ANYOFH[0101-HIGHEST_CP]',
     '[\x{102}\x{104}]' => 'ANYOFHb[0102 0104]',
-    '[\x{102}-\x{104}{INFTY}]' => 'ANYOFH[0102-0104 INFTY-INFTY]',
     '[\x{102}-\x{104}{HIGHEST_CP}]' => 'ANYOFH[0102-0104 HIGHEST_CP]',
     '[\x{102}-\x{104}\x{101}]' => 'ANYOFRb[0101-0104]',
     '[\x{102}-\x{104}\x{101}-{INFTY}]' => 'ANYOFH[0101-INFTY]',
@@ -505,7 +504,6 @@ my @tests = (
     '[\x{102}-\x{104}\x{106}]' => 'ANYOFHb[0102-0104 0106]',
     '[\x{102}-\x{104}\x{106}-{INFTY}]' => 'ANYOFH[0102-0104 0106-INFTY]',
     '[\x{102}-\x{104}\x{106}-{HIGHEST_CP}]' => 'ANYOFH[0102-0104 0106-HIGHEST_CP]',
-    '[\x{102}-\x{104}\x{108}-\x{10A}{INFTY}]' => 'ANYOFH[0102-0104 0108-010A INFTY-INFTY]',
     '[\x{102}-\x{104}\x{108}-\x{10A}{HIGHEST_CP}]' => 'ANYOFH[0102-0104 0108-010A HIGHEST_CP]',
     '[\x{102}-\x{104}\x{108}-\x{10A}\x{101}]' => 'ANYOFHb[0101-0104 0108-010A]',
     '[\x{102}-\x{104}\x{108}-\x{10A}\x{101}-{INFTY}]' => 'ANYOFH[0101-INFTY]',
@@ -603,14 +601,12 @@ my @tests = (
     '[\x{106}-{INFTY}\x{107}-{INFTY}]' => 'ANYOFH[0106-INFTY]',
     '[\x{106}-{INFTY}\x{107}-{HIGHEST_CP}]' => 'ANYOFH[0106-INFTY]',
     '[\x{106}-{INFTY}\x{107}-\x{107}]' => 'ANYOFH[0106-INFTY]',
-    '[\x{10C}-{INFTY}{INFTY}]' => 'ANYOFH[010C-INFTY]',
     '[\x{10C}-{INFTY}{HIGHEST_CP}]' => 'ANYOFH[010C-INFTY]',
     '[\x{10C}-{INFTY}\x{00}-{HIGHEST_CP}]' => 'SANY',
     '[\x{10C}-{INFTY}\x{00}-{INFTY}]' => 'SANY',
     '[\x{10C}-{INFTY}\x{101}-{INFTY}]' => 'ANYOFH[0101-INFTY]',
     '[\x{10C}-{INFTY}\x{101}-{HIGHEST_CP}]' => 'ANYOFH[0101-INFTY]',
     '[\x{10C}-{INFTY}\x{102}\x{104}]' => 'ANYOFH[0102 0104 010C-INFTY]',
-    '[\x{10C}-{INFTY}\x{102}-\x{104}{INFTY}]' => 'ANYOFH[0102-0104 010C-INFTY]',
     '[\x{10C}-{INFTY}\x{102}-\x{104}{HIGHEST_CP}]' => 'ANYOFH[0102-0104 010C-INFTY]',
     '[\x{10C}-{INFTY}\x{102}-\x{104}\x{100}]' => 'ANYOFH[0100 0102-0104 010C-INFTY]',
     '[\x{10C}-{INFTY}\x{102}-\x{104}\x{101}]' => 'ANYOFH[0101-0104 010C-INFTY]',
@@ -631,7 +627,6 @@ my @tests = (
     '[\x{10C}-{INFTY}\x{102}-\x{104}\x{106}]' => 'ANYOFH[0102-0104 0106 010C-INFTY]',
     '[\x{10C}-{INFTY}\x{102}-\x{104}\x{106}-{INFTY}]' => 'ANYOFH[0102-0104 0106-INFTY]',
     '[\x{10C}-{INFTY}\x{102}-\x{104}\x{106}-{HIGHEST_CP}]' => 'ANYOFH[0102-0104 0106-INFTY]',
-    '[\x{10C}-{INFTY}\x{102}-\x{104}\x{108}-\x{10A}{INFTY}]' => 'ANYOFH[0102-0104 0108-010A 010C-INFTY]',
     '[\x{10C}-{INFTY}\x{102}-\x{104}\x{108}-\x{10A}{HIGHEST_CP}]' => 'ANYOFH[0102-0104 0108-010A 010C-INFTY]',
     '[\x{10C}-{INFTY}\x{102}-\x{104}\x{108}-\x{10A}\x{101}]' => 'ANYOFH[0101-0104 0108-010A 010C-INFTY]',
     '[\x{10C}-{INFTY}\x{102}-\x{104}\x{108}-\x{10A}\x{101}-{INFTY}]' => 'ANYOFH[0101-INFTY]',
diff --git a/t/re/reg_mesg.t b/t/re/reg_mesg.t
index c7d51d9ad0..2793e9d95c 100644
--- a/t/re/reg_mesg.t
+++ b/t/re/reg_mesg.t
@@ -332,25 +332,23 @@ my @death_only_under_strict = (
     'm/[\xABC]/' => "",
                  => 'Use \x{...} for more than two hex characters {#} m/[\xABC{#}]/',
 
-    # XXX This is a confusing error message.  The G isn't ignored; it just
-    # terminates the \x.  Also some messages below are missing the <-- HERE,
-    # aren't all category 'regexp'.  (Hence we have to turn off 'digit'
-    # messages as well below)
-    'm/\xAG/' => 'Illegal hexadecimal digit \'G\' ignored',
+    # some messages below aren't all category 'regexp'.  (Hence we have to
+    # turn off 'digit' messages as well below)
+    'm/\xAG/' => 'Non-hex character \'G\' terminates \x early.  Resolved as "\x0AG" {#} m/\xA{#}G/',
               => 'Non-hex character {#} m/\xAG{#}/',
-    'm/[\xAG]/' => 'Illegal hexadecimal digit \'G\' ignored',
+    'm/[\xAG]/' => 'Non-hex character \'G\' terminates \x early.  Resolved as "\x0AG" {#} m/[\xA{#}G]/',
                 => 'Non-hex character {#} m/[\xAG{#}]/',
-    'm/\o{789}/' => 'Non-octal character \'8\'.  Resolved as "\o{7}"',
+    'm/\o{789}/' => 'Non-octal character \'8\' terminates \o early.  Resolved as "\o{007}" {#} m/\o{789}{#}/',
                  => 'Non-octal character {#} m/\o{78{#}9}/',
-    'm/[\o{789}]/' => 'Non-octal character \'8\'.  Resolved as "\o{7}"',
+    'm/[\o{789}]/' => 'Non-octal character \'8\' terminates \o early.  Resolved as "\o{007}" {#} m/[\o{789}{#}]/',
                    => 'Non-octal character {#} m/[\o{78{#}9}]/',
     'm/\x{}/' => "",
               => 'Empty \x{} {#} m/\x{}{#}/',
     'm/[\x{}]/' => "",
                 => 'Empty \x{} {#} m/[\x{}{#}]/',
-    'm/\x{ABCDEFG}/' => 'Illegal hexadecimal digit \'G\' ignored',
+    'm/\x{ABCDEFG}/' => 'Non-hex character \'G\' terminates \x early.  Resolved as "\x{ABCDEF}" {#} m/\x{ABCDEFG}{#}/',
                      => 'Non-hex character {#} m/\x{ABCDEFG{#}}/',
-    'm/[\x{ABCDEFG}]/' => 'Illegal hexadecimal digit \'G\' ignored',
+    'm/[\x{ABCDEFG}]/' => 'Non-hex character \'G\' terminates \x early.  Resolved as "\x{ABCDEF}" {#} m/[\x{ABCDEFG}{#}]/',
                        => 'Non-hex character {#} m/[\x{ABCDEFG{#}}]/',
     "m'[\\y]\\x{100}'" => 'Unrecognized escape \y in character class passed through {#} m/[\y{#}]\x{100}/',
                        => 'Unrecognized escape \y in character class {#} m/[\y{#}]\x{100}/',
diff --git a/toke.c b/toke.c
index 41e69305b6..d54e79e4f9 100644
--- a/toke.c
+++ b/toke.c
@@ -3552,12 +3552,13 @@ S_scan_const(pTHX_ char *start)
 		{
 		    const char* error;
 
-		    bool valid = grok_bslash_o(&s, send,
+		    if (! grok_bslash_o(&s, send,
                                                &uv, &error,
-                                               TRUE, /* Output warning */
+                                               NULL,
                                                FALSE, /* Not strict */
-                                               UTF);
-		    if (! valid) {
+                                               FALSE, /* No illegal cp's */
+                                               UTF))
+                    {
 			yyerror(error);
 			uv = 0; /* drop through to ensure range ends are set */
 		    }
@@ -3569,12 +3570,13 @@ S_scan_const(pTHX_ char *start)
 		{
 		    const char* error;
 
-		    bool valid = grok_bslash_x(&s, send,
+		    if (! grok_bslash_x(&s, send,
                                                &uv, &error,
-                                               TRUE, /* Output warning */
+                                               NULL,
                                                FALSE, /* Not strict */
-                                               UTF);
-		    if (! valid) {
+                                               FALSE, /* No illegal cp's */
+                                               UTF))
+                    {
 			yyerror(error);
 			uv = 0; /* drop through to ensure range ends are set */
 		    }
author	Karl Williamson <khw@cpan.org>	2020-01-16 13:17:32 -0700
committer	Karl Williamson <khw@cpan.org>	2020-01-23 15:46:56 -0700
commit	8d1e72f0f234299a86ddd5ce728d7cff6b44a547 (patch)
tree	477d458f1df8f1dd50551ced45ac7de81fd7368b
parent	73351a7160e044aa54e64f7da1c78c3401a64c7b (diff)
download	perl-8d1e72f0f234299a86ddd5ce728d7cff6b44a547.tar.gz