Restructure grok_bslash_[ox]

This commit causes these functions to allow a caller to request any messages generated to be returned to the caller, instead of always being handled within these functions. The messages are somewhat changed from previously to be clearer. I did not find any code in CPAN that relied on the previous message text. Like the previous commit for grok_bslash_c, here are two reasons to do this, repeated here. 1) In pattern compilation this brings these messages into conformity with the other ones that get generated in pattern compilation, where there is a particular syntax, including marking the exact position in the parse where the problem occurred. 2) These could generate truncated messages due to the (mostly) single-pass nature of pattern compilation that is now in effect. It keeps track of where during a parse a message has been output, and won't output it again if a second parsing pass turns out to be necessary. Prior to this commit, it had to assume that a message from one of these functions did get output, and this caused some out-of-bounds reads when a subparse (using a constructed pattern) was executed. The possibility of those went away in commit 5d894ca5213, which guarantees it won't try to read outside bounds, but that may still mean it is outputting text from the wrong parse, giving meaningless results. This commit should stop that possibility.
author: Karl Williamson <khw@cpan.org> 2020-01-16 13:17:32 -0700
committer: Karl Williamson <khw@cpan.org> 2020-01-23 15:46:56 -0700
commit: 8d1e72f0f234299a86ddd5ce728d7cff6b44a547 (patch)
tree: 477d458f1df8f1dd50551ced45ac7de81fd7368b /dquote.c
parent: 73351a7160e044aa54e64f7da1c78c3401a64c7b (diff)
download: perl-8d1e72f0f234299a86ddd5ce728d7cff6b44a547.tar.gz
1 files changed, 273 insertions, 62 deletions
diff --git a/dquote.c b/dquote.c
index d6e442e950..4c688b694f 100644
--- a/dquote.c
+++ b/dquote.c
@@ -79,51 +79,211 @@ Perl_grok_bslash_c(pTHX_ const char   source,
     return TRUE;
 }
 
+const char *
+Perl_form_alien_digit_msg(pTHX_
+        const U8 which,           /* 8 or 16 */
+        const STRLEN valids_len,  /* length of input before first bad char */
+        const char * const first_bad, /* Ptr to that bad char */
+        const char * const send,      /* End of input string */
+        const bool UTF,               /* Is it in UTF-8? */
+        const bool braced)            /* Is it enclosed in {} */
+{
+    /* Generate a mortal SV containing an appropriate warning message about
+     * alien characters found in an octal or hex constant given by the inputs,
+     * and return a pointer to that SV's string.  The message looks like:
+     *
+     * Non-hex character '?' terminates \x early.  Resolved as "\x{...}"
+     *
+     */
+
+    /* The usual worst case scenario: 2 chars to display per byte, plus \x{}
+     * (leading zeros could take up more space, and the scalar will
+     * automatically grow if necessary).  Space for NUL is added by the newSV()
+     * function */
+    SV * display_char = newSV(2 * UTF8_MAXBYTES + 4);
+    SV * message_sv = sv_newmortal();
+    char symbol;
+
+    PERL_ARGS_ASSERT_FORM_ALIEN_DIGIT_MSG;
+    assert(which == 8 || which == 16);
+
+    /* Calculate the display form of the character */
+    if (    UVCHR_IS_INVARIANT(*first_bad)
+        || (UTF && isUTF8_CHAR((U8 *) first_bad, (U8 *) send)))
+    {
+        pv_uni_display(display_char, (U8 *) first_bad, UTF8SKIP(first_bad),
+                                                    (STRLEN) -1, UNI_DISPLAY_QQ);
+    }
+    else {  /* Is not UTF-8, or is illegal UTF-8.  Show just the one byte */
+
+        /* It also isn't a UTF-8 invariant character, so no display shortcuts
+         * are available.  Use \\x{...} */
+	Perl_sv_setpvf(aTHX_ display_char, "\\x{%02x}", *first_bad);
+    }
+
+    /* Ready to start building the message */
+    sv_setpvs(message_sv, "Non-");
+    if (which == 8) {
+        sv_catpvs(message_sv, "octal");
+        if (braced) {
+            symbol = 'o';
+        }
+        else {
+            symbol = '0';   /* \008, for example */
+        }
+    }
+    else {
+        sv_catpvs(message_sv, "hex");
+        symbol = 'x';
+    }
+    sv_catpvs(message_sv, " character ");
+
+    if (isPRINT(*first_bad)) {
+        sv_catpvs(message_sv, "'");
+    }
+    sv_catsv(message_sv, display_char);
+    if (isPRINT(*first_bad)) {
+        sv_catpvs(message_sv, "'");
+    }
+    Perl_sv_catpvf(aTHX_ message_sv, " terminates \\%c early.  Resolved as "
+                                     "\"\\%c", symbol, symbol);
+    if (braced) {
+        sv_catpvs(message_sv, "{");
+    }
+
+    /* Octal constants have an extra leading 0, but \0 already includes that */
+    if (symbol == 'o' && valids_len < 3) {
+        sv_catpvs(message_sv, "0");
+    }
+    if (valids_len == 0) {  /* No legal digits at all */
+        sv_catpvs(message_sv, "00");
+    }
+    else if (valids_len == 1) { /* Just one is legal */
+        sv_catpvs(message_sv, "0");
+    }
+    sv_catpvn(message_sv, first_bad - valids_len, valids_len);
+
+    if (braced) {
+        sv_catpvs(message_sv, "}");
+    }
+    else {
+        sv_catsv(message_sv, display_char);
+    }
+    sv_catpvs(message_sv, "\"");
+
+    SvREFCNT_dec_NN(display_char);
+
+    return SvPVX_const(message_sv);
+}
+
+const char *
+Perl_form_cp_too_large_msg(pTHX_
+        const U8 which,        /* 8 or 16 */
+        const char * string,   /* NULL, or the text that is supposed to
+                                  represent a code point */
+        const Size_t len,      /* length of 'string' if not NULL; else 0 */
+        const UV cp)           /* 0 if 'string' not NULL; else the too-large
+                                  code point */
+{
+    /* Generate a mortal SV containing an appropriate warning message about
+     * code points that are too large for this system, given by the inputs,
+     * and return a pointer to that SV's string.  Either the text of the string
+     * to be converted to a code point is input, or a code point itself.  The
+     * former is needed to accurately represent something that overflows.
+     *
+     * The message looks like:
+     *
+     * Use of code point %s is not allowed; the permissible max is %s
+     *
+     */
+
+    SV * message_sv = sv_newmortal();
+    const char * format;
+    const char * prefix;
+
+    PERL_ARGS_ASSERT_FORM_CP_TOO_LARGE_MSG;
+    assert(which == 8 || which == 16);
+
+    /* One but not both must be non-zero */
+    assert((string != NULL) ^ (cp != 0));
+    assert((string == NULL) || len);
+
+    if (which == 8) {
+        format = "%" UVof;
+        prefix = "0";
+    }
+    else {
+        format = "%" UVXf;
+        prefix = "0x";
+    }
+
+    Perl_sv_setpvf(aTHX_ message_sv, "Use of code point %s", prefix);
+    if (string) {
+        Perl_sv_catpvf(aTHX_ message_sv, "%.*s", (int) len, string);
+    }
+    else {
+        Perl_sv_catpvf(aTHX_ message_sv, format, cp);
+    }
+    Perl_sv_catpvf(aTHX_ message_sv, " is not allowed; the permissible max is %s", prefix);
+    Perl_sv_catpvf(aTHX_ message_sv, format, MAX_LEGAL_CP);
+
+    return SvPVX_const(message_sv);
+}
+
 bool
 Perl_grok_bslash_o(pTHX_ char **s, const char * const send, UV *uv,
                       const char** message,
-                      const bool output_warning, const bool strict,
+                      U32 *      packed_warn,
+                      const bool strict,
+                      const bool allow_UV_MAX,
                       const bool UTF)
 {
 
 /*  Documentation to be supplied when interface nailed down finally
- *  This returns FALSE if there is an error which the caller need not recover
- *  from; otherwise TRUE.  In either case the caller should look at *len [???].
- *  It guarantees that the returned codepoint, *uv, when expressed as
- *  utf8 bytes, would fit within the skipped "\o{...}" bytes.
- *  On input:
+ *  This returns FALSE if there is an error the caller should probably die
+ *  from; otherwise TRUE.
  *	s   is the address of a pointer to a string.  **s is 'o', and the
  *	    previous character was a backslash.  At exit, *s will be advanced
  *	    to the byte just after those absorbed by this function.  Hence the
- *	    caller can continue parsing from there.  In the case of an error,
- *	    this routine has generally positioned *s to point just to the right
- *	    of the first bad spot, so that a message that has a "<--" to mark
- *	    the spot will be correctly positioned.
+ *	    caller can continue parsing from there.  In the case of an error
+ *	    when this function returns FALSE, continuing to parse is not an
+ *	    option, this routine has generally positioned *s to point just to
+ *	    the right of the first bad spot, so that a message that has a "<--"
+ *	    to mark the spot will be correctly positioned.
  *	send - 1  gives a limit in *s that this function is not permitted to
  *	    look beyond.  That is, the function may look at bytes only in the
  *	    range *s..send-1
  *	uv  points to a UV that will hold the output value, valid only if the
- *	    return from the function is TRUE
- *      message is a pointer that will be set to an internal buffer giving an
- *	    error message upon failure (the return is FALSE).  Untouched if
- *	    function succeeds
- *	output_warning says whether to output any warning messages, or suppress
- *	    them
+ *	    return from the function is TRUE; may be changed from the input
+ *	    value even when FALSE is returned.
+ *      message  A pointer to any warning or error message will be stored into
+ *          this pointer; NULL if none.
+ *      packed_warn if NULL on input asks that this routine display any warning
+ *          messages.  Otherwise, if the function found a warning, the packed
+ *          warning categories will be stored into *packed_warn (and the
+ *          corresponding message text into *message); 0 if none.
  *	strict is true if this should fail instead of warn if there are
  *	    non-octal digits within the braces
+ *      allow_UV_MAX is true if this shouldn't fail if the input code point is
+ *          UV_MAX, which is normally illegal, reserved for internal use.
  *	UTF is true iff the string *s is encoded in UTF-8.
  */
     char* e;
     STRLEN numbers_len;
     I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
-		| PERL_SCAN_DISALLOW_PREFIX
-                | PERL_SCAN_SILENT_NON_PORTABLE
-	        | PERL_SCAN_SILENT_ILLDIGIT;
+              | PERL_SCAN_DISALLOW_PREFIX
+              | PERL_SCAN_SILENT_NON_PORTABLE
+              | PERL_SCAN_SILENT_ILLDIGIT
+              | PERL_SCAN_SILENT_OVERFLOW;
 
     PERL_ARGS_ASSERT_GROK_BSLASH_O;
 
     assert(*(*s - 1) == '\\');
     assert(* *s       == 'o');
+
+    *message = NULL;
+    if (packed_warn) *packed_warn = 0;
+
     (*s)++;
 
     if (send <= *s || **s != '{') {
@@ -145,29 +305,40 @@ Perl_grok_bslash_o(pTHX_ char **s, const char * const send, UV *uv,
                   sequence if not a digit) */
     numbers_len = e - *s;
     if (numbers_len == 0) {
-        (*s)++;    /* Move past the } */
+        (*s)++;    /* Move past the '}' */
 	*message = "Empty \\o{}";
 	return FALSE;
     }
 
     *uv = grok_oct(*s, &numbers_len, &flags, NULL);
+    if (UNLIKELY(   (flags & PERL_SCAN_GREATER_THAN_UV_MAX)
+                 || (! allow_UV_MAX && *uv == UV_MAX)))
+    {
+        *message = form_cp_too_large_msg(8, *s, numbers_len, 0);
+        *s = e + 1;
+        return FALSE;
+    }
+
     /* Note that if has non-octal, will ignore everything starting with that up
      * to the '}' */
-
     if (numbers_len != (STRLEN) (e - *s)) {
+        *s += numbers_len;
         if (strict) {
-            *s += numbers_len;
             *s += (UTF) ? UTF8_SAFE_SKIP(*s, send) : 1;
             *message = "Non-octal character";
             return FALSE;
         }
-        else if (output_warning) {
-            Perl_ck_warner(aTHX_ packWARN(WARN_DIGIT),
-            /* diag_listed_as: Non-octal character '%c'.  Resolved as "%s" */
-                        "Non-octal character '%c'.  Resolved as \"\\o{%.*s}\"",
-                        *(*s + numbers_len),
-                        (int) numbers_len,
-                        *s);
+
+        if (ckWARN(WARN_DIGIT)) {
+            const char * failure = form_alien_digit_msg(8, numbers_len, *s, send,
+                                                                      UTF, TRUE);
+            if (packed_warn) {
+                *message = failure;
+                *packed_warn = packWARN(WARN_DIGIT);
+            }
+            else {
+                Perl_warner(aTHX_ packWARN(WARN_DIGIT), "%s", failure);
+            }
         }
     }
 
@@ -178,14 +349,16 @@ Perl_grok_bslash_o(pTHX_ char **s, const char * const send, UV *uv,
 }
 
 bool
-Perl_grok_bslash_x(pTHX_ char **s, const char * const send, UV *uv,
+Perl_grok_bslash_x(pTHX_ char ** s, const char * const send, UV *uv,
                       const char** message,
-                      const bool output_warning, const bool strict,
+                      U32 *      packed_warn,
+                      const bool strict,
+                      const bool allow_UV_MAX,
                       const bool UTF)
 {
 
 /*  Documentation to be supplied when interface nailed down finally
- *  This returns FALSE if there is an error which the caller need not recover
+ *  This returns FALSE if there is an error the caller should probably die
  *  from; otherwise TRUE.
  *  It guarantees that the returned codepoint, *uv, when expressed as
  *  utf8 bytes, would fit within the skipped "\x{...}" bytes.
@@ -202,29 +375,38 @@ Perl_grok_bslash_x(pTHX_ char **s, const char * const send, UV *uv,
  *	    look beyond.  That is, the function may look at bytes only in the
  *	    range *s..send-1
  *	uv  points to a UV that will hold the output value, valid only if the
- *	    return from the function is TRUE
- *      message is a pointer that will be set to an internal buffer giving an
- *	    error message upon failure (the return is FALSE).  Untouched if
- *	    function succeeds
- *	output_warning says whether to output any warning messages, or suppress
- *	    them
+ *	    return from the function is TRUE; may be changed from the input
+ *	    value even when FALSE is returned.
+ *      message  A pointer to any warning or error message will be stored into
+ *          this pointer; NULL if none.
+ *      packed_warn if NULL on input asks that this routine display any warning
+ *          messages.  Otherwise, if the function found a warning, the packed
+ *          warning categories will be stored into *packed_warn (and the
+ *          corresponding message text into *message); 0 if none.
  *	strict is true if anything out of the ordinary should cause this to
  *	    fail instead of warn or be silent.  For example, it requires
  *	    exactly 2 digits following the \x (when there are no braces).
  *	    3 digits could be a mistake, so is forbidden in this mode.
+ *      allow_UV_MAX is true if this shouldn't fail if the input code point is
+ *          UV_MAX, which is normally illegal, reserved for internal use.
  *	UTF is true iff the string *s is encoded in UTF-8.
  */
     char* e;
     STRLEN numbers_len;
     I32 flags = PERL_SCAN_DISALLOW_PREFIX
-              | PERL_SCAN_SILENT_NON_PORTABLE;
-
+              | PERL_SCAN_SILENT_ILLDIGIT
+              | PERL_SCAN_NOTIFY_ILLDIGIT
+              | PERL_SCAN_SILENT_NON_PORTABLE
+              | PERL_SCAN_SILENT_OVERFLOW;
 
     PERL_ARGS_ASSERT_GROK_BSLASH_X;
 
     assert(*(*s - 1) == '\\');
     assert(* *s      == 'x');
 
+    *message = NULL;
+    if (packed_warn) *packed_warn = 0;
+
     (*s)++;
 
     if (send <= *s) {
@@ -239,24 +421,34 @@ Perl_grok_bslash_x(pTHX_ char **s, const char * const send, UV *uv,
         return TRUE;
     }
 
-    if (strict || ! output_warning) {
-        flags |= PERL_SCAN_SILENT_ILLDIGIT;
-    }
-
     if (**s != '{') {
-        STRLEN len = (strict) ? 3 : 2;
-
-	*uv = grok_hex(*s, &len, &flags, NULL);
-	*s += len;
-        if (strict && len != 2) {
-            if (len < 2) {
-                *s += (UTF) ? UTF8_SAFE_SKIP(*s, send) : 1;
-                *message = "Non-hex character";
-            }
-            else {
+        numbers_len = (strict) ? 3 : 2;
+
+	*uv = grok_hex(*s, &numbers_len, &flags, NULL);
+	*s += numbers_len;
+
+        if (numbers_len != 2 && (strict || (flags & PERL_SCAN_NOTIFY_ILLDIGIT))) {
+            if (numbers_len == 3) { /* numbers_len 3 only happens with strict */
                 *message = "Use \\x{...} for more than two hex characters";
+                return FALSE;
+            }
+            else if (strict) {
+                    *s += (UTF) ? UTF8_SAFE_SKIP(*s, send) : 1;
+                    *message = "Non-hex character";
+                    return FALSE;
+            }
+            else if (ckWARN(WARN_DIGIT)) {
+                const char * failure = form_alien_digit_msg(16, numbers_len, *s,
+                                                              send, UTF, FALSE);
+
+                if (! packed_warn) {
+                    Perl_warner(aTHX_ packWARN(WARN_DIGIT), "%s", failure);
+                }
+                else {
+                    *message = failure;
+                    *packed_warn = packWARN(WARN_DIGIT);
+                }
             }
-            return FALSE;
         }
 	return TRUE;
     }
@@ -264,7 +456,7 @@ Perl_grok_bslash_x(pTHX_ char **s, const char * const send, UV *uv,
     e = (char *) memchr(*s, '}', send - *s);
     if (!e) {
         (*s)++;  /* Move past the '{' */
-        while (isXDIGIT(**s)) { /* Position beyond the legal digits */
+        while (*s < send && isXDIGIT(**s)) { /* Position beyond legal digits */
             (*s)++;
         }
         /* XXX The corresponding message above for \o is just '\\o{'; other
@@ -291,14 +483,33 @@ Perl_grok_bslash_x(pTHX_ char **s, const char * const send, UV *uv,
     flags |= PERL_SCAN_ALLOW_UNDERSCORES;
 
     *uv = grok_hex(*s, &numbers_len, &flags, NULL);
-    /* Note that if has non-hex, will ignore everything starting with that up
-     * to the '}' */
+    if (UNLIKELY(   (flags & PERL_SCAN_GREATER_THAN_UV_MAX)
+                 || (! allow_UV_MAX && *uv == UV_MAX)))
+    {
+        *message = form_cp_too_large_msg(16, *s, numbers_len, 0);
+        *s = e + 1;
+        return FALSE;
+    }
 
-    if (strict && numbers_len != (STRLEN) (e - *s)) {
+    if (numbers_len != (STRLEN) (e - *s)) {
         *s += numbers_len;
-        *s += (UTF) ? UTF8_SAFE_SKIP(*s, send) : 1;
-        *message = "Non-hex character";
-        return FALSE;
+        if (strict) {
+            *s += (UTF) ? UTF8_SAFE_SKIP(*s, send) : 1;
+            *message = "Non-hex character";
+            return FALSE;
+        }
+
+        if (ckWARN(WARN_DIGIT)) {
+            const char * failure = form_alien_digit_msg(16, numbers_len, *s,
+                                                                send, UTF, TRUE);
+            if (! packed_warn) {
+                Perl_warner(aTHX_ packWARN(WARN_DIGIT), "%s", failure);
+            }
+            else {
+                *message = failure;
+                *packed_warn = packWARN(WARN_DIGIT);
+            }
+        }
     }
 
     /* Return past the '}' */
author	Karl Williamson <khw@cpan.org>	2020-01-16 13:17:32 -0700
committer	Karl Williamson <khw@cpan.org>	2020-01-23 15:46:56 -0700
commit	8d1e72f0f234299a86ddd5ce728d7cff6b44a547 (patch)
tree	477d458f1df8f1dd50551ced45ac7de81fd7368b /dquote.c
parent	73351a7160e044aa54e64f7da1c78c3401a64c7b (diff)
download	perl-8d1e72f0f234299a86ddd5ce728d7cff6b44a547.tar.gz