summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKarl Williamson <khw@cpan.org>2020-01-15 10:48:05 -0700
committerKarl Williamson <khw@cpan.org>2020-01-23 15:46:56 -0700
commit73351a7160e044aa54e64f7da1c78c3401a64c7b (patch)
tree7496d9bec007d36ff53fe40e5a07e8c2527b2498
parent58ad81600e45597437bafd2a7e22e6182b164104 (diff)
downloadperl-73351a7160e044aa54e64f7da1c78c3401a64c7b.tar.gz
Restructure grok_bslash_c
This commit causes this function to allow a caller to request any messages generated to be returned to the caller, instead of always being handled within this function. Like the previous commit for grok_bslash_c, here are two reasons to do this, repeated here. 1) In pattern compilation this brings these messages into conformity with the other ones that get generated in pattern compilation, where there is a particular syntax, including marking the exact position in the parse where the problem occurred. 2) The messages could be truncated due to the (mostly) single-pass nature of pattern compilation that is now in effect. It keeps track of where during a parse a message has been output, and won't output it again if a second parsing pass turns out to be necessary. Prior to this commit, it had to assume that a message from one of these functions did get output, and this caused some out-of-bounds reads when a subparse (using a constructed pattern) was executed. The possibility of those went away in commit 5d894ca5213, which guarantees it won't try to read outside bounds, but that may still mean it is outputting text from the wrong parse, giving meaningless results. This commit should stop that possibility.
-rw-r--r--dquote.c66
-rw-r--r--embed.fnc5
-rw-r--r--embed.h2
-rw-r--r--pod/perldelta.pod30
-rw-r--r--proto.h5
-rw-r--r--regcomp.c47
-rw-r--r--t/lib/warnings/regcomp4
-rw-r--r--t/lib/warnings/toke6
-rw-r--r--t/re/reg_mesg.t4
-rw-r--r--toke.c9
10 files changed, 139 insertions, 39 deletions
diff --git a/dquote.c b/dquote.c
index 3a2ba46c26..d6e442e950 100644
--- a/dquote.c
+++ b/dquote.c
@@ -11,48 +11,72 @@
#include "dquote_inline.h"
/* XXX Add documentation after final interface and behavior is decided */
-/* May want to show context for error, so would pass S_grok_bslash_c(pTHX_ const char* current, const char* start, const bool output_warning)
- U8 source = *current;
-*/
-char
-Perl_grok_bslash_c(pTHX_ const char source, const bool output_warning)
+bool
+Perl_grok_bslash_c(pTHX_ const char source,
+ U8 * result,
+ const char** message,
+ U32 * packed_warn)
{
-
- U8 result;
+ PERL_ARGS_ASSERT_GROK_BSLASH_C;
+
+ /* This returns TRUE if the \c? sequence is valid; FALSE otherwise. If it
+ * is valid, the sequence evaluates to a single character, which will be
+ * stored into *result.
+ *
+ * source is the character immediately after a '\c' sequence.
+ * result points to a char variable into which this function will store
+ * what the sequence evaluates to, if valid; unchanged otherwise.
+ * message A pointer to any warning or error message will be stored into
+ * this pointer; NULL if none.
+ * packed_warn if NULL on input asks that this routine display any warning
+ * messages. Otherwise, if the function found a warning, the
+ * packed warning categories will be stored into *packed_warn (and
+ * the corresponding message text into *message); 0 if none.
+ */
+
+ *message = NULL;
+ if (packed_warn) *packed_warn = 0;
if (! isPRINT_A(source)) {
- Perl_croak(aTHX_ "%s",
- "Character following \"\\c\" must be printable ASCII");
+ *message = "Character following \"\\c\" must be printable ASCII";
+ return FALSE;
}
- else if (source == '{') {
+
+ if (source == '{') {
const char control = toCTRL('{');
if (isPRINT_A(control)) {
/* diag_listed_as: Use "%s" instead of "%s" */
- Perl_croak(aTHX_ "Use \"%c\" instead of \"\\c{\"", control);
+ *message = Perl_form(aTHX_ "Use \"%c\" instead of \"\\c{\"", control);
}
else {
- Perl_croak(aTHX_ "Sequence \"\\c{\" invalid");
+ *message = "Sequence \"\\c{\" invalid";
}
+ return FALSE;
}
- result = toCTRL(source);
- if (output_warning && isPRINT_A(result)) {
+ *result = toCTRL(source);
+ if (isPRINT_A(*result) && ckWARN(WARN_SYNTAX)) {
U8 clearer[3];
U8 i = 0;
- if (! isWORDCHAR(result)) {
+ char format[] = "\"\\c%c\" is more clearly written simply as \"%s\"";
+
+ if (! isWORDCHAR(*result)) {
clearer[i++] = '\\';
}
- clearer[i++] = result;
+ clearer[i++] = *result;
clearer[i++] = '\0';
- Perl_ck_warner(aTHX_ packWARN(WARN_SYNTAX),
- "\"\\c%c\" is more clearly written simply as \"%s\"",
- source,
- clearer);
+ if (packed_warn) {
+ *message = Perl_form(aTHX_ format, source, clearer);
+ *packed_warn = packWARN(WARN_SYNTAX);
+ }
+ else {
+ Perl_warner(aTHX_ packWARN(WARN_SYNTAX), format, source, clearer);
+ }
}
- return result;
+ return TRUE;
}
bool
diff --git a/embed.fnc b/embed.fnc
index 014e0f2f42..012a47932c 100644
--- a/embed.fnc
+++ b/embed.fnc
@@ -1143,7 +1143,10 @@ EpRX |bool |grok_bslash_x |NN char** s \
|const bool output_warning \
|const bool strict \
|const bool utf8
-EpRX |char |grok_bslash_c |const char source|const bool output_warning
+EpRX |bool |grok_bslash_c |const char source \
+ |NN U8 * result \
+ |NN const char** message \
+ |NULLOK U32 * packed_warn
EpRX |bool |grok_bslash_o |NN char** s \
|NN const char* const send \
|NN UV* uv \
diff --git a/embed.h b/embed.h
index 6fbbed72a3..cd167db841 100644
--- a/embed.h
+++ b/embed.h
@@ -1109,7 +1109,7 @@
# endif
# if defined(PERL_IN_REGCOMP_C) || defined(PERL_IN_TOKE_C) || defined(PERL_IN_DQUOTE_C)
#define form_short_octal_warning(a,b) S_form_short_octal_warning(aTHX_ a,b)
-#define grok_bslash_c(a,b) Perl_grok_bslash_c(aTHX_ a,b)
+#define grok_bslash_c(a,b,c,d) Perl_grok_bslash_c(aTHX_ a,b,c,d)
#define grok_bslash_o(a,b,c,d,e,f,g) Perl_grok_bslash_o(aTHX_ a,b,c,d,e,f,g)
#define grok_bslash_x(a,b,c,d,e,f,g) Perl_grok_bslash_x(aTHX_ a,b,c,d,e,f,g)
#define regcurly S_regcurly
diff --git a/pod/perldelta.pod b/pod/perldelta.pod
index 0901ccc3c4..3379edf6ee 100644
--- a/pod/perldelta.pod
+++ b/pod/perldelta.pod
@@ -230,7 +230,35 @@ XXX Changes (i.e. rewording) of diagnostic messages go here
=item *
-XXX Describe change here
+L<Character following "\c" must be printable ASCII|perldiag/"Character following "\c" must be printable ASCII">
+
+now has extra text added at the end, when raised during regular
+expression pattern compilation, marking where precisely in the pattern
+it occured.
+
+=item *
+
+L<Use "%s" instead of "%s"|perldiag/"Use "%s" instead of "%s"">
+
+now has extra text added at the end, when raised during regular
+expression pattern compilation, marking where precisely in the pattern
+it occured.
+
+=item *
+
+L<Sequence "\c{" invalid|perldiag/"Sequence "\c{" invalid">
+
+now has extra text added at the end, when raised during regular
+expression pattern compilation, marking where precisely in the pattern
+it occured.
+
+=item *
+
+L<"\c%c" is more clearly written simply as "%s"|perldiag/""\c%c" is more clearly written simply as "%s"">
+
+now has extra text added at the end, when raised during regular
+expression pattern compilation, marking where precisely in the pattern
+it occured.
=back
diff --git a/proto.h b/proto.h
index 1f71619e19..d5fb2caf54 100644
--- a/proto.h
+++ b/proto.h
@@ -5912,9 +5912,10 @@ PERL_STATIC_INLINE char* S_form_short_octal_warning(pTHX_ const char * const s,
assert(s)
#endif
-PERL_CALLCONV char Perl_grok_bslash_c(pTHX_ const char source, const bool output_warning)
+PERL_CALLCONV bool Perl_grok_bslash_c(pTHX_ const char source, U8 * result, const char** message, U32 * packed_warn)
__attribute__warn_unused_result__;
-#define PERL_ARGS_ASSERT_GROK_BSLASH_C
+#define PERL_ARGS_ASSERT_GROK_BSLASH_C \
+ assert(result); assert(message)
PERL_CALLCONV bool Perl_grok_bslash_o(pTHX_ char** s, const char* const send, UV* uv, const char** message, const bool output_warning, const bool strict, const bool utf8)
__attribute__warn_unused_result__;
diff --git a/regcomp.c b/regcomp.c
index 39875a1958..93bacdfdb6 100644
--- a/regcomp.c
+++ b/regcomp.c
@@ -13968,6 +13968,10 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
|| ! is_PATWS_safe((p), RExC_end, UTF));
switch ((U8)*p) {
+ const char* message;
+ U32 packed_warn;
+ U8 grok_c_char;
+
case '^':
case '$':
case '.':
@@ -14134,10 +14138,24 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
break;
}
case 'c':
- p++;
- ender = grok_bslash_c(*p, TO_OUTPUT_WARNINGS(p));
- UPDATE_WARNINGS_LOC(p);
p++;
+ if (! grok_bslash_c(*p, &grok_c_char,
+ &message, &packed_warn))
+ {
+ /* going to die anyway; point to exact spot of
+ * failure */
+ RExC_parse = p + ((UTF)
+ ? UTF8_SAFE_SKIP(p, RExC_end)
+ : 1);
+ vFAIL(message);
+ }
+
+ ender = grok_c_char;
+ p++;
+ if (message && TO_OUTPUT_WARNINGS(p)) {
+ warn_non_literal_string(p, packed_warn, message);
+ }
+
break;
case '8': case '9': /* must be a backreference */
--p;
@@ -17372,6 +17390,9 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
* is already in 'value'. Otherwise, need to translate the escape
* into what it signifies. */
if (! skip_white || ! isBLANK_A(value)) switch ((I32)value) {
+ const char * message;
+ U32 packed_warn;
+ U8 grok_c_char;
case 'w': namedclass = ANYOF_WORDCHAR; break;
case 'W': namedclass = ANYOF_NWORDCHAR; break;
@@ -17657,9 +17678,23 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
non_portable_endpoint++;
break;
case 'c':
- value = grok_bslash_c(*RExC_parse, TO_OUTPUT_WARNINGS(RExC_parse));
- UPDATE_WARNINGS_LOC(RExC_parse);
- RExC_parse++;
+ if (! grok_bslash_c(*RExC_parse, &grok_c_char, &message,
+ &packed_warn))
+ {
+ /* going to die anyway; point to exact spot of
+ * failure */
+ RExC_parse += (UTF)
+ ? UTF8_SAFE_SKIP(RExC_parse, RExC_end)
+ : 1;
+ vFAIL(message);
+ }
+
+ value = grok_c_char;
+ RExC_parse++;
+ if (message && TO_OUTPUT_WARNINGS(RExC_parse)) {
+ warn_non_literal_string(RExC_parse, packed_warn, message);
+ }
+
non_portable_endpoint++;
break;
case '0': case '1': case '2': case '3': case '4':
diff --git a/t/lib/warnings/regcomp b/t/lib/warnings/regcomp
index 6dcb7897bc..b10680b2a5 100644
--- a/t/lib/warnings/regcomp
+++ b/t/lib/warnings/regcomp
@@ -35,8 +35,8 @@ no warnings 'syntax';
$a = qr/\c,/;
$a = qr/[\c,]/;
EXPECT
-"\c," is more clearly written simply as "l" at - line 9.
-"\c," is more clearly written simply as "l" at - line 10.
+"\c," is more clearly written simply as "l" in regex; marked by <-- HERE in m/\c, <-- HERE / at - line 9.
+"\c," is more clearly written simply as "l" in regex; marked by <-- HERE in m/[\c, <-- HERE ]/ at - line 10.
########
# This is because currently a different error is output under
# use re 'strict', so can't go in reg_mesg.t
diff --git a/t/lib/warnings/toke b/t/lib/warnings/toke
index e66558a5b5..e875874707 100644
--- a/t/lib/warnings/toke
+++ b/t/lib/warnings/toke
@@ -1422,7 +1422,8 @@ use warnings;
my $a = "\c{ack}";
EXPECT
OPTION fatal
-Use ";" instead of "\c{" at - line 9.
+Use ";" instead of "\c{" at - line 9, within string
+Execution of - aborted due to compilation errors.
########
# toke.c
BEGIN {
@@ -1441,7 +1442,8 @@ Sequence "\c{" invalid at - line 9.
my $a = "\câ";
EXPECT
OPTION fatal
-Character following "\c" must be printable ASCII at - line 2.
+Character following "\c" must be printable ASCII at - line 2, within string
+Execution of - aborted due to compilation errors.
########
# toke.c
use warnings 'syntax' ;
diff --git a/t/re/reg_mesg.t b/t/re/reg_mesg.t
index 1ef912b3a9..c7d51d9ad0 100644
--- a/t/re/reg_mesg.t
+++ b/t/re/reg_mesg.t
@@ -283,7 +283,6 @@ my @death =
"m/(?('/" => "Sequence (?('... not terminated {#} m/(?('{#}/",
'm/\g{/' => 'Sequence \g{... not terminated {#} m/\g{{#}/',
'm/\k</' => 'Sequence \k<... not terminated {#} m/\k<{#}/',
- 'm/\cß/' => "Character following \"\\c\" must be printable ASCII",
'/((?# This is a comment in the middle of a token)?:foo)/' => 'In \'(?...)\', the \'(\' and \'?\' must be adjacent {#} m/((?# This is a comment in the middle of a token)?{#}:foo)/',
'/((?# This is a comment in the middle of a token)*FAIL)/' => 'In \'(*VERB...)\', the \'(\' and \'*\' must be adjacent {#} m/((?# This is a comment in the middle of a token)*{#}FAIL)/',
'/((?# This is a comment in the middle of a token)*script_run:foo)/' => 'In \'(*...)\', the \'(\' and \'*\' must be adjacent {#} m/((?# This is a comment in the middle of a token)*{#}script_run:foo)/',
@@ -491,7 +490,8 @@ my @death_utf8 = mark_as_utf8(
'/(?[ \t + \e # ネ This was supposed to be a comment ])/' =>
"Syntax error in (?[...]) {#} m/(?[ \\t + \\e # ネ This was supposed to be a comment ]){#}/",
'm/(*ネ)ネ/' => q<Unknown '(*...)' construct 'ネ' {#} m/(*ネ){#}ネ/>,
- '/\cネ/' => "Character following \"\\c\" must be printable ASCII",
+ '/\cネ/' => "Character following \"\\c\" must be printable ASCII {#} m/\\cネ{#}/",
+ '/[\cネ]/' => "Character following \"\\c\" must be printable ASCII {#} m/[\\cネ{#}]/",
'/\b{ネ}/' => "'ネ' is an unknown bound type {#} m/\\b{ネ{#}}/",
'/\B{ネ}/' => "'ネ' is an unknown bound type {#} m/\\B{ネ{#}}/",
);
diff --git a/toke.c b/toke.c
index 0638b98919..41e69305b6 100644
--- a/toke.c
+++ b/toke.c
@@ -3997,7 +3997,14 @@ S_scan_const(pTHX_ char *start)
case 'c':
s++;
if (s < send) {
- *d++ = grok_bslash_c(*s, 1);
+ const char * message;
+
+ if (! grok_bslash_c(*s, (U8 *) d, &message, NULL)) {
+ yyerror(message);
+ yyquit(); /* Have always immediately croaked on
+ errors in this */
+ }
+ d++;
}
else {
yyerror("Missing control char name in \\c");