From 7cf8d05d1e856f3bd3a392b3ccea008f1c1eb743 Mon Sep 17 00:00:00 2001 From: Karl Williamson Date: Wed, 28 Sep 2016 15:05:17 -0600 Subject: Add details to UTF-8 malformation error messages I've long been unsatisfied with the information contained in the error/warning messages raised when some input is malformed UTF-8, but have been reluctant to change the text in case some one is relying on it. One reason that someone might be parsing the messages is that there has been no convenient way to otherwise pin down what the exact malformation might be. A few commits from now will add a facility to get the type of malformation unambiguously. This will be a better mechanism to use for those rare modules that need to know what's the exact malformation. So, I will fix and issue pull requests for any module broken by this commit. The messages are changed by now dumping (in \xXY format) the bytes that make up the malformed character, and extra details are added in most cases. Messages about overlongs now display the code point they evaluate to and what the shortest UTF-8 sequence for generating that code point is. Messages about overflowing now just display that it overflows, since the entire byte sequence is now dumped. The previous message displayed just the byte which was being processed where overflow was detected, but that information is not at all meaningfull. --- embed.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'embed.h') diff --git a/embed.h b/embed.h index 6045ec75d2..d9c270920e 100644 --- a/embed.h +++ b/embed.h @@ -1821,6 +1821,7 @@ #define isa_lookup(a,b,c,d) S_isa_lookup(aTHX_ a,b,c,d) # endif # if defined(PERL_IN_UTF8_C) +#define _byte_dump_string(a,b) S__byte_dump_string(aTHX_ a,b) #define _to_utf8_case(a,b,c,d,e,f,g) S__to_utf8_case(aTHX_ a,b,c,d,e,f,g) #define check_locale_boundary_crossing(a,b,c,d) S_check_locale_boundary_crossing(aTHX_ a,b,c,d) #define is_utf8_common(a,b,c,d) S_is_utf8_common(aTHX_ a,b,c,d) @@ -1828,7 +1829,7 @@ #define swash_scan_list_line(a,b,c,d,e,f,g) S_swash_scan_list_line(aTHX_ a,b,c,d,e,f,g) #define swatch_get(a,b,c) S_swatch_get(aTHX_ a,b,c) #define to_lower_latin1 S_to_lower_latin1 -#define unexpected_non_continuation_text(a,b) S_unexpected_non_continuation_text(aTHX_ a,b) +#define unexpected_non_continuation_text(a,b,c,d) S_unexpected_non_continuation_text(aTHX_ a,b,c,d) # endif # if defined(PERL_IN_UTF8_C) || defined(PERL_IN_PP_C) #define _to_upper_title_latin1(a,b,c,d) Perl__to_upper_title_latin1(aTHX_ a,b,c,d) -- cgit v1.2.1