diff options
author | Karl Williamson <khw@cpan.org> | 2016-09-28 15:05:17 -0600 |
---|---|---|
committer | Karl Williamson <khw@cpan.org> | 2016-10-13 11:18:11 -0600 |
commit | 7cf8d05d1e856f3bd3a392b3ccea008f1c1eb743 (patch) | |
tree | 3387705d72ad140eb4ab6412aa70adb94afc6bb5 /embed.h | |
parent | 806547a7dc29226b6a06672e1d42fb136e766510 (diff) | |
download | perl-7cf8d05d1e856f3bd3a392b3ccea008f1c1eb743.tar.gz |
Add details to UTF-8 malformation error messages
I've long been unsatisfied with the information contained in the
error/warning messages raised when some input is malformed UTF-8, but
have been reluctant to change the text in case some one is relying on
it. One reason that someone might be parsing the messages is that there
has been no convenient way to otherwise pin down what the exact
malformation might be. A few commits from now will add a facility
to get the type of malformation unambiguously. This will be a better
mechanism to use for those rare modules that need to know what's the
exact malformation.
So, I will fix and issue pull requests for any module broken by this
commit.
The messages are changed by now dumping (in \xXY format) the bytes that
make up the malformed character, and extra details are added in most
cases.
Messages about overlongs now display the code point they evaluate to and
what the shortest UTF-8 sequence for generating that code point is.
Messages about overflowing now just display that it overflows, since the
entire byte sequence is now dumped. The previous message displayed just
the byte which was being processed where overflow was detected, but that
information is not at all meaningfull.
Diffstat (limited to 'embed.h')
-rw-r--r-- | embed.h | 3 |
1 files changed, 2 insertions, 1 deletions
@@ -1821,6 +1821,7 @@ #define isa_lookup(a,b,c,d) S_isa_lookup(aTHX_ a,b,c,d) # endif # if defined(PERL_IN_UTF8_C) +#define _byte_dump_string(a,b) S__byte_dump_string(aTHX_ a,b) #define _to_utf8_case(a,b,c,d,e,f,g) S__to_utf8_case(aTHX_ a,b,c,d,e,f,g) #define check_locale_boundary_crossing(a,b,c,d) S_check_locale_boundary_crossing(aTHX_ a,b,c,d) #define is_utf8_common(a,b,c,d) S_is_utf8_common(aTHX_ a,b,c,d) @@ -1828,7 +1829,7 @@ #define swash_scan_list_line(a,b,c,d,e,f,g) S_swash_scan_list_line(aTHX_ a,b,c,d,e,f,g) #define swatch_get(a,b,c) S_swatch_get(aTHX_ a,b,c) #define to_lower_latin1 S_to_lower_latin1 -#define unexpected_non_continuation_text(a,b) S_unexpected_non_continuation_text(aTHX_ a,b) +#define unexpected_non_continuation_text(a,b,c,d) S_unexpected_non_continuation_text(aTHX_ a,b,c,d) # endif # if defined(PERL_IN_UTF8_C) || defined(PERL_IN_PP_C) #define _to_upper_title_latin1(a,b,c,d) Perl__to_upper_title_latin1(aTHX_ a,b,c,d) |