summaryrefslogtreecommitdiff
path: root/utf8.c
diff options
context:
space:
mode:
authorKarl Williamson <public@khwilliamson.com>2012-12-23 10:03:16 -0700
committerKarl Williamson <public@khwilliamson.com>2012-12-23 10:53:36 -0700
commit281235491e0eef7be051126b2c99109c4f5332be (patch)
tree4a554c8dedb3aeae2eea6730b635aaf2a200e5ba /utf8.c
parenta37c82b6b1246893f697942026f005f7e60ec399 (diff)
downloadperl-281235491e0eef7be051126b2c99109c4f5332be.tar.gz
Deprecate calling isFOO_utf8() with malformed
handy.h has character classification macros to determine if a UTF-8 encoded character is of a given type FOO, such as isALPHA_utf8(), etc. Code that calls these should have first made sure that the parameter is legal UTF-8. Prior to this patch, false was silently returned for all illegal UTF-8. Now, in most instances, a deprecation warning is raised. This is to catch bugs, and prepare for eventual elimination of this check, which fails to catch read-off-end-of-buffer malformations anyway. (One idea would be to leave the check in for DEBUGGING builds.) The cases where no deprecation warning is raised as a result of this commit is for the classes where the character does not have to be converted to a code point for its inclusion to be determined. For example, if malformed UTF-8 is checked to see if it is ASCII, we only need to check that it is one of the 128 ASCII characters. If it isn't, we don't bother to see if it is malformed or not. There are other cases, as well, such as with isSPACE(), where we check if the UTF-8 is one of a very finite set, without checking for malformedness. This commit causes a number of apparent bugs to be shown by the Perl test suite. These do not cause actual failures.
Diffstat (limited to 'utf8.c')
-rw-r--r--utf8.c16
1 files changed, 13 insertions, 3 deletions
diff --git a/utf8.c b/utf8.c
index 930b148419..ec4e62799b 100644
--- a/utf8.c
+++ b/utf8.c
@@ -2028,16 +2028,26 @@ S_is_utf8_common(pTHX_ const U8 *const p, SV **swash,
PERL_ARGS_ASSERT_IS_UTF8_COMMON;
/* The API should have included a length for the UTF-8 character in <p>,
- * but it doesn't. We therefor assume that p has been validated at least
+ * but it doesn't. We therefore assume that p has been validated at least
* as far as there being enough bytes available in it to accommodate the
* character without reading beyond the end, and pass that number on to the
* validating routine */
- if (!is_utf8_char_buf(p, p + UTF8SKIP(p)))
- return FALSE;
+ if (! is_utf8_char_buf(p, p + UTF8SKIP(p))) {
+ if (ckWARN_d(WARN_UTF8)) {
+ Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED,WARN_UTF8),
+ "It is deprecated to pass malformed UTF-8 to character classification macros, for \"%s\"", swashname);
+ if (ckWARN(WARN_UTF8)) { /* This will output details as to the
+ what the malformation is */
+ utf8_to_uvchr_buf(p, p + UTF8SKIP(p), NULL);
+ }
+ }
+ return FALSE;
+ }
if (!*swash) {
U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
*swash = _core_swash_init("utf8", swashname, &PL_sv_undef, 1, 0, NULL, &flags);
}
+
return swash_fetch(*swash, p, TRUE) != 0;
}