summaryrefslogtreecommitdiff
path: root/utf8.c
diff options
context:
space:
mode:
authorKarl Williamson <public@khwilliamson.com>2012-03-19 15:38:06 -0600
committerKarl Williamson <public@khwilliamson.com>2012-03-19 18:23:44 -0600
commit4b88fb76efce8c436e63b907c9842345d4fa77c7 (patch)
tree67d8be3146bf0c32e93bd8209c141ed72c5a0ae2 /utf8.c
parent27d6c58a7e12243bef66c58b38e7d1415d9ca07e (diff)
downloadperl-4b88fb76efce8c436e63b907c9842345d4fa77c7.tar.gz
Use the new utf8 to code point functions
These functions should be used in preference to the old ones which can read beyond the end of the input string.
Diffstat (limited to 'utf8.c')
-rw-r--r--utf8.c46
1 files changed, 25 insertions, 21 deletions
diff --git a/utf8.c b/utf8.c
index c9bc63a001..85bf2f00c8 100644
--- a/utf8.c
+++ b/utf8.c
@@ -839,7 +839,11 @@ Returns the native code point of the first character in the string C<s>
which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
length, in bytes, of that character.
-If C<s> does not point to a well-formed UTF-8 character, zero is
+Some, but not all, UTF-8 malformations are detected, and in fact, some
+malformed input could cause reading beyond the end of the input buffer.
+Use L</utf8_to_uvchr_buf> instead.
+
+If C<s> points to one of the detected malformations, zero is
returned and C<retlen> is set, if possible, to -1.
=cut
@@ -850,8 +854,7 @@ Perl_utf8_to_uvchr(pTHX_ const U8 *s, STRLEN *retlen)
{
PERL_ARGS_ASSERT_UTF8_TO_UVCHR;
- return utf8n_to_uvchr(s, UTF8_MAXBYTES, retlen,
- ckWARN_d(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
+ return valid_utf8_to_uvchr(s, retlen);
}
/*
@@ -902,10 +905,11 @@ Returns the Unicode code point of the first character in the string C<s>
which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
length, in bytes, of that character.
-This function should only be used when the returned UV is considered
-an index into the Unicode semantic tables (e.g. swashes).
+Some, but not all, UTF-8 malformations are detected, and in fact, some
+malformed input could cause reading beyond the end of the input buffer.
+Use L</utf8_to_uvuni_buf> instead.
-If C<s> does not point to a well-formed UTF-8 character, zero is
+If C<s> points to one of the detected malformations, zero is
returned and C<retlen> is set, if possible, to -1.
=cut
@@ -916,9 +920,7 @@ Perl_utf8_to_uvuni(pTHX_ const U8 *s, STRLEN *retlen)
{
PERL_ARGS_ASSERT_UTF8_TO_UVUNI;
- /* Call the low level routine asking for checks */
- return Perl_utf8n_to_uvuni(aTHX_ s, UTF8_MAXBYTES, retlen,
- ckWARN_d(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
+ return valid_utf8_to_uvuni(s, retlen);
}
/*
@@ -1128,7 +1130,7 @@ Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *len)
d = s = save;
while (s < send) {
STRLEN ulen;
- *d++ = (U8)utf8_to_uvchr(s, &ulen);
+ *d++ = (U8)utf8_to_uvchr_buf(s, send, &ulen);
s += ulen;
}
*d = '\0';
@@ -2154,7 +2156,7 @@ Perl_to_utf8_case(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp,
dVAR;
U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
STRLEN len = 0;
- const UV uv0 = utf8_to_uvchr(p, NULL);
+ const UV uv0 = valid_utf8_to_uvchr(p, NULL);
/* The NATIVE_TO_UNI() and UNI_TO_NATIVE() mappings
* are necessary in EBCDIC, they are redundant no-ops
* in ASCII-ish platforms, and hopefully optimized away. */
@@ -2302,7 +2304,7 @@ S_check_locale_boundary_crossing(pTHX_ const U8* const p, const UV result, U8* c
bad_crossing:
/* Failed, have to return the original */
- original = utf8_to_uvchr(p, lenp);
+ original = valid_utf8_to_uvchr(p, lenp);
Copy(p, ustrp, *lenp, char);
return original;
}
@@ -3508,7 +3510,7 @@ Perl__swash_inversion_hash(pTHX_ SV* const swash)
"unexpectedly is not a string, flags=%lu",
(unsigned long)SvFLAGS(sv_to));
}
- /*DEBUG_U(PerlIO_printf(Perl_debug_log, "Found mapping from %"UVXf", First char of to is %"UVXf"\n", utf8_to_uvchr((U8*) char_from, 0), utf8_to_uvchr((U8*) SvPVX(sv_to), 0)));*/
+ /*DEBUG_U(PerlIO_printf(Perl_debug_log, "Found mapping from %"UVXf", First char of to is %"UVXf"\n", valid_utf8_to_uvchr((U8*) char_from, 0), valid_utf8_to_uvchr((U8*) SvPVX(sv_to), 0)));*/
/* Each key in the inverse list is a mapped-to value, and the key's
* hash value is a list of the strings (each in utf8) that map to
@@ -3575,7 +3577,7 @@ Perl__swash_inversion_hash(pTHX_ SV* const swash)
Perl_croak(aTHX_ "panic: hv_store() unexpectedly failed");
}
- /* For debugging: UV u = utf8_to_uvchr((U8*) SvPVX(*entryp), 0);*/
+ /* For debugging: UV u = valid_utf8_to_uvchr((U8*) SvPVX(*entryp), 0);*/
for (j = 0; j <= av_len(from_list); j++) {
entryp = av_fetch(from_list, j, FALSE);
if (entryp == NULL) {
@@ -3583,9 +3585,11 @@ Perl__swash_inversion_hash(pTHX_ SV* const swash)
}
/* When i==j this adds itself to the list */
- av_push(i_list, newSVuv(utf8_to_uvchr(
- (U8*) SvPVX(*entryp), 0)));
- /*DEBUG_U(PerlIO_printf(Perl_debug_log, "Adding %"UVXf" to list for %"UVXf"\n", utf8_to_uvchr((U8*) SvPVX(*entryp), 0), u));*/
+ av_push(i_list, newSVuv(utf8_to_uvchr_buf(
+ (U8*) SvPVX(*entryp),
+ (U8*) SvPVX(*entryp) + SvCUR(*entryp),
+ 0)));
+ /*DEBUG_U(PerlIO_printf(Perl_debug_log, "Adding %"UVXf" to list for %"UVXf"\n", valid_utf8_to_uvchr((U8*) SvPVX(*entryp), 0), u));*/
}
}
}
@@ -3931,7 +3935,7 @@ Perl_check_utf8_print(pTHX_ register const U8* s, const STRLEN len)
STRLEN char_len;
if (UTF8_IS_SUPER(s)) {
if (ckWARN_d(WARN_NON_UNICODE)) {
- UV uv = utf8_to_uvchr(s, &char_len);
+ UV uv = utf8_to_uvchr_buf(s, e, &char_len);
Perl_warner(aTHX_ packWARN(WARN_NON_UNICODE),
"Code point 0x%04"UVXf" is not Unicode, may not be portable", uv);
ok = FALSE;
@@ -3939,7 +3943,7 @@ Perl_check_utf8_print(pTHX_ register const U8* s, const STRLEN len)
}
else if (UTF8_IS_SURROGATE(s)) {
if (ckWARN_d(WARN_SURROGATE)) {
- UV uv = utf8_to_uvchr(s, &char_len);
+ UV uv = utf8_to_uvchr_buf(s, e, &char_len);
Perl_warner(aTHX_ packWARN(WARN_SURROGATE),
"Unicode surrogate U+%04"UVXf" is illegal in UTF-8", uv);
ok = FALSE;
@@ -3949,7 +3953,7 @@ Perl_check_utf8_print(pTHX_ register const U8* s, const STRLEN len)
((UTF8_IS_NONCHAR_GIVEN_THAT_NON_SUPER_AND_GE_PROBLEMATIC(s))
&& (ckWARN_d(WARN_NONCHAR)))
{
- UV uv = utf8_to_uvchr(s, &char_len);
+ UV uv = utf8_to_uvchr_buf(s, e, &char_len);
Perl_warner(aTHX_ packWARN(WARN_NONCHAR),
"Unicode non-character U+%04"UVXf" is illegal for open interchange", uv);
ok = FALSE;
@@ -3999,7 +4003,7 @@ Perl_pv_uni_display(pTHX_ SV *dsv, const U8 *spv, STRLEN len, STRLEN pvlim, UV f
truncated++;
break;
}
- u = utf8_to_uvchr((U8*)s, 0);
+ u = utf8_to_uvchr_buf((U8*)s, (U8*)e, 0);
if (u < 256) {
const unsigned char c = (unsigned char)u & 0xFF;
if (flags & UNI_DISPLAY_BACKSLASH) {