utf8.c: Add utf8_to_uvchr_buf() and utf8_to_uvuni_buf()

The existing functions (utf8_to_uvchr and utf8_to_uvuni) have a deficiency in that they could read beyond the end of the input string if given malformed input. This commit creates two new functions which behave as the old ones did, but have an extra parameter each, which gives the upper limit to the string, so no read beyond it is done.
author: Karl Williamson <public@khwilliamson.com> 2012-03-19 15:03:01 -0600
committer: Karl Williamson <public@khwilliamson.com> 2012-03-19 18:23:44 -0600
commit: ec5f19d09949aac9034bb62ade44ffba8d4d2bb1 (patch)
tree: aee587b46e00f843fa393dcc6427dcbf632c1bd9 /utf8.c
parent: d0460f306d2b79d09a9e5694f9f72c50a2481b83 (diff)
download: perl-ec5f19d09949aac9034bb62ade44ffba8d4d2bb1.tar.gz
1 files changed, 54 insertions, 1 deletions
diff --git a/utf8.c b/utf8.c
index 0aede4c3a6..1faa96d9fb 100644
--- a/utf8.c
+++ b/utf8.c
@@ -563,7 +563,7 @@ All other code points corresponding to Unicode characters, including private
 use and those yet to be assigned, are never considered malformed and never
 warn.
 
-Most code should use L</utf8_to_uvchr>() rather than call this directly.
+Most code should use L</utf8_to_uvchr_buf>() rather than call this directly.
 
 =cut
 */
@@ -795,6 +795,31 @@ malformed:
 }
 
 /*
+=for apidoc utf8_to_uvchr_buf
+
+Returns the native code point of the first character in the string C<s> which
+is assumed to be in UTF-8 encoding; C<send> points to 1 beyond the end of C<s>.
+C<retlen> will be set to the length, in bytes, of that character.
+
+If C<s> does not point to a well-formed UTF-8 character, zero is
+returned and C<retlen> is set, if possible, to -1.
+
+=cut
+*/
+
+
+UV
+Perl_utf8_to_uvchr_buf(pTHX_ const U8 *s, const U8 *send, STRLEN *retlen)
+{
+    PERL_ARGS_ASSERT_UTF8_TO_UVCHR_BUF;
+
+    assert(s < send);
+
+    return utf8n_to_uvchr(s, send - s, retlen,
+			  ckWARN_d(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
+}
+
+/*
 =for apidoc utf8_to_uvchr
 
 Returns the native code point of the first character in the string C<s>
@@ -817,6 +842,34 @@ Perl_utf8_to_uvchr(pTHX_ const U8 *s, STRLEN *retlen)
 }
 
 /*
+=for apidoc utf8_to_uvuni_buf
+
+Returns the Unicode code point of the first character in the string C<s> which
+is assumed to be in UTF-8 encoding; C<send> points to 1 beyond the end of C<s>.
+C<retlen> will be set to the length, in bytes, of that character.
+
+This function should only be used when the returned UV is considered
+an index into the Unicode semantic tables (e.g. swashes).
+
+If C<s> does not point to a well-formed UTF-8 character, zero is
+returned and C<retlen> is set, if possible, to -1.
+
+=cut
+*/
+
+UV
+Perl_utf8_to_uvuni_buf(pTHX_ const U8 *s, const U8 *send, STRLEN *retlen)
+{
+    PERL_ARGS_ASSERT_UTF8_TO_UVUNI_BUF;
+
+    assert(send > s);
+
+    /* Call the low level routine asking for checks */
+    return Perl_utf8n_to_uvuni(aTHX_ s, send -s, retlen,
+			       ckWARN_d(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
+}
+
+/*
 =for apidoc utf8_to_uvuni
 
 Returns the Unicode code point of the first character in the string C<s>
author	Karl Williamson <public@khwilliamson.com>	2012-03-19 15:03:01 -0600
committer	Karl Williamson <public@khwilliamson.com>	2012-03-19 18:23:44 -0600
commit	ec5f19d09949aac9034bb62ade44ffba8d4d2bb1 (patch)
tree	aee587b46e00f843fa393dcc6427dcbf632c1bd9 /utf8.c
parent	d0460f306d2b79d09a9e5694f9f72c50a2481b83 (diff)
download	perl-ec5f19d09949aac9034bb62ade44ffba8d4d2bb1.tar.gz