Move some obsolete UTF-8 handling fcns to mathoms

Two of the functions are internal to the core; the third has long been deprecated.
author: Karl Williamson <khw@cpan.org> 2020-02-15 11:15:09 -0700
committer: Karl Williamson <khw@cpan.org> 2020-02-19 18:22:53 -0700
commit: 86a5062a9c511e105120e7902a13b2167b830528 (patch)
tree: c8e99b63ca6029c4f77b2d829e7c0f8f8327fb8a /mathoms.c
parent: 4208b378031bca5deb79d0bdb058a142d219134a (diff)
download: perl-86a5062a9c511e105120e7902a13b2167b830528.tar.gz
1 files changed, 96 insertions, 0 deletions
diff --git a/mathoms.c b/mathoms.c
index b0122f95ca..a0728f6913 100644
--- a/mathoms.c
+++ b/mathoms.c
@@ -1321,6 +1321,102 @@ Perl_sv_2pvbyte(pTHX_ SV *sv, STRLEN *const lp)
     return sv_2pvbyte(sv, lp);
 }
 
+U8 *
+Perl_uvuni_to_utf8(pTHX_ U8 *d, UV uv)
+{
+    PERL_ARGS_ASSERT_UVUNI_TO_UTF8;
+
+    return uvoffuni_to_utf8_flags(d, uv, 0);
+}
+
+/*
+=for apidoc utf8n_to_uvuni
+
+Instead use L<perlapi/utf8_to_uvchr_buf>, or rarely, L<perlapi/utf8n_to_uvchr>.
+
+This function was useful for code that wanted to handle both EBCDIC and
+ASCII platforms with Unicode properties, but starting in Perl v5.20, the
+distinctions between the platforms have mostly been made invisible to most
+code, so this function is quite unlikely to be what you want.  If you do need
+this precise functionality, use instead
+C<L<NATIVE_TO_UNI(utf8_to_uvchr_buf(...))|perlapi/utf8_to_uvchr_buf>>
+or C<L<NATIVE_TO_UNI(utf8n_to_uvchr(...))|perlapi/utf8n_to_uvchr>>.
+
+=cut
+*/
+
+UV
+Perl_utf8n_to_uvuni(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
+{
+    PERL_ARGS_ASSERT_UTF8N_TO_UVUNI;
+
+    return NATIVE_TO_UNI(utf8n_to_uvchr(s, curlen, retlen, flags));
+}
+
+/*
+=for apidoc uvuni_to_utf8_flags
+
+Instead you almost certainly want to use L<perlapi/uvchr_to_utf8> or
+L<perlapi/uvchr_to_utf8_flags>.
+
+This function is a deprecated synonym for L</uvoffuni_to_utf8_flags>,
+which itself, while not deprecated, should be used only in isolated
+circumstances.  These functions were useful for code that wanted to handle
+both EBCDIC and ASCII platforms with Unicode properties, but starting in Perl
+v5.20, the distinctions between the platforms have mostly been made invisible
+to most code, so this function is quite unlikely to be what you want.
+
+=cut
+*/
+
+U8 *
+Perl_uvuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
+{
+    PERL_ARGS_ASSERT_UVUNI_TO_UTF8_FLAGS;
+
+    return uvoffuni_to_utf8_flags(d, uv, flags);
+}
+
+/*
+=for apidoc utf8_to_uvchr
+
+Returns the native code point of the first character in the string C<s>
+which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
+length, in bytes, of that character.
+
+Some, but not all, UTF-8 malformations are detected, and in fact, some
+malformed input could cause reading beyond the end of the input buffer, which
+is why this function is deprecated.  Use L</utf8_to_uvchr_buf> instead.
+
+If C<s> points to one of the detected malformations, and UTF8 warnings are
+enabled, zero is returned and C<*retlen> is set (if C<retlen> isn't
+C<NULL>) to -1.  If those warnings are off, the computed value if well-defined (or
+the Unicode REPLACEMENT CHARACTER, if not) is silently returned, and C<*retlen>
+is set (if C<retlen> isn't NULL) so that (S<C<s> + C<*retlen>>) is the
+next possible position in C<s> that could begin a non-malformed character.
+See L</utf8n_to_uvchr> for details on when the REPLACEMENT CHARACTER is returned.
+
+=cut
+*/
+
+UV
+Perl_utf8_to_uvchr(pTHX_ const U8 *s, STRLEN *retlen)
+{
+    PERL_ARGS_ASSERT_UTF8_TO_UVCHR;
+
+    /* This function is unsafe if malformed UTF-8 input is given it, which is
+     * why the function is deprecated.  If the first byte of the input
+     * indicates that there are more bytes remaining in the sequence that forms
+     * the character than there are in the input buffer, it can read past the
+     * end.  But we can make it safe if the input string happens to be
+     * NUL-terminated, as many strings in Perl are, by refusing to read past a
+     * NUL, which is what UTF8_CHK_SKIP() does.  A NUL indicates the start of
+     * the next character anyway.  If the input isn't NUL-terminated, the
+     * function remains unsafe, as it always has been. */
+
+    return utf8_to_uvchr_buf(s, s + UTF8_CHK_SKIP(s), retlen);
+}
+
 GCC_DIAG_RESTORE
 
 #endif /* NO_MATHOMS */
author	Karl Williamson <khw@cpan.org>	2020-02-15 11:15:09 -0700
committer	Karl Williamson <khw@cpan.org>	2020-02-19 18:22:53 -0700
commit	86a5062a9c511e105120e7902a13b2167b830528 (patch)
tree	c8e99b63ca6029c4f77b2d829e7c0f8f8327fb8a /mathoms.c
parent	4208b378031bca5deb79d0bdb058a142d219134a (diff)
download	perl-86a5062a9c511e105120e7902a13b2167b830528.tar.gz