summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKarl Williamson <khw@cpan.org>2022-06-28 09:33:22 -0600
committerKarl Williamson <khw@cpan.org>2022-06-28 10:10:20 -0600
commit31798a685edf1feafb2f251ac5096a193518cf2e (patch)
treeedc4f929ac8e1e188b6ad03f72ca837334f79274
parentb179236d59d4d0be43d9307d30b9e97609c9f96d (diff)
downloadperl-31798a685edf1feafb2f251ac5096a193518cf2e.tar.gz
Partially revert remove deprecated functions"
This partially reverts commit 7008caa915ad99e650acf2aea40612b5e48b7ba2. Another portion was reverted by f847c0b367a25dc6028136ddc7085a7301da7a82. It turns out that two more of the removed functions are still used in cpan in several places. Revert their removal for now until a better alternative than any existing is in place and backported in ppport.h.
-rw-r--r--embed.fnc2
-rw-r--r--embed.h6
-rw-r--r--mathoms.c75
-rw-r--r--proto.h14
4 files changed, 97 insertions, 0 deletions
diff --git a/embed.fnc b/embed.fnc
index b2a6a2cbd8..d098b490ee 100644
--- a/embed.fnc
+++ b/embed.fnc
@@ -2593,6 +2593,8 @@ CxTdp |U8* |bytes_from_utf8_loc|NN const U8 *s \
|NN bool *is_utf8p \
|NULLOK const U8 ** first_unconverted
Apxd |U8* |bytes_to_utf8 |NN const U8 *s|NN STRLEN *lenp
+ApdDb |UV |utf8_to_uvchr |NN const U8 *s|NULLOK STRLEN *retlen
+CbpdD |UV |utf8_to_uvuni |NN const U8 *s|NULLOK STRLEN *retlen
AMpd |UV |utf8_to_uvchr_buf |NN const U8 *s|NN const U8 *send|NULLOK STRLEN *retlen
Cip |UV |utf8_to_uvchr_buf_helper|NN const U8 *s|NN const U8 *send|NULLOK STRLEN *retlen
p |bool |check_utf8_print |NN const U8 *s|const STRLEN len
diff --git a/embed.h b/embed.h
index b381d5ca93..7e97e477a5 100644
--- a/embed.h
+++ b/embed.h
@@ -703,7 +703,13 @@
#define utf8_hop_safe Perl_utf8_hop_safe
#define utf8_length(a,b) Perl_utf8_length(aTHX_ a,b)
#define utf8_to_bytes(a,b) Perl_utf8_to_bytes(aTHX_ a,b)
+#ifndef NO_MATHOMS
+#define utf8_to_uvchr(a,b) Perl_utf8_to_uvchr(aTHX_ a,b)
+#endif
#define utf8_to_uvchr_buf_helper(a,b,c) Perl_utf8_to_uvchr_buf_helper(aTHX_ a,b,c)
+#ifndef NO_MATHOMS
+#define utf8_to_uvuni(a,b) Perl_utf8_to_uvuni(aTHX_ a,b)
+#endif
#define utf8n_to_uvchr_msgs Perl_utf8n_to_uvchr_msgs
#ifndef NO_MATHOMS
#define utf8n_to_uvuni(a,b,c,d) Perl_utf8n_to_uvuni(aTHX_ a,b,c,d)
diff --git a/mathoms.c b/mathoms.c
index 0449622020..5f5155b614 100644
--- a/mathoms.c
+++ b/mathoms.c
@@ -821,6 +821,40 @@ Perl_is_utf8_char_buf(const U8 *buf, const U8* buf_end)
return isUTF8_CHAR(buf, buf_end);
}
+/*
+=for apidoc_section $unicode
+=for apidoc utf8_to_uvuni
+
+Returns the Unicode code point of the first character in the string C<s>
+which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
+length, in bytes, of that character.
+
+Some, but not all, UTF-8 malformations are detected, and in fact, some
+malformed input could cause reading beyond the end of the input buffer, which
+is one reason why this function is deprecated. The other is that only in
+extremely limited circumstances should the Unicode versus native code point be
+of any interest to you.
+
+If C<s> points to one of the detected malformations, and UTF8 warnings are
+enabled, zero is returned and C<*retlen> is set (if C<retlen> doesn't point to
+NULL) to -1. If those warnings are off, the computed value if well-defined (or
+the Unicode REPLACEMENT CHARACTER, if not) is silently returned, and C<*retlen>
+is set (if C<retlen> isn't NULL) so that (S<C<s> + C<*retlen>>) is the
+next possible position in C<s> that could begin a non-malformed character.
+See L<perlapi/utf8n_to_uvchr> for details on when the REPLACEMENT CHARACTER is returned.
+
+=cut
+*/
+
+UV
+Perl_utf8_to_uvuni(pTHX_ const U8 *s, STRLEN *retlen)
+{
+ PERL_UNUSED_CONTEXT;
+ PERL_ARGS_ASSERT_UTF8_TO_UVUNI;
+
+ return NATIVE_TO_UNI(valid_utf8_to_uvchr(s, retlen));
+}
+
/* return ptr to little string in big string, NULL if not found */
/* The original version of this routine was donated by Corey Satten. */
@@ -895,6 +929,47 @@ Perl_utf8n_to_uvuni(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
return NATIVE_TO_UNI(utf8n_to_uvchr(s, curlen, retlen, flags));
}
+/*
+=for apidoc_section $unicode
+=for apidoc utf8_to_uvchr
+
+Returns the native code point of the first character in the string C<s>
+which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
+length, in bytes, of that character.
+
+Some, but not all, UTF-8 malformations are detected, and in fact, some
+malformed input could cause reading beyond the end of the input buffer, which
+is why this function is deprecated. Use L</utf8_to_uvchr_buf> instead.
+
+If C<s> points to one of the detected malformations, and UTF8 warnings are
+enabled, zero is returned and C<*retlen> is set (if C<retlen> isn't
+C<NULL>) to -1. If those warnings are off, the computed value if well-defined (or
+the Unicode REPLACEMENT CHARACTER, if not) is silently returned, and C<*retlen>
+is set (if C<retlen> isn't NULL) so that (S<C<s> + C<*retlen>>) is the
+next possible position in C<s> that could begin a non-malformed character.
+See L</utf8n_to_uvchr> for details on when the REPLACEMENT CHARACTER is returned.
+
+=cut
+*/
+
+UV
+Perl_utf8_to_uvchr(pTHX_ const U8 *s, STRLEN *retlen)
+{
+ PERL_ARGS_ASSERT_UTF8_TO_UVCHR;
+
+ /* This function is unsafe if malformed UTF-8 input is given it, which is
+ * why the function is deprecated. If the first byte of the input
+ * indicates that there are more bytes remaining in the sequence that forms
+ * the character than there are in the input buffer, it can read past the
+ * end. But we can make it safe if the input string happens to be
+ * NUL-terminated, as many strings in Perl are, by refusing to read past a
+ * NUL, which is what UTF8_CHK_SKIP() does. A NUL indicates the start of
+ * the next character anyway. If the input isn't NUL-terminated, the
+ * function remains unsafe, as it always has been. */
+
+ return utf8_to_uvchr_buf(s, s + UTF8_CHK_SKIP(s), retlen);
+}
+
GCC_DIAG_RESTORE
#endif /* NO_MATHOMS */
diff --git a/proto.h b/proto.h
index f9c63dc7a9..992e2be53a 100644
--- a/proto.h
+++ b/proto.h
@@ -4401,6 +4401,13 @@ PERL_CALLCONV U8* Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *lenp);
PERL_CALLCONV U8* Perl_utf8_to_utf16_base(pTHX_ U8* s, U8 *d, Size_t bytelen, Size_t *newlen, const bool high, const bool low);
#define PERL_ARGS_ASSERT_UTF8_TO_UTF16_BASE \
assert(s); assert(d); assert(newlen)
+#ifndef NO_MATHOMS
+PERL_CALLCONV UV Perl_utf8_to_uvchr(pTHX_ const U8 *s, STRLEN *retlen)
+ __attribute__deprecated__;
+#define PERL_ARGS_ASSERT_UTF8_TO_UVCHR \
+ assert(s)
+#endif
+
PERL_CALLCONV UV Perl_utf8_to_uvchr_buf(pTHX_ const U8 *s, const U8 *send, STRLEN *retlen);
#define PERL_ARGS_ASSERT_UTF8_TO_UVCHR_BUF \
assert(s); assert(send)
@@ -4409,6 +4416,13 @@ PERL_STATIC_INLINE UV Perl_utf8_to_uvchr_buf_helper(pTHX_ const U8 *s, const U8
#define PERL_ARGS_ASSERT_UTF8_TO_UVCHR_BUF_HELPER \
assert(s); assert(send)
#endif
+#ifndef NO_MATHOMS
+PERL_CALLCONV UV Perl_utf8_to_uvuni(pTHX_ const U8 *s, STRLEN *retlen)
+ __attribute__deprecated__;
+#define PERL_ARGS_ASSERT_UTF8_TO_UVUNI \
+ assert(s)
+#endif
+
PERL_CALLCONV UV Perl_utf8n_to_uvchr(const U8 *s, STRLEN curlen, STRLEN *retlen, const U32 flags);
#define PERL_ARGS_ASSERT_UTF8N_TO_UVCHR \
assert(s)