Move some deprecated utf8-handling functions to mathoms

This entailed creating new internal functions for some of them to call so that the functionality can be retained during the deprecation period.
author: Karl Williamson <khw@cpan.org> 2014-05-05 22:46:48 -0600
committer: Karl Williamson <khw@cpan.org> 2014-05-31 11:54:50 -0600
commit: f2645549e6a4bfca055b5fd1932da462df424921 (patch)
tree: 07b6b78d7f56b4cd966883167e5973d450342401 /mathoms.c
parent: 3cedd9d93070bd6f0cb074a3013165cd9a630fca (diff)
download: perl-f2645549e6a4bfca055b5fd1932da462df424921.tar.gz
1 files changed, 162 insertions, 0 deletions
diff --git a/mathoms.c b/mathoms.c
index 1132ac80f0..732e2a4398 100644
--- a/mathoms.c
+++ b/mathoms.c
@@ -1233,6 +1233,22 @@ ASCII_TO_NEED(const UV enc, const UV ch)
     return ch;
 }
 
+bool      /* Made into a function, so can be deprecated */
+Perl_isIDFIRST_lazy(pTHX_ const char* p)
+{
+    PERL_ARGS_ASSERT_ISIDFIRST_LAZY;
+
+    return isIDFIRST_lazy_if(p,1);
+}
+
+bool      /* Made into a function, so can be deprecated */
+Perl_isALNUM_lazy(pTHX_ const char* p)
+{
+    PERL_ARGS_ASSERT_ISALNUM_LAZY;
+
+    return isALNUM_lazy_if(p,1);
+}
+
 bool
 Perl_is_uni_alnum(pTHX_ UV c)
 {
@@ -1368,6 +1384,54 @@ Perl_is_uni_digit_lc(pTHX_ UV c)
 }
 
 bool
+Perl_is_uni_idfirst(pTHX_ UV c)
+{
+    U8 tmpbuf[UTF8_MAXBYTES+1];
+    uvchr_to_utf8(tmpbuf, c);
+    return _is_utf8_idstart(tmpbuf);
+}
+
+bool
+Perl_is_utf8_idfirst(pTHX_ const U8 *p) /* The naming is historical. */
+{
+    dVAR;
+
+    PERL_ARGS_ASSERT_IS_UTF8_IDFIRST;
+
+    return _is_utf8_idstart(p);
+}
+
+bool
+Perl_is_utf8_xidfirst(pTHX_ const U8 *p) /* The naming is historical. */
+{
+    dVAR;
+
+    PERL_ARGS_ASSERT_IS_UTF8_XIDFIRST;
+
+    return _is_utf8_xidstart(p);
+}
+
+bool
+Perl_is_utf8_idcont(pTHX_ const U8 *p)
+{
+    dVAR;
+
+    PERL_ARGS_ASSERT_IS_UTF8_IDCONT;
+
+    return _is_utf8_idcont(p);
+}
+
+bool
+Perl_is_utf8_xidcont(pTHX_ const U8 *p)
+{
+    dVAR;
+
+    PERL_ARGS_ASSERT_IS_UTF8_XIDCONT;
+
+    return _is_utf8_xidcont(p);
+}
+
+bool
 Perl_is_uni_upper_lc(pTHX_ UV c)
 {
     return isUPPER_LC_uvchr(c);
@@ -1628,6 +1692,104 @@ Perl_is_utf8_mark(pTHX_ const U8 *p)
     return _is_utf8_mark(p);
 }
 
+/*
+=for apidoc is_utf8_char
+
+Tests if some arbitrary number of bytes begins in a valid UTF-8
+character.  Note that an INVARIANT (i.e. ASCII on non-EBCDIC machines)
+character is a valid UTF-8 character.  The actual number of bytes in the UTF-8
+character will be returned if it is valid, otherwise 0.
+
+This function is deprecated due to the possibility that malformed input could
+cause reading beyond the end of the input buffer.  Use L</isUTF8_CHAR>
+instead.
+
+=cut */
+
+STRLEN
+Perl_is_utf8_char(const U8 *s)
+{
+    PERL_ARGS_ASSERT_IS_UTF8_CHAR;
+
+    /* Assumes we have enough space, which is why this is deprecated */
+    return isUTF8_CHAR(s, s + UTF8SKIP(s));
+}
+
+/* DEPRECATED!
+ * Like L</utf8_to_uvuni_buf>(), but should only be called when it is known that
+ * there are no malformations in the input UTF-8 string C<s>.  Surrogates,
+ * non-character code points, and non-Unicode code points are allowed */
+
+UV
+Perl_valid_utf8_to_uvuni(pTHX_ const U8 *s, STRLEN *retlen)
+{
+    PERL_ARGS_ASSERT_VALID_UTF8_TO_UVUNI;
+
+    return NATIVE_TO_UNI(valid_utf8_to_uvchr(s, retlen));
+}
+
+/*
+=for apidoc utf8_to_uvchr
+
+Returns the native code point of the first character in the string C<s>
+which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
+length, in bytes, of that character.
+
+Some, but not all, UTF-8 malformations are detected, and in fact, some
+malformed input could cause reading beyond the end of the input buffer, which
+is why this function is deprecated.  Use L</utf8_to_uvchr_buf> instead.
+
+If C<s> points to one of the detected malformations, and UTF8 warnings are
+enabled, zero is returned and C<*retlen> is set (if C<retlen> isn't
+NULL) to -1.  If those warnings are off, the computed value if well-defined (or
+the Unicode REPLACEMENT CHARACTER, if not) is silently returned, and C<*retlen>
+is set (if C<retlen> isn't NULL) so that (S<C<s> + C<*retlen>>) is the
+next possible position in C<s> that could begin a non-malformed character.
+See L</utf8n_to_uvchr> for details on when the REPLACEMENT CHARACTER is returned.
+
+=cut
+*/
+
+UV
+Perl_utf8_to_uvchr(pTHX_ const U8 *s, STRLEN *retlen)
+{
+    PERL_ARGS_ASSERT_UTF8_TO_UVCHR;
+
+    return utf8_to_uvchr_buf(s, s + UTF8_MAXBYTES, retlen);
+}
+
+/*
+=for apidoc utf8_to_uvuni
+
+Returns the Unicode code point of the first character in the string C<s>
+which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
+length, in bytes, of that character.
+
+Some, but not all, UTF-8 malformations are detected, and in fact, some
+malformed input could cause reading beyond the end of the input buffer, which
+is one reason why this function is deprecated.  The other is that only in
+extremely limited circumstances should the Unicode versus native code point be
+of any interest to you.  See L</utf8_to_uvuni_buf> for alternatives.
+
+If C<s> points to one of the detected malformations, and UTF8 warnings are
+enabled, zero is returned and C<*retlen> is set (if C<retlen> doesn't point to
+NULL) to -1.  If those warnings are off, the computed value if well-defined (or
+the Unicode REPLACEMENT CHARACTER, if not) is silently returned, and C<*retlen>
+is set (if C<retlen> isn't NULL) so that (S<C<s> + C<*retlen>>) is the
+next possible position in C<s> that could begin a non-malformed character.
+See L</utf8n_to_uvchr> for details on when the REPLACEMENT CHARACTER is returned.
+
+=cut
+*/
+
+UV
+Perl_utf8_to_uvuni(pTHX_ const U8 *s, STRLEN *retlen)
+{
+    PERL_ARGS_ASSERT_UTF8_TO_UVUNI;
+
+    return NATIVE_TO_UNI(valid_utf8_to_uvchr(s, retlen));
+}
+
 END_EXTERN_C
 
 #endif /* NO_MATHOMS */
author	Karl Williamson <khw@cpan.org>	2014-05-05 22:46:48 -0600
committer	Karl Williamson <khw@cpan.org>	2014-05-31 11:54:50 -0600
commit	f2645549e6a4bfca055b5fd1932da462df424921 (patch)
tree	07b6b78d7f56b4cd966883167e5973d450342401 /mathoms.c
parent	3cedd9d93070bd6f0cb074a3013165cd9a630fca (diff)
download	perl-f2645549e6a4bfca055b5fd1932da462df424921.tar.gz