diff options
author | Karl Williamson <khw@cpan.org> | 2014-05-05 22:46:48 -0600 |
---|---|---|
committer | Karl Williamson <khw@cpan.org> | 2014-05-31 11:54:50 -0600 |
commit | f2645549e6a4bfca055b5fd1932da462df424921 (patch) | |
tree | 07b6b78d7f56b4cd966883167e5973d450342401 /mathoms.c | |
parent | 3cedd9d93070bd6f0cb074a3013165cd9a630fca (diff) | |
download | perl-f2645549e6a4bfca055b5fd1932da462df424921.tar.gz |
Move some deprecated utf8-handling functions to mathoms
This entailed creating new internal functions for some of them to call
so that the functionality can be retained during the deprecation period.
Diffstat (limited to 'mathoms.c')
-rw-r--r-- | mathoms.c | 162 |
1 files changed, 162 insertions, 0 deletions
@@ -1233,6 +1233,22 @@ ASCII_TO_NEED(const UV enc, const UV ch) return ch; } +bool /* Made into a function, so can be deprecated */ +Perl_isIDFIRST_lazy(pTHX_ const char* p) +{ + PERL_ARGS_ASSERT_ISIDFIRST_LAZY; + + return isIDFIRST_lazy_if(p,1); +} + +bool /* Made into a function, so can be deprecated */ +Perl_isALNUM_lazy(pTHX_ const char* p) +{ + PERL_ARGS_ASSERT_ISALNUM_LAZY; + + return isALNUM_lazy_if(p,1); +} + bool Perl_is_uni_alnum(pTHX_ UV c) { @@ -1368,6 +1384,54 @@ Perl_is_uni_digit_lc(pTHX_ UV c) } bool +Perl_is_uni_idfirst(pTHX_ UV c) +{ + U8 tmpbuf[UTF8_MAXBYTES+1]; + uvchr_to_utf8(tmpbuf, c); + return _is_utf8_idstart(tmpbuf); +} + +bool +Perl_is_utf8_idfirst(pTHX_ const U8 *p) /* The naming is historical. */ +{ + dVAR; + + PERL_ARGS_ASSERT_IS_UTF8_IDFIRST; + + return _is_utf8_idstart(p); +} + +bool +Perl_is_utf8_xidfirst(pTHX_ const U8 *p) /* The naming is historical. */ +{ + dVAR; + + PERL_ARGS_ASSERT_IS_UTF8_XIDFIRST; + + return _is_utf8_xidstart(p); +} + +bool +Perl_is_utf8_idcont(pTHX_ const U8 *p) +{ + dVAR; + + PERL_ARGS_ASSERT_IS_UTF8_IDCONT; + + return _is_utf8_idcont(p); +} + +bool +Perl_is_utf8_xidcont(pTHX_ const U8 *p) +{ + dVAR; + + PERL_ARGS_ASSERT_IS_UTF8_XIDCONT; + + return _is_utf8_xidcont(p); +} + +bool Perl_is_uni_upper_lc(pTHX_ UV c) { return isUPPER_LC_uvchr(c); @@ -1628,6 +1692,104 @@ Perl_is_utf8_mark(pTHX_ const U8 *p) return _is_utf8_mark(p); } +/* +=for apidoc is_utf8_char + +Tests if some arbitrary number of bytes begins in a valid UTF-8 +character. Note that an INVARIANT (i.e. ASCII on non-EBCDIC machines) +character is a valid UTF-8 character. The actual number of bytes in the UTF-8 +character will be returned if it is valid, otherwise 0. + +This function is deprecated due to the possibility that malformed input could +cause reading beyond the end of the input buffer. Use L</isUTF8_CHAR> +instead. + +=cut */ + +STRLEN +Perl_is_utf8_char(const U8 *s) +{ + PERL_ARGS_ASSERT_IS_UTF8_CHAR; + + /* Assumes we have enough space, which is why this is deprecated */ + return isUTF8_CHAR(s, s + UTF8SKIP(s)); +} + +/* DEPRECATED! + * Like L</utf8_to_uvuni_buf>(), but should only be called when it is known that + * there are no malformations in the input UTF-8 string C<s>. Surrogates, + * non-character code points, and non-Unicode code points are allowed */ + +UV +Perl_valid_utf8_to_uvuni(pTHX_ const U8 *s, STRLEN *retlen) +{ + PERL_ARGS_ASSERT_VALID_UTF8_TO_UVUNI; + + return NATIVE_TO_UNI(valid_utf8_to_uvchr(s, retlen)); +} + +/* +=for apidoc utf8_to_uvchr + +Returns the native code point of the first character in the string C<s> +which is assumed to be in UTF-8 encoding; C<retlen> will be set to the +length, in bytes, of that character. + +Some, but not all, UTF-8 malformations are detected, and in fact, some +malformed input could cause reading beyond the end of the input buffer, which +is why this function is deprecated. Use L</utf8_to_uvchr_buf> instead. + +If C<s> points to one of the detected malformations, and UTF8 warnings are +enabled, zero is returned and C<*retlen> is set (if C<retlen> isn't +NULL) to -1. If those warnings are off, the computed value if well-defined (or +the Unicode REPLACEMENT CHARACTER, if not) is silently returned, and C<*retlen> +is set (if C<retlen> isn't NULL) so that (S<C<s> + C<*retlen>>) is the +next possible position in C<s> that could begin a non-malformed character. +See L</utf8n_to_uvchr> for details on when the REPLACEMENT CHARACTER is returned. + +=cut +*/ + +UV +Perl_utf8_to_uvchr(pTHX_ const U8 *s, STRLEN *retlen) +{ + PERL_ARGS_ASSERT_UTF8_TO_UVCHR; + + return utf8_to_uvchr_buf(s, s + UTF8_MAXBYTES, retlen); +} + +/* +=for apidoc utf8_to_uvuni + +Returns the Unicode code point of the first character in the string C<s> +which is assumed to be in UTF-8 encoding; C<retlen> will be set to the +length, in bytes, of that character. + +Some, but not all, UTF-8 malformations are detected, and in fact, some +malformed input could cause reading beyond the end of the input buffer, which +is one reason why this function is deprecated. The other is that only in +extremely limited circumstances should the Unicode versus native code point be +of any interest to you. See L</utf8_to_uvuni_buf> for alternatives. + +If C<s> points to one of the detected malformations, and UTF8 warnings are +enabled, zero is returned and C<*retlen> is set (if C<retlen> doesn't point to +NULL) to -1. If those warnings are off, the computed value if well-defined (or +the Unicode REPLACEMENT CHARACTER, if not) is silently returned, and C<*retlen> +is set (if C<retlen> isn't NULL) so that (S<C<s> + C<*retlen>>) is the +next possible position in C<s> that could begin a non-malformed character. +See L</utf8n_to_uvchr> for details on when the REPLACEMENT CHARACTER is returned. + +=cut +*/ + +UV +Perl_utf8_to_uvuni(pTHX_ const U8 *s, STRLEN *retlen) +{ + PERL_ARGS_ASSERT_UTF8_TO_UVUNI; + + return NATIVE_TO_UNI(valid_utf8_to_uvchr(s, retlen)); +} + END_EXTERN_C #endif /* NO_MATHOMS */ |