summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--embed.fnc16
-rw-r--r--embed.h8
-rw-r--r--mathoms.c2
-rw-r--r--proto.h43
-rw-r--r--utf8.c183
-rw-r--r--utf8.h11
6 files changed, 101 insertions, 162 deletions
diff --git a/embed.fnc b/embed.fnc
index 5372149771..f18ecb4560 100644
--- a/embed.fnc
+++ b/embed.fnc
@@ -1543,28 +1543,18 @@ ApMd |U8* |bytes_to_utf8 |NN const U8 *s|NN STRLEN *len
ApdD |UV |utf8_to_uvchr |NN const U8 *s|NULLOK STRLEN *retlen
ApdD |UV |utf8_to_uvuni |NN const U8 *s|NULLOK STRLEN *retlen
ApMD |UV |valid_utf8_to_uvuni |NN const U8 *s|NULLOK STRLEN *retlen
-Apd |UV |utf8_to_uvchr_buf |NN const U8 *s|NN const U8 *send|NULLOK STRLEN *retlen
+Amd |UV |utf8_to_uvchr_buf |NN const U8 *s|NN const U8 *send|NULLOK STRLEN *retlen
ApdD |UV |utf8_to_uvuni_buf |NN const U8 *s|NN const U8 *send|NULLOK STRLEN *retlen
pM |bool |check_utf8_print |NN const U8 *s|const STRLEN len
-#ifdef EBCDIC
Adp |UV |utf8n_to_uvchr |NN const U8 *s|STRLEN curlen|NULLOK STRLEN *retlen|U32 flags
-#else
-Adpbm |UV |utf8n_to_uvchr |NN const U8 *s|STRLEN curlen|NULLOK STRLEN *retlen|U32 flags
-#endif
ApM |UV |valid_utf8_to_uvchr |NN const U8 *s|NULLOK STRLEN *retlen
-Adp |UV |utf8n_to_uvoffuni|NN const U8 *s|STRLEN curlen|NULLOK STRLEN *retlen|U32 flags
Ap |UV |utf8n_to_uvuni|NN const U8 *s|STRLEN curlen|NULLOK STRLEN *retlen|U32 flags
-#ifdef EBCDIC
-Apd |U8* |uvchr_to_utf8 |NN U8 *d|UV uv
-#else
-Apdbm |U8* |uvchr_to_utf8 |NN U8 *d|UV uv
-#endif
-
+Adm |U8* |uvchr_to_utf8 |NN U8 *d|UV uv
Ap |U8* |uvuni_to_utf8 |NN U8 *d|UV uv
-Ap |U8* |uvchr_to_utf8_flags |NN U8 *d|UV uv|UV flags
+Adm |U8* |uvchr_to_utf8_flags |NN U8 *d|UV uv|UV flags
Apd |U8* |uvoffuni_to_utf8_flags |NN U8 *d|UV uv|UV flags
Ap |U8* |uvuni_to_utf8_flags |NN U8 *d|UV uv|UV flags
Apd |char* |pv_uni_display |NN SV *dsv|NN const U8 *spv|STRLEN len|STRLEN pvlim|UV flags
diff --git a/embed.h b/embed.h
index 110f735492..5ce9ed0d1f 100644
--- a/embed.h
+++ b/embed.h
@@ -693,12 +693,10 @@
#define utf8_length(a,b) Perl_utf8_length(aTHX_ a,b)
#define utf8_to_bytes(a,b) Perl_utf8_to_bytes(aTHX_ a,b)
#define utf8_to_uvchr(a,b) Perl_utf8_to_uvchr(aTHX_ a,b)
-#define utf8_to_uvchr_buf(a,b,c) Perl_utf8_to_uvchr_buf(aTHX_ a,b,c)
#define utf8_to_uvuni(a,b) Perl_utf8_to_uvuni(aTHX_ a,b)
#define utf8_to_uvuni_buf(a,b,c) Perl_utf8_to_uvuni_buf(aTHX_ a,b,c)
-#define utf8n_to_uvoffuni(a,b,c,d) Perl_utf8n_to_uvoffuni(aTHX_ a,b,c,d)
+#define utf8n_to_uvchr(a,b,c,d) Perl_utf8n_to_uvchr(aTHX_ a,b,c,d)
#define utf8n_to_uvuni(a,b,c,d) Perl_utf8n_to_uvuni(aTHX_ a,b,c,d)
-#define uvchr_to_utf8_flags(a,b,c) Perl_uvchr_to_utf8_flags(aTHX_ a,b,c)
#define uvoffuni_to_utf8_flags(a,b,c) Perl_uvoffuni_to_utf8_flags(aTHX_ a,b,c)
#define uvuni_to_utf8(a,b) Perl_uvuni_to_utf8(aTHX_ a,b)
#define uvuni_to_utf8_flags(a,b,c) Perl_uvuni_to_utf8_flags(aTHX_ a,b,c)
@@ -763,10 +761,6 @@
#if defined(DUMP_FDS)
#define dump_fds(a) Perl_dump_fds(aTHX_ a)
#endif
-#if defined(EBCDIC)
-#define utf8n_to_uvchr(a,b,c,d) Perl_utf8n_to_uvchr(aTHX_ a,b,c,d)
-#define uvchr_to_utf8(a,b) Perl_uvchr_to_utf8(aTHX_ a,b)
-#endif
#if defined(HAS_SIGACTION) && defined(SA_SIGINFO)
#define csighandler Perl_csighandler
#endif
diff --git a/mathoms.c b/mathoms.c
index 183545af59..e33d0c6601 100644
--- a/mathoms.c
+++ b/mathoms.c
@@ -1270,7 +1270,7 @@ Perl_utf8n_to_uvuni(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
{
PERL_ARGS_ASSERT_UTF8N_TO_UVUNI;
- return utf8n_to_uvoffuni(s, curlen, retlen, flags);
+ return NATIVE_TO_UNI(utf8n_to_uvchr(s, curlen, retlen, flags));
}
END_EXTERN_C
diff --git a/proto.h b/proto.h
index 4c95821544..a0329bb529 100644
--- a/proto.h
+++ b/proto.h
@@ -4748,11 +4748,9 @@ PERL_CALLCONV UV Perl_utf8_to_uvchr(pTHX_ const U8 *s, STRLEN *retlen)
#define PERL_ARGS_ASSERT_UTF8_TO_UVCHR \
assert(s)
-PERL_CALLCONV UV Perl_utf8_to_uvchr_buf(pTHX_ const U8 *s, const U8 *send, STRLEN *retlen)
+/* PERL_CALLCONV UV utf8_to_uvchr_buf(pTHX_ const U8 *s, const U8 *send, STRLEN *retlen)
__attribute__nonnull__(pTHX_1)
- __attribute__nonnull__(pTHX_2);
-#define PERL_ARGS_ASSERT_UTF8_TO_UVCHR_BUF \
- assert(s); assert(send)
+ __attribute__nonnull__(pTHX_2); */
PERL_CALLCONV UV Perl_utf8_to_uvuni(pTHX_ const U8 *s, STRLEN *retlen)
__attribute__deprecated__
@@ -4767,9 +4765,9 @@ PERL_CALLCONV UV Perl_utf8_to_uvuni_buf(pTHX_ const U8 *s, const U8 *send, STRLE
#define PERL_ARGS_ASSERT_UTF8_TO_UVUNI_BUF \
assert(s); assert(send)
-PERL_CALLCONV UV Perl_utf8n_to_uvoffuni(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
+PERL_CALLCONV UV Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
__attribute__nonnull__(pTHX_1);
-#define PERL_ARGS_ASSERT_UTF8N_TO_UVOFFUNI \
+#define PERL_ARGS_ASSERT_UTF8N_TO_UVCHR \
assert(s)
PERL_CALLCONV UV Perl_utf8n_to_uvuni(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
@@ -4777,10 +4775,11 @@ PERL_CALLCONV UV Perl_utf8n_to_uvuni(pTHX_ const U8 *s, STRLEN curlen, STRLEN *r
#define PERL_ARGS_ASSERT_UTF8N_TO_UVUNI \
assert(s)
-PERL_CALLCONV U8* Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
- __attribute__nonnull__(pTHX_1);
-#define PERL_ARGS_ASSERT_UVCHR_TO_UTF8_FLAGS \
- assert(d)
+/* PERL_CALLCONV U8* uvchr_to_utf8(pTHX_ U8 *d, UV uv)
+ __attribute__nonnull__(pTHX_1); */
+
+/* PERL_CALLCONV U8* uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
+ __attribute__nonnull__(pTHX_1); */
PERL_CALLCONV U8* Perl_uvoffuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
__attribute__nonnull__(pTHX_1);
@@ -4989,18 +4988,6 @@ STATIC int S_sv_2iuv_non_preserve(pTHX_ SV *const sv)
# endif
# endif
#endif
-#if !(defined(EBCDIC))
-/* PERL_CALLCONV UV Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
- __attribute__nonnull__(pTHX_1); */
-#define PERL_ARGS_ASSERT_UTF8N_TO_UVCHR \
- assert(s)
-
-/* PERL_CALLCONV U8* Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv)
- __attribute__nonnull__(pTHX_1); */
-#define PERL_ARGS_ASSERT_UVCHR_TO_UTF8 \
- assert(d)
-
-#endif
#if !(defined(HAS_SIGACTION) && defined(SA_SIGINFO))
PERL_CALLCONV Signal_t Perl_csighandler(int sig);
PERL_CALLCONV Signal_t Perl_sighandler(int sig);
@@ -5327,18 +5314,6 @@ PERL_CALLCONV void Perl_dump_fds(pTHX_ char* s)
assert(s)
#endif
-#if defined(EBCDIC)
-PERL_CALLCONV UV Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
- __attribute__nonnull__(pTHX_1);
-#define PERL_ARGS_ASSERT_UTF8N_TO_UVCHR \
- assert(s)
-
-PERL_CALLCONV U8* Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv)
- __attribute__nonnull__(pTHX_1);
-#define PERL_ARGS_ASSERT_UVCHR_TO_UTF8 \
- assert(d)
-
-#endif
#if defined(HAS_MSG) || defined(HAS_SEM) || defined(HAS_SHM)
PERL_CALLCONV I32 Perl_do_ipcctl(pTHX_ I32 optype, SV** mark, SV** sp)
__attribute__nonnull__(pTHX_2)
diff --git a/utf8.c b/utf8.c
index 3c9fc61c1c..20d7aca261 100644
--- a/utf8.c
+++ b/utf8.c
@@ -33,13 +33,6 @@
#include "perl.h"
#include "inline_invlist.c"
-#ifndef EBCDIC
-/* Separate prototypes needed because in ASCII systems these are
- * usually macros but they still are compiled as code, too. */
-PERL_CALLCONV UV Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags);
-PERL_CALLCONV U8* Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv);
-#endif
-
static const char unees[] =
"Malformed UTF-8 character (unexpected end of string)";
@@ -90,48 +83,14 @@ Perl_is_ascii_string(const U8 *s, STRLEN len)
=for apidoc uvoffuni_to_utf8_flags
THIS FUNCTION SHOULD BE USED IN ONLY VERY SPECIALIZED CIRCUMSTANCES.
+Instead, B<Almost all code should use L</uvchr_to_utf8> or
+L</uvchr_to_utf8_flags>>.
-It adds the UTF-8 representation of the Unicode code point C<uv> to the end
-of the string C<d>; C<d> should have at least C<UTF8_MAXBYTES+1> free
-bytes available. The return value is the pointer to the byte after the
-end of the new character. In other words,
-
- d = uvoffuni_to_utf8_flags(d, uv, flags);
-
-or, in most cases,
-
- d = uvoffuni_to_utf8_flags(d, uv, 0);
-
-This is the Unicode-aware way of saying
-
- *(d++) = uv;
-
-where uv is a code point expressed in Latin-1 or above, not the platform's
-native character set. B<Almost all code should instead use L</uvchr_to_utf8>
-or L</uvchr_to_utf8_flags>>.
-
-This function will convert to UTF-8 (and not warn) even code points that aren't
-legal Unicode or are problematic, unless C<flags> contains one or more of the
-following flags:
-
-If C<uv> is a Unicode surrogate code point and UNICODE_WARN_SURROGATE is set,
-the function will raise a warning, provided UTF8 warnings are enabled. If instead
-UNICODE_DISALLOW_SURROGATE is set, the function will fail and return NULL.
-If both flags are set, the function will both warn and return NULL.
-
-The UNICODE_WARN_NONCHAR and UNICODE_DISALLOW_NONCHAR flags correspondingly
-affect how the function handles a Unicode non-character. And likewise, the
-UNICODE_WARN_SUPER and UNICODE_DISALLOW_SUPER flags, affect the handling of
-code points that are
-above the Unicode maximum of 0x10FFFF. Code points above 0x7FFF_FFFF (which are
-even less portable) can be warned and/or disallowed even if other above-Unicode
-code points are accepted, by the UNICODE_WARN_FE_FF and UNICODE_DISALLOW_FE_FF
-flags.
-
-And finally, the flag UNICODE_WARN_ILLEGAL_INTERCHANGE selects all four of the
-above WARN flags; and UNICODE_DISALLOW_ILLEGAL_INTERCHANGE selects all four
-DISALLOW flags.
+This function is like them, but the input is a strict Unicode
+(as opposed to native) code point. Only in very rare circumstances should code
+not be using the native code point.
+For details, see the description for L</uvchr_to_utf8_flags>>.
=cut
*/
@@ -276,26 +235,71 @@ is the recommended wide native character-aware way of saying
*(d++) = uv;
+This function accepts any UV as input. To forbid or warn on non-Unicode code
+points, or those that may be problematic, see L</uvchr_to_utf8_flags>.
+
=cut
*/
-/* On ASCII machines this is normally a macro but we want a
- real function in case XS code wants it
-*/
+/* This is also a macro */
+PERL_CALLCONV U8* Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv);
+
U8 *
Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv)
{
- PERL_ARGS_ASSERT_UVCHR_TO_UTF8;
-
- return Perl_uvoffuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), 0);
+ return uvchr_to_utf8(d, uv);
}
+/*
+=for apidoc uvchr_to_utf8_flags
+
+Adds the UTF-8 representation of the native code point C<uv> to the end
+of the string C<d>; C<d> should have at least C<UTF8_MAXBYTES+1> free
+bytes available. The return value is the pointer to the byte after the
+end of the new character. In other words,
+
+ d = uvchr_to_utf8_flags(d, uv, flags);
+
+or, in most cases,
+
+ d = uvchr_to_utf8_flags(d, uv, 0);
+
+This is the Unicode-aware way of saying
+
+ *(d++) = uv;
+
+This function will convert to UTF-8 (and not warn) even code points that aren't
+legal Unicode or are problematic, unless C<flags> contains one or more of the
+following flags:
+
+If C<uv> is a Unicode surrogate code point and UNICODE_WARN_SURROGATE is set,
+the function will raise a warning, provided UTF8 warnings are enabled. If instead
+UNICODE_DISALLOW_SURROGATE is set, the function will fail and return NULL.
+If both flags are set, the function will both warn and return NULL.
+
+The UNICODE_WARN_NONCHAR and UNICODE_DISALLOW_NONCHAR flags correspondingly
+affect how the function handles a Unicode non-character. And likewise, the
+UNICODE_WARN_SUPER and UNICODE_DISALLOW_SUPER flags, affect the handling of
+code points that are
+above the Unicode maximum of 0x10FFFF. Code points above 0x7FFF_FFFF (which are
+even less portable) can be warned and/or disallowed even if other above-Unicode
+code points are accepted, by the UNICODE_WARN_FE_FF and UNICODE_DISALLOW_FE_FF
+flags.
+
+And finally, the flag UNICODE_WARN_ILLEGAL_INTERCHANGE selects all four of the
+above WARN flags; and UNICODE_DISALLOW_ILLEGAL_INTERCHANGE selects all four
+DISALLOW flags.
+
+=cut
+*/
+
+/* This is also a macro */
+PERL_CALLCONV U8* Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags);
+
U8 *
Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
{
- PERL_ARGS_ASSERT_UVCHR_TO_UTF8_FLAGS;
-
- return Perl_uvoffuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), flags);
+ return uvchr_to_utf8_flags(d, uv, flags);
}
/*
@@ -510,13 +514,13 @@ Perl_is_utf8_string_loclen(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el)
/*
-=for apidoc utf8n_to_uvoffuni
+=for apidoc utf8n_to_uvchr
THIS FUNCTION SHOULD BE USED IN ONLY VERY SPECIALIZED CIRCUMSTANCES.
+Most code should use L</utf8_to_uvchr_buf>() rather than call this directly.
Bottom level UTF-8 decode routine.
-Returns the official Unicode (not native) code point value of the first
-character in the string C<s>,
+Returns the native code point value of the first character in the string C<s>,
which is assumed to be in UTF-8 (or UTF-EBCDIC) encoding, and no longer than
C<curlen> bytes; C<*retlen> (if C<retlen> isn't NULL) will be set to
the length, in bytes, of that character.
@@ -585,13 +589,11 @@ All other code points corresponding to Unicode characters, including private
use and those yet to be assigned, are never considered malformed and never
warn.
-Most code should use L</utf8_to_uvchr_buf>() rather than call this directly.
-
=cut
*/
UV
-Perl_utf8n_to_uvoffuni(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
+Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
{
dVAR;
const U8 * const s0 = s;
@@ -609,7 +611,7 @@ Perl_utf8n_to_uvoffuni(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 fla
const char* const malformed_text = "Malformed UTF-8 character";
- PERL_ARGS_ASSERT_UTF8N_TO_UVOFFUNI;
+ PERL_ARGS_ASSERT_UTF8N_TO_UVCHR;
/* The order of malformation tests here is important. We should consume as
* few bytes as possible in order to not skip any valid character. This is
@@ -658,7 +660,7 @@ Perl_utf8n_to_uvoffuni(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 fla
/* An invariant is trivially well-formed */
if (UTF8_IS_INVARIANT(uv)) {
- return NATIVE_TO_LATIN1(uv);
+ return uv;
}
/* A continuation character can't start a valid sequence */
@@ -861,7 +863,9 @@ Perl_utf8n_to_uvoffuni(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 fla
}
if (sv) {
- outlier_ret = uv;
+ outlier_ret = uv; /* Note we don't bother to convert to native,
+ as all the outlier code points are the same
+ in both ASCII and EBCDIC */
goto do_warn;
}
@@ -869,7 +873,7 @@ Perl_utf8n_to_uvoffuni(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 fla
* to return it */
}
- return uv;
+ return UNI_TO_NATIVE(uv);
/* There are three cases which get to beyond this point. In all 3 cases:
* <sv> if not null points to a string to print as a warning.
@@ -943,7 +947,7 @@ NULL) to -1. If those warnings are off, the computed value, if well-defined
(or the Unicode REPLACEMENT CHARACTER if not), is silently returned, and
C<*retlen> is set (if C<retlen> isn't NULL) so that (S<C<s> + C<*retlen>>) is
the next possible position in C<s> that could begin a non-malformed character.
-See L</utf8n_to_uvoffuni> for details on when the REPLACEMENT CHARACTER is
+See L</utf8n_to_uvchr> for details on when the REPLACEMENT CHARACTER is
returned.
=cut
@@ -953,8 +957,6 @@ returned.
UV
Perl_utf8_to_uvchr_buf(pTHX_ const U8 *s, const U8 *send, STRLEN *retlen)
{
- PERL_ARGS_ASSERT_UTF8_TO_UVCHR_BUF;
-
assert(s < send);
return utf8n_to_uvchr(s, send - s, retlen,
@@ -1020,7 +1022,7 @@ NULL) to -1. If those warnings are off, the computed value if well-defined (or
the Unicode REPLACEMENT CHARACTER, if not) is silently returned, and C<*retlen>
is set (if C<retlen> isn't NULL) so that (S<C<s> + C<*retlen>>) is the
next possible position in C<s> that could begin a non-malformed character.
-See L</utf8n_to_uvoffuni> for details on when the REPLACEMENT CHARACTER is returned.
+See L</utf8n_to_uvchr> for details on when the REPLACEMENT CHARACTER is returned.
=cut
*/
@@ -1036,8 +1038,9 @@ Perl_utf8_to_uvchr(pTHX_ const U8 *s, STRLEN *retlen)
/*
=for apidoc utf8_to_uvuni_buf
-Only in very rare circumstances should code need to be dealing in the Unicode
-code point. Use L</utf8_to_uvchr_buf> instead.
+Only in very rare circumstances should code need to be dealing in Unicode
+(as opposed to native) code points. In those few cases, use
+C<L<NATIVE_TO_UNI(utf8_to_uvchr_buf(...))|/utf8_to_uvchr_buf>> instead.
Returns the Unicode (not-native) code point of the first character in the
string C<s> which
@@ -1050,7 +1053,7 @@ NULL) to -1. If those warnings are off, the computed value if well-defined (or
the Unicode REPLACEMENT CHARACTER, if not) is silently returned, and C<*retlen>
is set (if C<retlen> isn't NULL) so that (S<C<s> + C<*retlen>>) is the
next possible position in C<s> that could begin a non-malformed character.
-See L</utf8n_to_uvoffuni> for details on when the REPLACEMENT CHARACTER is returned.
+See L</utf8n_to_uvchr> for details on when the REPLACEMENT CHARACTER is returned.
=cut
*/
@@ -1063,8 +1066,8 @@ Perl_utf8_to_uvuni_buf(pTHX_ const U8 *s, const U8 *send, STRLEN *retlen)
assert(send > s);
/* Call the low level routine asking for checks */
- return Perl_utf8n_to_uvoffuni(aTHX_ s, send -s, retlen,
- ckWARN_d(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
+ return NATIVE_TO_UNI(Perl_utf8n_to_uvchr(aTHX_ s, send -s, retlen,
+ ckWARN_d(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY));
}
/* DEPRECATED!
@@ -1091,7 +1094,7 @@ Some, but not all, UTF-8 malformations are detected, and in fact, some
malformed input could cause reading beyond the end of the input buffer, which
is one reason why this function is deprecated. The other is that only in
extremely limited circumstances should the Unicode versus native code point be
-of any interest to you. Use L</utf8_to_uvchr_buf> instead.
+of any interest to you. See L</utf8_to_uvuni_buf> for alternatives.
If C<s> points to one of the detected malformations, and UTF8 warnings are
enabled, zero is returned and C<*retlen> is set (if C<retlen> doesn't point to
@@ -1099,7 +1102,7 @@ NULL) to -1. If those warnings are off, the computed value if well-defined (or
the Unicode REPLACEMENT CHARACTER, if not) is silently returned, and C<*retlen>
is set (if C<retlen> isn't NULL) so that (S<C<s> + C<*retlen>>) is the
next possible position in C<s> that could begin a non-malformed character.
-See L</utf8n_to_uvoffuni> for details on when the REPLACEMENT CHARACTER is returned.
+See L</utf8n_to_uvchr> for details on when the REPLACEMENT CHARACTER is returned.
=cut
*/
@@ -4250,32 +4253,6 @@ Perl__get_swash_invlist(pTHX_ SV* const swash)
return *ptr;
}
-/*
-=for apidoc utf8n_to_uvchr
-
-Returns the native character value of the first character in the string
-C<s>
-which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
-length, in bytes, of that character.
-
-C<length> and C<flags> are the same as L</utf8n_to_uvoffuni>().
-
-=cut
-*/
-/* On ASCII machines this is normally a macro but we want
- a real function in case XS code wants it
-*/
-UV
-Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen,
-U32 flags)
-{
- const UV uv = Perl_utf8n_to_uvoffuni(aTHX_ s, curlen, retlen, flags);
-
- PERL_ARGS_ASSERT_UTF8N_TO_UVCHR;
-
- return UNI_TO_NATIVE(uv);
-}
-
bool
Perl_check_utf8_print(pTHX_ const U8* s, const STRLEN len)
{
diff --git a/utf8.h b/utf8.h
index 45353eaac7..e54c98536f 100644
--- a/utf8.h
+++ b/utf8.h
@@ -39,6 +39,13 @@
#define _CORE_SWASH_INIT_RETURN_IF_UNDEF 0x2
#define _CORE_SWASH_INIT_ACCEPT_INVLIST 0x4
+#define uvchr_to_utf8(a,b) uvchr_to_utf8_flags(a,b,0)
+#define uvchr_to_utf8_flags(d,uv,flags) \
+ uvoffuni_to_utf8_flags(d,NATIVE_TO_UNI(uv),flags)
+#define utf8_to_uvchr_buf(s, e, lenp) \
+ utf8n_to_uvchr(s, (e) - (s), lenp, \
+ ckWARN_d(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY)
+
#define to_uni_fold(c, p, lenp) _to_uni_fold_flags(c, p, lenp, FOLD_FLAGS_FULL)
#define to_utf8_fold(c, p, lenp) _to_utf8_fold_flags(c, p, lenp, \
FOLD_FLAGS_FULL, NULL)
@@ -122,10 +129,6 @@ END_EXTERN_C
#define UNI_TO_NATIVE(ch) (ch)
#define NATIVE_TO_UNI(ch) (ch)
-/* As there are no translations, avoid the function wrapper */
-#define utf8n_to_uvchr utf8n_to_uvoffuni
-#define uvchr_to_utf8(a,b) uvoffuni_to_utf8_flags(a,b,0)
-
/*
The following table is from Unicode 3.2.