summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKarl Williamson <khw@cpan.org>2021-06-30 18:21:16 -0600
committerKarl Williamson <khw@cpan.org>2021-08-14 09:44:58 -0600
commit6af810c44554f309acce080a8c3d2ebefa0eb965 (patch)
tree78ca4fd1f6bffd7f9dc7aa0eb4eb67e37e5eabc9
parent5fd26678bf8c8b48b9fdad829c928780cd445d2e (diff)
downloadperl-6af810c44554f309acce080a8c3d2ebefa0eb965.tar.gz
Add utf8_to_utf16
-rw-r--r--embed.fnc2
-rw-r--r--embed.h1
-rw-r--r--ext/XS-APItest/APItest.xs30
-rw-r--r--ext/XS-APItest/t/utf16_to_utf8.t29
-rw-r--r--proto.h3
-rw-r--r--utf8.c72
-rw-r--r--utf8.h4
7 files changed, 137 insertions, 4 deletions
diff --git a/embed.fnc b/embed.fnc
index 89f806d232..d8c500e9ca 100644
--- a/embed.fnc
+++ b/embed.fnc
@@ -2546,6 +2546,8 @@ EXp |U8* |utf16_to_utf8_base|NN U8* p|NN U8 *d|Size_t bytelen|NN Size_t *newlen
|const bool high|const bool low
EMXp |U8* |utf16_to_utf8 |NN U8* p|NN U8 *d|Size_t bytelen|NN Size_t *newlen
EMXp |U8* |utf16_to_utf8_reversed|NN U8* p|NN U8 *d|Size_t bytelen|NN Size_t *newlen
+EXpx |U8* |utf8_to_utf16_base|NN U8* s|NN U8 *d|Size_t bytelen|NN Size_t *newlen \
+ |const bool high|const bool low
AdpR |STRLEN |utf8_length |NN const U8* s|NN const U8 *e
AipdR |IV |utf8_distance |NN const U8 *a|NN const U8 *b
AipdRT |U8* |utf8_hop |NN const U8 *s|SSize_t off
diff --git a/embed.h b/embed.h
index 32edcf4c04..8fab47508e 100644
--- a/embed.h
+++ b/embed.h
@@ -976,6 +976,7 @@
#define sv_magicext_mglob(a) Perl_sv_magicext_mglob(aTHX_ a)
#define sv_only_taint_gmagic Perl_sv_only_taint_gmagic
#define utf16_to_utf8_base(a,b,c,d,e,f) Perl_utf16_to_utf8_base(aTHX_ a,b,c,d,e,f)
+#define utf8_to_utf16_base(a,b,c,d,e,f) Perl_utf8_to_utf16_base(aTHX_ a,b,c,d,e,f)
#define validate_proto(a,b,c,d) Perl_validate_proto(aTHX_ a,b,c,d)
#define vivify_defelem(a) Perl_vivify_defelem(aTHX_ a)
#define yylex() Perl_yylex(aTHX)
diff --git a/ext/XS-APItest/APItest.xs b/ext/XS-APItest/APItest.xs
index 1940a9077c..565309daa2 100644
--- a/ext/XS-APItest/APItest.xs
+++ b/ext/XS-APItest/APItest.xs
@@ -3093,6 +3093,36 @@ utf16_to_utf8 (sv, ...)
XSRETURN(1);
void
+utf8_to_utf16 (sv, ...)
+ SV* sv
+ ALIAS:
+ utf8_to_utf16_reversed = 1
+ PREINIT:
+ STRLEN len;
+ U8 *source;
+ SV *dest;
+ Size_t got;
+ CODE:
+ if (ix) (void)SvPV_force_nolen(sv);
+ source = (U8 *)SvPV(sv, len);
+ /* Optionally only convert part of the buffer. */
+ if (items > 1) {
+ len = SvUV(ST(1));
+ }
+ /* Mortalise this right now, as we'll be testing croak()s */
+ dest = sv_2mortal(newSV(len * 2 + 1));
+ if (ix) {
+ utf8_to_utf16_reversed(source, (U8 *)SvPVX(dest), len, &got);
+ } else {
+ utf8_to_utf16(source, (U8 *)SvPVX(dest), len, &got);
+ }
+ SvCUR_set(dest, got);
+ SvPVX(dest)[got] = '\0';
+ SvPOK_on(dest);
+ ST(0) = dest;
+ XSRETURN(1);
+
+void
my_exit(int exitcode)
PPCODE:
my_exit(exitcode);
diff --git a/ext/XS-APItest/t/utf16_to_utf8.t b/ext/XS-APItest/t/utf16_to_utf8.t
index 8ebe1e3f24..612b146104 100644
--- a/ext/XS-APItest/t/utf16_to_utf8.t
+++ b/ext/XS-APItest/t/utf16_to_utf8.t
@@ -6,7 +6,9 @@ use Encode;
plan skip_all => 'Unclear how EBCIDC should behave' if ord "A" != 65;
-use XS::APItest qw(utf16_to_utf8 utf16_to_utf8_reversed);
+# Bug in Encode, non chars are rejected
+use XS::APItest qw(utf16_to_utf8 utf16_to_utf8_reversed
+ utf8_to_utf16 utf8_to_utf16_reversed);
for my $ord (0, 10, 13, 78, 255, 256, 0xD7FF, 0xE000, 0xFFFD,
0x10000, 0x10FC00, 0x103FF, 0x10FFFD) {
@@ -15,13 +17,17 @@ for my $ord (0, 10, 13, 78, 255, 256, 0xD7FF, 0xE000, 0xFFFD,
for my $suffix ('', "\0", "Moo!") {
my $string = $prefix . $chr . $suffix;
my $name = sprintf "for chr $ord prefix %d, suffix %d",
- length $prefix, length $suffix;
+ length $prefix, length $suffix;
my $as_utf8 = $string;
utf8::encode($as_utf8);
- is(utf16_to_utf8(encode('UTF-16BE', $string)), $as_utf8,
- "utf16_to_utf8 $name");
+ my $be_16 = encode('UTF-16BE', $string);
+ my $le_16 = encode('UTF-16LE', $string);
+ is(utf16_to_utf8($be_16), $as_utf8, "utf16_to_utf8 $name");
+ is(utf8_to_utf16($as_utf8), $be_16, "utf8_to_utf16 $name");
is(utf16_to_utf8_reversed(encode('UTF-16LE', $string)), $as_utf8,
"utf16_to_utf8_reversed $name");
+ is(utf8_to_utf16_reversed($as_utf8), $le_16,
+ "utf8_to_utf16_reversed $name");
}
}
}
@@ -66,4 +72,19 @@ like($@, qr/^Malformed UTF-16 surrogate at/, 'Lone surrogate croaks');
(ok(!defined $got, 'hence eval returns undef')) or
diag(join ', ', map {ord $_} split //, $got);
+{ # This example is published by Unicode, so verifies we aren't just
+ # internally consistent; we conform to the Standard
+ my $utf16_of_U10302 = utf8_to_utf16(chr 0x10302);
+ is(substr($utf16_of_U10302, 0, 1), chr 0xD8);
+ is(substr($utf16_of_U10302, 1, 1), chr 0x00);
+ is(substr($utf16_of_U10302, 2, 1), chr 0xDF);
+ is(substr($utf16_of_U10302, 3, 1), chr 0x02);
+
+ $utf16_of_U10302 = utf8_to_utf16_reversed(chr 0x10302);
+ is(substr($utf16_of_U10302, 0, 1), chr 0x00);
+ is(substr($utf16_of_U10302, 1, 1), chr 0xD8);
+ is(substr($utf16_of_U10302, 2, 1), chr 0x02);
+ is(substr($utf16_of_U10302, 3, 1), chr 0xDF);
+}
+
done_testing;
diff --git a/proto.h b/proto.h
index dc9fff0ef2..164f6be670 100644
--- a/proto.h
+++ b/proto.h
@@ -4048,6 +4048,9 @@ PERL_CALLCONV STRLEN Perl_utf8_length(pTHX_ const U8* s, const U8 *e)
PERL_CALLCONV U8* Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *lenp);
#define PERL_ARGS_ASSERT_UTF8_TO_BYTES \
assert(s); assert(lenp)
+PERL_CALLCONV U8* Perl_utf8_to_utf16_base(pTHX_ U8* s, U8 *d, Size_t bytelen, Size_t *newlen, const bool high, const bool low);
+#define PERL_ARGS_ASSERT_UTF8_TO_UTF16_BASE \
+ assert(s); assert(d); assert(newlen)
#ifndef NO_MATHOMS
PERL_CALLCONV UV Perl_utf8_to_uvchr(pTHX_ const U8 *s, STRLEN *retlen)
__attribute__deprecated__;
diff --git a/utf8.c b/utf8.c
index 38d176aaab..e2645edb0e 100644
--- a/utf8.c
+++ b/utf8.c
@@ -2682,6 +2682,78 @@ Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, Size_t bytelen, Size_t *newlen)
return utf16_to_utf8_reversed(p, d, bytelen, newlen);
}
+/*
+ * Convert UTF-8 to native UTF-16. Called via the macros utf8_to_utf16() for
+ * big-endian and utf8_to_utf16_reversed() for little-endian,
+ *
+ * 's' is the UTF-8 input string, passed as a pointer to U8.
+ * 'bytelen' is its length
+ * 'd' is the pointer to the destination buffer, currently passed as U8 *. The
+ * caller must ensure that the space is large enough. The maximum
+ * expansion factor is 2 times 'bytelen'. This happens when the input is
+ * entirely single-byte ASCII, expanding to two-byte UTF-16.
+ * '*newlen' will contain the number of bytes this function filled of 'd'.
+ * 'high_byte' is 0 if UTF-16BE; 1 if UTF-16LE
+ * 'low_byte' is 1 if UTF-16BE; 0 if UTF-16LE
+ *
+ * Do not use in-place. */
+U8*
+Perl_utf8_to_utf16_base(pTHX_ U8* s, U8* d, Size_t bytelen, Size_t *newlen,
+ const bool high_byte, /* Which of next two bytes
+ is high order */
+ const bool low_byte)
+{
+ U8* send;
+ U8* dstart = d;
+
+ PERL_ARGS_ASSERT_UTF8_TO_UTF16_BASE;
+
+ send = s + bytelen;
+
+ while (s < send) {
+ STRLEN retlen;
+ UV uv = NATIVE_TO_UNI(utf8n_to_uvchr(s, send - s, &retlen,
+ /* No surrogates nor above-Unicode */
+ UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE));
+
+ /* The modern method is to keep going with malformed input,
+ * substituting the REPLACEMENT CHARACTER */
+ if (UNLIKELY(uv == 0 && *s != '\0')) {
+ uv = UNICODE_REPLACEMENT;
+ }
+
+ if (uv >= FIRST_IN_PLANE1) { /* Requires a surrogate pair */
+
+ /* From https://unicode.org/faq/utf_bom.html#utf16-4 */
+ U32 high_surrogate = (uv >> 10) - (FIRST_IN_PLANE1 >> 10)
+ + FIRST_HIGH_SURROGATE;
+
+ /* (The bool is cast to U8 because on platforms where a bool is
+ * implemented as a signed char, a compiler warning may be
+ * generated) */
+ d[(U8) high_byte] = high_surrogate >> 8;
+ d[(U8) low_byte] = high_surrogate & nBIT_MASK(8);
+ d += 2;
+
+ /* The low surrogate is the lower 10 bits plus the offset */
+ uv &= nBIT_MASK(10);
+ uv += FIRST_LOW_SURROGATE;
+
+ /* Drop down to output the low surrogate like it were a
+ * non-surrogate */
+ }
+
+ d[(U8) high_byte] = uv >> 8;
+ d[(U8) low_byte] = uv & nBIT_MASK(8);
+ d += 2;
+
+ s += retlen;
+ }
+
+ *newlen = d - dstart;
+ return d;
+}
+
bool
Perl__is_uni_FOO(pTHX_ const U8 classnum, const UV c)
{
diff --git a/utf8.h b/utf8.h
index bc6aa2083b..662e4cf529 100644
--- a/utf8.h
+++ b/utf8.h
@@ -86,6 +86,10 @@ the string is invariant.
utf16_to_utf8_base(p, d, bytelen, newlen, 0, 1)
#define utf16_to_utf8_reversed(p, d, bytelen, newlen) \
utf16_to_utf8_base(p, d, bytelen, newlen, 1, 0)
+#define utf8_to_utf16(p, d, bytelen, newlen) \
+ utf8_to_utf16_base(p, d, bytelen, newlen, 0, 1)
+#define utf8_to_utf16_reversed(p, d, bytelen, newlen) \
+ utf8_to_utf16_base(p, d, bytelen, newlen, 1, 0)
#define to_uni_fold(c, p, lenp) _to_uni_fold_flags(c, p, lenp, FOLD_FLAGS_FULL)