diff options
-rw-r--r-- | embed.fnc | 2 | ||||
-rw-r--r-- | embed.h | 1 | ||||
-rw-r--r-- | ext/XS-APItest/APItest.xs | 30 | ||||
-rw-r--r-- | ext/XS-APItest/t/utf16_to_utf8.t | 29 | ||||
-rw-r--r-- | proto.h | 3 | ||||
-rw-r--r-- | utf8.c | 72 | ||||
-rw-r--r-- | utf8.h | 4 |
7 files changed, 137 insertions, 4 deletions
@@ -2546,6 +2546,8 @@ EXp |U8* |utf16_to_utf8_base|NN U8* p|NN U8 *d|Size_t bytelen|NN Size_t *newlen |const bool high|const bool low EMXp |U8* |utf16_to_utf8 |NN U8* p|NN U8 *d|Size_t bytelen|NN Size_t *newlen EMXp |U8* |utf16_to_utf8_reversed|NN U8* p|NN U8 *d|Size_t bytelen|NN Size_t *newlen +EXpx |U8* |utf8_to_utf16_base|NN U8* s|NN U8 *d|Size_t bytelen|NN Size_t *newlen \ + |const bool high|const bool low AdpR |STRLEN |utf8_length |NN const U8* s|NN const U8 *e AipdR |IV |utf8_distance |NN const U8 *a|NN const U8 *b AipdRT |U8* |utf8_hop |NN const U8 *s|SSize_t off @@ -976,6 +976,7 @@ #define sv_magicext_mglob(a) Perl_sv_magicext_mglob(aTHX_ a) #define sv_only_taint_gmagic Perl_sv_only_taint_gmagic #define utf16_to_utf8_base(a,b,c,d,e,f) Perl_utf16_to_utf8_base(aTHX_ a,b,c,d,e,f) +#define utf8_to_utf16_base(a,b,c,d,e,f) Perl_utf8_to_utf16_base(aTHX_ a,b,c,d,e,f) #define validate_proto(a,b,c,d) Perl_validate_proto(aTHX_ a,b,c,d) #define vivify_defelem(a) Perl_vivify_defelem(aTHX_ a) #define yylex() Perl_yylex(aTHX) diff --git a/ext/XS-APItest/APItest.xs b/ext/XS-APItest/APItest.xs index 1940a9077c..565309daa2 100644 --- a/ext/XS-APItest/APItest.xs +++ b/ext/XS-APItest/APItest.xs @@ -3093,6 +3093,36 @@ utf16_to_utf8 (sv, ...) XSRETURN(1); void +utf8_to_utf16 (sv, ...) + SV* sv + ALIAS: + utf8_to_utf16_reversed = 1 + PREINIT: + STRLEN len; + U8 *source; + SV *dest; + Size_t got; + CODE: + if (ix) (void)SvPV_force_nolen(sv); + source = (U8 *)SvPV(sv, len); + /* Optionally only convert part of the buffer. */ + if (items > 1) { + len = SvUV(ST(1)); + } + /* Mortalise this right now, as we'll be testing croak()s */ + dest = sv_2mortal(newSV(len * 2 + 1)); + if (ix) { + utf8_to_utf16_reversed(source, (U8 *)SvPVX(dest), len, &got); + } else { + utf8_to_utf16(source, (U8 *)SvPVX(dest), len, &got); + } + SvCUR_set(dest, got); + SvPVX(dest)[got] = '\0'; + SvPOK_on(dest); + ST(0) = dest; + XSRETURN(1); + +void my_exit(int exitcode) PPCODE: my_exit(exitcode); diff --git a/ext/XS-APItest/t/utf16_to_utf8.t b/ext/XS-APItest/t/utf16_to_utf8.t index 8ebe1e3f24..612b146104 100644 --- a/ext/XS-APItest/t/utf16_to_utf8.t +++ b/ext/XS-APItest/t/utf16_to_utf8.t @@ -6,7 +6,9 @@ use Encode; plan skip_all => 'Unclear how EBCIDC should behave' if ord "A" != 65; -use XS::APItest qw(utf16_to_utf8 utf16_to_utf8_reversed); +# Bug in Encode, non chars are rejected +use XS::APItest qw(utf16_to_utf8 utf16_to_utf8_reversed + utf8_to_utf16 utf8_to_utf16_reversed); for my $ord (0, 10, 13, 78, 255, 256, 0xD7FF, 0xE000, 0xFFFD, 0x10000, 0x10FC00, 0x103FF, 0x10FFFD) { @@ -15,13 +17,17 @@ for my $ord (0, 10, 13, 78, 255, 256, 0xD7FF, 0xE000, 0xFFFD, for my $suffix ('', "\0", "Moo!") { my $string = $prefix . $chr . $suffix; my $name = sprintf "for chr $ord prefix %d, suffix %d", - length $prefix, length $suffix; + length $prefix, length $suffix; my $as_utf8 = $string; utf8::encode($as_utf8); - is(utf16_to_utf8(encode('UTF-16BE', $string)), $as_utf8, - "utf16_to_utf8 $name"); + my $be_16 = encode('UTF-16BE', $string); + my $le_16 = encode('UTF-16LE', $string); + is(utf16_to_utf8($be_16), $as_utf8, "utf16_to_utf8 $name"); + is(utf8_to_utf16($as_utf8), $be_16, "utf8_to_utf16 $name"); is(utf16_to_utf8_reversed(encode('UTF-16LE', $string)), $as_utf8, "utf16_to_utf8_reversed $name"); + is(utf8_to_utf16_reversed($as_utf8), $le_16, + "utf8_to_utf16_reversed $name"); } } } @@ -66,4 +72,19 @@ like($@, qr/^Malformed UTF-16 surrogate at/, 'Lone surrogate croaks'); (ok(!defined $got, 'hence eval returns undef')) or diag(join ', ', map {ord $_} split //, $got); +{ # This example is published by Unicode, so verifies we aren't just + # internally consistent; we conform to the Standard + my $utf16_of_U10302 = utf8_to_utf16(chr 0x10302); + is(substr($utf16_of_U10302, 0, 1), chr 0xD8); + is(substr($utf16_of_U10302, 1, 1), chr 0x00); + is(substr($utf16_of_U10302, 2, 1), chr 0xDF); + is(substr($utf16_of_U10302, 3, 1), chr 0x02); + + $utf16_of_U10302 = utf8_to_utf16_reversed(chr 0x10302); + is(substr($utf16_of_U10302, 0, 1), chr 0x00); + is(substr($utf16_of_U10302, 1, 1), chr 0xD8); + is(substr($utf16_of_U10302, 2, 1), chr 0x02); + is(substr($utf16_of_U10302, 3, 1), chr 0xDF); +} + done_testing; @@ -4048,6 +4048,9 @@ PERL_CALLCONV STRLEN Perl_utf8_length(pTHX_ const U8* s, const U8 *e) PERL_CALLCONV U8* Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *lenp); #define PERL_ARGS_ASSERT_UTF8_TO_BYTES \ assert(s); assert(lenp) +PERL_CALLCONV U8* Perl_utf8_to_utf16_base(pTHX_ U8* s, U8 *d, Size_t bytelen, Size_t *newlen, const bool high, const bool low); +#define PERL_ARGS_ASSERT_UTF8_TO_UTF16_BASE \ + assert(s); assert(d); assert(newlen) #ifndef NO_MATHOMS PERL_CALLCONV UV Perl_utf8_to_uvchr(pTHX_ const U8 *s, STRLEN *retlen) __attribute__deprecated__; @@ -2682,6 +2682,78 @@ Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, Size_t bytelen, Size_t *newlen) return utf16_to_utf8_reversed(p, d, bytelen, newlen); } +/* + * Convert UTF-8 to native UTF-16. Called via the macros utf8_to_utf16() for + * big-endian and utf8_to_utf16_reversed() for little-endian, + * + * 's' is the UTF-8 input string, passed as a pointer to U8. + * 'bytelen' is its length + * 'd' is the pointer to the destination buffer, currently passed as U8 *. The + * caller must ensure that the space is large enough. The maximum + * expansion factor is 2 times 'bytelen'. This happens when the input is + * entirely single-byte ASCII, expanding to two-byte UTF-16. + * '*newlen' will contain the number of bytes this function filled of 'd'. + * 'high_byte' is 0 if UTF-16BE; 1 if UTF-16LE + * 'low_byte' is 1 if UTF-16BE; 0 if UTF-16LE + * + * Do not use in-place. */ +U8* +Perl_utf8_to_utf16_base(pTHX_ U8* s, U8* d, Size_t bytelen, Size_t *newlen, + const bool high_byte, /* Which of next two bytes + is high order */ + const bool low_byte) +{ + U8* send; + U8* dstart = d; + + PERL_ARGS_ASSERT_UTF8_TO_UTF16_BASE; + + send = s + bytelen; + + while (s < send) { + STRLEN retlen; + UV uv = NATIVE_TO_UNI(utf8n_to_uvchr(s, send - s, &retlen, + /* No surrogates nor above-Unicode */ + UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE)); + + /* The modern method is to keep going with malformed input, + * substituting the REPLACEMENT CHARACTER */ + if (UNLIKELY(uv == 0 && *s != '\0')) { + uv = UNICODE_REPLACEMENT; + } + + if (uv >= FIRST_IN_PLANE1) { /* Requires a surrogate pair */ + + /* From https://unicode.org/faq/utf_bom.html#utf16-4 */ + U32 high_surrogate = (uv >> 10) - (FIRST_IN_PLANE1 >> 10) + + FIRST_HIGH_SURROGATE; + + /* (The bool is cast to U8 because on platforms where a bool is + * implemented as a signed char, a compiler warning may be + * generated) */ + d[(U8) high_byte] = high_surrogate >> 8; + d[(U8) low_byte] = high_surrogate & nBIT_MASK(8); + d += 2; + + /* The low surrogate is the lower 10 bits plus the offset */ + uv &= nBIT_MASK(10); + uv += FIRST_LOW_SURROGATE; + + /* Drop down to output the low surrogate like it were a + * non-surrogate */ + } + + d[(U8) high_byte] = uv >> 8; + d[(U8) low_byte] = uv & nBIT_MASK(8); + d += 2; + + s += retlen; + } + + *newlen = d - dstart; + return d; +} + bool Perl__is_uni_FOO(pTHX_ const U8 classnum, const UV c) { @@ -86,6 +86,10 @@ the string is invariant. utf16_to_utf8_base(p, d, bytelen, newlen, 0, 1) #define utf16_to_utf8_reversed(p, d, bytelen, newlen) \ utf16_to_utf8_base(p, d, bytelen, newlen, 1, 0) +#define utf8_to_utf16(p, d, bytelen, newlen) \ + utf8_to_utf16_base(p, d, bytelen, newlen, 0, 1) +#define utf8_to_utf16_reversed(p, d, bytelen, newlen) \ + utf8_to_utf16_base(p, d, bytelen, newlen, 1, 0) #define to_uni_fold(c, p, lenp) _to_uni_fold_flags(c, p, lenp, FOLD_FLAGS_FULL) |