diff options
author | Karl Williamson <khw@cpan.org> | 2021-12-16 14:30:36 -0700 |
---|---|---|
committer | Karl Williamson <khw@cpan.org> | 2021-12-28 07:58:11 -0700 |
commit | 3c2b2fcba1af380e364ca5be1cd42495caf25e89 (patch) | |
tree | 0c0c5e01175c78f8ec62c8fda0e3e87fae3311d5 | |
parent | 2c9cc169c7a3a605ffcc50c843807bbb3f7e3a75 (diff) | |
download | perl-3c2b2fcba1af380e364ca5be1cd42495caf25e89.tar.gz |
Change pack U behavior for EBCDIC
This effectively reverts 3ece276e6c0.
It turns out this was a bad idea to make U mean the non-native official
Unicode code points. It may seem to make sense to do so, but broke
multiple CPAN modules which were using U the previous way.
This commit has no effect on ASCII-platform functioning.
-rw-r--r-- | ext/XS-APItest/t/utf16_to_utf8.t | 2 | ||||
-rw-r--r-- | pod/perlfunc.pod | 5 | ||||
-rw-r--r-- | pp_pack.c | 18 | ||||
-rw-r--r-- | t/io/bom.t | 3 | ||||
-rw-r--r-- | t/op/pack.t | 14 | ||||
-rw-r--r-- | utf8.c | 6 |
6 files changed, 21 insertions, 27 deletions
diff --git a/ext/XS-APItest/t/utf16_to_utf8.t b/ext/XS-APItest/t/utf16_to_utf8.t index 612b146104..4cc63e18e3 100644 --- a/ext/XS-APItest/t/utf16_to_utf8.t +++ b/ext/XS-APItest/t/utf16_to_utf8.t @@ -4,8 +4,6 @@ use strict; use Test::More; use Encode; -plan skip_all => 'Unclear how EBCIDC should behave' if ord "A" != 65; - # Bug in Encode, non chars are rejected use XS::APItest qw(utf16_to_utf8 utf16_to_utf8_reversed utf8_to_utf16 utf8_to_utf16_reversed); diff --git a/pod/perlfunc.pod b/pod/perlfunc.pod index fbe465bbf5..762f12cb72 100644 --- a/pod/perlfunc.pod +++ b/pod/perlfunc.pod @@ -5247,7 +5247,10 @@ of values, as follows: u A uuencoded string. U A Unicode character number. Encodes to a character in char- acter mode and UTF-8 (or UTF-EBCDIC in EBCDIC platforms) in - byte mode. + byte mode. Also on EBCDIC platforms, the character number will + be the native EBCDIC value for character numbers below 256. + This allows most programs using this feature to not have to + care which type of platform they are running on. w A BER compressed integer (not an ASN.1 BER, see perlpacktut for details). Its bytes represent an unsigned integer in @@ -1320,16 +1320,12 @@ S_unpack_rec(pTHX_ tempsym_t* symptr, const char *s, const char *strbeg, const c len = UTF8SKIP(result); if (!S_utf8_to_bytes(aTHX_ &ptr, strend, (char *) &result[1], len-1, 'U')) break; - auv = NATIVE_TO_UNI(utf8n_to_uvchr(result, - len, - &retlen, - UTF8_ALLOW_DEFAULT)); + auv = utf8n_to_uvchr(result, len, &retlen, + UTF8_ALLOW_DEFAULT); s = ptr; } else { - auv = NATIVE_TO_UNI(utf8n_to_uvchr((U8*)s, - strend - s, - &retlen, - UTF8_ALLOW_DEFAULT)); + auv = utf8n_to_uvchr((U8*)s, strend - s, &retlen, + UTF8_ALLOW_DEFAULT); if (retlen == (STRLEN) -1) Perl_croak(aTHX_ "Malformed UTF-8 string in unpack"); s += retlen; @@ -2668,7 +2664,7 @@ S_pack_rec(pTHX_ SV *cat, tempsym_t* symptr, SV **beglist, SV **endlist ) auv = SvUV_no_inf(fromstr, datumtype); if (utf8) { U8 buffer[UTF8_MAXLEN+1], *endb; - endb = uvchr_to_utf8_flags(buffer, UNI_TO_NATIVE(auv), 0); + endb = uvchr_to_utf8_flags(buffer, auv, 0); if (cur+(endb-buffer)*UTF8_EXPAND >= end) { *cur = '\0'; SvCUR_set(cat, cur - start); @@ -2684,9 +2680,7 @@ S_pack_rec(pTHX_ SV *cat, tempsym_t* symptr, SV **beglist, SV **endlist ) GROWING(0, cat, start, cur, len+UTF8_MAXLEN); end = start+SvLEN(cat)-UTF8_MAXLEN; } - cur = (char *) uvchr_to_utf8_flags((U8 *) cur, - UNI_TO_NATIVE(auv), - 0); + cur = (char *) uvchr_to_utf8_flags((U8 *) cur, auv, 0); } } break; diff --git a/t/io/bom.t b/t/io/bom.t index 0a79a056a9..482ae9bce4 100644 --- a/t/io/bom.t +++ b/t/io/bom.t @@ -20,8 +20,7 @@ for my $end (0, 1) { $end ? @$_[0, 1] : @$_[1, 0] } ( # Create UTF-16. - [ 0xFE, 0xFF ], map [ 0, utf8::native_to_unicode(ord($_)) ], - split //, "print 1;\nprint 2" + [ 0xFE, 0xFF ], map [ 0, ord($_) ], split //, "print 1;\nprint 2" ); fresh_perl_is($prog, "12", {}, "BOM indicates $encoding"); } diff --git a/t/op/pack.t b/t/op/pack.t index cd5f1913b3..949daa34a3 100644 --- a/t/op/pack.t +++ b/t/op/pack.t @@ -879,10 +879,10 @@ EOP { - is("1.20.300.4000", sprintf "%vd", pack("U*",utf8::native_to_unicode(1),utf8::native_to_unicode(20),300,4000)); - is("1.20.300.4000", sprintf "%vd", pack(" U*",utf8::native_to_unicode(1),utf8::native_to_unicode(20),300,4000)); + is("1.20.300.4000", sprintf "%vd", pack("U*",1,20,300,4000)); + is("1.20.300.4000", sprintf "%vd", pack(" U*",1,20,300,4000)); } -isnt(v1.20.300.4000, sprintf "%vd", pack("C0U*",utf8::native_to_unicode(1),utf8::native_to_unicode(20),300,4000)); +isnt(v1.20.300.4000, sprintf "%vd", pack("C0U*",1,20,300,4000)); my $rslt = join " ", map { ord } split "", byte_utf8a_to_utf8n("\xc7\xa2"); # The ASCII UTF-8 of U+1E2 is "\xc7\xa2" @@ -916,7 +916,7 @@ is("@{[unpack('U*', pack('U*', 100, 200))]}", "100 200"); # does pack C0U create characters? # The U* is expecting Unicode, so convert to that. - is("@{[pack('C0U*', map { utf8::native_to_unicode($_) } 64, 202)]}", + is("@{[pack('C0U*', map { $_ } 64, 202)]}", pack("C*", 64, @bytes202)); # does unpack U0U on byte data fail? @@ -1511,7 +1511,7 @@ my $U_1FFC_bytes = byte_utf8a_to_utf8n("\341\277\274"); is(join(',', unpack("aC/CU", "b\0$U_1FFC_bytes")), 'b,8188'); # The U expects Unicode, so convert from native - my $first_byte = utf8::native_to_unicode(ord substr($U_1FFC_bytes, 0, 1)); + my $first_byte = ord substr($U_1FFC_bytes, 0, 1); is(join(',', unpack("aU0C/UU", "b\0$U_1FFC_bytes")), "b,$first_byte"); is(join(',', unpack("aU0C/CU", "b\0$U_1FFC_bytes")), "b,$first_byte"); @@ -1794,9 +1794,9 @@ my $U_1FFC_bytes = byte_utf8a_to_utf8n("\341\277\274"); 'normal A* strip leaves \xa0'); is(unpack("U0C0A*", "ab \n" . uni_to_native("\xa0") . " \0"), "ab \n" . uni_to_native("\xa0"), 'normal A* strip leaves \xa0 even if it got upgraded for technical reasons'); - is(unpack("A*", pack("a*(U0U)a*", "ab \n", 0xa0, " \0")), "ab", + is(unpack("A*", pack("a*(U0U)a*", "ab \n", utf8::unicode_to_native(0xa0), " \0")), "ab", 'upgraded strings A* removes \xa0'); - is(unpack("A*", pack("a*(U0UU)a*", "ab \n", 0xa0, 0x1680, " \0")), "ab", + is(unpack("A*", pack("a*(U0UU)a*", "ab \n", utf8::unicode_to_native(0xa0), 0x1680, " \0")), "ab", 'upgraded strings A* removes all unicode whitespace'); is(unpack("A5", pack("a*(U0U)a*", "ab \n", 0x1680, "def", "ab")), "ab", 'upgraded strings A5 removes all unicode whitespace'); @@ -2659,7 +2659,7 @@ Perl_utf16_to_utf8_base(pTHX_ U8* p, U8* d, Size_t bytelen, Size_t *newlen, } /* Here, 'uv' is the real U32 we want to find the UTF-8 of */ - d = uvoffuni_to_utf8_flags(d, uv, 0); + d = uvchr_to_utf8(d, uv); } *newlen = d - dstart; @@ -2712,9 +2712,9 @@ Perl_utf8_to_utf16_base(pTHX_ U8* s, U8* d, Size_t bytelen, Size_t *newlen, while (s < send) { STRLEN retlen; - UV uv = NATIVE_TO_UNI(utf8n_to_uvchr(s, send - s, &retlen, + UV uv = utf8n_to_uvchr(s, send - s, &retlen, /* No surrogates nor above-Unicode */ - UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE)); + UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE); /* The modern method is to keep going with malformed input, * substituting the REPLACEMENT CHARACTER */ |