Change pack U behavior for EBCDIC

This effectively reverts 3ece276e6c0. It turns out this was a bad idea to make U mean the non-native official Unicode code points. It may seem to make sense to do so, but broke multiple CPAN modules which were using U the previous way. This commit has no effect on ASCII-platform functioning.
author: Karl Williamson <khw@cpan.org> 2021-12-16 14:30:36 -0700
committer: Karl Williamson <khw@cpan.org> 2021-12-28 07:58:11 -0700
commit: 3c2b2fcba1af380e364ca5be1cd42495caf25e89 (patch)
tree: 0c0c5e01175c78f8ec62c8fda0e3e87fae3311d5
parent: 2c9cc169c7a3a605ffcc50c843807bbb3f7e3a75 (diff)
download: perl-3c2b2fcba1af380e364ca5be1cd42495caf25e89.tar.gz
6 files changed, 21 insertions, 27 deletions
diff --git a/ext/XS-APItest/t/utf16_to_utf8.t b/ext/XS-APItest/t/utf16_to_utf8.t
index 612b146104..4cc63e18e3 100644
--- a/ext/XS-APItest/t/utf16_to_utf8.t
+++ b/ext/XS-APItest/t/utf16_to_utf8.t
@@ -4,8 +4,6 @@ use strict;
 use Test::More;
 use Encode;
 
-plan skip_all => 'Unclear how EBCIDC should behave' if ord "A" != 65;
-
 # Bug in Encode, non chars are rejected
 use XS::APItest qw(utf16_to_utf8 utf16_to_utf8_reversed
                    utf8_to_utf16 utf8_to_utf16_reversed);
diff --git a/pod/perlfunc.pod b/pod/perlfunc.pod
index fbe465bbf5..762f12cb72 100644
--- a/pod/perlfunc.pod
+++ b/pod/perlfunc.pod
@@ -5247,7 +5247,10 @@ of values, as follows:
     u  A uuencoded string.
     U  A Unicode character number.  Encodes to a character in char-
        acter mode and UTF-8 (or UTF-EBCDIC in EBCDIC platforms) in
-       byte mode.
+       byte mode.  Also on EBCDIC platforms, the character number will
+       be the native EBCDIC value for character numbers below 256.
+       This allows most programs using this feature to not have to
+       care which type of platform they are running on.
 
     w  A BER compressed integer (not an ASN.1 BER, see perlpacktut
        for details).  Its bytes represent an unsigned integer in
diff --git a/pp_pack.c b/pp_pack.c
index 78104a978c..41f72b798f 100644
--- a/pp_pack.c
+++ b/pp_pack.c
@@ -1320,16 +1320,12 @@ S_unpack_rec(pTHX_ tempsym_t* symptr, const char *s, const char *strbeg, const c
                     len = UTF8SKIP(result);
                     if (!S_utf8_to_bytes(aTHX_ &ptr, strend,
                                       (char *) &result[1], len-1, 'U')) break;
-                    auv = NATIVE_TO_UNI(utf8n_to_uvchr(result,
-                                                       len,
-                                                       &retlen,
-                                                       UTF8_ALLOW_DEFAULT));
+                    auv = utf8n_to_uvchr(result, len, &retlen,
+                                         UTF8_ALLOW_DEFAULT);
                     s = ptr;
                 } else {
-                    auv = NATIVE_TO_UNI(utf8n_to_uvchr((U8*)s,
-                                                       strend - s,
-                                                       &retlen,
-                                                       UTF8_ALLOW_DEFAULT));
+                    auv = utf8n_to_uvchr((U8*)s, strend - s, &retlen,
+                                         UTF8_ALLOW_DEFAULT);
                     if (retlen == (STRLEN) -1)
                         Perl_croak(aTHX_ "Malformed UTF-8 string in unpack");
                     s += retlen;
@@ -2668,7 +2664,7 @@ S_pack_rec(pTHX_ SV *cat, tempsym_t* symptr, SV **beglist, SV **endlist )
                 auv = SvUV_no_inf(fromstr, datumtype);
                 if (utf8) {
                     U8 buffer[UTF8_MAXLEN+1], *endb;
-                    endb = uvchr_to_utf8_flags(buffer, UNI_TO_NATIVE(auv), 0);
+                    endb = uvchr_to_utf8_flags(buffer, auv, 0);
                     if (cur+(endb-buffer)*UTF8_EXPAND >= end) {
                         *cur = '\0';
                         SvCUR_set(cat, cur - start);
@@ -2684,9 +2680,7 @@ S_pack_rec(pTHX_ SV *cat, tempsym_t* symptr, SV **beglist, SV **endlist )
                         GROWING(0, cat, start, cur, len+UTF8_MAXLEN);
                         end = start+SvLEN(cat)-UTF8_MAXLEN;
                     }
-                    cur = (char *) uvchr_to_utf8_flags((U8 *) cur,
-                                                       UNI_TO_NATIVE(auv),
-                                                       0);
+                    cur = (char *) uvchr_to_utf8_flags((U8 *) cur, auv, 0);
                 }
             }
             break;
diff --git a/t/io/bom.t b/t/io/bom.t
index 0a79a056a9..482ae9bce4 100644
--- a/t/io/bom.t
+++ b/t/io/bom.t
@@ -20,8 +20,7 @@ for my $end (0, 1) {
 		$end ? @$_[0, 1] : @$_[1, 0]
 	} (
                 # Create UTF-16.
-		[ 0xFE, 0xFF ], map [ 0, utf8::native_to_unicode(ord($_)) ],
-                                                    split //, "print 1;\nprint 2"
+		[ 0xFE, 0xFF ], map [ 0, ord($_) ], split //, "print 1;\nprint 2"
 	);
 	fresh_perl_is($prog, "12", {}, "BOM indicates $encoding");
 }
diff --git a/t/op/pack.t b/t/op/pack.t
index cd5f1913b3..949daa34a3 100644
--- a/t/op/pack.t
+++ b/t/op/pack.t
@@ -879,10 +879,10 @@ EOP
 
 
 {
-    is("1.20.300.4000", sprintf "%vd", pack("U*",utf8::native_to_unicode(1),utf8::native_to_unicode(20),300,4000));
-    is("1.20.300.4000", sprintf "%vd", pack("  U*",utf8::native_to_unicode(1),utf8::native_to_unicode(20),300,4000));
+    is("1.20.300.4000", sprintf "%vd", pack("U*",1,20,300,4000));
+    is("1.20.300.4000", sprintf "%vd", pack("  U*",1,20,300,4000));
 }
-isnt(v1.20.300.4000, sprintf "%vd", pack("C0U*",utf8::native_to_unicode(1),utf8::native_to_unicode(20),300,4000));
+isnt(v1.20.300.4000, sprintf "%vd", pack("C0U*",1,20,300,4000));
 
 my $rslt = join " ", map { ord } split "", byte_utf8a_to_utf8n("\xc7\xa2");
 # The ASCII UTF-8 of U+1E2 is "\xc7\xa2"
@@ -916,7 +916,7 @@ is("@{[unpack('U*', pack('U*', 100, 200))]}", "100 200");
 
     # does pack C0U create characters?
     # The U* is expecting Unicode, so convert to that.
-    is("@{[pack('C0U*', map { utf8::native_to_unicode($_) } 64, 202)]}",
+    is("@{[pack('C0U*', map { $_ } 64, 202)]}",
        pack("C*", 64, @bytes202));
 
     # does unpack U0U on byte data fail?
@@ -1511,7 +1511,7 @@ my $U_1FFC_bytes = byte_utf8a_to_utf8n("\341\277\274");
     is(join(',', unpack("aC/CU",   "b\0$U_1FFC_bytes")), 'b,8188');
 
     # The U expects Unicode, so convert from native
-    my $first_byte = utf8::native_to_unicode(ord substr($U_1FFC_bytes, 0, 1));
+    my $first_byte = ord substr($U_1FFC_bytes, 0, 1);
 
     is(join(',', unpack("aU0C/UU", "b\0$U_1FFC_bytes")), "b,$first_byte");
     is(join(',', unpack("aU0C/CU", "b\0$U_1FFC_bytes")), "b,$first_byte");
@@ -1794,9 +1794,9 @@ my $U_1FFC_bytes = byte_utf8a_to_utf8n("\341\277\274");
        'normal A* strip leaves \xa0');
     is(unpack("U0C0A*", "ab \n" . uni_to_native("\xa0") . " \0"), "ab \n" . uni_to_native("\xa0"),
        'normal A* strip leaves \xa0 even if it got upgraded for technical reasons');
-    is(unpack("A*", pack("a*(U0U)a*", "ab \n", 0xa0, " \0")), "ab",
+    is(unpack("A*", pack("a*(U0U)a*", "ab \n", utf8::unicode_to_native(0xa0), " \0")), "ab",
        'upgraded strings A* removes \xa0');
-    is(unpack("A*", pack("a*(U0UU)a*", "ab \n", 0xa0, 0x1680, " \0")), "ab",
+    is(unpack("A*", pack("a*(U0UU)a*", "ab \n", utf8::unicode_to_native(0xa0), 0x1680, " \0")), "ab",
        'upgraded strings A* removes all unicode whitespace');
     is(unpack("A5", pack("a*(U0U)a*", "ab \n", 0x1680, "def", "ab")), "ab",
        'upgraded strings A5 removes all unicode whitespace');
diff --git a/utf8.c b/utf8.c
index 3c73714062..afd05065c1 100644
--- a/utf8.c
+++ b/utf8.c
@@ -2659,7 +2659,7 @@ Perl_utf16_to_utf8_base(pTHX_ U8* p, U8* d, Size_t bytelen, Size_t *newlen,
         }
 
         /* Here, 'uv' is the real U32 we want to find the UTF-8 of */
-        d = uvoffuni_to_utf8_flags(d, uv, 0);
+        d = uvchr_to_utf8(d, uv);
     }
 
     *newlen = d - dstart;
@@ -2712,9 +2712,9 @@ Perl_utf8_to_utf16_base(pTHX_ U8* s, U8* d, Size_t bytelen, Size_t *newlen,
 
     while (s < send) {
         STRLEN retlen;
-        UV uv = NATIVE_TO_UNI(utf8n_to_uvchr(s, send - s, &retlen,
+        UV uv = utf8n_to_uvchr(s, send - s, &retlen,
                                /* No surrogates nor above-Unicode */
-                               UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE));
+                               UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE);
 
         /* The modern method is to keep going with malformed input,
          * substituting the REPLACEMENT CHARACTER */
author	Karl Williamson <khw@cpan.org>	2021-12-16 14:30:36 -0700
committer	Karl Williamson <khw@cpan.org>	2021-12-28 07:58:11 -0700
commit	3c2b2fcba1af380e364ca5be1cd42495caf25e89 (patch)
tree	0c0c5e01175c78f8ec62c8fda0e3e87fae3311d5
parent	2c9cc169c7a3a605ffcc50c843807bbb3f7e3a75 (diff)
download	perl-3c2b2fcba1af380e364ca5be1cd42495caf25e89.tar.gz