diff options
author | Karl Williamson <khw@cpan.org> | 2014-11-24 10:34:27 -0700 |
---|---|---|
committer | Karl Williamson <khw@cpan.org> | 2014-11-24 10:50:26 -0700 |
commit | 27c3afbd6068ac83b49a11df3e33758ef059027e (patch) | |
tree | e964ad36e5ade6b830a8e54f3771352b00651370 /lib | |
parent | 374aed2060f1b45b00ae9ad47fb986b4619492c7 (diff) | |
download | perl-27c3afbd6068ac83b49a11df3e33758ef059027e.tar.gz |
charnames: Generalize to work on non-ASCII platforms
This includes the tests.
The character names are now stored in native order. This means that
pack('U') no longer works on non-ASCII platforms. Use chr instead,
mostly, and pack('W*') for a sequence.
These changes required the 'encoding' pragma to no longer affect e.g.,
chr() outside its scope, which was recently done by
3e669301f0a6fa34269f0e1eaf1fbbd72cae498a.
Diffstat (limited to 'lib')
-rw-r--r-- | lib/_charnames.pm | 24 | ||||
-rw-r--r-- | lib/charnames.t | 56 |
2 files changed, 37 insertions, 43 deletions
diff --git a/lib/_charnames.pm b/lib/_charnames.pm index fbbe79ab92..bb7d7c6c7b 100644 --- a/lib/_charnames.pm +++ b/lib/_charnames.pm @@ -65,10 +65,10 @@ $Carp::Internal{ (__PACKAGE__) } = 1; my %system_aliases = ( - 'SINGLE-SHIFT 2' => pack("U", utf8::unicode_to_native(0x8E)), - 'SINGLE-SHIFT 3' => pack("U", utf8::unicode_to_native(0x8F)), - 'PRIVATE USE 1' => pack("U", utf8::unicode_to_native(0x91)), - 'PRIVATE USE 2' => pack("U", utf8::unicode_to_native(0x92)), + 'SINGLE-SHIFT 2' => chr utf8::unicode_to_native(0x8E), + 'SINGLE-SHIFT 3' => chr utf8::unicode_to_native(0x8F), + 'PRIVATE USE 1' => chr utf8::unicode_to_native(0x91), + 'PRIVATE USE 2' => chr utf8::unicode_to_native(0x92), ); # These are the aliases above that differ under :loose and :full matching @@ -77,15 +77,15 @@ my %system_aliases = ( #); #my %deprecated_aliases; -#$deprecated_aliases{'BELL'} = pack("U", utf8::unicode_to_native(0x07)) if $^V lt v5.17.0; +#$deprecated_aliases{'BELL'} = chr utf8::unicode_to_native(0x07) if $^V lt v5.17.0; #my %loose_deprecated_aliases = ( #); # These are special cased in :loose matching, differing only in a medial # hyphen -my $HANGUL_JUNGSEONG_O_E_utf8 = pack("U", 0x1180); -my $HANGUL_JUNGSEONG_OE_utf8 = pack("U", 0x116C); +my $HANGUL_JUNGSEONG_O_E_utf8 = chr 0x1180; +my $HANGUL_JUNGSEONG_OE_utf8 = chr 0x116C; my $txt; # The table of official character names @@ -163,7 +163,7 @@ sub alias (@) # Set up a single alias } if ($value =~ $decimal_qr) { no warnings qw(non_unicode surrogate nonchar); # Allow any of these - $^H{charnames_ord_aliases}{$name} = pack("U", $value); + $^H{charnames_ord_aliases}{$name} = chr $value; # Use a canonical form. $^H{charnames_inverse_ords}{sprintf("%05X", $value)} = $name; @@ -432,7 +432,7 @@ sub lookup_name ($$$) { if (($loose || $^H{charnames_full}) && (defined (my $ord = charnames::name_to_code_point_special($lookup_name, $loose)))) { - $result = pack("U", $ord); + $result = chr $ord; } else { @@ -525,7 +525,7 @@ sub lookup_name ($$$) { # therefore yield the very last character in the table, which should # also be a \n, so the statement works anyway.) if (substr($txt, $off[0] - 7, 1) eq "\n") { - $result = pack("U", CORE::hex substr($txt, $off[0] - 6, 5)); + $result = chr CORE::hex substr($txt, $off[0] - 6, 5); # Handle the single loose matching special case, in which two names # differ only by a single medial hyphen. If the original had a @@ -544,7 +544,7 @@ sub lookup_name ($$$) { # The +1 skips past that newline, or, if the rindex() fails, to put # us to an offset of zero. my $charstart = rindex($txt, "\n", $off[0] - 7) + 1; - $result = pack("U*", map { CORE::hex } + $result = pack("W*", map { CORE::hex } split " ", substr($txt, $charstart, $off[0] - $charstart - 1)); } } @@ -556,8 +556,6 @@ sub lookup_name ($$$) { $cache_ref->{$name} = $result if defined $cache_ref; } } - utf8::downgrade($result, 1); - # Here, have the result character. If the return is to be an ord, must be # any single character. diff --git a/lib/charnames.t b/lib/charnames.t index bd0c21ea85..e11581183b 100644 --- a/lib/charnames.t +++ b/lib/charnames.t @@ -65,7 +65,7 @@ EOE use charnames ":alias" => { mychar1 => "0xE8000", mychar2 => 983040, # U+F0000 mychar3 => "U+100000", - myctrl => 0x80, + myctrl => utf8::unicode_to_native(0x80), mylarge => "U+111000", }; is ("\N{PILE OF POO}", chr(0x1F4A9), "Verify :alias alone implies :full"); @@ -77,29 +77,14 @@ EOE is (charnames::viacode(0x100000), "mychar3", "And that can get the alias back"); is ("\N{mylarge}", chr(0x111000), "Verify that can define alias beyond Unicode"); is (charnames::viacode(0x111000), "mylarge", "And that can get the alias back"); - is (charnames::viacode(0x80), "myctrl", "Verify that can name a nameless control"); + is (charnames::viacode(utf8::unicode_to_native(0x80)), "myctrl", "Verify that can name a nameless control"); } -my $encoded_be; -my $encoded_alpha; -my $encoded_bet; -my $encoded_deseng; - -# If octal representation of unicode char is \0xyzt, then the utf8 is \3xy\2zt -if (ord('A') == 65) { # as on ASCII or UTF-8 machines - $encoded_be = "\320\261"; - $encoded_alpha = "\316\261"; - $encoded_bet = "\327\221"; - $encoded_deseng = "\360\220\221\215"; -} -else { # EBCDIC where UTF-EBCDIC may be used (this may be 1047 specific since - # UTF-EBCDIC is codepage specific) - $encoded_be = "\270\102\130"; - $encoded_alpha = "\264\130"; - $encoded_bet = "\270\125\130"; - $encoded_deseng = "\336\102\103\124"; -} +my $encoded_be = byte_utf8a_to_utf8n("\320\261"); +my $encoded_alpha = byte_utf8a_to_utf8n("\316\261"); +my $encoded_bet = byte_utf8a_to_utf8n("\327\221"); +my $encoded_deseng = byte_utf8a_to_utf8n("\360\220\221\215"); sub to_bytes { unpack"U0a*", shift; @@ -235,7 +220,7 @@ sub test_vianame ($$$) { use bytes; is(charnames::vianame("GOTHIC LETTER AHSA"), 0x10330, "Verify vianame \\N{name} is unaffected by 'use bytes'"); - is(charnames::vianame("U+FF"), chr(0xFF), "Verify vianame \\N{U+FF} is unaffected by 'use bytes'"); + is(charnames::vianame("U+FF"), chr(utf8::unicode_to_native(0xFF)), "Verify vianame \\N{U+FF} is unaffected by 'use bytes'"); cmp_ok($warning_count, '==', scalar @WARN, "Verify vianame doesn't warn on legal inputs under 'use bytes'"); ok(! defined charnames::vianame("U+100"), "Verify vianame \\N{U+100} is undef under 'use bytes'"); ok($warning_count == scalar @WARN - 1 && $WARN[-1] =~ /above 0xFF/, "Verify vianame gives appropriate warning for previous test"); @@ -244,9 +229,9 @@ sub test_vianame ($$$) { ok(! defined charnames::string_vianame("GOTHIC LETTER AHSA"), "Verify string_vianame(\"GOTHIC LETTER AHSA\") is undefined under 'use bytes'"); ok($warning_count == scalar @WARN - 1 && $WARN[-1] =~ /above 0xFF/, "Verify string_vianame gives appropriate warning for previous test"); $warning_count = @WARN; - is(charnames::string_vianame("U+FF"), chr(0xFF), "Verify string_vianame(\"U+FF\") is chr(0xFF) under 'use bytes'"); + is(charnames::string_vianame("U+FF"), chr(utf8::unicode_to_native(0xFF)), "Verify string_vianame(\"U+FF\") is chr(0xFF) under 'use bytes'"); cmp_ok($warning_count, '==', scalar @WARN, "Verify string_vianame doesn't warn on legal inputs under 'use bytes'"); - is(charnames::string_vianame("LATIN SMALL LETTER Y WITH DIAERESIS"), chr(0xFF), "Verify string_vianame(\"LATIN SMALL LETTER Y WITH DIAERESIS\") is chr(0xFF) under 'use bytes'"); + is(charnames::string_vianame("LATIN SMALL LETTER Y WITH DIAERESIS"), chr(utf8::unicode_to_native(0xFF)), "Verify string_vianame(\"LATIN SMALL LETTER Y WITH DIAERESIS\") is chr(native 0xFF) under 'use bytes'"); cmp_ok($warning_count, '==', scalar @WARN, "Verify string_vianame doesn't warn on legal inputs under 'use bytes'"); ok(! defined charnames::string_vianame("U+100"), "Verify string_vianame \\N{U+100} is undef under 'use bytes'"); ok($warning_count == scalar @WARN - 1 && $WARN[-1] =~ /above 0xFF/, "Verify string_vianame gives appropriate warning for previous test"); @@ -350,7 +335,8 @@ ok(! defined charnames::viacode(0x110000), ok((grep { /\Qyou asked for U+110000/ } @WARN), '... and gives warning'); is(charnames::viacode(0), "NULL", 'Verify charnames::viacode(0) eq "NULL"'); -is(charnames::viacode("BE"), "VULGAR FRACTION THREE QUARTERS", 'Verify charnames::viacode("BE") eq "VULGAR FRACTION THREE QUARTERS"'); +my $three_quarters = sprintf("%2X", utf8::unicode_to_native(0xBE)); +is(charnames::viacode("$three_quarters"), "VULGAR FRACTION THREE QUARTERS", 'Verify charnames::viacode(native "BE") eq "VULGAR FRACTION THREE QUARTERS"'); is(charnames::viacode("U+00000000000FEED"), "ARABIC LETTER WAW ISOLATED FORM", 'Verify charnames::viacode("U+00000000000FEED") eq "ARABIC LETTER WAW ISOLATED FORM"'); { @@ -1024,7 +1010,10 @@ is("\N{U+1D0C5}", "\N{BYZANTINE MUSICAL SYMBOL FTHORA SKLIRON CHROMA VASIS}", 'V while (<$fh>) { chomp; my ($code, $name, undef, undef, undef, undef, undef, undef, undef, undef, $u1name) = split ";"; - my $decimal = hex $code; + my $decimal = utf8::unicode_to_native(hex $code); + $code = sprintf("%04X", $decimal) unless $::IS_ASCII; + + $decimal = hex $code; # The Unicode version 1 name is used instead of any that are # marked <control>. @@ -1034,7 +1023,7 @@ is("\N{U+1D0C5}", "\N{BYZANTINE MUSICAL SYMBOL FTHORA SKLIRON CHROMA VASIS}", 'V $name = "" if $^V lt v5.17.0 && $decimal == 0x1F514; # ALERT overrides BELL - $name = 'ALERT' if $decimal == 7; + $name = 'ALERT' if $decimal == utf8::unicode_to_native(7); # Some don't have names, leave those array elements undefined next unless $name; @@ -1209,7 +1198,10 @@ is("\N{U+1D0C5}", "\N{BYZANTINE MUSICAL SYMBOL FTHORA SKLIRON CHROMA VASIS}", 'V # These four code points now have names, from NameAlias, but # aren't listed as having names in UnicodeData.txt, so viacode # returns their alias names, not undef - next if $i == 0x80 || $i == 0x81 || $i == 0x84 || $i == 0x99; + next if $i == utf8::unicode_to_native(0x80) + || $i == utf8::unicode_to_native(0x81) + || $i == utf8::unicode_to_native(0x84) + || $i == utf8::unicode_to_native(0x99); # If there is no name for this code point, all we can # test is that. @@ -1223,7 +1215,11 @@ is("\N{U+1D0C5}", "\N{BYZANTINE MUSICAL SYMBOL FTHORA SKLIRON CHROMA VASIS}", 'V # These four code points have a different Unicode1 name than # regular name, and viacode has already specifically tested # for the regular name - if ($i != 0x0a && $i != 0x0c && $i != 0x0d && $i != 0x85) { + if ($i != utf8::unicode_to_native(0x0a) + && $i != utf8::unicode_to_native(0x0c) + && $i != utf8::unicode_to_native(0x0d) + && $i != utf8::unicode_to_native(0x85)) + { $all_pass &= is(charnames::viacode($i), $names[$i], "Verify viacode(0x$hex) is \"$names[$i]\""); } @@ -1249,7 +1245,7 @@ is("\N{U+1D0C5}", "\N{BYZANTINE MUSICAL SYMBOL FTHORA SKLIRON CHROMA VASIS}", 'V s/^\s*#.*//; next unless $_; my ($name, $codes) = split ";"; - my $utf8 = pack("U*", map { hex } split " ", $codes); + my $utf8 = pack("W*", map { hex } split " ", $codes); is(charnames::string_vianame($name), $utf8, "Verify string_vianame(\"$name\") is the proper utf8"); my $loose_name = get_loose_name($name); use charnames ":loose"; |