diff options
author | Karl Williamson <khw@cpan.org> | 2015-07-28 15:21:16 -0600 |
---|---|---|
committer | Karl Williamson <khw@cpan.org> | 2015-07-28 22:15:58 -0600 |
commit | ce8d64d95a7ce1fbbd4c00af9cd06eb0f91df8f9 (patch) | |
tree | e71008dd2d9fb194dc70a74c061465f6f5309275 /lib | |
parent | bf4268faa8496da74302d7b5ee786c51f77a322f (diff) | |
download | perl-ce8d64d95a7ce1fbbd4c00af9cd06eb0f91df8f9.tar.gz |
mktables: Fix up Name_Alias in early Unicodes
perl needs the Name_Alias property accessible in all releases in order
for charnames to work properly. However the property was not created
until Unicode version 5.0. Previously, the property was made available
to all Unicode versions, which is contrary to the policy of exposing
properties to public use only when Unicode so exposes them. Thus the
behavior is as close as possible to Unicode-specified. This commit
creates an internal-only property for the perl core, and removes the
general access on early Unicode releases.
Diffstat (limited to 'lib')
-rw-r--r-- | lib/Unicode/UCD.pm | 6 | ||||
-rw-r--r-- | lib/Unicode/UCD.t | 2 | ||||
-rw-r--r-- | lib/unicore/mktables | 357 |
3 files changed, 78 insertions, 287 deletions
diff --git a/lib/Unicode/UCD.pm b/lib/Unicode/UCD.pm index bf4ba53c3a..1854982491 100644 --- a/lib/Unicode/UCD.pm +++ b/lib/Unicode/UCD.pm @@ -3258,8 +3258,8 @@ RETRY: # we need to also read in that table. Create a hash with the keys # being the code points, and the values being a list of the # aliases for the code point key. - my ($aliases_code_points, $aliases_maps, undef, undef) = - &prop_invmap('Name_Alias'); + my ($aliases_code_points, $aliases_maps, undef, undef) + = &prop_invmap("_Perl_Name_Alias", '_perl_core_internal_ok'); my %aliases; for (my $i = 0; $i < @$aliases_code_points; $i++) { my $code_point = $aliases_code_points->[$i]; @@ -3894,7 +3894,7 @@ RETRY: map { $_ = [ split " ", $_ ] if $_ =~ / / } @invmap; $format = 'sl'; } - elsif ($returned_prop eq 'ToNameAlias') { + elsif ($returned_prop =~ / To ( _Perl )? NameAlias/x) { # This property currently doesn't have any lists, but theoretically # could diff --git a/lib/Unicode/UCD.t b/lib/Unicode/UCD.t index 1bf036cbf6..22b2edbc93 100644 --- a/lib/Unicode/UCD.t +++ b/lib/Unicode/UCD.t @@ -2437,7 +2437,7 @@ foreach my $prop (sort(keys %props), sort keys %legacy_props) { # And remove the aliases. We read in the Name_Alias property, and go # through them one by one. my ($aliases_code_points, $aliases_maps, undef, undef) - = &prop_invmap('Name_Alias'); + = &prop_invmap('_Perl_Name_Alias', '_perl_core_internal_ok'); for (my $i = 0; $i < @$aliases_code_points; $i++) { my $code_point = $aliases_code_points->[$i]; diff --git a/lib/unicore/mktables b/lib/unicore/mktables index d96ff0e923..2402017a63 100644 --- a/lib/unicore/mktables +++ b/lib/unicore/mktables @@ -2401,7 +2401,12 @@ sub trace { return main::trace(@_); } # message giving the needed things, and add it to the list # of such to output before actual processing happens # (hence the user finds out all of them in one run). - my $string_version = sprintf "%vd", $first_released{$addr}; + # Instead of creating a general method for NameAliases, + # hard-code it here, as there is unlikely to ever be a + # second one which needs special handling. + my $string_version = ($file{$addr} eq "NameAliases.txt") + ? 'at least 6.1 (the later, the better)' + : sprintf "%vd", $first_released{$addr}; push @missing_early_files, <<END; '$file{$addr}' version $string_version should be copied to '$early{$addr}[1]'. END @@ -11257,11 +11262,12 @@ END my $file = shift; Carp::carp_extra_args(\@_) if main::DEBUG && @_; - # Create a new property specially located that is a combination of the + # Create a new property specially located that is a combination of # various Name properties: Name, Unicode_1_Name, Named Sequences, and - # Name_Alias properties. (The final duplicates elements of the - # first.) A comment for it will later be constructed based on the - # actual properties present and used + # _Perl_Name_Alias properties. (The final one duplicates elements of the + # first, and starting in v6.1, is the same as the 'Name_Alias + # property.) A comment for the new property will later be constructed + # based on the actual properties present and used $perl_charname = Property->new('Perl_Charnames', Default_Map => "", Directory => File::Spec->curdir(), @@ -13101,272 +13107,47 @@ sub generate_GCB { main::process_generic_property_file($file); } -sub setup_early_name_alias { - my $file= shift; - Carp::carp_extra_args(\@_) if main::DEBUG && @_; - # This has the effect of pretending that the Name_Alias property was - # available in all Unicode releases. Strictly speaking, this property - # should not be availabe in early releases, but doing this allows - # charnames.pm to work on older releases without change. Prior to v5.16 - # it had these names hard-coded inside it. Unicode 6.1 came along and - # created these names, and so they were removed from charnames. +sub fixup_early_perl_name_alias { - my $aliases = property_ref('Name_Alias'); - if (! defined $aliases) { - $aliases = Property->new('Name_Alias', Default_Map => ""); - } + # Different versions of Unicode have varying support for the name synonyms + # below. Just include everything. As of 6.1, all these are correct in + # the Unicode-supplied file. + + my $file= shift; + Carp::carp_extra_args(\@_) if main::DEBUG && @_; - $file->insert_lines(get_old_name_aliases()); - return; -} + # ALERT did not come along until 6.0, at which point it became preferred + # over BELL. By inserting it last in early releases, BELL is preferred + # over it; and vice-vers in 6.0 + my $type_for_bell = ($v_version lt v6.0.0) + ? 'correction' + : 'alternate'; + $file->insert_lines(split /\n/, <<END +0007;BELL; $type_for_bell +000A;LINE FEED (LF);alternate +000C;FORM FEED (FF);alternate +000D;CARRIAGE RETURN (CR);alternate +0085;NEXT LINE (NEL);alternate +END -sub get_old_name_aliases () { + ); - # The Unicode_1_Name field, contains most of these names. One would - # expect, given the field's name, that its values would be fixed across - # versions, giving the true Unicode version 1 name for the character. - # Sadly, this is not the case. Actually Version 1.1.5 had no names for - # any of the controls; Version 2.0 introduced names for the C0 controls, - # and 3.0 introduced C1 names. 3.0.1 removed the name INDEX; and 3.2 - # changed some names: it + # One might think that the the 'Unicode_1_Name' field, could work for most + # of the above names, but sadly that field varies depending on the + # release. Version 1.1.5 had no names for any of the controls; Version + # 2.0 introduced names for the C0 controls, and 3.0 introduced C1 names. + # 3.0.1 removed the name INDEX; and 3.2 changed some names: # changed to parenthesized versions like "NEXT LINE" to # "NEXT LINE (NEL)"; # changed PARTIAL LINE DOWN to PARTIAL LINE FORWARD # changed PARTIAL LINE UP to PARTIAL LINE BACKWARD;; # changed e.g. FILE SEPARATOR to INFORMATION SEPARATOR FOUR - # This list contains all the names that were defined so that - # charnames::vianame(), etc. understand them all EVEN if this version of - # Unicode didn't specify them (this could be construed as a bug). - # mktables elsewhere gives preference to the Unicode_1_Name field over - # these names, so that viacode() will return the correct value for that - # version of Unicode, except when that version doesn't define a name, - # viacode() will return one anyway (this also could be construed as a - # bug). But these potential "bugs" allow for the smooth working of code - # on earlier Unicode releases. - - my @return = split /\n/, <<'END'; -0000;NULL;control -0000;NUL;abbreviation -0001;START OF HEADING;control -0001;SOH;abbreviation -0002;START OF TEXT;control -0002;STX;abbreviation -0003;END OF TEXT;control -0003;ETX;abbreviation -0004;END OF TRANSMISSION;control -0004;EOT;abbreviation -0005;ENQUIRY;control -0005;ENQ;abbreviation -0006;ACKNOWLEDGE;control -0006;ACK;abbreviation -0007;BELL;control -0007;BEL;abbreviation -0008;BACKSPACE;control -0008;BS;abbreviation -0009;CHARACTER TABULATION;control -0009;HORIZONTAL TABULATION;control -0009;HT;abbreviation -0009;TAB;abbreviation -000A;LINE FEED;control -000A;LINE FEED (LF);control -000A;NEW LINE;control -000A;END OF LINE;control -000A;LF;abbreviation -000A;NL;abbreviation -000A;EOL;abbreviation -000B;LINE TABULATION;control -000B;VERTICAL TABULATION;control -000B;VT;abbreviation -000C;FORM FEED;control -000C;FORM FEED (FF);control -000C;FF;abbreviation -000D;CARRIAGE RETURN;control -000D;CARRIAGE RETURN (CR);control -000D;CR;abbreviation -000E;SHIFT OUT;control -000E;LOCKING-SHIFT ONE;control -000E;SO;abbreviation -000F;SHIFT IN;control -000F;LOCKING-SHIFT ZERO;control -000F;SI;abbreviation -0010;DATA LINK ESCAPE;control -0010;DLE;abbreviation -0011;DEVICE CONTROL ONE;control -0011;DC1;abbreviation -0012;DEVICE CONTROL TWO;control -0012;DC2;abbreviation -0013;DEVICE CONTROL THREE;control -0013;DC3;abbreviation -0014;DEVICE CONTROL FOUR;control -0014;DC4;abbreviation -0015;NEGATIVE ACKNOWLEDGE;control -0015;NAK;abbreviation -0016;SYNCHRONOUS IDLE;control -0016;SYN;abbreviation -0017;END OF TRANSMISSION BLOCK;control -0017;ETB;abbreviation -0018;CANCEL;control -0018;CAN;abbreviation -0019;END OF MEDIUM;control -0019;EOM;abbreviation -001A;SUBSTITUTE;control -001A;SUB;abbreviation -001B;ESCAPE;control -001B;ESC;abbreviation -001C;INFORMATION SEPARATOR FOUR;control -001C;FILE SEPARATOR;control -001C;FS;abbreviation -001D;INFORMATION SEPARATOR THREE;control -001D;GROUP SEPARATOR;control -001D;GS;abbreviation -001E;INFORMATION SEPARATOR TWO;control -001E;RECORD SEPARATOR;control -001E;RS;abbreviation -001F;INFORMATION SEPARATOR ONE;control -001F;UNIT SEPARATOR;control -001F;US;abbreviation -0020;SP;abbreviation -007F;DELETE;control -007F;DEL;abbreviation -0080;PADDING CHARACTER;figment -0080;PAD;abbreviation -0081;HIGH OCTET PRESET;figment -0081;HOP;abbreviation -0082;BREAK PERMITTED HERE;control -0082;BPH;abbreviation -0083;NO BREAK HERE;control -0083;NBH;abbreviation -0084;INDEX;control -0084;IND;abbreviation -0085;NEXT LINE;control -0085;NEXT LINE (NEL);control -0085;NEL;abbreviation -0086;START OF SELECTED AREA;control -0086;SSA;abbreviation -0087;END OF SELECTED AREA;control -0087;ESA;abbreviation -0088;CHARACTER TABULATION SET;control -0088;HORIZONTAL TABULATION SET;control -0088;HTS;abbreviation -0089;CHARACTER TABULATION WITH JUSTIFICATION;control -0089;HORIZONTAL TABULATION WITH JUSTIFICATION;control -0089;HTJ;abbreviation -008A;LINE TABULATION SET;control -008A;VERTICAL TABULATION SET;control -008A;VTS;abbreviation -008B;PARTIAL LINE FORWARD;control -008B;PARTIAL LINE DOWN;control -008B;PLD;abbreviation -008C;PARTIAL LINE BACKWARD;control -008C;PARTIAL LINE UP;control -008C;PLU;abbreviation -008D;REVERSE LINE FEED;control -008D;REVERSE INDEX;control -008D;RI;abbreviation -008E;SINGLE SHIFT TWO;control -008E;SINGLE-SHIFT-2;control -008E;SS2;abbreviation -008F;SINGLE SHIFT THREE;control -008F;SINGLE-SHIFT-3;control -008F;SS3;abbreviation -0090;DEVICE CONTROL STRING;control -0090;DCS;abbreviation -0091;PRIVATE USE ONE;control -0091;PRIVATE USE-1;control -0091;PU1;abbreviation -0092;PRIVATE USE TWO;control -0092;PRIVATE USE-2;control -0092;PU2;abbreviation -0093;SET TRANSMIT STATE;control -0093;STS;abbreviation -0094;CANCEL CHARACTER;control -0094;CCH;abbreviation -0095;MESSAGE WAITING;control -0095;MW;abbreviation -0096;START OF GUARDED AREA;control -0096;START OF PROTECTED AREA;control -0096;SPA;abbreviation -0097;END OF GUARDED AREA;control -0097;END OF PROTECTED AREA;control -0097;EPA;abbreviation -0098;START OF STRING;control -0098;SOS;abbreviation -0099;SINGLE GRAPHIC CHARACTER INTRODUCER;figment -0099;SGC;abbreviation -009A;SINGLE CHARACTER INTRODUCER;control -009A;SCI;abbreviation -009B;CONTROL SEQUENCE INTRODUCER;control -009B;CSI;abbreviation -009C;STRING TERMINATOR;control -009C;ST;abbreviation -009D;OPERATING SYSTEM COMMAND;control -009D;OSC;abbreviation -009E;PRIVACY MESSAGE;control -009E;PM;abbreviation -009F;APPLICATION PROGRAM COMMAND;control -009F;APC;abbreviation -00A0;NBSP;abbreviation -00AD;SHY;abbreviation -200B;ZWSP;abbreviation -200C;ZWNJ;abbreviation -200D;ZWJ;abbreviation -200E;LRM;abbreviation -200F;RLM;abbreviation -202A;LRE;abbreviation -202B;RLE;abbreviation -202C;PDF;abbreviation -202D;LRO;abbreviation -202E;RLO;abbreviation -FEFF;BYTE ORDER MARK;alternate -FEFF;BOM;abbreviation -FEFF;ZWNBSP;abbreviation -END - - if ($v_version ge v3.0.0) { - push @return, split /\n/, <<'END'; -180B; FVS1; abbreviation -180C; FVS2; abbreviation -180D; FVS3; abbreviation -180E; MVS; abbreviation -202F; NNBSP; abbreviation -END - } - - if ($v_version ge v3.2.0) { - push @return, split /\n/, <<'END'; -034F; CGJ; abbreviation -205F; MMSP; abbreviation -2060; WJ; abbreviation -END - # Add in VS1..VS16 - my $cp = 0xFE00 - 1; - for my $i (1..16) { - push @return, sprintf("%04X; VS%d; abbreviation", $cp + $i, $i); - } - } - if ($v_version ge v4.0.0) { # Add in VS17..VS256 - my $cp = 0xE0100 - 17; - for my $i (17..256) { - push @return, sprintf("%04X; VS%d; abbreviation", $cp + $i, $i); - } - } - - # ALERT did not come along until 6.0, at which point it became preferred - # over BELL, and was never in the Unicode_1_Name field. For the same - # reasons, that the other names are made known to all releases by this - # function, we make ALERT known too. By inserting it - # last in early releases, BELL is preferred over it; and vice-vers in 6.0 - my $alert = '0007; ALERT; control'; - if ($v_version lt v6.0.0) { - push @return, $alert; - } - else { - unshift @return, $alert; - } + # + # All these are present in the 6.1 NameAliases.txt - return @return; + return; } sub filter_later_version_name_alias_line { @@ -13393,9 +13174,8 @@ sub filter_later_version_name_alias_line { sub filter_early_version_name_alias_line { # Early versions did not have the trailing alias type field; implicitly it - # was 'correction'. But our synthetic lines we add in this program do - # have it, so test for the type field. - $_ .= "; correction" if $_ !~ /;.*;/; + # was 'correction'. + $_ .= "; correction"; filter_later_version_name_alias_line; return; @@ -14605,7 +14385,7 @@ sub compile_perl() { + utf8::unicode_to_native(0xA0) # NBSP ); - my @composition = ('Name', 'Unicode_1_Name', 'Name_Alias'); + my @composition = ('Name', 'Unicode_1_Name', '_Perl_Name_Alias'); if (@named_sequences) { push @composition, 'Named_Sequence'; @@ -14616,15 +14396,15 @@ sub compile_perl() { my $alias_sentence = ""; my %abbreviations; - my $alias = property_ref('Name_Alias'); - $perl_charname->set_proxy_for('Name_Alias'); - - # Add each entry in Name_Alias to Perl_Charnames. Where these go with - # respect to any existing entry depends on the entry type. Corrections go - # before said entry, as they should be returned in preference over the - # existing entry. (A correction to a correction should be later in the - # Name_Alias table, so it will correctly precede the erroneous correction - # in Perl_Charnames.) + my $alias = property_ref('_Perl_Name_Alias'); + $perl_charname->set_proxy_for('_Perl_Name_Alias'); + + # Add each entry in _Perl_Name_Alias to Perl_Charnames. Where these go + # with respect to any existing entry depends on the entry type. + # Corrections go before said entry, as they should be returned in + # preference over the existing entry. (A correction to a correction + # should be later in the _Perl_Name_Alias table, so it will correctly + # precede the erroneous correction in Perl_Charnames.) # # Abbreviations go after everything else, so they are saved temporarily in # a hash for later. @@ -14659,7 +14439,7 @@ sub compile_perl() { $perl_charname->add_duplicate($code_point, $value, Replace => $replace_type); } $alias_sentence = <<END; -The Name_Alias property adds duplicate code point entries that are +The _Perl_Name_Alias property adds duplicate code point entries that are alternatives to the original name. If an addition is a corrected name, it will be physically first in the table. The original (less correct, but still valid) name will be next; then any alternatives, in no particular @@ -14699,12 +14479,6 @@ END Replace => $before_or_after); } - # But in this version only, the ALERT has precedence over BELL, the - # Unicode_1_Name that would otherwise have precedence. - if ($v_version eq v6.0.0) { - $perl_charname->add_duplicate(7, 'ALERT', Replace => $MULTIPLE_BEFORE); - } - # Now that have everything added, add in abbreviations after # everything else. Sort so results don't change between runs of this # program @@ -18839,15 +18613,32 @@ my @input_file_objects = ( Withdrawn => v5.2, Skip => $Documentation, ), - Input_file->new('NameAliases.txt', v0, + Input_file->new('NameAliases.txt', v5.0, Property => 'Name_Alias', - Pre_Handler => ($v_version le v6.0.0) - ? \&setup_early_name_alias - : undef, Each_Line_Handler => ($v_version le v6.0.0) ? \&filter_early_version_name_alias_line : \&filter_later_version_name_alias_line, ), + # NameAliases.txt came along in v5.0. The above constructor handles + # this. But until 6.1, it was lacking some information needed by core + # perl. The constructor below handles that. It is either a kludge or + # clever, depending on your point of view. The 'Withdrawn' parameter + # indicates not to use it at all starting in 6.1 (so the above + # constructor applies), and the 'v6.1' parameter indicates to use the + # Early parameter before 6.1. Therefore 'Early" is always used, + # yielding the internal-only property '_Perl_Name_Alias', which it + # gets from a NameAliases.txt from 6.1 or later stored in + # N_Asubst.txt. In combination with the above constructor, + # 'Name_Alias' is publicly accessible starting with v5.0, and the + # better 6.1 version is accessible to perl core in all releases. + Input_file->new("NameAliases.txt", v6.1, + Withdrawn => v6.1, + Early => [ "N_Asubst.txt", '_Perl_Name_Alias', "" ], + Property => 'Name_Alias', + EOF_Handler => \&fixup_early_perl_name_alias, + Each_Line_Handler => + \&filter_later_version_name_alias_line, + ), Input_file->new('NamedSqProv.txt', v5.0.0, Skip => 'Named sequences proposed for inclusion in a ' . 'later version of the Unicode Standard; if you ' |