diff options
-rw-r--r-- | lib/unicore/mktables | 72 | ||||
-rw-r--r-- | pod/perldelta.pod | 5 | ||||
-rw-r--r-- | pod/perlunicode.pod | 76 | ||||
-rw-r--r-- | pod/perluniintro.pod | 12 |
4 files changed, 142 insertions, 23 deletions
diff --git a/lib/unicore/mktables b/lib/unicore/mktables index 3257a47ace..3004e6dc8c 100644 --- a/lib/unicore/mktables +++ b/lib/unicore/mktables @@ -779,6 +779,8 @@ push @tables_that_may_be_empty, 'Script=Common' if $v_version le v4.0.1; push @tables_that_may_be_empty, 'Title' if $v_version lt v2.0.0; push @tables_that_may_be_empty, 'Script=Katakana_Or_Hiragana' if $v_version ge v4.1.0; +push @tables_that_may_be_empty, 'Script_Extensions=Katakana_Or_Hiragana' + if $v_version ge v6.0.0; # The lists below are hashes, so the key is the item in the list, and the # value is the reason why it is in the list. This makes generation of @@ -944,7 +946,11 @@ if ($v_version ge 5.2.0 && $v_version lt 6.0.0) { # Probably obsolete forever if ($v_version ge v4.1.0) { - $why_suppressed{'Script=Katakana_Or_Hiragana'} = 'Obsolete. All code points previously matched by this have been moved to "Script=Common"'; + $why_suppressed{'Script=Katakana_Or_Hiragana'} = 'Obsolete. All code points previously matched by this have been moved to "Script=Common".'; +} +if ($v_version ge v6.0.0) { + $why_suppressed{'Script=Katakana_Or_Hiragana'} .= ' Consider instead using Script_Extensions=Katakana or Script_Extensions=Hiragana (or both)"'; + $why_suppressed{'Script_Extensions=Katakana_Or_Hiragana'} = 'All code points that would be matched by this are matched by either Script_Extensions=Katakana or Script_Extensions=Hiragana"'; } # This program can create files for enumerated-like properties, such as @@ -1063,7 +1069,6 @@ my %ignored_files = ( 'EmojiSources.txt' => 'Not of general utility: for Japanese legacy cell-phone applications', 'IndicMatraCategory.txt' => 'Provisional', 'IndicSyllabicCategory.txt' => 'Provisional', - 'ScriptExtensions.txt' => 'Provisional', ); ### End of externally interesting definitions, except for @input_file_objects @@ -11135,6 +11140,35 @@ sub filter_old_style_normalization_lines { return; } +sub setup_script_extensions { + # The Script_Extensions property starts out with a clone of the Script + # property. + + my $sc = property_ref("Script"); + my $scx = Property->new("scx", Full_Name => "Script_Extensions", + Initialize => $sc, + Default_Map => $sc->default_map, + Pre_Declared_Maps => 0, + ); + $scx->add_comment(join_lines( <<END +The values for code points that appear in one script are just the same as for +the 'Script' property. Likewise the values for those that appear in many +scripts are either 'Common' or 'Inherited', same as with 'Script'. But the +values of code points that appear in a few scripts are a space separated list +of those scripts. +END + )); + + # Make the scx's tables and aliases for them the same as sc's + foreach my $table ($sc->tables) { + my $scx_table = $scx->add_match_table($table->name, + Full_Name => $table->full_name); + foreach my $alias ($table->aliases) { + $scx_table->add_alias($alias->name); + } + } +} + sub finish_Unicode() { # This routine should be called after all the Unicode files have been read # in. It: @@ -11384,7 +11418,35 @@ END )); } } - return + + # The Script_Extensions property started out as a clone of the Script + # property. But processing its data file caused some elements to be + # replaced with different data. (These elements were for the Common and + # Inherited properties.) This data is a qw() list of all the scripts that + # the code points in the given range are in. An example line is: + # 060C ; Arab Syrc Thaa # Po ARABIC COMMA + # + # The code above has created a new match table named "Arab Syrc Thaa" + # which contains 060C. (The cloned table started out with this code point + # mapping to "Common".) Now we add 060C to each of the Arab, Syrc, and + # Thaa match tables. Then we delete the now spurious "Arab Syrc Thaa" + # match table. This is repeated for all these tables and ranges. The map + # data is retained in the map table for reference, but the spurious match + # tables are deleted. + + my $scx = property_ref("Script_Extensions"); + foreach my $table ($scx->tables) { + next unless $table->name =~ /\s/; # Only the new tables have a space + # in their names, and all do + my @scripts = split /\s+/, $table->name; + foreach my $script (@scripts) { + my $script_table = $scx->table($script); + $script_table += $table; + } + $scx->delete_match_table($table); + } + + return; } sub compile_perl() { @@ -14585,6 +14647,10 @@ my @input_file_objects = ( Optional => 1, Each_Line_Handler => \&filter_unihan_line, ), + Input_file->new('ScriptExtensions.txt', v6.0.0, + Property => 'Script_Extensions', + Pre_Handler => \&setup_script_extensions, + ), ); # End of all the preliminaries. diff --git a/pod/perldelta.pod b/pod/perldelta.pod index f4fc9c7b84..63061898ee 100644 --- a/pod/perldelta.pod +++ b/pod/perldelta.pod @@ -48,6 +48,11 @@ The restriction that you can only have one C<study> active at a time has been removed. You can now usefully C<study> as many strings as you want (until you exhaust memory). +=head2 The Unicode C<Script_Extensions> property is now supported. + +New in Unicode 6.0, this is an improved C<Script> property. Details +are in L<perlunicode/Scripts>. + =head1 Security XXX Any security-related notices go here. In particular, any security diff --git a/pod/perlunicode.pod b/pod/perlunicode.pod index c7bdef4bcb..4779cc5dca 100644 --- a/pod/perlunicode.pod +++ b/pod/perlunicode.pod @@ -470,11 +470,63 @@ The world's languages are written in many different scripts. This sentence written in Cyrillic, and Greek is written in, well, Greek; Japanese mainly in Hiragana or Katakana. There are many more. -The Unicode Script property gives what script a given character is in, -and the property can be specified with the compound form like -C<\p{Script=Hebrew}> (short: C<\p{sc=hebr}>). Perl furnishes shortcuts for all -script names. You can omit everything up through the equals (or colon), and -simply write C<\p{Latin}> or C<\P{Cyrillic}>. +The Unicode Script and Script_Extensions properties give what script a +given character is in. Either property can be specified with the +compound form like +C<\p{Script=Hebrew}> (short: C<\p{sc=hebr}>), or +C<\p{Script_Extensions=Javanese}> (short: C<\p{scx=java}>). +In addition, Perl furnishes shortcuts for all +C<Script> property names. You can omit everything up through the equals +(or colon), and simply write C<\p{Latin}> or C<\P{Cyrillic}>. +(This is not true for C<Script_Extensions>, which is required to be +written in the compound form.) + +The difference between these two properties involves characters that are +used in multiple scripts. For example the digits '0' through '9' are +used in many parts of the world. These are placed in a script named +C<Common>. Other characters are used in just a few scripts. For +example, the "KATAKANA-HIRAGANA DOUBLE HYPHEN" is used in both Japanese +scripts, Katakana and Hiragana, but nowhere else. The C<Script> +property places all characters that are used in multiple scripts in the +C<Common> script, while the C<Script_Extensions> property places those +that are used in only a few scripts into each of those scripts; while +still using C<Common> for those used in many scripts. Thus both these +match: + + "0" =~ /\p{sc=Common}/ # Matches + "0" =~ /\p{scx=Common}/ # Matches + +and only the first of these match: + + "\N{KATAKANA-HIRAGANA DOUBLE HYPHEN}" =~ /\p{sc=Common} # Matches + "\N{KATAKANA-HIRAGANA DOUBLE HYPHEN}" =~ /\p{scx=Common} # No match + +And only the last two of these match: + + "\N{KATAKANA-HIRAGANA DOUBLE HYPHEN}" =~ /\p{sc=Hiragana} # No match + "\N{KATAKANA-HIRAGANA DOUBLE HYPHEN}" =~ /\p{sc=Katakana} # No match + "\N{KATAKANA-HIRAGANA DOUBLE HYPHEN}" =~ /\p{scx=Hiragana} # Matches + "\N{KATAKANA-HIRAGANA DOUBLE HYPHEN}" =~ /\p{scx=Katakana} # Matches + +C<Script_Extensions> is thus an improved C<Script>, in which there are +fewer characters in the C<Common> script, and correspondingly more in +other scripts. It is new in Unicode version 6.0, and its data are likely +to change significantly in later releases, as things get sorted out. + +(Actually, besides C<Common>, the C<Inherited> script, contains +characters that are used in multiple scripts. These are modifier +characters which modify other characters, and inherit the script value +of the controlling character. Some of these are used in many scripts, +and so go into C<Inherited> in both C<Script> and C<Script_Extensions>. +Others are used in just a few scripts, so are in C<Inherited> in +C<Script>, but not in C<Script_Extensions>.) + +It is worth stressing that there are several different sets of digits in +Unicode that are equivalent to 0-9 and are matchable by C<\d> in a +regular expression. If they are used in a single language only, they +are in that language's C<Script> and C<Script_Extension>. If they are +used in more than one script, they will be in C<sc=Common>, but only +if they are used in many scripts should they be in C<scx=Common>. A complete list of scripts and their shortcuts is in L<perluniprops>. @@ -497,20 +549,14 @@ other words, the ASCII characters. The "Latin" script contains some letters from this as well as several other blocks, like "Latin-1 Supplement", "Latin Extended-A", etc., but it does not contain all the characters from those blocks. It does not, for example, contain the digits 0-9, because -those digits are shared across many scripts. The digits 0-9 and similar groups, -like punctuation, are in the script called C<Common>. There is also a -script called C<Inherited> for characters that modify other characters, -and inherit the script value of the controlling character. (Note that -there are several different sets of digits in Unicode that are -equivalent to 0-9 and are matchable by C<\d> in a regular expression. -If they are used in a single language only, they are in that language's -script. Only sets that are used across several languages are in the -C<Common> script.) +those digits are shared across many scripts, and hence are in the +C<Common> script. For more about scripts versus blocks, see UAX#24 "Unicode Script Property": L<http://www.unicode.org/reports/tr24> -The Script property is likely to be the one you want to use when processing +The C<Script> or C<Script_Extensions> properties are likely to be the +ones you want to use when processing natural language; the Block property may occasionally be useful in working with the nuts and bolts of Unicode. diff --git a/pod/perluniintro.pod b/pod/perluniintro.pod index c69dedf71b..a8a927df7e 100644 --- a/pod/perluniintro.pod +++ b/pod/perluniintro.pod @@ -109,11 +109,13 @@ C<block> of consecutive unallocated code points for its characters. So far, the number of code points in these blocks has always been evenly divisible by 16. Extras in a block, not currently needed, are left unallocated, for future growth. But there have been occasions when -a later relase needed more code points than available extras, and a new -block had to allocated somewhere else, not contiguous to the initial one -to handle the overflow. Thus, it became apparent early on that "block" -wasn't an adequate organizing principal, and so the C<script> property -was created. Those code points that are in overflow blocks can still +a later relase needed more code points than the available extras, and a +new block had to allocated somewhere else, not contiguous to the initial +one, to handle the overflow. Thus, it became apparent early on that +"block" wasn't an adequate organizing principal, and so the C<Script> +property was created. (Later an improved script property was added as +well, the C<Script_Extensions> property.) Those code points that are in +overflow blocks can still have the same script as the original ones. The script concept fits more closely with natural language: there is C<Latin> script, C<Greek> script, and so on; and there are several artificial scripts, like |