UCD.pm: Remove reliance on UnicodeData.txt

In doing so, there were a number of bug fixes made, as it now relies on files processed by mktables, which has intelligence to fix a number of problems with UnicodeData.txt. This is essentially a rewrite of charinfo(). It previously had hard-coded the ranges in UnicodeData.txt, instead of examining the file to see what was there. This had not been updated for some time, and was out-of-date, with the result that the newer ranges (all CJK) were quite wrong. The new code does not have such reliance, and so new versions of Unicode should not break this, like they previously would This may be slower than what was previously there, as it reads several smaller files instead of one very large one. But the principal reason to do this work was to save disk space. It was previously thought that the function could continue to use UnicodeData.txt if it exists on the machine, but this would have required fixing all the bugs that this automatically fixes by using the processed files.
author: Karl Williamson <public@khwilliamson.com> 2011-03-03 18:33:18 -0700
committer: Karl Williamson <public@khwilliamson.com> 2011-03-03 19:26:17 -0700
commit: 05dbc6f80f8f2d5774f53874803f5a20450bbe82 (patch)
tree: add277ddca3bddff4ea4ac1aa52d77ab6f069d2d /lib/Unicode
parent: 36c2430c54431c750134fb5add2327486301d66f (diff)
download: perl-05dbc6f80f8f2d5774f53874803f5a20450bbe82.tar.gz
2 files changed, 147 insertions, 127 deletions
diff --git a/lib/Unicode/UCD.pm b/lib/Unicode/UCD.pm
index 475f2832c5..1fece2773d 100644
--- a/lib/Unicode/UCD.pm
+++ b/lib/Unicode/UCD.pm
@@ -4,6 +4,7 @@ use strict;
 use warnings;
 no warnings 'surrogate';    # surrogates can be inputs to this
 use charnames ();
+use Unicode::Normalize qw(getCombinClass NFKD);
 
 our $VERSION = '0.32';
 
@@ -90,7 +91,6 @@ to 16 bits (the number of Unicode code points is open-ended, in theory
 unlimited): you may have more than 4 hexdigits.
 =cut
 
-my $UNICODEFH;
 my $BLOCKSFH;
 my $VERSIONFH;
 my $CASEFOLDFH;
@@ -264,134 +264,152 @@ sub _getcode {
     return;
 }
 
-# Lingua::KO::Hangul::Util not part of the standard distribution
-# but it will be used if available.
-
-eval { require Lingua::KO::Hangul::Util };
-my $hasHangulUtil = ! $@;
-if ($hasHangulUtil) {
-    Lingua::KO::Hangul::Util->import();
-}
-
-sub hangul_decomp { # internal: called from charinfo
-    if ($hasHangulUtil) {
-	my @tmp = decomposeHangul(shift);
-	return sprintf("%04X %04X",      @tmp) if @tmp == 2;
-	return sprintf("%04X %04X %04X", @tmp) if @tmp == 3;
-    }
-    return;
-}
-
-sub hangul_charname { # internal: called from charinfo
-    return sprintf("HANGUL SYLLABLE-%04X", shift);
+# Populated by _num.  Converts real number back to input rational
+my %real_to_rational;
+
+# To store the contents of files found on disk.
+my @BIDIS;
+my @CATEGORIES;
+my @DECOMPOSITIONS;
+my @NUMERIC_TYPES;
+my @SIMPLE_LOWER;
+my @SIMPLE_TITLE;
+my @SIMPLE_UPPER;
+my @UNICODE_1_NAMES;
+
+sub _charinfo_case {
+
+    # Returns the value to set into one of the case fields in the charinfo
+    # structure.
+    #   $char is the character,
+    #   $cased is the case-changed character
+    #   $file is the file in lib/unicore/To/$file that contains the data
+    #       needed for this, in the form that _search() understands.
+    #   $array_ref points to the array holding the contents of $file.  It will
+    #       be populated if empty.
+    # By using the 'uc', etc. functions, we avoid loading more files into
+    # memory except for those rare cases where the simple casing (which has
+    # been what charinfo() has always returned, is different than the full
+    # casing.
+    my ($char, $cased, $file, $array_ref) = @_;
+
+    return "" if $cased eq $char;
+
+    return sprintf("%04X", ord $cased) if length($cased) == 1;
+
+    @$array_ref =_read_table("unicore/To/$file") unless @$array_ref;
+    return _search($array_ref, 0, $#$array_ref, ord $char) // "";
 }
 
-sub han_charname { # internal: called from charinfo
-    return sprintf("CJK UNIFIED IDEOGRAPH-%04X", shift);
-}
+sub charinfo {
 
-# Overwritten by data in file
-my %first_last = (
-   'CJK Ideograph Extension A' => [ 0x3400,   0x4DB5   ],
-   'CJK Ideograph'             => [ 0x4E00,   0x9FA5   ],
-   'CJK Ideograph Extension B' => [ 0x20000,  0x2A6D6  ],
-);
-
-get_charinfo_ranges();
-
-sub get_charinfo_ranges {
-   my @blocks = keys %first_last;
-   
-   my $fh;
-   openunicode( \$fh, 'UnicodeData.txt' );
-   if( defined $fh ){
-      while( my $line = <$fh> ){
-         next unless $line =~ /(?:First|Last)/;
-         if( grep{ $line =~ /[^;]+;<$_\s*,\s*(?:First|Last)>/ }@blocks ){
-            my ($number,$block,$type);
-            ($number,$block) = split /;/, $line;
-            $block =~ s/<|>//g;
-            ($block,$type) = split /, /, $block;
-            my $index = $type eq 'First' ? 0 : 1;
-            $first_last{ $block }->[$index] = hex $number;
-         }
-      }
-   }
-}
+    # This function has traditionally mimicked what is in UnicodeData.txt,
+    # warts and all.  This is a re-write that avoids UnicodeData.txt so that
+    # it can be removed to save disk space.  Instead, this assembles
+    # information gotten by other methods that get data from various other
+    # files.  It uses charnames to get the character name; and various
+    # mktables tables.
 
-my @CharinfoRanges = (
-# block name
-# [ first, last, coderef to name, coderef to decompose ],
-# CJK Ideographs Extension A
-  [ @{ $first_last{'CJK Ideograph Extension A'} },        \&han_charname,   undef  ],
-# CJK Ideographs
-  [ @{ $first_last{'CJK Ideograph'} },                    \&han_charname,   undef  ],
-# Hangul Syllables
-  [ 0xAC00,   0xD7A3,   $hasHangulUtil ? \&getHangulName : \&hangul_charname,  \&hangul_decomp ],
-# Non-Private Use High Surrogates
-  [ 0xD800,   0xDB7F,   undef,   undef  ],
-# Private Use High Surrogates
-  [ 0xDB80,   0xDBFF,   undef,   undef  ],
-# Low Surrogates
-  [ 0xDC00,   0xDFFF,   undef,   undef  ],
-# The Private Use Area
-  [ 0xE000,   0xF8FF,   undef,   undef  ],
-# CJK Ideographs Extension B
-  [ @{ $first_last{'CJK Ideograph Extension B'} },        \&han_charname,   undef  ],
-# Plane 15 Private Use Area
-  [ 0xF0000,  0xFFFFD,  undef,   undef  ],
-# Plane 16 Private Use Area
-  [ 0x100000, 0x10FFFD, undef,   undef  ],
-);
+    use feature 'unicode_strings';
 
-sub charinfo {
     my $arg  = shift;
     my $code = _getcode($arg);
-    croak __PACKAGE__, "::charinfo: unknown code '$arg'"
-	unless defined $code;
-    my $hexk = sprintf("%06X", $code);
-    my($rcode,$rname,$rdec);
-    foreach my $range (@CharinfoRanges){
-      if ($range->[0] <= $code && $code <= $range->[1]) {
-        $rcode = $hexk;
-	$rcode =~ s/^0+//;
-	$rcode =  sprintf("%04X", hex($rcode));
-        $rname = $range->[2] ? $range->[2]->($code) : '';
-        $rdec  = $range->[3] ? $range->[3]->($code) : '';
-        $hexk  = sprintf("%06X", $range->[0]); # replace by the first
-        last;
-      }
+    croak __PACKAGE__, "::charinfo: unknown code '$arg'" unless defined $code;
+
+    # Non-unicode implies undef.
+    return if $code > 0x10FFFF;
+
+    my %prop;
+    my $char = chr($code);
+
+    @CATEGORIES =_read_table("unicore/To/Gc.pl") unless @CATEGORIES;
+    $prop{'category'} = _search(\@CATEGORIES, 0, $#CATEGORIES, $code)
+                        // $utf8::SwashInfo{'ToGc'}{'missing'};
+
+    return if $prop{'category'} eq 'Cn';    # Unassigned code points are undef
+
+    $prop{'code'} = sprintf "%04X", $code;
+    $prop{'name'} = ($char =~ /\p{Cntrl}/) ? '<control>'
+                                           : (charnames::viacode($code) // "");
+
+    $prop{'combining'} = getCombinClass($code);
+
+    @BIDIS =_read_table("unicore/To/Bc.pl") unless @BIDIS;
+    $prop{'bidi'} = _search(\@BIDIS, 0, $#BIDIS, $code)
+                    // $utf8::SwashInfo{'ToBc'}{'missing'};
+
+    # For most code points, we can just read in "unicore/Decomposition.pl", as
+    # its contents are exactly what should be output.  But that file doesn't
+    # contain the data for the Hangul syllable decompositions, which can be
+    # algorithmically computed, and NFKD() does that, so we call NFKD() for
+    # those.  We can't use NFKD() for everything, as it does a complete
+    # recursive decomposition, and what this function has always done is to
+    # return what's in UnicodeData.txt which doesn't have the recursivenss
+    # specified.
+    # in the decomposition types.  No decomposition implies an empty field;
+    # otherwise, all but "Canonical" imply a compatible decomposition, and
+    # the type is prefixed to that, as it is in UnicodeData.txt
+    if ($char =~ /\p{Block=Hangul_Syllables}/) {
+        # The code points of the decomposition are output in standard Unicode
+        # hex format, separated by blanks.
+        $prop{'decomposition'} = join " ", map { sprintf("%04X", $_)}
+                                           unpack "U*", NFKD($char);
     }
-    openunicode(\$UNICODEFH, "UnicodeData.txt");
-    if (defined $UNICODEFH) {
-	use Search::Dict 1.02;
-	if (look($UNICODEFH, "$hexk;", { xfrm => sub { $_[0] =~ /^([^;]+);(.+)/; sprintf "%06X;$2", hex($1) } } ) >= 0) {
-	    my $line = <$UNICODEFH>;
-	    return unless defined $line;
-	    chomp $line;
-	    my %prop;
-	    @prop{qw(
-		     code name category
-		     combining bidi decomposition
-		     decimal digit numeric
-		     mirrored unicode10 comment
-		     upper lower title
-		    )} = split(/;/, $line, -1);
-	    $hexk =~ s/^0+//;
-	    $hexk =  sprintf("%04X", hex($hexk));
-	    if ($prop{code} eq $hexk) {
-		$prop{block}  = charblock($code);
-		$prop{script} = charscript($code);
-		if(defined $rname){
-                    $prop{code} = $rcode;
-                    $prop{name} = $rname;
-                    $prop{decomposition} = $rdec;
-                }
-		return \%prop;
-	    }
-	}
+    else {
+        @DECOMPOSITIONS = _read_table("unicore/Decomposition.pl")
+                          unless @DECOMPOSITIONS;
+        $prop{'decomposition'} = _search(\@DECOMPOSITIONS, 0, $#DECOMPOSITIONS,
+                                                                $code) // "";
     }
-    return;
+
+    # Can use num() to get the numeric values, if any.
+    if (! defined (my $value = num($char))) {
+        $prop{'decimal'} = $prop{'digit'} = $prop{'numeric'} = "";
+    }
+    else {
+        if ($char =~ /\d/) {
+            $prop{'decimal'} = $prop{'digit'} = $prop{'numeric'} = $value;
+        }
+        else {
+
+            # For non-decimal-digits, we have to read in the Numeric type
+            # to distinguish them.  It is not just a matter of integer vs.
+            # rational, as some whole number values are not considered digits,
+            # e.g., TAMIL NUMBER TEN.
+            $prop{'decimal'} = "";
+
+            @NUMERIC_TYPES =_read_table("unicore/To/Nt.pl")
+                                unless @NUMERIC_TYPES;
+            if ((_search(\@NUMERIC_TYPES, 0, $#NUMERIC_TYPES, $code) // "")
+                eq 'Digit')
+            {
+                $prop{'digit'} = $prop{'numeric'} = $value;
+            }
+            else {
+                $prop{'digit'} = "";
+                $prop{'numeric'} = $real_to_rational{$value} // $value;
+            }
+        }
+    }
+
+    $prop{'mirrored'} = ($char =~ /\p{Bidi_Mirrored}/) ? 'Y' : 'N';
+
+    @UNICODE_1_NAMES =_read_table("unicore/To/Na1.pl") unless @UNICODE_1_NAMES;
+    $prop{'unicode10'} = _search(\@UNICODE_1_NAMES, 0, $#UNICODE_1_NAMES, $code)
+                         // "";
+
+    # This is true starting in 6.0, but, num() also requires 6.0, so
+    # don't need to test for version again here.
+    $prop{'comment'} = "";
+
+    $prop{'upper'} = _charinfo_case($char, uc $char, '_suc.pl', \@SIMPLE_UPPER);
+    $prop{'lower'} = _charinfo_case($char, lc $char, '_slc.pl', \@SIMPLE_LOWER);
+    $prop{'title'} = _charinfo_case($char, ucfirst $char, '_stc.pl',
+                                                                \@SIMPLE_TITLE);
+
+    $prop{block}  = charblock($code);
+    $prop{script} = charscript($code);
+    return \%prop;
 }
 
 sub _search { # Binary search in a [[lo,hi,prop],[...],...] table.
@@ -1228,9 +1246,11 @@ sub _numeric {
     foreach my $entry (@numbers) {
         my ($start, $end, $value) = @$entry;
 
-        # If value contains a slash, convert to decimal
+        # If value contains a slash, convert to decimal, add a reverse hash
+        # used by charinfo.
         if ((my @rational = split /\//, $value) == 2) {
             my $real = $rational[0] / $rational[1];
+            $real_to_rational{$real} = $value;
             $value = $real;
         }
 
diff --git a/lib/Unicode/UCD.t b/lib/Unicode/UCD.t
index c22c762a00..dca4c1a5a3 100644
--- a/lib/Unicode/UCD.t
+++ b/lib/Unicode/UCD.t
@@ -133,12 +133,12 @@ is($charinfo->{script},         'Hebrew');
 
 $charinfo = charinfo(0xAC00);
 
-is($charinfo->{code},           'AC00', 'HANGUL SYLLABLE-AC00');
-is($charinfo->{name},           'HANGUL SYLLABLE-AC00');
+is($charinfo->{code},           'AC00', 'HANGUL SYLLABLE U+AC00');
+is($charinfo->{name},           'HANGUL SYLLABLE GA');
 is($charinfo->{category},       'Lo');
 is($charinfo->{combining},      '0');
 is($charinfo->{bidi},           'L');
-is($charinfo->{decomposition},  undef);
+is($charinfo->{decomposition},  '1100 1161');
 is($charinfo->{decimal},        '');
 is($charinfo->{digit},          '');
 is($charinfo->{numeric},        '');
@@ -155,12 +155,12 @@ is($charinfo->{script},         'Hangul');
 
 $charinfo = charinfo(0xAE00);
 
-is($charinfo->{code},           'AE00', 'HANGUL SYLLABLE-AE00');
-is($charinfo->{name},           'HANGUL SYLLABLE-AE00');
+is($charinfo->{code},           'AE00', 'HANGUL SYLLABLE U+AE00');
+is($charinfo->{name},           'HANGUL SYLLABLE GEUL');
 is($charinfo->{category},       'Lo');
 is($charinfo->{combining},      '0');
 is($charinfo->{bidi},           'L');
-is($charinfo->{decomposition},  undef);
+is($charinfo->{decomposition},  "1100 1173 11AF");
 is($charinfo->{decimal},        '');
 is($charinfo->{digit},          '');
 is($charinfo->{numeric},        '');
author	Karl Williamson <public@khwilliamson.com>	2011-03-03 18:33:18 -0700
committer	Karl Williamson <public@khwilliamson.com>	2011-03-03 19:26:17 -0700
commit	05dbc6f80f8f2d5774f53874803f5a20450bbe82 (patch)
tree	add277ddca3bddff4ea4ac1aa52d77ab6f069d2d /lib/Unicode
parent	36c2430c54431c750134fb5add2327486301d66f (diff)
download	perl-05dbc6f80f8f2d5774f53874803f5a20450bbe82.tar.gz