Add compexcl(), casefold(), and casespec() interfaces;

and make all interfaces to return hash references instead of hashes. p4raw-id: //depot/perl@11260
author: Jarkko Hietaniemi <jhi@iki.fi> 2001-07-10 15:00:37 +0000
committer: Jarkko Hietaniemi <jhi@iki.fi> 2001-07-10 15:00:37 +0000
commit: b08cd201f6dd56800c3a806ca8bcd6503a4588a3 (patch)
tree: a97dc41b3bcb8b06bf159dacfa327e485f1d429c /lib/Unicode
parent: b22b3e312425d0895f989288877de019dda3315f (diff)
download: perl-b08cd201f6dd56800c3a806ca8bcd6503a4588a3.tar.gz
2 files changed, 364 insertions, 130 deletions
diff --git a/lib/Unicode/UCD.pm b/lib/Unicode/UCD.pm
index ff819cde1a..cc5d192227 100644
--- a/lib/Unicode/UCD.pm
+++ b/lib/Unicode/UCD.pm
@@ -11,7 +11,9 @@ our @ISA = qw(Exporter);
 our @EXPORT_OK = qw(charinfo
 		    charblock charscript
 		    charblocks charscripts
-		    charinrange);
+		    charinrange
+		    compexcl
+		    casefold casespec);
 
 use Carp;
 
@@ -22,7 +24,7 @@ Unicode::UCD - Unicode character database
 =head1 SYNOPSIS
 
     use Unicode::UCD 'charinfo';
-    my %charinfo   = charinfo($codepoint);
+    my $charinfo   = charinfo($codepoint);
 
     use Unicode::UCD 'charblock';
     my $charblock  = charblock($codepoint);
@@ -41,6 +43,9 @@ my $UNICODEFH;
 my $BLOCKSFH;
 my $SCRIPTSFH;
 my $VERSIONFH;
+my $COMPEXCLFH;
+my $CASEFOLDFH;
+my $CASESPECFH;
 
 sub openunicode {
     my ($rfh, @path) = @_;
@@ -63,10 +68,10 @@ sub openunicode {
 
     use Unicode::UCD 'charinfo';
 
-    my %charinfo = charinfo(0x41);
+    my $charinfo = charinfo(0x41);
 
-charinfo() returns a hash that has the following fields as defined
-by the Unicode standard:
+charinfo() returns a reference to a hash that has the following fields
+as defined by the Unicode standard:
 
     key
 
@@ -89,7 +94,7 @@ by the Unicode standard:
     block            block the character belongs to (used in \p{In...})
     script           script the character belongs to 
 
-If no match is found, an empty hash is returned.
+If no match is found, a reference to an empty hash is returned.
 
 The C<block> property is the same as as returned by charinfo().  It is
 not defined in the Unicode Character Database proper (Chapter 4 of the
@@ -98,9 +103,7 @@ of TUS3).  Similarly for the C<script> property.
 
 Note that you cannot do (de)composition and casing based solely on the
 above C<decomposition> and C<lower>, C<upper>, C<title>, properties,
-you will need also the I<Composition Exclusions>, I<Case Folding>, and
-I<SpecialCasing> tables, available as files F<CompExcl.txt>,
-F<CaseFold.txt>, and F<SpecCase.txt> in the Perl distribution.
+you will need also the compexcl(), casefold(), and casespec() functions.
 
 =cut
 
@@ -140,7 +143,7 @@ sub charinfo {
 	    if ($prop{code} eq $hexk) {
 		$prop{block}  = charblock($code);
 		$prop{script} = charscript($code);
-		return %prop;
+		return \%prop;
 	    }
 	}
     }
@@ -188,9 +191,7 @@ sub charinrange {
 
 With a B<code point argument> charblock() returns the block the character
 belongs to, e.g.  C<Basic Latin>.  Note that not all the character
-positions within all blocks are defined.  A <code point argument>
-is either a decimal or a hexadecimal scalar, or "U+" followed
-by hexadecimals.
+positions within all blocks are defined.
 
 If supplied with an argument that can't be a code point, charblock()
 tries to do the opposite and interpret the argument as a character
@@ -250,9 +251,7 @@ sub charblock {
     my $ranges     = charscript('Thai');
 
 With a B<code point argument> charscript() returns the script the
-character belongs to, e.g.  C<Latin>, C<Greek>, C<Han>.  A <code point
-argument> is either a decimal or a hexadecimal scalar, or "U+"
-followed by hexadecimals.
+character belongs to, e.g.  C<Latin>, C<Greek>, C<Han>.
 
 If supplied with an argument that can't be a code point, charscript()
 tries to do the opposite and interpret the argument as a character
@@ -308,16 +307,16 @@ sub charscript {
 
     use Unicode::UCD 'charblocks';
 
-    my %charblocks = charblocks();
+    my $charblocks = charblocks();
 
-charblocks() returns a hash with the known block names as the keys,
-and the code point ranges (see L</charblock>) as the values.
+charblocks() returns a reference to a hash with the known block names
+as the keys, and the code point ranges (see L</charblock>) as the values.
 
 =cut
 
 sub charblocks {
-    _charblocks() unless @BLOCKS;
-    return %BLOCKS;
+    _charblocks() unless %BLOCKS;
+    return \%BLOCKS;
 }
 
 =head2 charscripts
@@ -332,8 +331,8 @@ and the code point ranges (see L</charscript>) as the values.
 =cut
 
 sub charscripts {
-    _charscripts() unless @SCRIPTS;
-    return %SCRIPTS;
+    _charscripts() unless %SCRIPTS;
+    return \%SCRIPTS;
 }
 
 =head2 Blocks versus Scripts
@@ -368,6 +367,11 @@ There are a few cases where there exists both a script and a block by
 the same name, in these cases the block version has C<Block> appended:
 C<\p{InKatakana}> is the script, C<\p{InKatakanaBlock}> is the block.
 
+=head2 Code Point Arguments
+
+A <code point argument> is either a decimal or a hexadecimal scalar,
+or "U+" followed by hexadecimals.
+
 =head2 charinrange
 
 In addition to using the C<\p{In...}> and C<\P{In...}> constructs, you
@@ -382,6 +386,191 @@ by L</charblocks> and </charscripts> by using charinrange():
 
 =cut
 
+=head2 compexcl
+
+    use Unicode::UCD 'compexcl';
+
+    my $compexcl = compexcl("09dc");
+
+The compexcl() returns the composition exclusion (that is, if the
+character cannot be decomposed) of the character specified by a B<code
+point argument>.
+
+If there is a composition exclusion for the character, true is
+returned.  Otherwise, false is returned.
+
+=cut
+
+my %COMPEXCL;
+
+sub _compexcl {
+    unless (%COMPEXCL) {
+	if (openunicode(\$COMPEXCLFH, "CompExcl.txt")) {
+	    while (<$COMPEXCLFH>) {
+		if (/^([0-9A-F]+) \# /) {
+		    my $code = hex($1);
+		    $COMPEXCL{$code} = undef;
+		}
+	    }
+	    close($COMPEXCLFH);
+	}
+    }
+}
+
+sub compexcl {
+    my $arg  = shift;
+    my $code = _getcode($arg);
+
+    _compexcl() unless %COMPEXCL;
+
+    return exists $COMPEXCL{$code};
+}
+
+=head2 casefold
+
+    use Unicode::UCD 'casefold';
+
+    my %casefold = casefold("09dc");
+
+The casefold() returns the locale-independent case folding of the
+character specified by a B<code point argument>.
+
+If there is a case folding for that character, a reference to a hash
+with the following fields is returned:
+
+    key
+
+    code             code point with at least four hexdigits
+    status           "C", "F", "S", or "I"
+    mapping          one or more codes separated by spaces
+
+The meaning of the I<status> is as follows:
+
+   C                 common case folding, common mappings shared
+                     by both simple and full mappings
+   F                 full case folding, mappings that cause strings
+                     to grow in length. Multiple characters are separated
+                     by spaces
+   S                 simple case folding, mappings to single characters
+                     where different from F
+   I                 special case for dotted uppercase I and
+                     dotless lowercase i
+                     - If this mapping is included, the result is
+                       case-insensitive, but dotless and dotted I's
+                       are not distinguished
+                     - If this mapping is excluded, the result is not
+                       fully case-insensitive, but dotless and dotted
+                       I's are distinguished
+
+If there is no case folding for that character, C<undef> is returned.
+
+For more information about case mappings see
+http://www.unicode.org/unicode/reports/tr21/
+
+=cut
+
+my %CASEFOLD;
+
+sub _casefold {
+    unless (%CASEFOLD) {
+	if (openunicode(\$CASEFOLDFH, "CaseFold.txt")) {
+	    while (<$CASEFOLDFH>) {
+		if (/^([0-9A-F]+); ([CFSI]); ([0-9A-F]+(?: [0-9A-F]+)*);/) {
+		    my $code = hex($1);
+		    $CASEFOLD{$code} = { code    => $1,
+					 status  => $2,
+					 mapping => $3 };
+		}
+	    }
+	    close($CASEFOLDFH);
+	}
+    }
+}
+
+sub casefold {
+    my $arg  = shift;
+    my $code = _getcode($arg);
+
+    _casefold() unless %CASEFOLD;
+
+    return $CASEFOLD{$code};
+}
+
+=head2 casespec
+
+    use Unicode::UCD 'casespec';
+
+    my %casespec = casespec("09dc");
+
+The casespec() returns the potentially locale-dependent case mapping
+of the character specified by a B<code point argument>.  The mapping
+may change the length of the string (which the basic Unicode case
+mappings as returned by charinfo() never do).
+
+If there is a case folding for that character, a reference to a hash
+with the following fields is returned:
+
+    key
+
+    code             code point with at least four hexdigits
+    lower            lowercase
+    title            titlecase
+    upper            uppercase
+    condition        condition list (may be undef)
+
+The C<condition> is optional.  Where present, it consists of one or
+more I<locales> or I<contexts>, separated by spaces (other than as
+used to separate elements, spaces are to be ignored).  A condition
+list overrides the normal behavior if all of the listed conditions are
+true.  Case distinctions in the condition list are not significant.
+Conditions preceded by "NON_" represent the negation of the condition
+
+A I<locale> is defined as a 2-letter ISO 3166 country code, possibly
+followed by a "_" and a 2-letter ISO language code (, possibly followed
+by a "_" and a variant code).  You can find the list of those codes
+in L<Locale::Country> and L<Locale::Language>.
+
+A I<context> is one of the following choices:
+
+    FINAL            The letter is not followed by a letter of
+                     general category L (e.g. Ll, Lt, Lu, Lm, or Lo)
+    MODERN           The mapping is only used for modern text
+    AFTER_i          The last base character was "i" 0069
+
+For more information about case mappings see
+http://www.unicode.org/unicode/reports/tr21/
+
+=cut
+
+my %CASESPEC;
+
+sub _casespec {
+    unless (%CASESPEC) {
+	if (openunicode(\$CASESPECFH, "SpecCase.txt")) {
+	    while (<$CASESPECFH>) {
+		if (/^([0-9A-F]+); ([0-9A-F]+(?: [0-9A-F]+)*)?; ([0-9A-F]+(?: [0-9A-F]+)*)?; ([0-9A-F]+(?: [0-9A-F]+)*)?; (\w+(?: \w+)*)?/) {
+		    my $code = hex($1);
+		    $CASESPEC{$code} = { code      => $1,
+					 lower     => $2,
+					 title     => $3,
+					 upper     => $4,
+					 condition => $5 };
+		}
+	    }
+	    close($CASESPECFH);
+	}
+    }
+}
+
+sub casespec {
+    my $arg  = shift;
+    my $code = _getcode($arg);
+
+    _casespec() unless %CASESPEC;
+
+    return $CASESPEC{$code};
+}
+
 =head2 Unicode::UCD::UnicodeVersion
 
 Unicode::UCD::UnicodeVersion() returns the version of the Unicode Character
diff --git a/lib/Unicode/UCD.t b/lib/Unicode/UCD.t
index 3cd11925b1..6ebea8a1a6 100644
--- a/lib/Unicode/UCD.t
+++ b/lib/Unicode/UCD.t
@@ -3,95 +3,95 @@ use Unicode::UCD;
 use Test;
 use strict;
 
-BEGIN { plan tests => 103 };
+BEGIN { plan tests => 111 };
 
 use Unicode::UCD 'charinfo';
 
-my %charinfo;
-
-%charinfo = charinfo(0x41);
-
-ok($charinfo{code},           '0041');
-ok($charinfo{name},           'LATIN CAPITAL LETTER A');
-ok($charinfo{category},       'Lu');
-ok($charinfo{combining},      '0');
-ok($charinfo{bidi},           'L');
-ok($charinfo{decomposition},  '');
-ok($charinfo{decimal},        '');
-ok($charinfo{digit},          '');
-ok($charinfo{numeric},        '');
-ok($charinfo{mirrored},       'N');
-ok($charinfo{unicode10},      '');
-ok($charinfo{comment},        '');
-ok($charinfo{upper},          '');
-ok($charinfo{lower},          '0061');
-ok($charinfo{title},          '');
-ok($charinfo{block},          'Basic Latin');
-ok($charinfo{script},         'Latin');
-
-%charinfo = charinfo(0x100);
-
-ok($charinfo{code},           '0100');
-ok($charinfo{name},           'LATIN CAPITAL LETTER A WITH MACRON');
-ok($charinfo{category},       'Lu');
-ok($charinfo{combining},      '0');
-ok($charinfo{bidi},           'L');
-ok($charinfo{decomposition},  '0041 0304');
-ok($charinfo{decimal},        '');
-ok($charinfo{digit},          '');
-ok($charinfo{numeric},        '');
-ok($charinfo{mirrored},       'N');
-ok($charinfo{unicode10},      'LATIN CAPITAL LETTER A MACRON');
-ok($charinfo{comment},        '');
-ok($charinfo{upper},          '');
-ok($charinfo{lower},          '0101');
-ok($charinfo{title},          '');
-ok($charinfo{block},          'Latin Extended-A');
-ok($charinfo{script},         'Latin');
+my $charinfo;
+
+$charinfo = charinfo(0x41);
+
+ok($charinfo->{code},           '0041');
+ok($charinfo->{name},           'LATIN CAPITAL LETTER A');
+ok($charinfo->{category},       'Lu');
+ok($charinfo->{combining},      '0');
+ok($charinfo->{bidi},           'L');
+ok($charinfo->{decomposition},  '');
+ok($charinfo->{decimal},        '');
+ok($charinfo->{digit},          '');
+ok($charinfo->{numeric},        '');
+ok($charinfo->{mirrored},       'N');
+ok($charinfo->{unicode10},      '');
+ok($charinfo->{comment},        '');
+ok($charinfo->{upper},          '');
+ok($charinfo->{lower},          '0061');
+ok($charinfo->{title},          '');
+ok($charinfo->{block},          'Basic Latin');
+ok($charinfo->{script},         'Latin');
+
+$charinfo = charinfo(0x100);
+
+ok($charinfo->{code},           '0100');
+ok($charinfo->{name},           'LATIN CAPITAL LETTER A WITH MACRON');
+ok($charinfo->{category},       'Lu');
+ok($charinfo->{combining},      '0');
+ok($charinfo->{bidi},           'L');
+ok($charinfo->{decomposition},  '0041 0304');
+ok($charinfo->{decimal},        '');
+ok($charinfo->{digit},          '');
+ok($charinfo->{numeric},        '');
+ok($charinfo->{mirrored},       'N');
+ok($charinfo->{unicode10},      'LATIN CAPITAL LETTER A MACRON');
+ok($charinfo->{comment},        '');
+ok($charinfo->{upper},          '');
+ok($charinfo->{lower},          '0101');
+ok($charinfo->{title},          '');
+ok($charinfo->{block},          'Latin Extended-A');
+ok($charinfo->{script},         'Latin');
 
 # 0x0590 is in the Hebrew block but unused.
 
-%charinfo = charinfo(0x590);
-
-ok($charinfo{code},          undef);
-ok($charinfo{name},          undef);
-ok($charinfo{category},      undef);
-ok($charinfo{combining},     undef);
-ok($charinfo{bidi},          undef);
-ok($charinfo{decomposition}, undef);
-ok($charinfo{decimal},       undef);
-ok($charinfo{digit},         undef);
-ok($charinfo{numeric},       undef);
-ok($charinfo{mirrored},      undef);
-ok($charinfo{unicode10},     undef);
-ok($charinfo{comment},       undef);
-ok($charinfo{upper},         undef);
-ok($charinfo{lower},         undef);
-ok($charinfo{title},         undef);
-ok($charinfo{block},         undef);
-ok($charinfo{script},        undef);
+$charinfo = charinfo(0x590);
+
+ok($charinfo->{code},          undef);
+ok($charinfo->{name},          undef);
+ok($charinfo->{category},      undef);
+ok($charinfo->{combining},     undef);
+ok($charinfo->{bidi},          undef);
+ok($charinfo->{decomposition}, undef);
+ok($charinfo->{decimal},       undef);
+ok($charinfo->{digit},         undef);
+ok($charinfo->{numeric},       undef);
+ok($charinfo->{mirrored},      undef);
+ok($charinfo->{unicode10},     undef);
+ok($charinfo->{comment},       undef);
+ok($charinfo->{upper},         undef);
+ok($charinfo->{lower},         undef);
+ok($charinfo->{title},         undef);
+ok($charinfo->{block},         undef);
+ok($charinfo->{script},        undef);
 
 # 0x05d0 is in the Hebrew block and used.
 
-%charinfo = charinfo(0x5d0);
-
-ok($charinfo{code},           '05D0');
-ok($charinfo{name},           'HEBREW LETTER ALEF');
-ok($charinfo{category},       'Lo');
-ok($charinfo{combining},      '0');
-ok($charinfo{bidi},           'R');
-ok($charinfo{decomposition},  '');
-ok($charinfo{decimal},        '');
-ok($charinfo{digit},          '');
-ok($charinfo{numeric},        '');
-ok($charinfo{mirrored},       'N');
-ok($charinfo{unicode10},      '');
-ok($charinfo{comment},        '');
-ok($charinfo{upper},          '');
-ok($charinfo{lower},          '');
-ok($charinfo{title},          '');
-ok($charinfo{block},          'Hebrew');
-ok($charinfo{script},         'Hebrew');
+$charinfo = charinfo(0x5d0);
+
+ok($charinfo->{code},           '05D0');
+ok($charinfo->{name},           'HEBREW LETTER ALEF');
+ok($charinfo->{category},       'Lo');
+ok($charinfo->{combining},      '0');
+ok($charinfo->{bidi},           'R');
+ok($charinfo->{decomposition},  '');
+ok($charinfo->{decimal},        '');
+ok($charinfo->{digit},          '');
+ok($charinfo->{numeric},        '');
+ok($charinfo->{mirrored},       'N');
+ok($charinfo->{unicode10},      '');
+ok($charinfo->{comment},        '');
+ok($charinfo->{upper},          '');
+ok($charinfo->{lower},          '');
+ok($charinfo->{title},          '');
+ok($charinfo->{block},          'Hebrew');
+ok($charinfo->{script},         'Hebrew');
 
 use Unicode::UCD qw(charblock charscript);
 
@@ -100,39 +100,39 @@ use Unicode::UCD qw(charblock charscript);
 ok(charblock(0x590),          'Hebrew');
 ok(charscript(0x590),         undef);
 
-%charinfo = charinfo(0xbe);
-
-ok($charinfo{code},           '00BE');
-ok($charinfo{name},           'VULGAR FRACTION THREE QUARTERS');
-ok($charinfo{category},       'No');
-ok($charinfo{combining},      '0');
-ok($charinfo{bidi},           'ON');
-ok($charinfo{decomposition},  '<fraction> 0033 2044 0034');
-ok($charinfo{decimal},        '');
-ok($charinfo{digit},          '');
-ok($charinfo{numeric},        '3/4');
-ok($charinfo{mirrored},       'N');
-ok($charinfo{unicode10},      'FRACTION THREE QUARTERS');
-ok($charinfo{comment},        '');
-ok($charinfo{upper},          '');
-ok($charinfo{lower},          '');
-ok($charinfo{title},          '');
-ok($charinfo{block},          'Latin-1 Supplement');
-ok($charinfo{script},         undef);
+$charinfo = charinfo(0xbe);
+
+ok($charinfo->{code},           '00BE');
+ok($charinfo->{name},           'VULGAR FRACTION THREE QUARTERS');
+ok($charinfo->{category},       'No');
+ok($charinfo->{combining},      '0');
+ok($charinfo->{bidi},           'ON');
+ok($charinfo->{decomposition},  '<fraction> 0033 2044 0034');
+ok($charinfo->{decimal},        '');
+ok($charinfo->{digit},          '');
+ok($charinfo->{numeric},        '3/4');
+ok($charinfo->{mirrored},       'N');
+ok($charinfo->{unicode10},      'FRACTION THREE QUARTERS');
+ok($charinfo->{comment},        '');
+ok($charinfo->{upper},          '');
+ok($charinfo->{lower},          '');
+ok($charinfo->{title},          '');
+ok($charinfo->{block},          'Latin-1 Supplement');
+ok($charinfo->{script},         undef);
 
 use Unicode::UCD qw(charblocks charscripts);
 
-my %charblocks = charblocks();
+my $charblocks = charblocks();
 
-ok(exists $charblocks{Thai});
-ok($charblocks{Thai}->[0]->[0], hex('0e00'));
-ok(!exists $charblocks{PigLatin});
+ok(exists $charblocks->{Thai});
+ok($charblocks->{Thai}->[0]->[0], hex('0e00'));
+ok(!exists $charblocks->{PigLatin});
 
-my %charscripts = charscripts();
+my $charscripts = charscripts();
 
-ok(exists $charscripts{Armenian});
-ok($charscripts{Armenian}->[0]->[0], hex('0531'));
-ok(!exists $charscripts{PigLatin});
+ok(exists $charscripts->{Armenian});
+ok($charscripts->{Armenian}->[0]->[0], hex('0531'));
+ok(!exists $charscripts->{PigLatin});
 
 my $charscript;
 
@@ -160,3 +160,48 @@ ok( charinrange($ranges, "13f4"));
 ok(!charinrange($ranges, "13f5"));
 
 ok(Unicode::UCD::UnicodeVersion, 3.1);
+
+use Unicode::UCD qw(compexcl);
+
+ok(!compexcl(0x0100));
+ok( compexcl(0x0958));
+
+use Unicode::UCD qw(casefold);
+
+my $casefold;
+
+$casefold = casefold(0x41);
+
+ok($casefold->{code} eq '0041' &&
+   $casefold->{status} eq 'C'  &&
+   $casefold->{mapping} eq '0061');
+
+$casefold = casefold(0xdf);
+
+ok($casefold->{code} eq '00DF' &&
+   $casefold->{status} eq 'F'  &&
+   $casefold->{mapping} eq '0073 0073');
+
+ok(!casefold(0x20));
+
+use Unicode::UCD qw(casespec);
+
+my $casespec;
+
+ok(!casespec(0x41));
+
+$casespec = casespec(0xdf);
+
+ok($casespec->{code} eq '00DF' &&
+   $casespec->{lower} eq '00DF'  &&
+   $casespec->{title} eq '0053 0073'  &&
+   $casespec->{upper} eq '0053 0053' &&
+   $casespec->{condition} eq undef);
+
+$casespec = casespec(0x307);
+
+ok($casespec->{code} eq '0307' &&
+   $casespec->{lower} eq '0307'  &&
+   $casespec->{title} eq ''  &&
+   $casespec->{upper} eq '' &&
+   $casespec->{condition} eq 'lt AFTER_i');
author	Jarkko Hietaniemi <jhi@iki.fi>	2001-07-10 15:00:37 +0000
committer	Jarkko Hietaniemi <jhi@iki.fi>	2001-07-10 15:00:37 +0000
commit	b08cd201f6dd56800c3a806ca8bcd6503a4588a3 (patch)
tree	a97dc41b3bcb8b06bf159dacfa327e485f1d429c /lib/Unicode
parent	b22b3e312425d0895f989288877de019dda3315f (diff)
download	perl-b08cd201f6dd56800c3a806ca8bcd6503a4588a3.tar.gz