Unicode 4.1.0

Message-ID: <424E584D.5000508@iki.fi> Date: Sat, 02 Apr 2005 11:31:09 +0300 p4raw-id: //depot/perl@24134
author: Jarkko Hietaniemi <jhi@iki.fi> 2005-04-02 14:31:09 +0300
committer: Nicholas Clark <nick@ccl4.org> 2005-04-02 18:51:27 +0000
commit: a2bd7410d0183a5974c36ff0549fab8692d0713b (patch)
tree: ec51ae454884c4f2a00b6a329242a5ce914b2cd5 /lib/Unicode
parent: f366f7b38b027e6aab7ac71e1c1715678bda65b1 (diff)
download: perl-a2bd7410d0183a5974c36ff0549fab8692d0713b.tar.gz
2 files changed, 85 insertions, 7 deletions
diff --git a/lib/Unicode/UCD.pm b/lib/Unicode/UCD.pm
index dfdd2dcb51..54c07e7947 100644
--- a/lib/Unicode/UCD.pm
+++ b/lib/Unicode/UCD.pm
@@ -3,7 +3,7 @@ package Unicode::UCD;
 use strict;
 use warnings;
 
-our $VERSION = '0.22';
+our $VERSION = '0.23';
 
 use Storable qw(dclone);
 
@@ -16,7 +16,8 @@ our @EXPORT_OK = qw(charinfo
 		    charblocks charscripts
 		    charinrange
 		    compexcl
-		    casefold casespec);
+		    casefold casespec
+		    namedseq);
 
 use Carp;
 
@@ -48,6 +49,9 @@ Unicode::UCD - Unicode character database
     use Unicode::UCD 'compexcl';
     my $compexcl = compexcl($codepoint);
 
+    use Unicode::UCD 'namedseq';
+    my $namedseq = namedseq($named_sequence_name);
+
     my $unicode_version = Unicode::UCD::UnicodeVersion();
 
 =head1 DESCRIPTION
@@ -64,6 +68,7 @@ my $VERSIONFH;
 my $COMPEXCLFH;
 my $CASEFOLDFH;
 my $CASESPECFH;
+my $NAMEDSEQFH;
 
 sub openunicode {
     my ($rfh, @path) = @_;
@@ -287,8 +292,8 @@ See also L</Blocks versus Scripts>.
 If supplied with an argument that can't be a code point, charblock() tries
 to do the opposite and interpret the argument as a character block. The
 return value is a I<range>: an anonymous list of lists that contain
-I<start-of-range>, I<end-of-range> code point pairs. You can test whether a
-code point is in a range using the L</charinrange> function. If the
+I<start-of-range>, I<end-of-range> code point pairs. You can test whether
+a code point is in a range using the L</charinrange> function. If the
 argument is not a known charater block, C<undef> is returned.
 
 =cut
@@ -716,6 +721,63 @@ sub casespec {
     return ref $CASESPEC{$code} ? dclone $CASESPEC{$code} : $CASESPEC{$code};
 }
 
+=head2 namedseq()
+
+    use Unicode::UCD 'namedseq';
+
+    my $namedseq = namedseq("KATAKANA LETTER AINU P");
+    my @namedseq = namedseq("KATAKANA LETTER AINU P");
+    my %namedseq = namedseq();
+
+If used with a single argument in a scalar context, returns the string
+consisting of the code points of the named sequence, or C<undef> if no
+named sequence by that name exists.  If used with a single argument in
+a list context, returns list of the code points.  If used with no
+arguments in a list context, returns a hash with the names of the
+named sequences as the keys and the named sequences as strings as
+the values.  Otherwise, returns C<undef> or empty list depending
+on the context.
+
+(New from Unicode 4.1.0)
+
+=cut
+
+my %NAMEDSEQ;
+
+sub _namedseq {
+    unless (%NAMEDSEQ) {
+	if (openunicode(\$NAMEDSEQFH, "NamedSequences.txt")) {
+	    local $_;
+	    while (<$NAMEDSEQFH>) {
+		if (/^(.+)\s*;\s*([0-9A-F]+(?: [0-9A-F]+)*)$/) {
+		    my ($n, $s) = ($1, $2);
+		    my @s = map { chr(hex($_)) } split(' ', $s);
+		    $NAMEDSEQ{$n} = join("", @s);
+		}
+	    }
+	    close($NAMEDSEQFH);
+	}
+    }
+}
+
+sub namedseq {
+    _namedseq() unless %NAMEDSEQ;
+    my $wantarray = wantarray();
+    if (defined $wantarray) {
+	if ($wantarray) {
+	    if (@_ == 0) {
+		return %NAMEDSEQ;
+	    } elsif (@_ == 1) {
+		my $s = $NAMEDSEQ{ $_[0] };
+		return defined $s ? map { ord($_) } split('', $s) : ();
+	    }
+	} elsif (@_ == 1) {
+	    return $NAMEDSEQ{ $_[0] };
+	}
+    }
+    return;
+}
+
 =head2 Unicode::UCD::UnicodeVersion
 
 Unicode::UCD::UnicodeVersion() returns the version of the Unicode
diff --git a/lib/Unicode/UCD.t b/lib/Unicode/UCD.t
index c4046bcdca..c9903e5743 100644
--- a/lib/Unicode/UCD.t
+++ b/lib/Unicode/UCD.t
@@ -18,7 +18,7 @@ use strict;
 use Unicode::UCD;
 use Test::More;
 
-BEGIN { plan tests => 179 };
+BEGIN { plan tests => 188 };
 
 use Unicode::UCD 'charinfo';
 
@@ -238,7 +238,7 @@ ok( charinrange($ranges, "13a0"));
 ok( charinrange($ranges, "13f4"));
 ok(!charinrange($ranges, "13f5"));
 
-is(Unicode::UCD::UnicodeVersion, '4.0.1', 'UnicodeVersion');
+is(Unicode::UCD::UnicodeVersion, '4.1.0', 'UnicodeVersion');
 
 use Unicode::UCD qw(compexcl);
 
@@ -309,7 +309,7 @@ is(Unicode::UCD::_getcode('U+123x'),  undef, "_getcode(x123)");
 {
     my $r1 = charscript('Latin');
     my $n1 = @$r1;
-    is($n1, 26, "26 ranges in Latin script (Unicode 4.0.0)");
+    is($n1, 29, "29 ranges in Latin script (Unicode 4.1.0)");
     shift @$r1 while @$r1;
     my $r2 = charscript('Latin');
     is(@$r2, $n1, "modifying results should not mess up internal caches");
@@ -318,3 +318,19 @@ is(Unicode::UCD::_getcode('U+123x'),  undef, "_getcode(x123)");
 {
 	is(charinfo(0xdeadbeef), undef, "[perl #23273] warnings in Unicode::UCD");
 }
+
+use Unicode::UCD qw(namedseq);
+
+is(namedseq("KATAKANA LETTER AINU P"), "\x{31F7}\x{309A}", "namedseq");
+is(namedseq("KATAKANA LETTER AINU Q"), undef);
+is(namedseq(), undef);
+is(namedseq(qw(foo bar)), undef);
+my @ns = namedseq("KATAKANA LETTER AINU P");
+is(scalar @ns, 2);
+is($ns[0], 0x31F7);
+is($ns[1], 0x309A);
+my %ns = namedseq();
+is($ns{"KATAKANA LETTER AINU P"}, "\x{31F7}\x{309A}");
+@ns = namedseq(42);
+is(@ns, 0);
+
author	Jarkko Hietaniemi <jhi@iki.fi>	2005-04-02 14:31:09 +0300
committer	Nicholas Clark <nick@ccl4.org>	2005-04-02 18:51:27 +0000
commit	a2bd7410d0183a5974c36ff0549fab8692d0713b (patch)
tree	ec51ae454884c4f2a00b6a329242a5ce914b2cd5 /lib/Unicode
parent	f366f7b38b027e6aab7ac71e1c1715678bda65b1 (diff)
download	perl-a2bd7410d0183a5974c36ff0549fab8692d0713b.tar.gz