diff options
Diffstat (limited to 'lib/Unicode')
-rw-r--r-- | lib/Unicode/UCD.pm | 70 | ||||
-rw-r--r-- | lib/Unicode/UCD.t | 22 |
2 files changed, 85 insertions, 7 deletions
diff --git a/lib/Unicode/UCD.pm b/lib/Unicode/UCD.pm index dfdd2dcb51..54c07e7947 100644 --- a/lib/Unicode/UCD.pm +++ b/lib/Unicode/UCD.pm @@ -3,7 +3,7 @@ package Unicode::UCD; use strict; use warnings; -our $VERSION = '0.22'; +our $VERSION = '0.23'; use Storable qw(dclone); @@ -16,7 +16,8 @@ our @EXPORT_OK = qw(charinfo charblocks charscripts charinrange compexcl - casefold casespec); + casefold casespec + namedseq); use Carp; @@ -48,6 +49,9 @@ Unicode::UCD - Unicode character database use Unicode::UCD 'compexcl'; my $compexcl = compexcl($codepoint); + use Unicode::UCD 'namedseq'; + my $namedseq = namedseq($named_sequence_name); + my $unicode_version = Unicode::UCD::UnicodeVersion(); =head1 DESCRIPTION @@ -64,6 +68,7 @@ my $VERSIONFH; my $COMPEXCLFH; my $CASEFOLDFH; my $CASESPECFH; +my $NAMEDSEQFH; sub openunicode { my ($rfh, @path) = @_; @@ -287,8 +292,8 @@ See also L</Blocks versus Scripts>. If supplied with an argument that can't be a code point, charblock() tries to do the opposite and interpret the argument as a character block. The return value is a I<range>: an anonymous list of lists that contain -I<start-of-range>, I<end-of-range> code point pairs. You can test whether a -code point is in a range using the L</charinrange> function. If the +I<start-of-range>, I<end-of-range> code point pairs. You can test whether +a code point is in a range using the L</charinrange> function. If the argument is not a known charater block, C<undef> is returned. =cut @@ -716,6 +721,63 @@ sub casespec { return ref $CASESPEC{$code} ? dclone $CASESPEC{$code} : $CASESPEC{$code}; } +=head2 namedseq() + + use Unicode::UCD 'namedseq'; + + my $namedseq = namedseq("KATAKANA LETTER AINU P"); + my @namedseq = namedseq("KATAKANA LETTER AINU P"); + my %namedseq = namedseq(); + +If used with a single argument in a scalar context, returns the string +consisting of the code points of the named sequence, or C<undef> if no +named sequence by that name exists. If used with a single argument in +a list context, returns list of the code points. If used with no +arguments in a list context, returns a hash with the names of the +named sequences as the keys and the named sequences as strings as +the values. Otherwise, returns C<undef> or empty list depending +on the context. + +(New from Unicode 4.1.0) + +=cut + +my %NAMEDSEQ; + +sub _namedseq { + unless (%NAMEDSEQ) { + if (openunicode(\$NAMEDSEQFH, "NamedSequences.txt")) { + local $_; + while (<$NAMEDSEQFH>) { + if (/^(.+)\s*;\s*([0-9A-F]+(?: [0-9A-F]+)*)$/) { + my ($n, $s) = ($1, $2); + my @s = map { chr(hex($_)) } split(' ', $s); + $NAMEDSEQ{$n} = join("", @s); + } + } + close($NAMEDSEQFH); + } + } +} + +sub namedseq { + _namedseq() unless %NAMEDSEQ; + my $wantarray = wantarray(); + if (defined $wantarray) { + if ($wantarray) { + if (@_ == 0) { + return %NAMEDSEQ; + } elsif (@_ == 1) { + my $s = $NAMEDSEQ{ $_[0] }; + return defined $s ? map { ord($_) } split('', $s) : (); + } + } elsif (@_ == 1) { + return $NAMEDSEQ{ $_[0] }; + } + } + return; +} + =head2 Unicode::UCD::UnicodeVersion Unicode::UCD::UnicodeVersion() returns the version of the Unicode diff --git a/lib/Unicode/UCD.t b/lib/Unicode/UCD.t index c4046bcdca..c9903e5743 100644 --- a/lib/Unicode/UCD.t +++ b/lib/Unicode/UCD.t @@ -18,7 +18,7 @@ use strict; use Unicode::UCD; use Test::More; -BEGIN { plan tests => 179 }; +BEGIN { plan tests => 188 }; use Unicode::UCD 'charinfo'; @@ -238,7 +238,7 @@ ok( charinrange($ranges, "13a0")); ok( charinrange($ranges, "13f4")); ok(!charinrange($ranges, "13f5")); -is(Unicode::UCD::UnicodeVersion, '4.0.1', 'UnicodeVersion'); +is(Unicode::UCD::UnicodeVersion, '4.1.0', 'UnicodeVersion'); use Unicode::UCD qw(compexcl); @@ -309,7 +309,7 @@ is(Unicode::UCD::_getcode('U+123x'), undef, "_getcode(x123)"); { my $r1 = charscript('Latin'); my $n1 = @$r1; - is($n1, 26, "26 ranges in Latin script (Unicode 4.0.0)"); + is($n1, 29, "29 ranges in Latin script (Unicode 4.1.0)"); shift @$r1 while @$r1; my $r2 = charscript('Latin'); is(@$r2, $n1, "modifying results should not mess up internal caches"); @@ -318,3 +318,19 @@ is(Unicode::UCD::_getcode('U+123x'), undef, "_getcode(x123)"); { is(charinfo(0xdeadbeef), undef, "[perl #23273] warnings in Unicode::UCD"); } + +use Unicode::UCD qw(namedseq); + +is(namedseq("KATAKANA LETTER AINU P"), "\x{31F7}\x{309A}", "namedseq"); +is(namedseq("KATAKANA LETTER AINU Q"), undef); +is(namedseq(), undef); +is(namedseq(qw(foo bar)), undef); +my @ns = namedseq("KATAKANA LETTER AINU P"); +is(scalar @ns, 2); +is($ns[0], 0x31F7); +is($ns[1], 0x309A); +my %ns = namedseq(); +is($ns{"KATAKANA LETTER AINU P"}, "\x{31F7}\x{309A}"); +@ns = namedseq(42); +is(@ns, 0); + |