diff options
Diffstat (limited to 'lib/Unicode/UCD.pm')
-rw-r--r-- | lib/Unicode/UCD.pm | 70 |
1 files changed, 66 insertions, 4 deletions
diff --git a/lib/Unicode/UCD.pm b/lib/Unicode/UCD.pm index dfdd2dcb51..54c07e7947 100644 --- a/lib/Unicode/UCD.pm +++ b/lib/Unicode/UCD.pm @@ -3,7 +3,7 @@ package Unicode::UCD; use strict; use warnings; -our $VERSION = '0.22'; +our $VERSION = '0.23'; use Storable qw(dclone); @@ -16,7 +16,8 @@ our @EXPORT_OK = qw(charinfo charblocks charscripts charinrange compexcl - casefold casespec); + casefold casespec + namedseq); use Carp; @@ -48,6 +49,9 @@ Unicode::UCD - Unicode character database use Unicode::UCD 'compexcl'; my $compexcl = compexcl($codepoint); + use Unicode::UCD 'namedseq'; + my $namedseq = namedseq($named_sequence_name); + my $unicode_version = Unicode::UCD::UnicodeVersion(); =head1 DESCRIPTION @@ -64,6 +68,7 @@ my $VERSIONFH; my $COMPEXCLFH; my $CASEFOLDFH; my $CASESPECFH; +my $NAMEDSEQFH; sub openunicode { my ($rfh, @path) = @_; @@ -287,8 +292,8 @@ See also L</Blocks versus Scripts>. If supplied with an argument that can't be a code point, charblock() tries to do the opposite and interpret the argument as a character block. The return value is a I<range>: an anonymous list of lists that contain -I<start-of-range>, I<end-of-range> code point pairs. You can test whether a -code point is in a range using the L</charinrange> function. If the +I<start-of-range>, I<end-of-range> code point pairs. You can test whether +a code point is in a range using the L</charinrange> function. If the argument is not a known charater block, C<undef> is returned. =cut @@ -716,6 +721,63 @@ sub casespec { return ref $CASESPEC{$code} ? dclone $CASESPEC{$code} : $CASESPEC{$code}; } +=head2 namedseq() + + use Unicode::UCD 'namedseq'; + + my $namedseq = namedseq("KATAKANA LETTER AINU P"); + my @namedseq = namedseq("KATAKANA LETTER AINU P"); + my %namedseq = namedseq(); + +If used with a single argument in a scalar context, returns the string +consisting of the code points of the named sequence, or C<undef> if no +named sequence by that name exists. If used with a single argument in +a list context, returns list of the code points. If used with no +arguments in a list context, returns a hash with the names of the +named sequences as the keys and the named sequences as strings as +the values. Otherwise, returns C<undef> or empty list depending +on the context. + +(New from Unicode 4.1.0) + +=cut + +my %NAMEDSEQ; + +sub _namedseq { + unless (%NAMEDSEQ) { + if (openunicode(\$NAMEDSEQFH, "NamedSequences.txt")) { + local $_; + while (<$NAMEDSEQFH>) { + if (/^(.+)\s*;\s*([0-9A-F]+(?: [0-9A-F]+)*)$/) { + my ($n, $s) = ($1, $2); + my @s = map { chr(hex($_)) } split(' ', $s); + $NAMEDSEQ{$n} = join("", @s); + } + } + close($NAMEDSEQFH); + } + } +} + +sub namedseq { + _namedseq() unless %NAMEDSEQ; + my $wantarray = wantarray(); + if (defined $wantarray) { + if ($wantarray) { + if (@_ == 0) { + return %NAMEDSEQ; + } elsif (@_ == 1) { + my $s = $NAMEDSEQ{ $_[0] }; + return defined $s ? map { ord($_) } split('', $s) : (); + } + } elsif (@_ == 1) { + return $NAMEDSEQ{ $_[0] }; + } + } + return; +} + =head2 Unicode::UCD::UnicodeVersion Unicode::UCD::UnicodeVersion() returns the version of the Unicode |