summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJarkko Hietaniemi <jhi@iki.fi>2001-07-05 19:33:34 +0000
committerJarkko Hietaniemi <jhi@iki.fi>2001-07-05 19:33:34 +0000
commit10a6ecd25e80ad20ebf67b311125411d51e78bc0 (patch)
tree2c2cd3bb425c933678308e2004833ef2459e8b60
parent3aa957f9c7dbe37b7f2fe946b886b63a07d35ac7 (diff)
downloadperl-10a6ecd25e80ad20ebf67b311125411d51e78bc0.tar.gz
More flexible argument understanding; add charblocks() and
charscripts(); make charblock() and charscript() two-way; add charinrange(); separate the $Unicode::UCD::VERSION and the version of the Unicode by adding UnicodeVersion(). p4raw-id: //depot/perl@11163
-rw-r--r--lib/Unicode/UCD.pm251
-rw-r--r--lib/Unicode/UCD.t47
2 files changed, 249 insertions, 49 deletions
diff --git a/lib/Unicode/UCD.pm b/lib/Unicode/UCD.pm
index cc7adaeec9..ff819cde1a 100644
--- a/lib/Unicode/UCD.pm
+++ b/lib/Unicode/UCD.pm
@@ -3,12 +3,15 @@ package Unicode::UCD;
use strict;
use warnings;
-our $VERSION = '3.1.0';
+our $VERSION = '0.1';
require Exporter;
our @ISA = qw(Exporter);
-our @EXPORT_OK = qw(charinfo charblock charscript);
+our @EXPORT_OK = qw(charinfo
+ charblock charscript
+ charblocks charscripts
+ charinrange);
use Carp;
@@ -18,9 +21,6 @@ Unicode::UCD - Unicode character database
=head1 SYNOPSIS
- use Unicode::UCD 3.1.0;
- # requires that level of the Unicode character database
-
use Unicode::UCD 'charinfo';
my %charinfo = charinfo($codepoint);
@@ -37,9 +37,10 @@ Database.
=cut
-my $UNICODE;
-my $BLOCKS;
-my $SCRIPTS;
+my $UNICODEFH;
+my $BLOCKSFH;
+my $SCRIPTSFH;
+my $VERSIONFH;
sub openunicode {
my ($rfh, @path) = @_;
@@ -103,15 +104,30 @@ F<CaseFold.txt>, and F<SpecCase.txt> in the Perl distribution.
=cut
+sub _getcode {
+ my $arg = shift;
+
+ if ($arg =~ /^\d+$/) {
+ return $arg;
+ } elsif ($arg =~ /^(?:U\+|0x)?([[:xdigit:]]+)$/) {
+ return hex($1);
+ }
+
+ return;
+}
+
sub charinfo {
- my $code = shift;
+ my $arg = shift;
+ my $code = _getcode($arg);
+ croak __PACKAGE__, "::charinfo: unknown code '$arg'"
+ unless defined $code;
my $hexk = sprintf("%04X", $code);
- openunicode(\$UNICODE, "Unicode.txt");
- if (defined $UNICODE) {
+ openunicode(\$UNICODEFH, "Unicode.txt");
+ if (defined $UNICODEFH) {
use Search::Dict;
- if (look($UNICODE, "$hexk;") >= 0) {
- my $line = <$UNICODE>;
+ if (look($UNICODEFH, "$hexk;") >= 0) {
+ my $line = <$UNICODEFH>;
chomp $line;
my %prop;
@prop{qw(
@@ -139,7 +155,7 @@ sub _search { # Binary search in a [[lo,hi,prop],[...],...] table.
my $mid = int(($lo+$hi) / 2);
if ($table->[$mid]->[0] < $code) {
- if (defined $table->[$mid]->[1] && $table->[$mid]->[1] >= $code) {
+ if ($table->[$mid]->[1] >= $code) {
return $table->[$mid]->[2];
} else {
_search($table, $mid + 1, $hi, $code);
@@ -151,35 +167,76 @@ sub _search { # Binary search in a [[lo,hi,prop],[...],...] table.
}
}
+sub charinrange {
+ my ($range, $arg) = @_;
+ my $code = _getcode($arg);
+ croak __PACKAGE__, "::charinrange: unknown code '$arg'"
+ unless defined $code;
+ _search($range, 0, $#$range, $code);
+}
+
=head2 charblock
use Unicode::UCD 'charblock';
my $charblock = charblock(0x41);
-
-charblock() returns the block the character belongs to, e.g.
-C<Basic Latin>. Note that not all the character positions within all
-blocks are defined.
+ my $charblock = charblock(1234);
+ my $charblock = charblock("0x263a");
+ my $charblock = charblock("U+263a");
+
+ my $ranges = charblock('Armenian');
+
+With a B<code point argument> charblock() returns the block the character
+belongs to, e.g. C<Basic Latin>. Note that not all the character
+positions within all blocks are defined. A <code point argument>
+is either a decimal or a hexadecimal scalar, or "U+" followed
+by hexadecimals.
+
+If supplied with an argument that can't be a code point, charblock()
+tries to do the opposite and interpret the argument as a character
+block. The return value is a I<range>: an anonymous list that
+contains anonymous lists, which in turn contain I<start-of-range>,
+I<end-of-range> code point pairs. You can test whether a code point
+is in a range using the L</charinrange> function. If the argument is
+not a known charater block, C<undef> is returned.
=cut
my @BLOCKS;
+my %BLOCKS;
-sub charblock {
- my $code = shift;
-
+sub _charblocks {
unless (@BLOCKS) {
- if (openunicode(\$BLOCKS, "Blocks.txt")) {
- while (<$BLOCKS>) {
+ if (openunicode(\$BLOCKSFH, "Blocks.txt")) {
+ while (<$BLOCKSFH>) {
if (/^([0-9A-F]+)\.\.([0-9A-F]+);\s+(.+)/) {
- push @BLOCKS, [ hex($1), hex($2), $3 ];
+ my ($lo, $hi) = (hex($1), hex($2));
+ my $subrange = [ $lo, $hi, $3 ];
+ push @BLOCKS, $subrange;
+ push @{$BLOCKS{$3}}, $subrange;
}
}
- close($BLOCKS);
+ close($BLOCKSFH);
}
}
+}
+
+sub charblock {
+ my $arg = shift;
+
+ _charblocks() unless @BLOCKS;
+
+ my $code = _getcode($arg);
- _search(\@BLOCKS, 0, $#BLOCKS, $code);
+ if (defined $code) {
+ _search(\@BLOCKS, 0, $#BLOCKS, $code);
+ } else {
+ if (exists $BLOCKS{$arg}) {
+ return $BLOCKS{$arg};
+ } else {
+ return;
+ }
+ }
}
=head2 charscript
@@ -187,38 +244,104 @@ sub charblock {
use Unicode::UCD 'charscript';
my $charscript = charscript(0x41);
+ my $charscript = charscript(1234);
+ my $charscript = charscript("U+263a");
-charscript() returns the script the character belongs to, e.g.
-C<Latin>, C<Greek>, C<Han>.
+ my $ranges = charscript('Thai');
+
+With a B<code point argument> charscript() returns the script the
+character belongs to, e.g. C<Latin>, C<Greek>, C<Han>. A <code point
+argument> is either a decimal or a hexadecimal scalar, or "U+"
+followed by hexadecimals.
+
+If supplied with an argument that can't be a code point, charscript()
+tries to do the opposite and interpret the argument as a character
+script. The return value is a I<range>: an anonymous list that
+contains anonymous lists, which in turn contain I<start-of-range>,
+I<end-of-range> code point pairs. You can test whether a code point
+is in a range using the L</charinrange> function. If the argument is
+not a known charater script, C<undef> is returned.
=cut
my @SCRIPTS;
+my %SCRIPTS;
-sub charscript {
- my $code = shift;
-
+sub _charscripts {
unless (@SCRIPTS) {
- if (openunicode(\$SCRIPTS, "Scripts.txt")) {
- while (<$SCRIPTS>) {
+ if (openunicode(\$SCRIPTSFH, "Scripts.txt")) {
+ while (<$SCRIPTSFH>) {
if (/^([0-9A-F]+)(?:\.\.([0-9A-F]+))?\s+;\s+(\w+)/) {
- push @SCRIPTS, [ hex($1), $2 ? hex($2) : undef, $3 ];
+ my ($lo, $hi) = (hex($1), $2 ? hex($2) : hex($1));
+ my $script = lc($3);
+ $script =~ s/\b(\w)/uc($1)/ge;
+ my $subrange = [ $lo, $hi, $script ];
+ push @SCRIPTS, $subrange;
+ push @{$SCRIPTS{$script}}, $subrange;
}
}
- close($SCRIPTS);
+ close($SCRIPTSFH);
@SCRIPTS = sort { $a->[0] <=> $b->[0] } @SCRIPTS;
}
}
+}
+
+sub charscript {
+ my $arg = shift;
+
+ _charscripts() unless @SCRIPTS;
- _search(\@SCRIPTS, 0, $#SCRIPTS, $code);
+ my $code = _getcode($arg);
+
+ if (defined $code) {
+ _search(\@SCRIPTS, 0, $#SCRIPTS, $code);
+ } else {
+ if (exists $SCRIPTS{$arg}) {
+ return $SCRIPTS{$arg};
+ } else {
+ return;
+ }
+ }
+}
+
+=head2 charblocks
+
+ use Unicode::UCD 'charblocks';
+
+ my %charblocks = charblocks();
+
+charblocks() returns a hash with the known block names as the keys,
+and the code point ranges (see L</charblock>) as the values.
+
+=cut
+
+sub charblocks {
+ _charblocks() unless @BLOCKS;
+ return %BLOCKS;
+}
+
+=head2 charscripts
+
+ use Unicode::UCD 'charscripts';
+
+ my %charscripts = charscripts();
+
+charscripts() returns a hash with the known script names as the keys,
+and the code point ranges (see L</charscript>) as the values.
+
+=cut
+
+sub charscripts {
+ _charscripts() unless @SCRIPTS;
+ return %SCRIPTS;
}
-=head2 charblock versus charscript
+=head2 Blocks versus Scripts
-The difference between a character block and a script is that scripts
-are closer to the linguistic notion of a set of characters required to
-present languages, while block is more of an artifact of the Unicode
-character numbering and separation into blocks of 256 characters.
+The difference between a block and a script is that scripts are closer
+to the linguistic notion of a set of characters required to present
+languages, while block is more of an artifact of the Unicode character
+numbering and separation into blocks of 256 characters.
For example the Latin B<script> is spread over several B<blocks>, such
as C<Basic Latin>, C<Latin 1 Supplement>, C<Latin Extended-A>, and
@@ -238,11 +361,47 @@ construct C<\p{In...}> and its negation C<\P{In...}>.
The name of the script or the block comes after the C<In>, for example
C<\p{InCyrillic}>, C<\P{InBasicLatin}>. Spaces and dashes ('-') are
-squished away from the names for the C<\p{In...}>, for example
-C<LatinExtendedA> instead of C<Latin Extended-A>. There are a few
-cases where there exists both a script and a block by the same name,
-in these cases the block version has C<Block> appended: C<\p{InKatakana}>
-is the script, C<\p{InKatakanaBlock}> is the block.
+removed from the names for the C<\p{In...}>, for example
+C<LatinExtendedA> instead of C<Latin Extended-A>.
+
+There are a few cases where there exists both a script and a block by
+the same name, in these cases the block version has C<Block> appended:
+C<\p{InKatakana}> is the script, C<\p{InKatakanaBlock}> is the block.
+
+=head2 charinrange
+
+In addition to using the C<\p{In...}> and C<\P{In...}> constructs, you
+can also test whether a code point is in the I<range> as returned by
+L</charblock> and L</charscript> or as the values of the hash returned
+by L</charblocks> and </charscripts> by using charinrange():
+
+ use Unicode::UCD qw(charscript charinrange);
+
+ $range = charscript('Hiragana');
+ print "looks like hiragana\n" if charinrange($range, $code);
+
+=cut
+
+=head2 Unicode::UCD::UnicodeVersion
+
+Unicode::UCD::UnicodeVersion() returns the version of the Unicode Character
+Database, in other words, the version of the Unicode standard the
+database implements.
+
+=cut
+
+my $UNICODEVERSION;
+
+sub UnicodeVersion {
+ unless (defined $UNICODEVERSION) {
+ openunicode(\$VERSIONFH, "version");
+ chomp($UNICODEVERSION = <$VERSIONFH>);
+ close($VERSIONFH);
+ croak __PACKAGE__, "::VERSION: strange version '$UNICODEVERSION'"
+ unless $UNICODEVERSION =~ /^\d+(?:\.\d+)+$/;
+ }
+ return $UNICODEVERSION;
+}
=head2 Implementation Note
diff --git a/lib/Unicode/UCD.t b/lib/Unicode/UCD.t
index 42c9a90d3e..51e200b56c 100644
--- a/lib/Unicode/UCD.t
+++ b/lib/Unicode/UCD.t
@@ -1,9 +1,9 @@
-use Unicode::UCD 3.1.0;
+use Unicode::UCD;
use Test;
use strict;
-BEGIN { plan tests => 87 };
+BEGIN { plan tests => 103 };
use Unicode::UCD 'charinfo';
@@ -91,7 +91,7 @@ ok($charinfo{upper}, '');
ok($charinfo{lower}, '');
ok($charinfo{title}, '');
ok($charinfo{block}, 'Hebrew');
-ok($charinfo{script}, 'HEBREW');
+ok($charinfo{script}, 'Hebrew');
use Unicode::UCD qw(charblock charscript);
@@ -119,3 +119,44 @@ ok($charinfo{lower}, '');
ok($charinfo{title}, '');
ok($charinfo{block}, 'Latin-1 Supplement');
ok($charinfo{script}, undef);
+
+use Unicode::UCD qw(charblocks charscripts);
+
+my %charblocks = charblocks();
+
+ok(exists $charblocks{Thai});
+ok($charblocks{Thai}->[0]->[0], hex('0e00'));
+ok(!exists $charblocks{PigLatin});
+
+my %charscripts = charscripts();
+
+ok(exists $charscripts{Armenian});
+ok($charscripts{Armenian}->[0]->[0], hex('0531'));
+ok(!exists $charscripts{PigLatin});
+
+my $charscript;
+
+$charscript = charscript("12ab");
+ok($charscript, 'Ethiopic');
+
+$charscript = charscript("0x12ab");
+ok($charscript, 'Ethiopic');
+
+$charscript = charscript("U+12ab");
+ok($charscript, 'Ethiopic');
+
+my $ranges;
+
+$ranges = charscript('Ogham');
+ok($ranges->[0]->[0], hex('1681'));
+ok($ranges->[0]->[1], hex('169a'));
+
+use Unicode::UCD qw(charinrange);
+
+$ranges = charscript('Cherokee');
+ok(!charinrange($ranges, "139f"));
+ok( charinrange($ranges, "13a0"));
+ok( charinrange($ranges, "13f4"));
+ok(!charinrange($ranges, "13f5"));
+
+ok(Unicode::UCD::UnicodeVersion, 3.1);