summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJarkko Hietaniemi <jhi@iki.fi>2001-06-30 15:53:22 +0000
committerJarkko Hietaniemi <jhi@iki.fi>2001-06-30 15:53:22 +0000
commit561c79edc84c161bc73be7aa0d10cde362797d7d (patch)
tree9f2ee9adfc42a10c18b55d9b8fb5a803eb16d3e5
parent3e93e9fb59a74787b4bfe55b8e0ce5b648ff8d52 (diff)
downloadperl-561c79edc84c161bc73be7aa0d10cde362797d7d.tar.gz
Add a simple Unicode character database interface, Unicode::UCD.
p4raw-id: //depot/perl@11046
-rw-r--r--MANIFEST2
-rw-r--r--lib/Unicode/UCD.pm183
-rw-r--r--lib/Unicode/UCD.t110
3 files changed, 295 insertions, 0 deletions
diff --git a/MANIFEST b/MANIFEST
index 7695b79ff9..12b704750f 100644
--- a/MANIFEST
+++ b/MANIFEST
@@ -1161,6 +1161,8 @@ lib/Time/localtime.pm By-name interface to Perl's builtin localtime
lib/Time/localtime.t Test for Time::localtime
lib/Time/tm.pm Internal object for Time::{gm,local}time
lib/timelocal.pl Perl library supporting inverse of localtime, gmtime
+lib/Unicode/UCD.pm Unicode character database
+lib/Unicode/UCD.t See if Unicode character database works
lib/unicode/ArabLink.pl Unicode character database
lib/unicode/ArabLnkGrp.pl Unicode character database
lib/unicode/ArabShap.txt Unicode character database
diff --git a/lib/Unicode/UCD.pm b/lib/Unicode/UCD.pm
new file mode 100644
index 0000000000..ab214bb770
--- /dev/null
+++ b/lib/Unicode/UCD.pm
@@ -0,0 +1,183 @@
+package Unicode::UCD;
+
+use strict;
+use warnings;
+
+our $VERSION = v3.1.0;
+
+require Exporter;
+
+our @ISA = qw(Exporter);
+our @EXPORT_OK = qw(charinfo charblock);
+
+use Carp;
+
+=head1 NAME
+
+Unicode - Unicode character database
+
+=head1 SYNOPSIS
+
+ use Unicode::UCD 3.1.0;
+ # requires that level of the Unicode character database
+
+ use Unicode::UCD 'charinfo';
+ my %charinfo = charinfo($codepoint);
+
+ use Unicode::UCD 'charblock';
+ my $charblock = charblock($codepoint);
+
+=head1 DESCRIPTION
+
+The Unicode module offers a simple interface to the Unicode Character
+Database.
+
+=cut
+
+my $UNICODE;
+my $BLOCKS;
+
+sub openunicode {
+ my ($rfh, @path) = @_;
+ my $f;
+ unless (defined $$rfh) {
+ for my $d (@INC) {
+ use File::Spec;
+ $f = File::Spec->catfile($d, "unicode", @path);
+ if (open($$rfh, $f)) {
+ last;
+ } else {
+ croak __PACKAGE__, ": open '$f' failed: $!\n";
+ }
+ }
+ croak __PACKAGE__, ": failed to find ",join("/",@path)," in @INC\n"
+ unless defined $rfh;
+ }
+ return $f;
+}
+
+=head2 charinfo
+
+ use Unicode::UCD 'charinfo';
+
+ my %charinfo = charinfo(0x41);
+
+charinfo() returns a hash that has the following fields as defined
+by the Unicode standard:
+
+ key
+
+ code code point with at least four hexdigits
+ name name of the character IN UPPER CASE
+ category general category of the character
+ combining classes used in the Canonical Ordering Algorithm
+ bidi bidirectional category
+ decomposition character decomposition mapping
+ decimal if decimal digit this is the integer numeric value
+ digit if digit this is the numeric value
+ numeric if numeric is the integer or rational numeric value
+ mirrored if mirrored in bidirectional text
+ unicode10 Unicode 1.0 name if existed and different
+ comment ISO 10646 comment field
+ upper uppercase equivalent mapping
+ lower lowercase equivalent mapping
+ title titlecase equivalent mapping
+ block block the character belongs to (used in \p{In...})
+
+If no match is found, an empty hash is returned.
+
+The C<block> property is the same as as returned by charinfo().
+(It is not defined in the Unicode Character Database proper but
+instead in an auxiliary database.)
+
+=cut
+
+sub charinfo {
+ my $code = shift;
+ my $hexk = sprintf("%04X", $code);
+
+ openunicode(\$UNICODE, "Unicode.txt");
+ if (defined $UNICODE) {
+ use Search::Dict;
+ if (look($UNICODE, "$hexk;") >= 0) {
+ my $line = <$UNICODE>;
+ chomp $line;
+ my %prop;
+ @prop{qw(
+ code name category
+ combining bidi decomposition
+ decimal digit numeric
+ mirrored unicode10 comment
+ upper lower title
+ )} = split(/;/, $line, -1);
+ if ($prop{code} eq $hexk) {
+ $prop{block} = charblock($code);
+ return %prop;
+ }
+ }
+ }
+ return;
+}
+
+=head2 charbloc
+
+ use Unicode::UCD 'charblock';
+
+ my $charblock = charblock(0x41);
+
+charblock() returns the block the character belongs to, e.g.
+C<Basic Latin>. Note that not all the character positions within all
+block are defined.
+
+The name is the same name that is used in the C<\p{In...}> construct,
+for example C<\p{InBasicLatin}> (spaces and dashes ('-') are squished
+away from the names for the C<\p{In...}>.
+
+=cut
+
+my @BLOCKS;
+
+sub _charblock {
+ my ($code, $lo, $hi) = @_;
+
+ return if $lo > $hi;
+
+ my $mid = int(($lo+$hi) / 2);
+
+ if ($BLOCKS[$mid]->[0] < $code) {
+ if ($BLOCKS[$mid]->[1] >= $code) {
+ return $BLOCKS[$mid]->[2];
+ } else {
+ _charblock($code, $mid + 1, $hi);
+ }
+ } elsif ($BLOCKS[$mid]->[0] > $code) {
+ _charblock($code, $lo, $mid - 1);
+ } else {
+ return $BLOCKS[$mid]->[2];
+ }
+}
+
+sub charblock {
+ my $code = shift;
+
+ unless (@BLOCKS) {
+ if (openunicode(\$BLOCKS, "Blocks.pl")) {
+ while (<$BLOCKS>) {
+ if (/^([0-9A-F]+)\s+([0-9A-F]+)\s+(.+)/) {
+ push @BLOCKS, [ hex($1), hex($2), $3 ];
+ }
+ }
+ close($BLOCKS);
+ }
+ }
+
+ _charblock($code, 0, $#BLOCKS);
+}
+
+=head1 AUTHOR
+
+Jarkko Hietaniemi
+
+=cut
+
+1;
diff --git a/lib/Unicode/UCD.t b/lib/Unicode/UCD.t
new file mode 100644
index 0000000000..731ac8f7bf
--- /dev/null
+++ b/lib/Unicode/UCD.t
@@ -0,0 +1,110 @@
+use Unicode::UCD 3.1.0;
+
+use Test;
+use strict;
+
+BEGIN { plan tests => 81 };
+
+use Unicode::UCD 'charinfo';
+
+my %charinfo;
+
+%charinfo = charinfo(0x41);
+
+ok($charinfo{code}, '0041');
+ok($charinfo{name}, 'LATIN CAPITAL LETTER A');
+ok($charinfo{category}, 'Lu');
+ok($charinfo{combining}, '0');
+ok($charinfo{bidi}, 'L');
+ok($charinfo{decomposition}, '');
+ok($charinfo{decimal}, '');
+ok($charinfo{digit}, '');
+ok($charinfo{numeric}, '');
+ok($charinfo{mirrored}, 'N');
+ok($charinfo{unicode10}, '');
+ok($charinfo{comment}, '');
+ok($charinfo{upper}, '');
+ok($charinfo{lower}, '0061');
+ok($charinfo{title}, '');
+ok($charinfo{block}, 'Basic Latin');
+
+%charinfo = charinfo(0x100);
+
+ok($charinfo{code}, '0100');
+ok($charinfo{name}, 'LATIN CAPITAL LETTER A WITH MACRON');
+ok($charinfo{category}, 'Lu');
+ok($charinfo{combining}, '0');
+ok($charinfo{bidi}, 'L');
+ok($charinfo{decomposition}, '0041 0304');
+ok($charinfo{decimal}, '');
+ok($charinfo{digit}, '');
+ok($charinfo{numeric}, '');
+ok($charinfo{mirrored}, 'N');
+ok($charinfo{unicode10}, 'LATIN CAPITAL LETTER A MACRON');
+ok($charinfo{comment}, '');
+ok($charinfo{upper}, '');
+ok($charinfo{lower}, '0101');
+ok($charinfo{title}, '');
+ok($charinfo{block}, 'Latin Extended-A');
+
+%charinfo = charinfo(0x590);
+
+ok($charinfo{code}, undef);
+ok($charinfo{name}, undef);
+ok($charinfo{category}, undef);
+ok($charinfo{combining}, undef);
+ok($charinfo{bidi}, undef);
+ok($charinfo{decomposition}, undef);
+ok($charinfo{decimal}, undef);
+ok($charinfo{digit}, undef);
+ok($charinfo{numeric}, undef);
+ok($charinfo{mirrored}, undef);
+ok($charinfo{unicode10}, undef);
+ok($charinfo{comment}, undef);
+ok($charinfo{upper}, undef);
+ok($charinfo{lower}, undef);
+ok($charinfo{title}, undef);
+ok($charinfo{block}, undef);
+
+%charinfo = charinfo(0x5d0);
+
+ok($charinfo{code}, '05D0');
+ok($charinfo{name}, 'HEBREW LETTER ALEF');
+ok($charinfo{category}, 'Lo');
+ok($charinfo{combining}, '0');
+ok($charinfo{bidi}, 'R');
+ok($charinfo{decomposition}, '');
+ok($charinfo{decimal}, '');
+ok($charinfo{digit}, '');
+ok($charinfo{numeric}, '');
+ok($charinfo{mirrored}, 'N');
+ok($charinfo{unicode10}, '');
+ok($charinfo{comment}, '');
+ok($charinfo{upper}, '');
+ok($charinfo{lower}, '');
+ok($charinfo{title}, '');
+ok($charinfo{block}, 'Hebrew');
+
+use Unicode::UCD 'charblock';
+
+ok(charblock(0x590), 'Hebrew');
+
+%charinfo = charinfo(0xbe);
+
+ok($charinfo{code}, '00BE');
+ok($charinfo{name}, 'VULGAR FRACTION THREE QUARTERS');
+ok($charinfo{category}, 'No');
+ok($charinfo{combining}, '0');
+ok($charinfo{bidi}, 'ON');
+ok($charinfo{decomposition}, '<fraction> 0033 2044 0034');
+ok($charinfo{decimal}, '');
+ok($charinfo{digit}, '');
+ok($charinfo{numeric}, '3/4');
+ok($charinfo{mirrored}, 'N');
+ok($charinfo{unicode10}, 'FRACTION THREE QUARTERS');
+ok($charinfo{comment}, '');
+ok($charinfo{upper}, '');
+ok($charinfo{lower}, '');
+ok($charinfo{title}, '');
+ok($charinfo{block}, 'Latin-1 Supplement');
+