summaryrefslogtreecommitdiff
path: root/lib/Unicode
diff options
context:
space:
mode:
authorJarkko Hietaniemi <jhi@iki.fi>2007-05-18 06:45:29 +0300
committerCraig A. Berry <craigberry@mac.com>2007-05-18 22:01:51 +0000
commitea508aee27b2f1cd6703f49dc50601d7c9c3d20a (patch)
treeb2d8ec424dd1d9cf0d61cd9dcc6512ed14cd5e93 /lib/Unicode
parentb4ff380f20561274e4b6f0c9fde2644d95c98ac8 (diff)
downloadperl-ea508aee27b2f1cd6703f49dc50601d7c9c3d20a.tar.gz
Unicode::UCD: add general category and bidi type interfaces
Message-Id: <200705180045.l4I0jTeI221780@kosh.hut.fi> p4raw-id: //depot/perl@31237
Diffstat (limited to 'lib/Unicode')
-rw-r--r--lib/Unicode/UCD.pm124
-rw-r--r--lib/Unicode/UCD.t18
2 files changed, 135 insertions, 7 deletions
diff --git a/lib/Unicode/UCD.pm b/lib/Unicode/UCD.pm
index 6a2b5e1384..23feae00f8 100644
--- a/lib/Unicode/UCD.pm
+++ b/lib/Unicode/UCD.pm
@@ -3,7 +3,7 @@ package Unicode::UCD;
use strict;
use warnings;
-our $VERSION = '0.24';
+our $VERSION = '0.25';
use Storable qw(dclone);
@@ -15,6 +15,7 @@ our @EXPORT_OK = qw(charinfo
charblock charscript
charblocks charscripts
charinrange
+ general_categories bidi_types
compexcl
casefold casespec
namedseq);
@@ -40,12 +41,16 @@ Unicode::UCD - Unicode character database
my $charblocks = charblocks();
use Unicode::UCD 'charscripts';
- my %charscripts = charscripts();
+ my $charscripts = charscripts();
use Unicode::UCD qw(charscript charinrange);
my $range = charscript($script);
print "looks like $script\n" if charinrange($range, $codepoint);
+ use Unicode::UCD qw(general_categories bidi_types);
+ my $categories = general_categories();
+ my $types = bidi_types();
+
use Unicode::UCD 'compexcl';
my $compexcl = compexcl($codepoint);
@@ -102,7 +107,7 @@ as defined by the Unicode standard:
name name of the character IN UPPER CASE
category general category of the character
combining classes used in the Canonical Ordering Algorithm
- bidi bidirectional category
+ bidi bidirectional type
decomposition character decomposition mapping
decimal if decimal digit this is the integer numeric value
digit if digit this is the numeric value
@@ -423,10 +428,11 @@ sub charblocks {
use Unicode::UCD 'charscripts';
- my %charscripts = charscripts();
+ my $charscripts = charscripts();
-charscripts() returns a hash with the known script names as the keys,
-and the code point ranges (see L</charscript>) as the values.
+charscripts() returns a reference to a hash with the known script
+names as the keys, and the code point ranges (see L</charscript>) as
+the values.
See also L</Blocks versus Scripts>.
@@ -487,6 +493,112 @@ by L</charblocks> and L</charscripts> by using charinrange():
=cut
+my %GENERAL_CATEGORIES =
+ (
+ 'L' => 'Letter',
+ 'LC' => 'CasedLetter',
+ 'Lu' => 'UppercaseLetter',
+ 'Ll' => 'LowercaseLetter',
+ 'Lt' => 'TitlecaseLetter',
+ 'Lm' => 'ModifierLetter',
+ 'Lo' => 'OtherLetter',
+ 'M' => 'Mark',
+ 'Mn' => 'NonspacingMark',
+ 'Mc' => 'SpacingMark',
+ 'Me' => 'EnclosingMark',
+ 'N' => 'Number',
+ 'Nd' => 'DecimalNumber',
+ 'Nl' => 'LetterNumber',
+ 'No' => 'OtherNumber',
+ 'P' => 'Punctuation',
+ 'Pc' => 'ConnectorPunctuation',
+ 'Pd' => 'DashPunctuation',
+ 'Ps' => 'OpenPunctuation',
+ 'Pe' => 'ClosePunctuation',
+ 'Pi' => 'InitialPunctuation',
+ 'Pf' => 'FinalPunctuation',
+ 'Po' => 'OtherPunctuation',
+ 'S' => 'Symbol',
+ 'Sm' => 'MathSymbol',
+ 'Sc' => 'CurrencySymbol',
+ 'Sk' => 'ModifierSymbol',
+ 'So' => 'OtherSymbol',
+ 'Z' => 'Separator',
+ 'Zs' => 'SpaceSeparator',
+ 'Zl' => 'LineSeparator',
+ 'Zp' => 'ParagraphSeparator',
+ 'C' => 'Other',
+ 'Cc' => 'Control',
+ 'Cf' => 'Format',
+ 'Cs' => 'Surrogate',
+ 'Co' => 'PrivateUse',
+ 'Cn' => 'Unassigned',
+ );
+
+sub general_categories {
+ return dclone \%GENERAL_CATEGORIES;
+}
+
+=head2 general_categories
+
+ use Unicode::UCD 'general_categories';
+
+ my $categories = general_categories();
+
+The general_categories() returns a reference to a hash which has short
+general category names (such as C<Lu>, C<Nd>, C<Zs>, C<S>) as keys and long
+names (such as C<UppercaseLetter>, C<DecimalNumber>, C<SpaceSeparator>,
+C<Symbol>) as values. The hash is reversible in case you need to go
+from the long names to the short names. The general category is the
+one returned from charinfo() under the C<category> key.
+
+=cut
+
+my %BIDI_TYPES =
+ (
+ 'L' => 'Left-to-Right',
+ 'LRE' => 'Left-to-Right Embedding',
+ 'LRO' => 'Left-to-Right Override',
+ 'R' => 'Right-to-Left',
+ 'AL' => 'Right-to-Left Arabic',
+ 'RLE' => 'Right-to-Left Embedding',
+ 'RLO' => 'Right-to-Left Override',
+ 'PDF' => 'Pop Directional Format',
+ 'EN' => 'European Number',
+ 'ES' => 'European Number Separator',
+ 'ET' => 'European Number Terminator',
+ 'AN' => 'Arabic Number',
+ 'CS' => 'Common Number Separator',
+ 'NSM' => 'Non-Spacing Mark',
+ 'BN' => 'Boundary Neutral',
+ 'B' => 'Paragraph Separator',
+ 'S' => 'Segment Separator',
+ 'WS' => 'Whitespace',
+ 'ON' => 'Other Neutrals',
+ );
+
+sub bidi_types {
+ return dclone \%BIDI_TYPES;
+}
+
+=head2 bidi_types
+
+ use Unicode::UCD 'bidi_types';
+
+ my $categories = bidi_types();
+
+The bidi_types() returns a reference to a hash which has the short
+bidi (bidirectional) type names (such as C<L>, C<R>) as keys and long
+names (such as C<Left-to-Right>, C<Right-to-Left>) as values. The
+hash is reversible in case you need to go from the long names to the
+short names. The bidi type is the one returned from charinfo()
+under the C<bidi> key. For the exact meaning of the various bidi classes
+the Unicode TR9 is recommended reading:
+http://www.unicode.org/reports/tr9/tr9-17.html
+(as of Unicode 5.0.0)
+
+=cut
+
=head2 compexcl
use Unicode::UCD 'compexcl';
diff --git a/lib/Unicode/UCD.t b/lib/Unicode/UCD.t
index 52075b807f..3ade6b3717 100644
--- a/lib/Unicode/UCD.t
+++ b/lib/Unicode/UCD.t
@@ -18,7 +18,7 @@ use strict;
use Unicode::UCD;
use Test::More;
-BEGIN { plan tests => 188 };
+BEGIN { plan tests => 194 };
use Unicode::UCD 'charinfo';
@@ -238,6 +238,22 @@ ok( charinrange($ranges, "13a0"));
ok( charinrange($ranges, "13f4"));
ok(!charinrange($ranges, "13f5"));
+use Unicode::UCD qw(general_categories);
+
+my $gc = general_categories();
+
+ok(exists $gc->{L}, 'has L');
+is($gc->{L}, 'Letter', 'L is Letter');
+is($gc->{Lu}, 'UppercaseLetter', 'Lu is UppercaseLetter');
+
+use Unicode::UCD qw(bidi_types);
+
+my $bt = bidi_types();
+
+ok(exists $bt->{L}, 'has L');
+is($bt->{L}, 'Left-to-Right', 'L is Left-to-Right');
+is($bt->{AL}, 'Right-to-Left Arabic', 'AL is Right-to-Left Arabic');
+
is(Unicode::UCD::UnicodeVersion, '5.0.0', 'UnicodeVersion');
use Unicode::UCD qw(compexcl);