1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
|
package Unicode::UCD;
use strict;
use warnings;
our $VERSION = '3.1.0';
require Exporter;
our @ISA = qw(Exporter);
our @EXPORT_OK = qw(charinfo charblock);
use Carp;
=head1 NAME
Unicode - Unicode character database
=head1 SYNOPSIS
use Unicode::UCD 3.1.0;
# requires that level of the Unicode character database
use Unicode::UCD 'charinfo';
my %charinfo = charinfo($codepoint);
use Unicode::UCD 'charblock';
my $charblock = charblock($codepoint);
=head1 DESCRIPTION
The Unicode module offers a simple interface to the Unicode Character
Database.
=cut
my $UNICODE;
my $BLOCKS;
sub openunicode {
my ($rfh, @path) = @_;
my $f;
unless (defined $$rfh) {
for my $d (@INC) {
use File::Spec;
$f = File::Spec->catfile($d, "unicode", @path);
if (open($$rfh, $f)) {
last;
} else {
croak __PACKAGE__, ": open '$f' failed: $!\n";
}
}
croak __PACKAGE__, ": failed to find ",join("/",@path)," in @INC\n"
unless defined $rfh;
}
return $f;
}
=head2 charinfo
use Unicode::UCD 'charinfo';
my %charinfo = charinfo(0x41);
charinfo() returns a hash that has the following fields as defined
by the Unicode standard:
key
code code point with at least four hexdigits
name name of the character IN UPPER CASE
category general category of the character
combining classes used in the Canonical Ordering Algorithm
bidi bidirectional category
decomposition character decomposition mapping
decimal if decimal digit this is the integer numeric value
digit if digit this is the numeric value
numeric if numeric is the integer or rational numeric value
mirrored if mirrored in bidirectional text
unicode10 Unicode 1.0 name if existed and different
comment ISO 10646 comment field
upper uppercase equivalent mapping
lower lowercase equivalent mapping
title titlecase equivalent mapping
block block the character belongs to (used in \p{In...})
If no match is found, an empty hash is returned.
The C<block> property is the same as as returned by charinfo().
(It is not defined in the Unicode Character Database proper but
instead in an auxiliary database.)
=cut
sub charinfo {
my $code = shift;
my $hexk = sprintf("%04X", $code);
openunicode(\$UNICODE, "Unicode.txt");
if (defined $UNICODE) {
use Search::Dict;
if (look($UNICODE, "$hexk;") >= 0) {
my $line = <$UNICODE>;
chomp $line;
my %prop;
@prop{qw(
code name category
combining bidi decomposition
decimal digit numeric
mirrored unicode10 comment
upper lower title
)} = split(/;/, $line, -1);
if ($prop{code} eq $hexk) {
$prop{block} = charblock($code);
return %prop;
}
}
}
return;
}
=head2 charblock
use Unicode::UCD 'charblock';
my $charblock = charblock(0x41);
charblock() returns the block the character belongs to, e.g.
C<Basic Latin>. Note that not all the character positions within all
block are defined.
The name is the same name that is used in the C<\p{In...}> construct,
for example C<\p{InBasicLatin}> (spaces and dashes ('-') are squished
away from the names for the C<\p{In...}>.
=cut
my @BLOCKS;
sub _charblock {
my ($code, $lo, $hi) = @_;
return if $lo > $hi;
my $mid = int(($lo+$hi) / 2);
if ($BLOCKS[$mid]->[0] < $code) {
if ($BLOCKS[$mid]->[1] >= $code) {
return $BLOCKS[$mid]->[2];
} else {
_charblock($code, $mid + 1, $hi);
}
} elsif ($BLOCKS[$mid]->[0] > $code) {
_charblock($code, $lo, $mid - 1);
} else {
return $BLOCKS[$mid]->[2];
}
}
sub charblock {
my $code = shift;
unless (@BLOCKS) {
if (openunicode(\$BLOCKS, "Blocks.pl")) {
while (<$BLOCKS>) {
if (/^([0-9A-F]+)\s+([0-9A-F]+)\s+(.+)/) {
push @BLOCKS, [ hex($1), hex($2), $3 ];
}
}
close($BLOCKS);
}
}
_charblock($code, 0, $#BLOCKS);
}
=head1 AUTHOR
Jarkko Hietaniemi
=cut
1;
|