summaryrefslogtreecommitdiff
path: root/lib
diff options
context:
space:
mode:
authorKarl Williamson <public@khwilliamson.com>2010-11-19 12:04:53 -0700
committerFather Chrysostomos <sprout@cpan.org>2010-11-20 18:15:15 -0800
commit71a442a8e083048f771614ff45898af022ade6b7 (patch)
treed9401977771e09665041f8773a79efc88a7dbf28 /lib
parent98ef7649bc2d994db925c45a7d2fdce7dff098b3 (diff)
downloadperl-71a442a8e083048f771614ff45898af022ade6b7.tar.gz
UCD.pm: Don't use CompositionExclusions.txt
The motiviation for this patch was to remove dependence of UCD on another Unicode DB .txt file. But the subroutine that uses it is out-of-date, now that this property, and an even more convenient one are accessible from the core. So the documentation is also updated to educate people. Instead of using the file, the routine just uses the core's access method
Diffstat (limited to 'lib')
-rw-r--r--lib/Unicode/UCD.pm52
-rw-r--r--lib/Unicode/UCD.t2
2 files changed, 26 insertions, 28 deletions
diff --git a/lib/Unicode/UCD.pm b/lib/Unicode/UCD.pm
index eb4de2882d..522c540dd9 100644
--- a/lib/Unicode/UCD.pm
+++ b/lib/Unicode/UCD.pm
@@ -710,45 +710,43 @@ sub bidi_types {
my $compexcl = compexcl(0x09dc);
-This returns B<true> if the
-L</code point argument> should not be produced by composition normalization,
-B<AND> if that fact is not otherwise determinable from the Unicode data base.
-It currently does not return B<true> if the code point has a decomposition
+This routine is included for backwards compatibility, but as of Perl 5.12, for
+most purposes it is probably more convenient to use one of the following
+instead:
+
+ my $compexcl = chr(0x09dc) =~ /\p{Comp_Ex};
+ my $compexcl = chr(0x09dc) =~ /\p{Full_Composition_Exclusion};
+
+or even
+
+ my $compexcl = chr(0x09dc) =~ /\p{CE};
+ my $compexcl = chr(0x09dc) =~ /\p{Composition_Exclusion};
+
+The first two forms return B<true> if the L</code point argument> should not
+be produced by composition normalization. The final two forms
+additionally require that this fact not otherwise be determinable from
+the Unicode data base for them to return B<true>.
+
+This routine behaves identically to the final two forms. That is,
+it does not return B<true> if the code point has a decomposition
consisting of another single code point, nor if its decomposition starts
with a code point whose combining class is non-zero. Code points that meet
either of these conditions should also not be produced by composition
-normalization.
+normalization, which is probably why you should use the
+C<Full_Composition_Exclusion> property instead, as shown above.
-It returns B<false> otherwise.
+The routine returns B<false> otherwise.
=cut
-my %COMPEXCL;
-
-sub _compexcl {
- unless (%COMPEXCL) {
- if (openunicode(\$COMPEXCLFH, "CompositionExclusions.txt")) {
- local $_;
- while (<$COMPEXCLFH>) {
- if (/^([0-9A-F]+)\s+\#\s+/) {
- my $code = hex($1);
- $COMPEXCL{$code} = undef;
- }
- }
- close($COMPEXCLFH);
- }
- }
-}
-
sub compexcl {
my $arg = shift;
my $code = _getcode($arg);
croak __PACKAGE__, "::compexcl: unknown code '$arg'"
unless defined $code;
- _compexcl() unless %COMPEXCL;
-
- return exists $COMPEXCL{$code};
+ no warnings "utf8"; # So works on surrogates and non-Unicode code points
+ return chr($code) =~ /\p{Composition_Exclusion}/;
}
=head2 B<casefold()>
@@ -1233,8 +1231,6 @@ if you are wondering where one of your filehandles went, that's where.
Does not yet support EBCDIC platforms.
-L</compexcl()> should give a complete list of excluded code points.
-
=head1 AUTHOR
Jarkko Hietaniemi
diff --git a/lib/Unicode/UCD.t b/lib/Unicode/UCD.t
index 795888aa2f..ae8432cfaf 100644
--- a/lib/Unicode/UCD.t
+++ b/lib/Unicode/UCD.t
@@ -301,6 +301,8 @@ is(Unicode::UCD::UnicodeVersion, '6.0.0', 'UnicodeVersion');
use Unicode::UCD qw(compexcl);
ok(!compexcl(0x0100), 'compexcl');
+ok(!compexcl(0xD801), 'compexcl of surrogate');
+ok(!compexcl(0x110000), 'compexcl of non-Unicode code point');
ok( compexcl(0x0958));
use Unicode::UCD qw(casefold);