summaryrefslogtreecommitdiff
path: root/lib
diff options
context:
space:
mode:
authorNicholas Clark <nick@ccl4.org>2003-11-29 17:29:15 +0000
committerNicholas Clark <nick@ccl4.org>2003-11-29 17:29:15 +0000
commit10d7ec48cd7252976e5d98d8245df9ed1b239c74 (patch)
tree877721fa0852aabc4e472134f760ec70864133a4 /lib
parent4158a951785f90e7eefe039d837a9048e68bac32 (diff)
downloadperl-10d7ec48cd7252976e5d98d8245df9ed1b239c74.tar.gz
Update Unicode::Collate to 0.31 (Only the .pm version for now)
p4raw-id: //depot/perl@21810
Diffstat (limited to 'lib')
-rw-r--r--lib/Unicode/Collate.pm32
-rw-r--r--lib/Unicode/Collate/Changes13
-rw-r--r--lib/Unicode/Collate/README41
-rw-r--r--lib/Unicode/Collate/t/illegal.t85
-rw-r--r--lib/Unicode/Collate/t/illegalp.t80
5 files changed, 225 insertions, 26 deletions
diff --git a/lib/Unicode/Collate.pm b/lib/Unicode/Collate.pm
index a4d6d80cd1..e700597e77 100644
--- a/lib/Unicode/Collate.pm
+++ b/lib/Unicode/Collate.pm
@@ -12,9 +12,11 @@ use warnings;
use Carp;
use File::Spec;
+no warnings 'utf8';
+
require Exporter;
-our $VERSION = '0.30';
+our $VERSION = '0.31';
our $PACKAGE = __PACKAGE__;
our @ISA = qw(Exporter);
@@ -206,7 +208,7 @@ sub checkCollator {
or croak "Illegal UCA version (passed $self->{UCA_Version}).";
$self->{variable} ||= $self->{alternate} || $self->{variableTable} ||
- $self->{alternateTable} || $self->{alternate} || 'shifted';
+ $self->{alternateTable} || 'shifted';
$self->{variable} = $self->{alternate} = lc($self->{variable});
exists $VariableOK{ $self->{variable} }
or croak "$PACKAGE unknown variable tag name: $self->{variable}";
@@ -499,7 +501,7 @@ sub splitEnt
}
for (my $i = 0; $i < @src; $i++) {
- next if _isNonCharacter($src[$i]);
+ next if _isIllegal($src[$i]);
my $i_orig = $i;
my $jcps = $src[$i];
@@ -801,7 +803,7 @@ sub _decompHangul {
);
}
-sub _isNonCharacter {
+sub _isIllegal {
my $code = shift;
return ! defined $code # removed
|| ($code < 0 || 0x10FFFF < $code) # out of range
@@ -1344,11 +1346,10 @@ but it is not warned at present.>
You can use another collation element table if desired.
The table file must be put into a directory
-where F<Unicode/Collate.pm> is installed.
-E.g. in F<perl/lib/Unicode/Collate> directory
-when you have F<perl/lib/Unicode/Collate.pm>.
+where F<Unicode/Collate.pm> is installed; e.g. into
+F<perl/lib/Unicode/Collate/> if you have F<perl/lib/Unicode/Collate.pm>.
-By default, the filename F<"allkeys.txt"> is used.
+By default, the filename F<allkeys.txt> is used.
If C<undef> is passed explicitly as the value for this key,
no file is read (but you can define collation elements via C<entry>).
@@ -1680,9 +1681,8 @@ assign C<normalization =E<gt> undef> explicitly.
=head2 Conformance Test
-The Conformance Test for the UCA is provided
-in L<http://www.unicode.org/reports/tr10/CollationTest.html>
-and L<http://www.unicode.org/reports/tr10/CollationTest.zip>
+The Conformance Test for the UCA is available
+under L<http://www.unicode.org/Public/UCA/>.
For F<CollationTest_SHIFTED.txt>,
a collator via C<Unicode::Collate-E<gt>new( )> should be used;
@@ -1693,7 +1693,7 @@ B<Unicode::Normalize is required to try The Conformance Test.>
=head1 AUTHOR
-SADAHIRO Tomoyuki, <SADAHIRO@cpan.org>
+SADAHIRO Tomoyuki <SADAHIRO@cpan.org>
http://homepage1.nifty.com/nomenclator/perl/
@@ -1712,17 +1712,17 @@ L<http://www.unicode.org/reports/tr10/>
=item The Default Unicode Collation Element Table (DUCET)
-L<http://www.unicode.org/reports/tr10/allkeys.txt>
+L<http://www.unicode.org/Public/UCA/latest/allkeys.txt>
=item The conformance test for the UCA
-L<http://www.unicode.org/reports/tr10/CollationTest.html>
+L<http://www.unicode.org/Public/UCA/latest/CollationTest.html>
-L<http://www.unicode.org/reports/tr10/CollationTest.zip>
+L<http://www.unicode.org/Public/UCA/latest/CollationTest.zip>
=item Hangul Syllable Type
-http://www.unicode.org/Public/UNIDATA/HangulSyllableType.txt
+L<http://www.unicode.org/Public/UNIDATA/HangulSyllableType.txt>
=item Unicode Normalization Forms - UAX #15
diff --git a/lib/Unicode/Collate/Changes b/lib/Unicode/Collate/Changes
index 7f92d7aad1..df60b97c7f 100644
--- a/lib/Unicode/Collate/Changes
+++ b/lib/Unicode/Collate/Changes
@@ -1,14 +1,25 @@
Revision history for Perl module Unicode::Collate.
+0.31 Sun Nov 16 15:40:15 2003
+ - Illegal code points (surrogate and noncharacter; they are definitely
+ ignorable) will be distinguished from NULL ("\0");
+ but porting is not successful in the case of ((Pure Perl) and
+ (Perl 5.7.3 or before)). If perl 5.6.X is used, XSUB may help it
+ in place of broken CORE::unpack('U*') in older perl.
+ - added illegal.t and illegalp.t.
+ - added XSUB edition (EXPERIMENTAL) where some functions are implemented
+ in XSUB (Pure Perl edition is also supported.)
+
0.30 Mon Oct 13 21:26:37 2003
- fix: Completely ignorable in table should be able to be overrided
by non-ignorable in entry.
- fix: Maximum length for contraction must not be shortened
- by a shorter contraction following.
+ by a shorter contraction following in table and/or entry.
- added normal.t.
- some doc fixes
0.29 Mon Oct 13 12:18:23 2003
+ - now UCA Version 11.
- supported hangul_terminator.
- fix: Base_Unicode_Version falsely returns Perl's Unicode version.
C4 in UTS #10 requires UTS's Unicode version.
diff --git a/lib/Unicode/Collate/README b/lib/Unicode/Collate/README
index 6a4b712a8b..2fc4e5fcc6 100644
--- a/lib/Unicode/Collate/README
+++ b/lib/Unicode/Collate/README
@@ -1,4 +1,4 @@
-Unicode/Collate version 0.30
+Unicode/Collate version 0.31
===============================
NAME
@@ -23,6 +23,22 @@ SYNOPSIS
INSTALLATION
Perl 5.6.1 or later
+(recommended: Perl 5.8.0 or later)
+
+To use this module, it is recommended to install a table file
+in the UCA format, by copying it into the directory
+where F<Unicode/Collate.pm> is installed;
+e.g. into F<perl/lib/Unicode/Collate/> directory
+if you have F<perl/lib/Unicode/Collate.pm>.
+
+The most preferable one is "The Default Unicode Collation Element Table",
+available from the Unicode consortium's website:
+
+ http://www.unicode.org/Public/UCA/latest/allkeys.txt (latest version)
+
+Though this distribution contains a subset of allkeys.txt, named "keys.txt",
+this one is intended only for doing a test of this module
+and practically useless for any other purpose.
To install this module type the following:
@@ -31,17 +47,24 @@ To install this module type the following:
make test
make install
-To use this module, it is better to install a table file in the UCA format,
-by copying it into the lib/Unicode/Collate directory.
+If you have a C compiler and want to use XSUB edition,
+type the following (!! "enableXS" must run before "Makefile.PL" !!):
-The most preferable one is "The Default Unicode Collation Element Table",
-available from the Unicode consortium's website:
+ perl enableXS
+ perl Makefile.PL
+ make
+ make test
+ make install
- http://www.unicode.org/reports/tr10/allkeys.txt
+If you decide to install pure Perl (i.e. non-XS) edition after trying
+to build XSUB, type the following:
-Though this distribution contains a subset of allkeys.txt, named "keys.txt",
-this one is intended only for doing a test of this module
-and practically useless for any other purpose.
+ make clean
+ perl disableXS
+ perl Makefile.PL
+ make
+ make test
+ make install
DEPENDENCIES
diff --git a/lib/Unicode/Collate/t/illegal.t b/lib/Unicode/Collate/t/illegal.t
new file mode 100644
index 0000000000..b9961b6981
--- /dev/null
+++ b/lib/Unicode/Collate/t/illegal.t
@@ -0,0 +1,85 @@
+
+BEGIN {
+ unless ("A" eq pack('U', 0x41)) {
+ print "1..0 # Unicode::Collate " .
+ "cannot stringify a Unicode code point\n";
+ exit 0;
+ }
+}
+
+BEGIN {
+ if ($ENV{PERL_CORE}) {
+ chdir('t') if -d 't';
+ @INC = $^O eq 'MacOS' ? qw(::lib) : qw(../lib);
+ }
+}
+
+use Test;
+use strict;
+use warnings;
+
+BEGIN {
+ use Unicode::Collate;
+
+ unless (exists &Unicode::Collate::bootstrap or 5.008 <= $]) {
+ print "1..0 # skipped: XSUB, or Perl 5.8.0 or later".
+ " needed for this test\n";
+ print $@;
+ exit;
+ }
+}
+
+BEGIN { plan tests => 22 };
+
+ok(1);
+
+#########################
+
+no warnings 'utf8';
+
+# NULL is tailorable but illegal code points are not.
+# illegal code points should be always ingored
+# (cf. UCA, 7.1.1 Illegal code points).
+
+my $illeg = Unicode::Collate->new(
+ entry => <<'ENTRIES',
+0000 ; [.0020.0000.0000.0000] # [0000] NULL
+0001 ; [.0021.0000.0000.0001] # [0001] START OF HEADING
+FFFE ; [.0022.0000.0000.FFFE] # <noncharacter-FFFE>
+FFFF ; [.0023.0000.0000.FFFF] # <noncharacter-FFFF>
+D800 ; [.0024.0000.0000.D800] # <surrogate-D800>
+DFFF ; [.0025.0000.0000.DFFF] # <surrogate-DFFF>
+FDD0 ; [.0026.0000.0000.FDD0] # <noncharacter-FDD0>
+FDEF ; [.0027.0000.0000.FDEF] # <noncharacter-FDEF>
+0002 ; [.0030.0000.0000.0002] # [0002] START OF TEXT
+10FFFF; [.0040.0000.0000.10FFFF] # <noncharacter-10FFFF>
+110000; [.0041.0000.0000.110000] # <out-of-range 110000>
+ENTRIES
+ level => 1,
+ table => undef,
+ normalization => undef,
+);
+
+ok($illeg->lt("", "\x00"));
+ok($illeg->lt("", "\x01"));
+ok($illeg->eq("", "\x{FFFE}"));
+ok($illeg->eq("", "\x{FFFF}"));
+ok($illeg->eq("", "\x{D800}"));
+ok($illeg->eq("", "\x{DFFF}"));
+ok($illeg->eq("", "\x{FDD0}"));
+ok($illeg->eq("", "\x{FDEF}"));
+ok($illeg->lt("", "\x02"));
+ok($illeg->eq("", "\x{10FFFF}"));
+ok($illeg->eq("", "\x{110000}"));
+
+ok($illeg->lt("\x00", "\x01"));
+ok($illeg->lt("\x01", "\x02"));
+ok($illeg->ne("\0", "\x{D800}"));
+ok($illeg->ne("\0", "\x{DFFF}"));
+ok($illeg->ne("\0", "\x{FDD0}"));
+ok($illeg->ne("\0", "\x{FDEF}"));
+ok($illeg->ne("\0", "\x{FFFE}"));
+ok($illeg->ne("\0", "\x{FFFF}"));
+ok($illeg->ne("\0", "\x{10FFFF}"));
+ok($illeg->ne("\0", "\x{110000}"));
+
diff --git a/lib/Unicode/Collate/t/illegalp.t b/lib/Unicode/Collate/t/illegalp.t
new file mode 100644
index 0000000000..690c88d0bb
--- /dev/null
+++ b/lib/Unicode/Collate/t/illegalp.t
@@ -0,0 +1,80 @@
+
+BEGIN {
+ unless ("A" eq pack('U', 0x41)) {
+ print "1..0 # Unicode::Collate " .
+ "cannot stringify a Unicode code point\n";
+ exit 0;
+ }
+}
+
+BEGIN {
+ if ($ENV{PERL_CORE}) {
+ chdir('t') if -d 't';
+ @INC = $^O eq 'MacOS' ? qw(::lib) : qw(../lib);
+ }
+}
+
+use Test;
+BEGIN { plan tests => 17 };
+
+use strict;
+use warnings;
+
+ok(1);
+
+#
+# No test for Unicode::Collate is included in this .t file.
+#
+# UCA conformance test requires completely ignorable characters
+# (including noncharacters) must be able to be ordered in code point order;
+# If not so, Unicode::Collate must not be compliant with UCA.
+#
+# ~~~ CollationTest_SHIFTED.txt in CollationTest-4.0.0
+#
+# 206F 0021; # ! NOMINAL DIGIT SHAPES [| | | 0251]
+# D800 0021; # ! <surrogate-D800> [| | | 0251]
+# DFFF 0021; # ! <surrogate-DFFF> [| | | 0251]
+# FDD0 0021; # ! <noncharacter-FDD0> [| | | 0251]
+# FFFB 0021; # ! INTERLINEAR ANNOTATION TERMINATOR [| | | 0251]
+# FFFE 0021; # ! <noncharacter-FFFE> [| | | 0251]
+# FFFF 0021; # ! <noncharacter-FFFF> [| | | 0251]
+# 1D165 0021; # ! MS. Cm. STEM [| | | 0251]
+#
+# ~~~ CollationTest_NON_IGNORABLE.txt in CollationTest-4.0.0
+#
+# 206F 0021; # ! NOMINAL DIGIT SHAPES [0251 | 0020 | 0002 |]
+# D800 0021; # ! <surrogate-D800> [0251 | 0020 | 0002 |]
+# DFFF 0021; # ! <surrogate-DFFF> [0251 | 0020 | 0002 |]
+# FDD0 0021; # ! <noncharacter-FDD0> [0251 | 0020 | 0002 |]
+# FFFB 0021; # ! INTERLINEAR ANNOTATION TERMINATOR [0251 | 0020 | 0002 |]
+# FFFE 0021; # ! <noncharacter-FFFE> [0251 | 0020 | 0002 |]
+# FFFF 0021; # ! <noncharacter-FFFF> [0251 | 0020 | 0002 |]
+# 1D165 0021; # ! MS. Cm. STEM [0251 | 0020 | 0002 |]
+#
+
+no warnings 'utf8';
+
+ok("\x{206F}!" lt "\x{D800}!");
+ok(pack('U*', 0x206F, 0x21) lt pack('U*', 0xD800, 0x21));
+
+ok("\x{D800}!" lt "\x{DFFF}!");
+ok(pack('U*', 0xD800, 0x21) lt pack('U*', 0xDFFF, 0x21));
+
+ok("\x{DFFF}!" lt "\x{FDD0}!");
+ok(pack('U*', 0xDFFF, 0x21) lt pack('U*', 0xFDD0, 0x21) );
+
+ok("\x{FDD0}!" lt "\x{FFFB}!");
+ok(pack('U*', 0xFDD0, 0x21) lt pack('U*', 0xFFFB, 0x21));
+
+ok("\x{FFFB}!" lt "\x{FFFE}!");
+ok(pack('U*', 0xFFFB, 0x21) lt pack('U*', 0xFFFE, 0x21));
+
+ok("\x{FFFE}!" lt "\x{FFFF}!");
+ok(pack('U*', 0xFFFE, 0x21) lt pack('U*', 0xFFFF, 0x21));
+
+ok("\x{FFFF}!" lt "\x{1D165}!");
+ok(pack('U*', 0xFFFF, 0x21) lt pack('U*', 0x1D165, 0x21));
+
+ok("\000!" lt "\x{FFFF}!");
+ok(pack('U*', 0, 0x21) lt pack('U*', 0xFFFF, 0x21));
+