diff options
author | Nicholas Clark <nick@ccl4.org> | 2003-11-29 17:29:15 +0000 |
---|---|---|
committer | Nicholas Clark <nick@ccl4.org> | 2003-11-29 17:29:15 +0000 |
commit | 10d7ec48cd7252976e5d98d8245df9ed1b239c74 (patch) | |
tree | 877721fa0852aabc4e472134f760ec70864133a4 /lib/Unicode | |
parent | 4158a951785f90e7eefe039d837a9048e68bac32 (diff) | |
download | perl-10d7ec48cd7252976e5d98d8245df9ed1b239c74.tar.gz |
Update Unicode::Collate to 0.31 (Only the .pm version for now)
p4raw-id: //depot/perl@21810
Diffstat (limited to 'lib/Unicode')
-rw-r--r-- | lib/Unicode/Collate.pm | 32 | ||||
-rw-r--r-- | lib/Unicode/Collate/Changes | 13 | ||||
-rw-r--r-- | lib/Unicode/Collate/README | 41 | ||||
-rw-r--r-- | lib/Unicode/Collate/t/illegal.t | 85 | ||||
-rw-r--r-- | lib/Unicode/Collate/t/illegalp.t | 80 |
5 files changed, 225 insertions, 26 deletions
diff --git a/lib/Unicode/Collate.pm b/lib/Unicode/Collate.pm index a4d6d80cd1..e700597e77 100644 --- a/lib/Unicode/Collate.pm +++ b/lib/Unicode/Collate.pm @@ -12,9 +12,11 @@ use warnings; use Carp; use File::Spec; +no warnings 'utf8'; + require Exporter; -our $VERSION = '0.30'; +our $VERSION = '0.31'; our $PACKAGE = __PACKAGE__; our @ISA = qw(Exporter); @@ -206,7 +208,7 @@ sub checkCollator { or croak "Illegal UCA version (passed $self->{UCA_Version})."; $self->{variable} ||= $self->{alternate} || $self->{variableTable} || - $self->{alternateTable} || $self->{alternate} || 'shifted'; + $self->{alternateTable} || 'shifted'; $self->{variable} = $self->{alternate} = lc($self->{variable}); exists $VariableOK{ $self->{variable} } or croak "$PACKAGE unknown variable tag name: $self->{variable}"; @@ -499,7 +501,7 @@ sub splitEnt } for (my $i = 0; $i < @src; $i++) { - next if _isNonCharacter($src[$i]); + next if _isIllegal($src[$i]); my $i_orig = $i; my $jcps = $src[$i]; @@ -801,7 +803,7 @@ sub _decompHangul { ); } -sub _isNonCharacter { +sub _isIllegal { my $code = shift; return ! defined $code # removed || ($code < 0 || 0x10FFFF < $code) # out of range @@ -1344,11 +1346,10 @@ but it is not warned at present.> You can use another collation element table if desired. The table file must be put into a directory -where F<Unicode/Collate.pm> is installed. -E.g. in F<perl/lib/Unicode/Collate> directory -when you have F<perl/lib/Unicode/Collate.pm>. +where F<Unicode/Collate.pm> is installed; e.g. into +F<perl/lib/Unicode/Collate/> if you have F<perl/lib/Unicode/Collate.pm>. -By default, the filename F<"allkeys.txt"> is used. +By default, the filename F<allkeys.txt> is used. If C<undef> is passed explicitly as the value for this key, no file is read (but you can define collation elements via C<entry>). @@ -1680,9 +1681,8 @@ assign C<normalization =E<gt> undef> explicitly. =head2 Conformance Test -The Conformance Test for the UCA is provided -in L<http://www.unicode.org/reports/tr10/CollationTest.html> -and L<http://www.unicode.org/reports/tr10/CollationTest.zip> +The Conformance Test for the UCA is available +under L<http://www.unicode.org/Public/UCA/>. For F<CollationTest_SHIFTED.txt>, a collator via C<Unicode::Collate-E<gt>new( )> should be used; @@ -1693,7 +1693,7 @@ B<Unicode::Normalize is required to try The Conformance Test.> =head1 AUTHOR -SADAHIRO Tomoyuki, <SADAHIRO@cpan.org> +SADAHIRO Tomoyuki <SADAHIRO@cpan.org> http://homepage1.nifty.com/nomenclator/perl/ @@ -1712,17 +1712,17 @@ L<http://www.unicode.org/reports/tr10/> =item The Default Unicode Collation Element Table (DUCET) -L<http://www.unicode.org/reports/tr10/allkeys.txt> +L<http://www.unicode.org/Public/UCA/latest/allkeys.txt> =item The conformance test for the UCA -L<http://www.unicode.org/reports/tr10/CollationTest.html> +L<http://www.unicode.org/Public/UCA/latest/CollationTest.html> -L<http://www.unicode.org/reports/tr10/CollationTest.zip> +L<http://www.unicode.org/Public/UCA/latest/CollationTest.zip> =item Hangul Syllable Type -http://www.unicode.org/Public/UNIDATA/HangulSyllableType.txt +L<http://www.unicode.org/Public/UNIDATA/HangulSyllableType.txt> =item Unicode Normalization Forms - UAX #15 diff --git a/lib/Unicode/Collate/Changes b/lib/Unicode/Collate/Changes index 7f92d7aad1..df60b97c7f 100644 --- a/lib/Unicode/Collate/Changes +++ b/lib/Unicode/Collate/Changes @@ -1,14 +1,25 @@ Revision history for Perl module Unicode::Collate. +0.31 Sun Nov 16 15:40:15 2003 + - Illegal code points (surrogate and noncharacter; they are definitely + ignorable) will be distinguished from NULL ("\0"); + but porting is not successful in the case of ((Pure Perl) and + (Perl 5.7.3 or before)). If perl 5.6.X is used, XSUB may help it + in place of broken CORE::unpack('U*') in older perl. + - added illegal.t and illegalp.t. + - added XSUB edition (EXPERIMENTAL) where some functions are implemented + in XSUB (Pure Perl edition is also supported.) + 0.30 Mon Oct 13 21:26:37 2003 - fix: Completely ignorable in table should be able to be overrided by non-ignorable in entry. - fix: Maximum length for contraction must not be shortened - by a shorter contraction following. + by a shorter contraction following in table and/or entry. - added normal.t. - some doc fixes 0.29 Mon Oct 13 12:18:23 2003 + - now UCA Version 11. - supported hangul_terminator. - fix: Base_Unicode_Version falsely returns Perl's Unicode version. C4 in UTS #10 requires UTS's Unicode version. diff --git a/lib/Unicode/Collate/README b/lib/Unicode/Collate/README index 6a4b712a8b..2fc4e5fcc6 100644 --- a/lib/Unicode/Collate/README +++ b/lib/Unicode/Collate/README @@ -1,4 +1,4 @@ -Unicode/Collate version 0.30 +Unicode/Collate version 0.31 =============================== NAME @@ -23,6 +23,22 @@ SYNOPSIS INSTALLATION Perl 5.6.1 or later +(recommended: Perl 5.8.0 or later) + +To use this module, it is recommended to install a table file +in the UCA format, by copying it into the directory +where F<Unicode/Collate.pm> is installed; +e.g. into F<perl/lib/Unicode/Collate/> directory +if you have F<perl/lib/Unicode/Collate.pm>. + +The most preferable one is "The Default Unicode Collation Element Table", +available from the Unicode consortium's website: + + http://www.unicode.org/Public/UCA/latest/allkeys.txt (latest version) + +Though this distribution contains a subset of allkeys.txt, named "keys.txt", +this one is intended only for doing a test of this module +and practically useless for any other purpose. To install this module type the following: @@ -31,17 +47,24 @@ To install this module type the following: make test make install -To use this module, it is better to install a table file in the UCA format, -by copying it into the lib/Unicode/Collate directory. +If you have a C compiler and want to use XSUB edition, +type the following (!! "enableXS" must run before "Makefile.PL" !!): -The most preferable one is "The Default Unicode Collation Element Table", -available from the Unicode consortium's website: + perl enableXS + perl Makefile.PL + make + make test + make install - http://www.unicode.org/reports/tr10/allkeys.txt +If you decide to install pure Perl (i.e. non-XS) edition after trying +to build XSUB, type the following: -Though this distribution contains a subset of allkeys.txt, named "keys.txt", -this one is intended only for doing a test of this module -and practically useless for any other purpose. + make clean + perl disableXS + perl Makefile.PL + make + make test + make install DEPENDENCIES diff --git a/lib/Unicode/Collate/t/illegal.t b/lib/Unicode/Collate/t/illegal.t new file mode 100644 index 0000000000..b9961b6981 --- /dev/null +++ b/lib/Unicode/Collate/t/illegal.t @@ -0,0 +1,85 @@ + +BEGIN { + unless ("A" eq pack('U', 0x41)) { + print "1..0 # Unicode::Collate " . + "cannot stringify a Unicode code point\n"; + exit 0; + } +} + +BEGIN { + if ($ENV{PERL_CORE}) { + chdir('t') if -d 't'; + @INC = $^O eq 'MacOS' ? qw(::lib) : qw(../lib); + } +} + +use Test; +use strict; +use warnings; + +BEGIN { + use Unicode::Collate; + + unless (exists &Unicode::Collate::bootstrap or 5.008 <= $]) { + print "1..0 # skipped: XSUB, or Perl 5.8.0 or later". + " needed for this test\n"; + print $@; + exit; + } +} + +BEGIN { plan tests => 22 }; + +ok(1); + +######################### + +no warnings 'utf8'; + +# NULL is tailorable but illegal code points are not. +# illegal code points should be always ingored +# (cf. UCA, 7.1.1 Illegal code points). + +my $illeg = Unicode::Collate->new( + entry => <<'ENTRIES', +0000 ; [.0020.0000.0000.0000] # [0000] NULL +0001 ; [.0021.0000.0000.0001] # [0001] START OF HEADING +FFFE ; [.0022.0000.0000.FFFE] # <noncharacter-FFFE> +FFFF ; [.0023.0000.0000.FFFF] # <noncharacter-FFFF> +D800 ; [.0024.0000.0000.D800] # <surrogate-D800> +DFFF ; [.0025.0000.0000.DFFF] # <surrogate-DFFF> +FDD0 ; [.0026.0000.0000.FDD0] # <noncharacter-FDD0> +FDEF ; [.0027.0000.0000.FDEF] # <noncharacter-FDEF> +0002 ; [.0030.0000.0000.0002] # [0002] START OF TEXT +10FFFF; [.0040.0000.0000.10FFFF] # <noncharacter-10FFFF> +110000; [.0041.0000.0000.110000] # <out-of-range 110000> +ENTRIES + level => 1, + table => undef, + normalization => undef, +); + +ok($illeg->lt("", "\x00")); +ok($illeg->lt("", "\x01")); +ok($illeg->eq("", "\x{FFFE}")); +ok($illeg->eq("", "\x{FFFF}")); +ok($illeg->eq("", "\x{D800}")); +ok($illeg->eq("", "\x{DFFF}")); +ok($illeg->eq("", "\x{FDD0}")); +ok($illeg->eq("", "\x{FDEF}")); +ok($illeg->lt("", "\x02")); +ok($illeg->eq("", "\x{10FFFF}")); +ok($illeg->eq("", "\x{110000}")); + +ok($illeg->lt("\x00", "\x01")); +ok($illeg->lt("\x01", "\x02")); +ok($illeg->ne("\0", "\x{D800}")); +ok($illeg->ne("\0", "\x{DFFF}")); +ok($illeg->ne("\0", "\x{FDD0}")); +ok($illeg->ne("\0", "\x{FDEF}")); +ok($illeg->ne("\0", "\x{FFFE}")); +ok($illeg->ne("\0", "\x{FFFF}")); +ok($illeg->ne("\0", "\x{10FFFF}")); +ok($illeg->ne("\0", "\x{110000}")); + diff --git a/lib/Unicode/Collate/t/illegalp.t b/lib/Unicode/Collate/t/illegalp.t new file mode 100644 index 0000000000..690c88d0bb --- /dev/null +++ b/lib/Unicode/Collate/t/illegalp.t @@ -0,0 +1,80 @@ + +BEGIN { + unless ("A" eq pack('U', 0x41)) { + print "1..0 # Unicode::Collate " . + "cannot stringify a Unicode code point\n"; + exit 0; + } +} + +BEGIN { + if ($ENV{PERL_CORE}) { + chdir('t') if -d 't'; + @INC = $^O eq 'MacOS' ? qw(::lib) : qw(../lib); + } +} + +use Test; +BEGIN { plan tests => 17 }; + +use strict; +use warnings; + +ok(1); + +# +# No test for Unicode::Collate is included in this .t file. +# +# UCA conformance test requires completely ignorable characters +# (including noncharacters) must be able to be ordered in code point order; +# If not so, Unicode::Collate must not be compliant with UCA. +# +# ~~~ CollationTest_SHIFTED.txt in CollationTest-4.0.0 +# +# 206F 0021; # ! NOMINAL DIGIT SHAPES [| | | 0251] +# D800 0021; # ! <surrogate-D800> [| | | 0251] +# DFFF 0021; # ! <surrogate-DFFF> [| | | 0251] +# FDD0 0021; # ! <noncharacter-FDD0> [| | | 0251] +# FFFB 0021; # ! INTERLINEAR ANNOTATION TERMINATOR [| | | 0251] +# FFFE 0021; # ! <noncharacter-FFFE> [| | | 0251] +# FFFF 0021; # ! <noncharacter-FFFF> [| | | 0251] +# 1D165 0021; # ! MS. Cm. STEM [| | | 0251] +# +# ~~~ CollationTest_NON_IGNORABLE.txt in CollationTest-4.0.0 +# +# 206F 0021; # ! NOMINAL DIGIT SHAPES [0251 | 0020 | 0002 |] +# D800 0021; # ! <surrogate-D800> [0251 | 0020 | 0002 |] +# DFFF 0021; # ! <surrogate-DFFF> [0251 | 0020 | 0002 |] +# FDD0 0021; # ! <noncharacter-FDD0> [0251 | 0020 | 0002 |] +# FFFB 0021; # ! INTERLINEAR ANNOTATION TERMINATOR [0251 | 0020 | 0002 |] +# FFFE 0021; # ! <noncharacter-FFFE> [0251 | 0020 | 0002 |] +# FFFF 0021; # ! <noncharacter-FFFF> [0251 | 0020 | 0002 |] +# 1D165 0021; # ! MS. Cm. STEM [0251 | 0020 | 0002 |] +# + +no warnings 'utf8'; + +ok("\x{206F}!" lt "\x{D800}!"); +ok(pack('U*', 0x206F, 0x21) lt pack('U*', 0xD800, 0x21)); + +ok("\x{D800}!" lt "\x{DFFF}!"); +ok(pack('U*', 0xD800, 0x21) lt pack('U*', 0xDFFF, 0x21)); + +ok("\x{DFFF}!" lt "\x{FDD0}!"); +ok(pack('U*', 0xDFFF, 0x21) lt pack('U*', 0xFDD0, 0x21) ); + +ok("\x{FDD0}!" lt "\x{FFFB}!"); +ok(pack('U*', 0xFDD0, 0x21) lt pack('U*', 0xFFFB, 0x21)); + +ok("\x{FFFB}!" lt "\x{FFFE}!"); +ok(pack('U*', 0xFFFB, 0x21) lt pack('U*', 0xFFFE, 0x21)); + +ok("\x{FFFE}!" lt "\x{FFFF}!"); +ok(pack('U*', 0xFFFE, 0x21) lt pack('U*', 0xFFFF, 0x21)); + +ok("\x{FFFF}!" lt "\x{1D165}!"); +ok(pack('U*', 0xFFFF, 0x21) lt pack('U*', 0x1D165, 0x21)); + +ok("\000!" lt "\x{FFFF}!"); +ok(pack('U*', 0, 0x21) lt pack('U*', 0xFFFF, 0x21)); + |