summaryrefslogtreecommitdiff
path: root/cpan/Unicode-Collate/Collate.pm
diff options
context:
space:
mode:
Diffstat (limited to 'cpan/Unicode-Collate/Collate.pm')
-rw-r--r--cpan/Unicode-Collate/Collate.pm164
1 files changed, 113 insertions, 51 deletions
diff --git a/cpan/Unicode-Collate/Collate.pm b/cpan/Unicode-Collate/Collate.pm
index 97abf1e683..e7c5513354 100644
--- a/cpan/Unicode-Collate/Collate.pm
+++ b/cpan/Unicode-Collate/Collate.pm
@@ -14,7 +14,7 @@ use File::Spec;
no warnings 'utf8';
-our $VERSION = '0.53';
+our $VERSION = '0.55';
our $PACKAGE = __PACKAGE__;
my @Path = qw(Unicode Collate);
@@ -71,36 +71,49 @@ use constant NON_VAR => 0; # Non-Variable character
use constant VAR => 1; # Variable character
# specific code points
+use constant Hangul_SBase => 0xAC00;
+use constant Hangul_SIni => 0xAC00;
+use constant Hangul_SFin => 0xD7A3;
+use constant Hangul_NCount => 588;
+use constant Hangul_TCount => 28;
use constant Hangul_LBase => 0x1100;
use constant Hangul_LIni => 0x1100;
use constant Hangul_LFin => 0x1159;
use constant Hangul_LFill => 0x115F;
+use constant Hangul_LEnd => 0x115F; # Unicode 5.2.0
use constant Hangul_VBase => 0x1161;
use constant Hangul_VIni => 0x1160; # from Vowel Filler
use constant Hangul_VFin => 0x11A2;
+use constant Hangul_VEnd => 0x11A7; # Unicode 5.2.0
use constant Hangul_TBase => 0x11A7; # from "no-final" codepoint
use constant Hangul_TIni => 0x11A8;
use constant Hangul_TFin => 0x11F9;
-use constant Hangul_TCount => 28;
-use constant Hangul_NCount => 588;
-use constant Hangul_SBase => 0xAC00;
-use constant Hangul_SIni => 0xAC00;
-use constant Hangul_SFin => 0xD7A3;
+use constant Hangul_TEnd => 0x11FF; # Unicode 5.2.0
+use constant HangulL2Ini => 0xA960; # Unicode 5.2.0
+use constant HangulL2Fin => 0xA97C; # Unicode 5.2.0
+use constant HangulV2Ini => 0xD7B0; # Unicode 5.2.0
+use constant HangulV2Fin => 0xD7C6; # Unicode 5.2.0
+use constant HangulT2Ini => 0xD7CB; # Unicode 5.2.0
+use constant HangulT2Fin => 0xD7FB; # Unicode 5.2.0
+
use constant CJK_UidIni => 0x4E00;
use constant CJK_UidFin => 0x9FA5;
use constant CJK_UidF41 => 0x9FBB;
use constant CJK_UidF51 => 0x9FC3;
+use constant CJK_UidF52 => 0x9FCB;
use constant CJK_ExtAIni => 0x3400;
use constant CJK_ExtAFin => 0x4DB5;
use constant CJK_ExtBIni => 0x20000;
use constant CJK_ExtBFin => 0x2A6D6;
+use constant CJK_ExtCIni => 0x2A700; # Unicode 5.2.0
+use constant CJK_ExtCFin => 0x2B734; # Unicode 5.2.0
# Logical_Order_Exception in PropList.txt
my $DefaultRearrange = [ 0x0E40..0x0E44, 0x0EC0..0x0EC4 ];
-sub UCA_Version { "18" }
+sub UCA_Version { "20" }
-sub Base_Unicode_Version { "5.1.0" }
+sub Base_Unicode_Version { "5.2.0" }
######
@@ -130,8 +143,7 @@ our @ChangeNG = qw/
entry mapping table maxlength
ignoreChar ignoreName undefChar undefName variableTable
versionTable alternateTable backwardsTable forwardsTable rearrangeTable
- derivCode normCode rearrangeHash
- backwardsFlag
+ derivCode normCode rearrangeHash backwardsFlag
/;
# The hash key 'ignored' is deleted at v 0.21.
# The hash key 'isShift' is deleted at v 0.23.
@@ -190,6 +202,7 @@ my %DerivCode = (
14 => \&_derivCE_14,
16 => \&_derivCE_14, # 16 == 14
18 => \&_derivCE_18,
+ 20 => \&_derivCE_20,
);
sub checkCollator {
@@ -293,6 +306,30 @@ sub new
return $self;
}
+sub parseAtmark {
+ my $self = shift;
+ my $line = shift; # after s/^\s*\@//
+
+ if ($line =~ /^version\s*(\S*)/) {
+ $self->{versionTable} ||= $1;
+ }
+ elsif ($line =~ /^variable\s+(\S*)/) { # since UTS #10-9
+ $self->{variableTable} ||= $1;
+ }
+ elsif ($line =~ /^alternate\s+(\S*)/) { # till UTS #10-8
+ $self->{alternateTable} ||= $1;
+ }
+ elsif ($line =~ /^backwards\s+(\S*)/) {
+ push @{ $self->{backwardsTable} }, $1;
+ }
+ elsif ($line =~ /^forwards\s+(\S*)/) { # parhaps no use
+ push @{ $self->{forwardsTable} }, $1;
+ }
+ elsif ($line =~ /^rearrange\s+(.*)/) { # (\S*) is NG
+ push @{ $self->{rearrangeTable} }, _getHexArray($1);
+ }
+}
+
sub read_table {
my $self = shift;
@@ -309,29 +346,11 @@ sub read_table {
while (my $line = <$fh>) {
next if $line =~ /^\s*#/;
- unless ($line =~ s/^\s*\@//) {
- $self->parseEntry($line);
- next;
- }
- # matched ^\s*\@
- if ($line =~ /^version\s*(\S*)/) {
- $self->{versionTable} ||= $1;
- }
- elsif ($line =~ /^variable\s+(\S*)/) { # since UTS #10-9
- $self->{variableTable} ||= $1;
- }
- elsif ($line =~ /^alternate\s+(\S*)/) { # till UTS #10-8
- $self->{alternateTable} ||= $1;
- }
- elsif ($line =~ /^backwards\s+(\S*)/) {
- push @{ $self->{backwardsTable} }, $1;
- }
- elsif ($line =~ /^forwards\s+(\S*)/) { # parhaps no use
- push @{ $self->{forwardsTable} }, $1;
- }
- elsif ($line =~ /^rearrange\s+(.*)/) { # (\S*) is NG
- push @{ $self->{rearrangeTable} }, _getHexArray($1);
+ if ($line =~ s/^\s*\@//) {
+ $self->parseAtmark($line);
+ } else {
+ $self->parseEntry($line);
}
}
close $fh;
@@ -651,8 +670,8 @@ sub getSortKey
my $self = shift;
my $lev = $self->{level};
my $rEnt = $self->splitEnt(shift); # get an arrayref of JCPS
- my $v2i = $self->{UCA_Version} >= 9 &&
- $self->{variable} ne 'non-ignorable';
+ my $vers = $self->{UCA_Version};
+ my $v2i = $vers >= 9 && $self->{variable} ne 'non-ignorable';
my @buf; # weight arrays
if ($self->{hangul_terminator}) {
@@ -661,7 +680,7 @@ sub getSortKey
# weird things like VL, TL-contraction are not considered!
my $curHST = '';
foreach my $u (split /;/, $jcps) {
- $curHST .= getHST($u);
+ $curHST .= getHST($u, $vers);
}
if ($preHST && !$curHST || # hangul before non-hangul
$preHST =~ /L\z/ && $curHST =~ /^T/ ||
@@ -758,6 +777,19 @@ sub sort {
}
+sub _derivCE_20 {
+ my $u = shift;
+ my $base = (CJK_UidIni <= $u && $u <= CJK_UidF52) ? 0xFB40 : # CJK
+ (CJK_ExtAIni <= $u && $u <= CJK_ExtAFin ||
+ CJK_ExtBIni <= $u && $u <= CJK_ExtBFin ||
+ CJK_ExtCIni <= $u && $u <= CJK_ExtCFin) ? 0xFB80 # CJK ext.
+ : 0xFBC0; # others
+ my $aaaa = $base + ($u >> 15);
+ my $bbbb = ($u & 0x7FFF) | 0x8000;
+ return pack(VCE_TEMPLATE, NON_VAR, $aaaa, Min2Wt, Min3Wt, $u),
+ pack(VCE_TEMPLATE, NON_VAR, $bbbb, 0, 0, $u);
+}
+
sub _derivCE_18 {
my $u = shift;
my $base = (CJK_UidIni <= $u && $u <= CJK_UidF51) ? 0xFB40 : # CJK
@@ -811,6 +843,7 @@ sub _uideoCE_8 {
sub _isUIdeo {
my ($u, $uca_vers) = @_;
return((CJK_UidIni <= $u && (
+ $uca_vers >= 20 ? ($u <= CJK_UidF52) :
$uca_vers >= 18 ? ($u <= CJK_UidF51) :
$uca_vers >= 14 ? ($u <= CJK_UidF41) :
($u <= CJK_UidFin)))
@@ -818,6 +851,9 @@ sub _isUIdeo {
(CJK_ExtAIni <= $u && $u <= CJK_ExtAFin)
||
(CJK_ExtBIni <= $u && $u <= CJK_ExtBFin)
+ ||
+ ($uca_vers >= 20 &&
+ CJK_ExtCIni <= $u && $u <= CJK_ExtCFin)
);
}
@@ -864,12 +900,25 @@ sub _isIllegal {
# Hangul Syllable Type
sub getHST {
my $u = shift;
- return
- Hangul_LIni <= $u && $u <= Hangul_LFin || $u == Hangul_LFill ? "L" :
- Hangul_VIni <= $u && $u <= Hangul_VFin ? "V" :
- Hangul_TIni <= $u && $u <= Hangul_TFin ? "T" :
- Hangul_SIni <= $u && $u <= Hangul_SFin ?
- ($u - Hangul_SBase) % Hangul_TCount ? "LVT" : "LV" : "";
+ my $vers = shift || 0;
+
+ if (Hangul_SIni <= $u && $u <= Hangul_SFin) {
+ return +($u - Hangul_SBase) % Hangul_TCount ? "LVT" : "LV";
+ }
+
+ if ($vers < 20) {
+ return Hangul_LIni <= $u && $u <= Hangul_LFin ||
+ $u == Hangul_LFill ? "L" :
+ Hangul_VIni <= $u && $u <= Hangul_VFin ? "V" :
+ Hangul_TIni <= $u && $u <= Hangul_TFin ? "T" : "";
+ } else {
+ return Hangul_LIni <= $u && $u <= Hangul_LEnd ||
+ HangulL2Ini <= $u && $u <= HangulL2Fin ? "L" :
+ Hangul_VIni <= $u && $u <= Hangul_VEnd ||
+ HangulV2Ini <= $u && $u <= HangulV2Fin ? "V" :
+ Hangul_TIni <= $u && $u <= Hangul_TEnd ||
+ HangulT2Ini <= $u && $u <= HangulT2Fin ? "T" : "";
+ }
}
@@ -1178,7 +1227,7 @@ behavior of that tracking version is emulated on collating.
If omitted, the return value of C<UCA_Version()> is used.
C<UCA_Version()> should return the latest tracking version supported.
-The supported tracking version: 8, 9, 11, 14, 16 or 18.
+The supported tracking version: 8, 9, 11, 14, 16, 18 or 20.
UCA Unicode Standard DUCET (@version)
---------------------------------------------------
@@ -1188,6 +1237,7 @@ The supported tracking version: 8, 9, 11, 14, 16 or 18.
14 4.1.0 4.1.0 (4.1.0)
16 5.0 5.0.0 (5.0.0)
18 5.1.0 5.1.0 (5.1.0)
+ 20 5.2.0 5.2.0 (5.2.0)
Note: Recent UTS #10 renames "Tracking Version" to "Revision."
@@ -1375,12 +1425,18 @@ B<Unicode::Normalize> is required (see also B<CAVEAT>).
-- see 7.1 Derived Collation Elements, UTS #10.
By default, CJK Unified Ideographs are ordered in Unicode codepoint
-order but C<CJK Unified Ideographs>
-(if C<UCA_Version> is 8 to 11, its range is C<U+4E00..U+9FA5>;
-if C<UCA_Version> is 14 to 16, its range is C<U+4E00..U+9FBB>;
-if C<UCA_Version> is 18, its range is C<U+4E00..U+9FC3>)
-are lesser than C<CJK Unified Ideographs Extension>
-(its range is C<U+3400..U+4DB5> and C<U+20000..U+2A6D6>).
+order but C<CJK Unified Ideographs> are lesser than
+C<CJK Unified Ideographs Extension>.
+
+ CJK Unified Ideographs:
+ U+4E00..U+9FA5 if UCA_Version is 8 to 11;
+ U+4E00..U+9FBB if UCA_Version is 14 to 16;
+ U+4E00..U+9FC3 if UCA_Version is 18;
+ U+4E00..U+9FCB if UCA_Version> is 20.
+
+ CJK Unified Ideographs Extension:
+ Ext.A (U+3400..U+4DB5) and Ext.B (U+20000..U+2A6D6) if UCA_Version < 20;
+ Ext.A, Ext.B and Ext.C (U+2A700..U+2B734) if UCA_Version is 20.
Through C<overrideCJK>, ordering of CJK Unified Ideographs can be overrided.
@@ -1476,6 +1532,12 @@ By default, F<allkeys.txt> (as the filename of DUCET) is used.
If you will prepare your own table file, any name other than F<allkeys.txt>
may be better to avoid namespace conflict.
+B<NOTE>: When XSUB is used, the DUCET is compiled on building this
+module, and it may save time at the run time.
+Explicit saying C<table =E<gt> 'allkeys.txt'> (or using another table),
+or using C<ignoreChar>, C<ignoreName>, C<undefChar>, or C<undefName>
+will prevent this module using the compiled DUCET.
+
If C<undef> is passed explicitly as the value for this key,
no file is read (but you can define collation elements via C<entry>).
@@ -1851,9 +1913,9 @@ SADAHIRO Tomoyuki. Japan. All rights reserved.
This module is free software; you can redistribute it and/or
modify it under the same terms as Perl itself.
-The file Unicode/Collate/allkeys.txt was copied directly
-from L<http://www.unicode.org/Public/UCA/5.1.0/allkeys.txt>.
-This file is Copyright (c) 1991-2008 Unicode, Inc. All rights reserved.
+The file Unicode/Collate/allkeys.txt was copied verbatim
+from L<http://www.unicode.org/Public/UCA/5.2.0/allkeys.txt>.
+This file is Copyright (c) 1991-2009 Unicode, Inc. All rights reserved.
Distributed under the Terms of Use in L<http://www.unicode.org/copyright.html>.
=head1 SEE ALSO