1 files changed, 113 insertions, 51 deletions
diff --git a/cpan/Unicode-Collate/Collate.pm b/cpan/Unicode-Collate/Collate.pm
index 97abf1e683..e7c5513354 100644
--- a/cpan/Unicode-Collate/Collate.pm
+++ b/cpan/Unicode-Collate/Collate.pm
@@ -14,7 +14,7 @@ use File::Spec;
 
 no warnings 'utf8';
 
-our $VERSION = '0.53';
+our $VERSION = '0.55';
 our $PACKAGE = __PACKAGE__;
 
 my @Path = qw(Unicode Collate);
@@ -71,36 +71,49 @@ use constant NON_VAR => 0; # Non-Variable character
 use constant VAR     => 1; # Variable character
 
 # specific code points
+use constant Hangul_SBase  => 0xAC00;
+use constant Hangul_SIni   => 0xAC00;
+use constant Hangul_SFin   => 0xD7A3;
+use constant Hangul_NCount => 588;
+use constant Hangul_TCount => 28;
 use constant Hangul_LBase  => 0x1100;
 use constant Hangul_LIni   => 0x1100;
 use constant Hangul_LFin   => 0x1159;
 use constant Hangul_LFill  => 0x115F;
+use constant Hangul_LEnd   => 0x115F; # Unicode 5.2.0
 use constant Hangul_VBase  => 0x1161;
 use constant Hangul_VIni   => 0x1160; # from Vowel Filler
 use constant Hangul_VFin   => 0x11A2;
+use constant Hangul_VEnd   => 0x11A7; # Unicode 5.2.0
 use constant Hangul_TBase  => 0x11A7; # from "no-final" codepoint
 use constant Hangul_TIni   => 0x11A8;
 use constant Hangul_TFin   => 0x11F9;
-use constant Hangul_TCount => 28;
-use constant Hangul_NCount => 588;
-use constant Hangul_SBase  => 0xAC00;
-use constant Hangul_SIni   => 0xAC00;
-use constant Hangul_SFin   => 0xD7A3;
+use constant Hangul_TEnd   => 0x11FF; # Unicode 5.2.0
+use constant HangulL2Ini   => 0xA960; # Unicode 5.2.0
+use constant HangulL2Fin   => 0xA97C; # Unicode 5.2.0
+use constant HangulV2Ini   => 0xD7B0; # Unicode 5.2.0
+use constant HangulV2Fin   => 0xD7C6; # Unicode 5.2.0
+use constant HangulT2Ini   => 0xD7CB; # Unicode 5.2.0
+use constant HangulT2Fin   => 0xD7FB; # Unicode 5.2.0
+
 use constant CJK_UidIni    => 0x4E00;
 use constant CJK_UidFin    => 0x9FA5;
 use constant CJK_UidF41    => 0x9FBB;
 use constant CJK_UidF51    => 0x9FC3;
+use constant CJK_UidF52    => 0x9FCB;
 use constant CJK_ExtAIni   => 0x3400;
 use constant CJK_ExtAFin   => 0x4DB5;
 use constant CJK_ExtBIni   => 0x20000;
 use constant CJK_ExtBFin   => 0x2A6D6;
+use constant CJK_ExtCIni   => 0x2A700; # Unicode 5.2.0
+use constant CJK_ExtCFin   => 0x2B734; # Unicode 5.2.0
 
 # Logical_Order_Exception in PropList.txt
 my $DefaultRearrange = [ 0x0E40..0x0E44, 0x0EC0..0x0EC4 ];
 
-sub UCA_Version { "18" }
+sub UCA_Version { "20" }
 
-sub Base_Unicode_Version { "5.1.0" }
+sub Base_Unicode_Version { "5.2.0" }
 
 ######
 
@@ -130,8 +143,7 @@ our @ChangeNG = qw/
     entry mapping table maxlength
     ignoreChar ignoreName undefChar undefName variableTable
     versionTable alternateTable backwardsTable forwardsTable rearrangeTable
-    derivCode normCode rearrangeHash
-    backwardsFlag
+    derivCode normCode rearrangeHash backwardsFlag
   /;
 # The hash key 'ignored' is deleted at v 0.21.
 # The hash key 'isShift' is deleted at v 0.23.
@@ -190,6 +202,7 @@ my %DerivCode = (
    14 => \&_derivCE_14,
    16 => \&_derivCE_14, # 16 == 14
    18 => \&_derivCE_18,
+   20 => \&_derivCE_20,
 );
 
 sub checkCollator {
@@ -293,6 +306,30 @@ sub new
     return $self;
 }
 
+sub parseAtmark {
+    my $self = shift;
+    my $line = shift; # after s/^\s*\@//
+
+    if ($line =~ /^version\s*(\S*)/) {
+	$self->{versionTable} ||= $1;
+    }
+    elsif ($line =~ /^variable\s+(\S*)/) { # since UTS #10-9
+	$self->{variableTable} ||= $1;
+    }
+    elsif ($line =~ /^alternate\s+(\S*)/) { # till UTS #10-8
+	$self->{alternateTable} ||= $1;
+    }
+    elsif ($line =~ /^backwards\s+(\S*)/) {
+	push @{ $self->{backwardsTable} }, $1;
+    }
+    elsif ($line =~ /^forwards\s+(\S*)/) { # parhaps no use
+	push @{ $self->{forwardsTable} }, $1;
+    }
+    elsif ($line =~ /^rearrange\s+(.*)/) { # (\S*) is NG
+	push @{ $self->{rearrangeTable} }, _getHexArray($1);
+    }
+}
+
 sub read_table {
     my $self = shift;
 
@@ -309,29 +346,11 @@ sub read_table {
 
     while (my $line = <$fh>) {
 	next if $line =~ /^\s*#/;
-	unless ($line =~ s/^\s*\@//) {
-	    $self->parseEntry($line);
-	    next;
-	}
 
-	# matched ^\s*\@
-	if ($line =~ /^version\s*(\S*)/) {
-	    $self->{versionTable} ||= $1;
-	}
-	elsif ($line =~ /^variable\s+(\S*)/) { # since UTS #10-9
-	    $self->{variableTable} ||= $1;
-	}
-	elsif ($line =~ /^alternate\s+(\S*)/) { # till UTS #10-8
-	    $self->{alternateTable} ||= $1;
-	}
-	elsif ($line =~ /^backwards\s+(\S*)/) {
-	    push @{ $self->{backwardsTable} }, $1;
-	}
-	elsif ($line =~ /^forwards\s+(\S*)/) { # parhaps no use
-	    push @{ $self->{forwardsTable} }, $1;
-	}
-	elsif ($line =~ /^rearrange\s+(.*)/) { # (\S*) is NG
-	    push @{ $self->{rearrangeTable} }, _getHexArray($1);
+	if ($line =~ s/^\s*\@//) {
+	    $self->parseAtmark($line);
+	} else {
+	    $self->parseEntry($line);
 	}
     }
     close $fh;
@@ -651,8 +670,8 @@ sub getSortKey
     my $self = shift;
     my $lev  = $self->{level};
     my $rEnt = $self->splitEnt(shift); # get an arrayref of JCPS
-    my $v2i  = $self->{UCA_Version} >= 9 &&
-		$self->{variable} ne 'non-ignorable';
+    my $vers = $self->{UCA_Version};
+    my $v2i  = $vers >= 9 && $self->{variable} ne 'non-ignorable';
 
     my @buf; # weight arrays
     if ($self->{hangul_terminator}) {
@@ -661,7 +680,7 @@ sub getSortKey
 	    # weird things like VL, TL-contraction are not considered!
 	    my $curHST = '';
 	    foreach my $u (split /;/, $jcps) {
-		$curHST .= getHST($u);
+		$curHST .= getHST($u, $vers);
 	    }
 	    if ($preHST && !$curHST || # hangul before non-hangul
 		$preHST =~ /L\z/ && $curHST =~ /^T/ ||
@@ -758,6 +777,19 @@ sub sort {
 }
 
 
+sub _derivCE_20 {
+    my $u = shift;
+    my $base = (CJK_UidIni  <= $u && $u <= CJK_UidF52) ? 0xFB40 : # CJK
+	       (CJK_ExtAIni <= $u && $u <= CJK_ExtAFin ||
+		CJK_ExtBIni <= $u && $u <= CJK_ExtBFin ||
+		CJK_ExtCIni <= $u && $u <= CJK_ExtCFin) ? 0xFB80  # CJK ext.
+							: 0xFBC0; # others
+    my $aaaa = $base + ($u >> 15);
+    my $bbbb = ($u & 0x7FFF) | 0x8000;
+    return pack(VCE_TEMPLATE, NON_VAR, $aaaa, Min2Wt, Min3Wt, $u),
+	   pack(VCE_TEMPLATE, NON_VAR, $bbbb,      0,      0, $u);
+}
+
 sub _derivCE_18 {
     my $u = shift;
     my $base = (CJK_UidIni  <= $u && $u <= CJK_UidF51) ? 0xFB40 : # CJK
@@ -811,6 +843,7 @@ sub _uideoCE_8 {
 sub _isUIdeo {
     my ($u, $uca_vers) = @_;
     return((CJK_UidIni <= $u && (
+	    $uca_vers >= 20 ? ($u <= CJK_UidF52) :
 	    $uca_vers >= 18 ? ($u <= CJK_UidF51) :
 	    $uca_vers >= 14 ? ($u <= CJK_UidF41) :
 			      ($u <= CJK_UidFin)))
@@ -818,6 +851,9 @@ sub _isUIdeo {
 	(CJK_ExtAIni <= $u && $u <= CJK_ExtAFin)
 		||
 	(CJK_ExtBIni <= $u && $u <= CJK_ExtBFin)
+		||
+	($uca_vers >= 20 &&
+	 CJK_ExtCIni <= $u && $u <= CJK_ExtCFin)
     );
 }
 
@@ -864,12 +900,25 @@ sub _isIllegal {
 # Hangul Syllable Type
 sub getHST {
     my $u = shift;
-    return
-	Hangul_LIni <= $u && $u <= Hangul_LFin || $u == Hangul_LFill ? "L" :
-	Hangul_VIni <= $u && $u <= Hangul_VFin	     ? "V" :
-	Hangul_TIni <= $u && $u <= Hangul_TFin	     ? "T" :
-	Hangul_SIni <= $u && $u <= Hangul_SFin ?
-	    ($u - Hangul_SBase) % Hangul_TCount ? "LVT" : "LV" : "";
+    my $vers = shift || 0;
+
+    if (Hangul_SIni <= $u && $u <= Hangul_SFin) {
+	return +($u - Hangul_SBase) % Hangul_TCount ? "LVT" : "LV";
+    }
+
+    if ($vers < 20) {
+	return Hangul_LIni <= $u && $u <= Hangul_LFin ||
+			            $u == Hangul_LFill ? "L" :
+	       Hangul_VIni <= $u && $u <= Hangul_VFin  ? "V" :
+	       Hangul_TIni <= $u && $u <= Hangul_TFin  ? "T" : "";
+    } else {
+	return Hangul_LIni <= $u && $u <= Hangul_LEnd ||
+	       HangulL2Ini <= $u && $u <= HangulL2Fin  ? "L" :
+	       Hangul_VIni <= $u && $u <= Hangul_VEnd ||
+	       HangulV2Ini <= $u && $u <= HangulV2Fin  ? "V" :
+	       Hangul_TIni <= $u && $u <= Hangul_TEnd ||
+	       HangulT2Ini <= $u && $u <= HangulT2Fin  ? "T" : "";
+    }
 }
 
 
@@ -1178,7 +1227,7 @@ behavior of that tracking version is emulated on collating.
 If omitted, the return value of C<UCA_Version()> is used.
 C<UCA_Version()> should return the latest tracking version supported.
 
-The supported tracking version: 8, 9, 11, 14, 16 or 18.
+The supported tracking version: 8, 9, 11, 14, 16, 18 or 20.
 
      UCA       Unicode Standard         DUCET (@version)
      ---------------------------------------------------
@@ -1188,6 +1237,7 @@ The supported tracking version: 8, 9, 11, 14, 16 or 18.
      14             4.1.0               4.1.0 (4.1.0)
      16              5.0                5.0.0 (5.0.0)
      18             5.1.0               5.1.0 (5.1.0)
+     20             5.2.0               5.2.0 (5.2.0)
 
 Note: Recent UTS #10 renames "Tracking Version" to "Revision."
 
@@ -1375,12 +1425,18 @@ B<Unicode::Normalize> is required (see also B<CAVEAT>).
 -- see 7.1 Derived Collation Elements, UTS #10.
 
 By default, CJK Unified Ideographs are ordered in Unicode codepoint
-order but C<CJK Unified Ideographs>
-(if C<UCA_Version> is 8 to 11, its range is C<U+4E00..U+9FA5>;
-if C<UCA_Version> is 14 to 16, its range is C<U+4E00..U+9FBB>;
-if C<UCA_Version> is 18, its range is C<U+4E00..U+9FC3>)
-are lesser than C<CJK Unified Ideographs Extension>
-(its range is C<U+3400..U+4DB5> and C<U+20000..U+2A6D6>).
+order but C<CJK Unified Ideographs> are lesser than
+C<CJK Unified Ideographs Extension>.
+
+    CJK Unified Ideographs:
+    U+4E00..U+9FA5 if UCA_Version is 8 to 11;
+    U+4E00..U+9FBB if UCA_Version is 14 to 16;
+    U+4E00..U+9FC3 if UCA_Version is 18;
+    U+4E00..U+9FCB if UCA_Version> is 20.
+
+    CJK Unified Ideographs Extension:
+    Ext.A (U+3400..U+4DB5) and Ext.B (U+20000..U+2A6D6) if UCA_Version < 20;
+    Ext.A, Ext.B and Ext.C (U+2A700..U+2B734) if UCA_Version is 20.
 
 Through C<overrideCJK>, ordering of CJK Unified Ideographs can be overrided.
 
@@ -1476,6 +1532,12 @@ By default, F<allkeys.txt> (as the filename of DUCET) is used.
 If you will prepare your own table file, any name other than F<allkeys.txt>
 may be better to avoid namespace conflict.
 
+B<NOTE>: When XSUB is used, the DUCET is compiled on building this
+module, and it may save time at the run time.
+Explicit saying C<table =E<gt> 'allkeys.txt'> (or using another table),
+or using C<ignoreChar>, C<ignoreName>, C<undefChar>, or C<undefName>
+will prevent this module using the compiled DUCET.
+
 If C<undef> is passed explicitly as the value for this key,
 no file is read (but you can define collation elements via C<entry>).
 
@@ -1851,9 +1913,9 @@ SADAHIRO Tomoyuki. Japan. All rights reserved.
 This module is free software; you can redistribute it and/or
 modify it under the same terms as Perl itself.
 
-The file Unicode/Collate/allkeys.txt was copied directly
-from L<http://www.unicode.org/Public/UCA/5.1.0/allkeys.txt>.
-This file is Copyright (c) 1991-2008 Unicode, Inc. All rights reserved.
+The file Unicode/Collate/allkeys.txt was copied verbatim
+from L<http://www.unicode.org/Public/UCA/5.2.0/allkeys.txt>.
+This file is Copyright (c) 1991-2009 Unicode, Inc. All rights reserved.
 Distributed under the Terms of Use in L<http://www.unicode.org/copyright.html>.
 
 =head1 SEE ALSO