1 files changed, 64 insertions, 30 deletions
diff --git a/lib/Unicode/Collate.pm b/lib/Unicode/Collate.pm
index fa19afef4f..2bcc3155d2 100644
--- a/lib/Unicode/Collate.pm
+++ b/lib/Unicode/Collate.pm
@@ -14,7 +14,7 @@ use File::Spec;
 
 require Exporter;
 
-our $VERSION = '0.26';
+our $VERSION = '0.27';
 our $PACKAGE = __PACKAGE__;
 
 our @ISA = qw(Exporter);
@@ -225,17 +225,18 @@ sub checkCollator {
 	croak "Unicode/Normalize.pm is required to normalize strings: $@"
 	    if $@;
 
-	Unicode::Normalize->import();
 	$getCombinClass = \&Unicode::Normalize::getCombinClass
 	    if ! $getCombinClass;
 
-	$self->{normCode} =
-	    $self->{normalization} =~ /^(?:NF)?C$/  ? \&NFC :
-	    $self->{normalization} =~ /^(?:NF)?D$/  ? \&NFD :
-	    $self->{normalization} =~ /^(?:NF)?KC$/ ? \&NFKC :
-	    $self->{normalization} =~ /^(?:NF)?KD$/ ? \&NFKD :
-	  croak "$PACKAGE unknown normalization form name: "
-		. $self->{normalization};
+	my $norm = $self->{normalization};
+	$self->{normCode} = sub {
+		Unicode::Normalize::normalize($norm, shift);
+	    };
+
+	eval { $self->{normCode}->("") }; # try
+	if ($@) {
+	    croak "$PACKAGE unknown normalization form name: $norm";
+	}
     }
     return;
 }
@@ -477,10 +478,13 @@ sub splitCE
 
 	if ($max->{$ce}) { # contract
 	    my $temp_ce = $ce;
+	    my $ceLen = 1;
+	    my $maxLen = $max->{$ce};
 
-	    for (my $p = $i + 1; $p < @src; $p++) {
+	    for (my $p = $i + 1; $ceLen < $maxLen && $p < @src; $p++) {
 		next if ! defined $src[$p];
 		$temp_ce .= CODE_SEP . $src[$p];
+		$ceLen++;
 		if ($ent->{$temp_ce}) {
 		    $ce = $temp_ce;
 		    $i = $p;
@@ -524,8 +528,6 @@ sub getWt
     my $self = shift;
     my $ce   = shift;
     my $ent  = $self->{entries};
-    my $cjk  = $self->{overrideCJK};
-    my $hang = $self->{overrideHangul};
     my $der  = $self->{derivCode};
 
     return if !defined $ce;
@@ -536,18 +538,50 @@ sub getWt
     my $u = $ce;
 
     if (0xAC00 <= $u && $u <= 0xD7A3) { # is Hangul Syllale
-	return map $self->altCE($_),
-	    $hang
-		? map(pack(VCE_TEMPLATE, NON_VAR, @$_), &$hang($u))
-		: defined $hang
-		    ? map({
-			    $ent->{$_} ? @{ $ent->{$_} } : $der->($_);
-			} _decompHangul($u))
-		    : $der->($u);
+	my $hang = $self->{overrideHangul};
+	my @hangulCE;
+	if ($hang) {
+	    @hangulCE = map(pack(VCE_TEMPLATE, NON_VAR, @$_), &$hang($u));
+	}
+	elsif (!defined $hang) {
+	    @hangulCE = $der->($u);
+	}
+	else {
+	    my $max  = $self->{maxlength};
+	    my @decH = _decompHangul($u);
+
+	    if (@decH == 2) {
+		my $contract = join(CODE_SEP, @decH);
+		@decH = ($contract) if $ent->{$contract};
+	    } else { # must be <@decH == 3>
+		if ($max->{$decH[0]}) {
+		    my $contract = join(CODE_SEP, @decH);
+		    if ($ent->{$contract}) {
+			@decH = ($contract);
+		    } else {
+			$contract = join(CODE_SEP, @decH[0,1]);
+		        $ent->{$contract} and @decH = ($contract, $decH[2]);
+		    }
+		    # even if V's ignorable, LT contraction is not supported.
+		    # If such a situatution were required, NFD should be used.
+		}
+		if (@decH == 3 && $max->{$decH[1]}) {
+		    my $contract = join(CODE_SEP, @decH[1,2]);
+		    $ent->{$contract} and @decH = ($decH[0], $contract);
+		}
+	    }
+
+	    @hangulCE = map({
+		    $ent->{$_} ? @{ $ent->{$_} } : $der->($_);
+		} @decH);
+	}
+	return map $self->altCE($_), @hangulCE;
     }
     elsif (0x3400 <= $u && $u <= 0x4DB5 ||
 	   0x4E00 <= $u && $u <= 0x9FA5 ||
-	   0x20000 <= $u && $u <= 0x2A6D6) { # CJK Ideograph
+	   0x20000 <= $u && $u <= 0x2A6D6) # CJK Ideograph
+    {
+	my $cjk  = $self->{overrideCJK};
 	return map $self->altCE($_),
 	    $cjk
 		? map(pack(VCE_TEMPLATE, NON_VAR, @$_), &$cjk($u))
@@ -1092,14 +1126,12 @@ If omitted, the maximum is the 4th.
 If specified, strings are normalized before preparation of sort keys
 (the normalization is executed after preprocess).
 
-As a form name, one of the following names must be used.
+A form name C<Unicode::Normalize::normalize()> accepts will be applied
+as C<$normalization_form>.
+See C<Unicode::Normalize::normalize()> for detail.
+If omitted, C<'NFD'> is used.
 
-  'C'  or 'NFC'  for Normalization Form C
-  'D'  or 'NFD'  for Normalization Form D
-  'KC' or 'NFKC' for Normalization Form KC
-  'KD' or 'NFKD' for Normalization Form KD
-
-If omitted, the string is put into Normalization Form D.
+L<normalization> is performed after L<preprocess> (if defined).
 
 If C<undef> is passed explicitly as the value for this key,
 any normalization is not carried out (this may make tailoring easier
@@ -1169,9 +1201,11 @@ Then, "the pen" is before "a pencil".
      preprocess => sub {
            my $str = shift;
            $str =~ s/\b(?:an?|the)\s+//gi;
-           $str;
+           return $str;
         },
 
+L<preprocess> is performed before L<normalization> (if defined).
+
 =item rearrange
 
 -- see 3.1.3 Rearrangement, UTS #10.
@@ -1505,7 +1539,7 @@ B<Unicode::Normalize is required to try The Conformance Test.>
 
 =head1 AUTHOR
 
-SADAHIRO Tomoyuki, E<lt>SADAHIRO@cpan.orgE<gt>
+SADAHIRO Tomoyuki, <SADAHIRO@cpan.org>
 
   http://homepage1.nifty.com/nomenclator/perl/