summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorChris 'BinGOs' Williams <chris@bingosnet.co.uk>2010-06-27 19:51:20 +0100
committerChris 'BinGOs' Williams <chris@bingosnet.co.uk>2010-06-27 19:51:20 +0100
commita96160d631b54897af2f2540335c2ca854dcea68 (patch)
tree2cf0106edd8be54b706dbc2ba0df8a0da0cd7129
parent6ad8f254c95c6d4523948ded91d651dcc490dee5 (diff)
downloadperl-a96160d631b54897af2f2540335c2ca854dcea68.tar.gz
Update Unicode-Normalize to CPAN release 1.06
[DELTA] 1.06 Thu Feb 11 16:19:54 2010 - Pure Perl/mkheader: fixed the internal _getHexArray() for perl 5.11.3 changes (Bug #53197). 1.05 Mon Sep 28 21:43:17 2009 - normalize_partial() and NFX_partial(). { NFX =~ /^NFK?[CD]\z/ } - added partial1.t for NFX_partial(). - added partial2.t for normalize_partial(). 1.04 Wed Sep 23 22:32:57 2009 - doc: splitOnLastStarter() since 0.24 is now documented. - test: some new tests are added to split.t.
-rw-r--r--MANIFEST2
-rwxr-xr-xPorting/Maintainers.pl2
-rw-r--r--cpan/Unicode-Normalize/Changes16
-rw-r--r--cpan/Unicode-Normalize/Normalize.pm135
-rw-r--r--cpan/Unicode-Normalize/README4
-rw-r--r--cpan/Unicode-Normalize/mkheader143
-rw-r--r--cpan/Unicode-Normalize/t/partial1.t104
-rw-r--r--cpan/Unicode-Normalize/t/partial2.t100
-rw-r--r--cpan/Unicode-Normalize/t/split.t68
-rw-r--r--cpan/Unicode-Normalize/t/test.t18
10 files changed, 490 insertions, 102 deletions
diff --git a/MANIFEST b/MANIFEST
index bc386928a0..9a7f322000 100644
--- a/MANIFEST
+++ b/MANIFEST
@@ -2512,6 +2512,8 @@ cpan/Unicode-Normalize/t/form.t Unicode::Normalize
cpan/Unicode-Normalize/t/func.t Unicode::Normalize
cpan/Unicode-Normalize/t/illegal.t Unicode::Normalize
cpan/Unicode-Normalize/t/norm.t Unicode::Normalize
+cpan/Unicode-Normalize/t/partial1.t Unicode::Normalize
+cpan/Unicode-Normalize/t/partial2.t Unicode::Normalize
cpan/Unicode-Normalize/t/null.t Unicode::Normalize
cpan/Unicode-Normalize/t/proto.t Unicode::Normalize
cpan/Unicode-Normalize/t/split.t Unicode::Normalize
diff --git a/Porting/Maintainers.pl b/Porting/Maintainers.pl
index e8d674d35e..18e6995319 100755
--- a/Porting/Maintainers.pl
+++ b/Porting/Maintainers.pl
@@ -1501,7 +1501,7 @@ use File::Glob qw(:case);
'Unicode::Normalize' =>
{
'MAINTAINER' => 'sadahiro',
- 'DISTRIBUTION' => 'SADAHIRO/Unicode-Normalize-1.03.tar.gz',
+ 'DISTRIBUTION' => 'SADAHIRO/Unicode-Normalize-1.06-withoutworldwriteables.tar.gz',
'FILES' => q[cpan/Unicode-Normalize],
'EXCLUDED' => [ qw{MANIFEST.N Normalize.pmN disableXS enableXS }],
'UPSTREAM' => 'first-come',
diff --git a/cpan/Unicode-Normalize/Changes b/cpan/Unicode-Normalize/Changes
index e9cb3918a5..433edb4db7 100644
--- a/cpan/Unicode-Normalize/Changes
+++ b/cpan/Unicode-Normalize/Changes
@@ -1,5 +1,18 @@
Revision history for Perl extension Unicode::Normalize.
+1.06 Thu Feb 11 16:19:54 2010
+ - Pure Perl/mkheader: fixed the internal _getHexArray() for perl 5.11.3
+ changes (Bug #53197).
+
+1.05 Mon Sep 28 21:43:17 2009
+ - normalize_partial() and NFX_partial(). { NFX =~ /^NFK?[CD]\z/ }
+ - added partial1.t for NFX_partial().
+ - added partial2.t for normalize_partial().
+
+1.04 Wed Sep 23 22:32:57 2009
+ - doc: splitOnLastStarter() since 0.24 is now documented.
+ - test: some new tests are added to split.t.
+
1.03 Sun Mar 29 12:56:23 2009
- mkheader: check if no composition needs growing the string.
- Makefile.PL: a tweak
@@ -69,7 +82,8 @@ Revision history for Perl extension Unicode::Normalize.
script files, named "enableXS" and "disableXS".
(no longer <perl Makefile.PL xs> and <perl Makefile.PL noxs>.)
* simplified Makefile.PL.
- - added fcdc.t and split.t.
+ - added fcdc.t for FCD() and FCC().
+ - added split.t for splitOnLastStarter(): an undocumented function.
0.23 Sat Jun 28 20:38:10 2003
- bug fix: \0-terminate in compose() in XS.
diff --git a/cpan/Unicode-Normalize/Normalize.pm b/cpan/Unicode-Normalize/Normalize.pm
index ad5ff82a83..00be139070 100644
--- a/cpan/Unicode-Normalize/Normalize.pm
+++ b/cpan/Unicode-Normalize/Normalize.pm
@@ -13,13 +13,9 @@ use Carp;
no warnings 'utf8';
-our $VERSION = '1.03';
+our $VERSION = '1.06';
our $PACKAGE = __PACKAGE__;
-require Exporter;
-require DynaLoader;
-
-our @ISA = qw(Exporter DynaLoader);
our @EXPORT = qw( NFC NFD NFKC NFKD );
our @EXPORT_OK = qw(
normalize decompose reorder compose
@@ -27,8 +23,8 @@ our @EXPORT_OK = qw(
getCanon getCompat getComposite getCombinClass
isExclusion isSingleton isNonStDecomp isComp2nd isComp_Ex
isNFD_NO isNFC_NO isNFC_MAYBE isNFKD_NO isNFKC_NO isNFKC_MAYBE
- FCD checkFCD FCC checkFCC composeContiguous
- splitOnLastStarter
+ FCD checkFCD FCC checkFCC composeContiguous splitOnLastStarter
+ normalize_partial NFC_partial NFD_partial NFKC_partial NFKD_partial
);
our %EXPORT_TAGS = (
all => [ @EXPORT, @EXPORT_OK ],
@@ -37,12 +33,6 @@ our %EXPORT_TAGS = (
fast => [ qw/FCD checkFCD FCC checkFCC composeContiguous/ ],
);
-######
-
-bootstrap Unicode::Normalize $VERSION;
-
-######
-
##
## utilites for tests
##
@@ -55,9 +45,18 @@ sub unpack_U {
return unpack('U*', shift(@_).pack('U*'));
}
+require Exporter;
+
+##### The above part is common to XS and PP #####
+
+our @ISA = qw(Exporter DynaLoader);
+require DynaLoader;
+bootstrap Unicode::Normalize $VERSION;
+
+##### The below part is common to XS and PP #####
##
-## normalization forms
+## normalize
##
sub FCD ($) {
@@ -83,9 +82,27 @@ sub normalize($$)
croak($PACKAGE."::normalize: invalid form name: $form");
}
+##
+## partial
+##
+
+sub normalize_partial ($$) {
+ if (exists $formNorm{$_[0]}) {
+ my $n = normalize($_[0], $_[1]);
+ my($p, $u) = splitOnLastStarter($n);
+ $_[1] = $u;
+ return $p;
+ }
+ croak($PACKAGE."::normalize_partial: invalid form name: $_[0]");
+}
+
+sub NFD_partial ($) { return normalize_partial('NFD', $_[0]) }
+sub NFC_partial ($) { return normalize_partial('NFC', $_[0]) }
+sub NFKD_partial($) { return normalize_partial('NFKD',$_[0]) }
+sub NFKC_partial($) { return normalize_partial('NFKC',$_[0]) }
##
-## quick check
+## check
##
our %formCheck = (
@@ -239,6 +256,82 @@ you can get its NFC/NFKC string, by saying
$NFC_string = compose($NFD_string);
$NFKC_string = compose($NFKD_string);
+=item C<($processed, $unprocessed) = splitOnLastStarter($normalized)>
+
+It returns two strings: the first one, C<$processed>, is a part
+before the last starter, and the second one, C<$unprocessed> is
+another part after the first part. A starter is a character having
+a combining class of zero (see UAX #15).
+
+Note that C<$processed> may be empty (when C<$normalized> contains no
+starter or starts with the last starter), and then C<$unprocessed>
+should be equal to the entire C<$normalized>.
+
+When you have a C<$normalized> string and an C<$unnormalized> string
+following it, a simple concatenation is wrong:
+
+ $concat = $normalized . normalize($form, $unnormalized); # wrong!
+
+Instead of it, do like this:
+
+ ($processed, $unprocessed) = splitOnLastStarter($normalized);
+ $concat = $processed . normalize($form, $unprocessed.$unnormalized);
+
+C<splitOnLastStarter()> should be called with a pre-normalized parameter
+C<$normalized>, that is in the same form as C<$form> you want.
+
+If you have an array of C<@string> that should be concatenated and then
+normalized, you can do like this:
+
+ my $result = "";
+ my $unproc = "";
+ foreach my $str (@string) {
+ $unproc .= $str;
+ my $n = normalize($form, $unproc);
+ my($p, $u) = splitOnLastStarter($n);
+ $result .= $p;
+ $unproc = $u;
+ }
+ $result .= $unproc;
+ # instead of normalize($form, join('', @string))
+
+=item C<$processed = normalize_partial($form, $unprocessed)>
+
+A wrapper for the combination of C<normalize()> and C<splitOnLastStarter()>.
+Note that C<$unprocessed> will be modified as a side-effect.
+
+If you have an array of C<@string> that should be concatenated and then
+normalized, you can do like this:
+
+ my $result = "";
+ my $unproc = "";
+ foreach my $str (@string) {
+ $unproc .= $str;
+ $result .= normalize_partial($form, $unproc);
+ }
+ $result .= $unproc;
+ # instead of normalize($form, join('', @string))
+
+=item C<$processed = NFD_partial($unprocessed)>
+
+It does like C<normalize_partial('NFD', $unprocessed)>.
+Note that C<$unprocessed> will be modified as a side-effect.
+
+=item C<$processed = NFC_partial($unprocessed)>
+
+It does like C<normalize_partial('NFC', $unprocessed)>.
+Note that C<$unprocessed> will be modified as a side-effect.
+
+=item C<$processed = NFKD_partial($unprocessed)>
+
+It does like C<normalize_partial('NFKD', $unprocessed)>.
+Note that C<$unprocessed> will be modified as a side-effect.
+
+=item C<$processed = NFKC_partial($unprocessed)>
+
+It does like C<normalize_partial('NFKC', $unprocessed)>.
+Note that C<$unprocessed> will be modified as a side-effect.
+
=back
=head2 Quick Check
@@ -321,15 +414,15 @@ while C<"B\N{COMBINING ACUTE ACCENT}"> is in NFC.
If you want to check exactly, compare the string with its NFC/NFKC/FCC.
if ($string eq NFC($string)) {
- # $string is exactly normalized in NFC;
+ # $string is exactly normalized in NFC;
} else {
- # $string is not normalized in NFC;
+ # $string is not normalized in NFC;
}
if ($string eq NFKC($string)) {
- # $string is exactly normalized in NFKC;
+ # $string is exactly normalized in NFKC;
} else {
- # $string is not normalized in NFKC;
+ # $string is not normalized in NFKC;
}
=head2 Character Data
@@ -454,7 +547,7 @@ normalization implemented by this module depends on your perl's version.
5.8.4-5.8.6 4.0.1 (normalization is same as 4.0.0)
5.8.7-5.8.8 4.1.0
5.10.0 5.0.0
- 5.8.9 5.1.0
+ 5.8.9, 5.10.1 5.1.0
=item Correction of decomposition mapping
@@ -482,7 +575,7 @@ lower than 4.1.0.
SADAHIRO Tomoyuki <SADAHIRO@cpan.org>
-Copyright(C) 2001-2007, SADAHIRO Tomoyuki. Japan. All rights reserved.
+Copyright(C) 2001-2010, SADAHIRO Tomoyuki. Japan. All rights reserved.
This module is free software; you can redistribute it
and/or modify it under the same terms as Perl itself.
diff --git a/cpan/Unicode-Normalize/README b/cpan/Unicode-Normalize/README
index 8a5390cc78..06c9d0c83f 100644
--- a/cpan/Unicode-Normalize/README
+++ b/cpan/Unicode-Normalize/README
@@ -1,4 +1,4 @@
-Unicode/Normalize version 1.03
+Unicode/Normalize version 1.06
===================================
Unicode::Normalize - Unicode Normalization Forms
@@ -83,7 +83,7 @@ COPYRIGHT AND LICENSE
SADAHIRO Tomoyuki <SADAHIRO@cpan.org>
-Copyright(C) 2001-2007, SADAHIRO Tomoyuki. Japan. All rights reserved.
+Copyright(C) 2001-2010, SADAHIRO Tomoyuki. Japan. All rights reserved.
This module is free software; you can redistribute it and/or
modify it under the same terms as Perl itself.
diff --git a/cpan/Unicode-Normalize/mkheader b/cpan/Unicode-Normalize/mkheader
index b3e3c3153c..b6d153c552 100644
--- a/cpan/Unicode-Normalize/mkheader
+++ b/cpan/Unicode-Normalize/mkheader
@@ -32,30 +32,37 @@ BEGIN {
our $PACKAGE = 'Unicode::Normalize, mkheader';
-our $Combin = do "unicore/CombiningClass.pl"
- || do "unicode/CombiningClass.pl"
- || croak "$PACKAGE: CombiningClass.pl not found";
+our $prefix = "UNF_";
+our $structname = "${prefix}complist";
-our $Decomp = do "unicore/Decomposition.pl"
- || do "unicode/Decomposition.pl"
- || croak "$PACKAGE: Decomposition.pl not found";
+sub pack_U {
+ return pack('U*', @_);
+}
+
+# %Canon and %Compat will be ($codepoint => $hexstring) after _U_stringify()
+our %Comp1st; # $codepoint => $listname : may be composed with a next char.
+our %CompList; # $listname,$2nd => $codepoint : composite
+
+##### The below part is common to mkheader and PP #####
our %Combin; # $codepoint => $number : combination class
our %Canon; # $codepoint => \@codepoints : canonical decomp.
our %Compat; # $codepoint => \@codepoints : compat. decomp.
-# after _U_stringify(), ($codepoint => $hexstring) for %Canon and %Compat
+our %Compos; # $1st,$2nd => $codepoint : composite
our %Exclus; # $codepoint => 1 : composition exclusions
our %Single; # $codepoint => 1 : singletons
our %NonStD; # $codepoint => 1 : non-starter decompositions
-
-our %Comp1st; # $codepoint => $listname : may be composed with a next char.
our %Comp2nd; # $codepoint => 1 : may be composed with a prev char.
-our %CompList; # $listname,$2nd => $codepoint : composite
-our $prefix = "UNF_";
-our $structname = "${prefix}complist";
+# from Unicode database
+our $Combin = do "unicore/CombiningClass.pl"
+ || do "unicode/CombiningClass.pl"
+ || croak "$PACKAGE: CombiningClass.pl not found";
+our $Decomp = do "unicore/Decomposition.pl"
+ || do "unicode/Decomposition.pl"
+ || croak "$PACKAGE: Decomposition.pl not found";
-########## definition of Hangul constants ##########
+# definition of Hangul constants
use constant SBase => 0xAC00;
use constant SFinal => 0xD7A3; # SBase -1 + SCount
use constant SCount => 11172; # LCount * NCount
@@ -71,48 +78,26 @@ use constant TFinal => 0x11C2;
use constant TCount => 28;
sub decomposeHangul {
- my $SIndex = $_[0] - SBase;
- my $LIndex = int( $SIndex / NCount);
- my $VIndex = int(($SIndex % NCount) / TCount);
- my $TIndex = $SIndex % TCount;
+ my $sindex = $_[0] - SBase;
+ my $lindex = int( $sindex / NCount);
+ my $vindex = int(($sindex % NCount) / TCount);
+ my $tindex = $sindex % TCount;
my @ret = (
- LBase + $LIndex,
- VBase + $VIndex,
- $TIndex ? (TBase + $TIndex) : (),
+ LBase + $lindex,
+ VBase + $vindex,
+ $tindex ? (TBase + $tindex) : (),
);
- return @ret;
-}
-
-########## length of a character ##########
-
-sub utf8len {
- my $uv = shift;
- return $uv < 0x80 ? 1 :
- $uv < 0x800 ? 2 :
- $uv < 0x10000 ? 3 :
- $uv < 0x110000 ? 4 :
- croak "$PACKAGE: illegal char in the composite. codepoint max is 0x10ffff.";
-}
-
-sub utfelen {
- my $uv = shift;
- return $uv < 0xA0 ? 1 :
- $uv < 0x400 ? 2 :
- $uv < 0x4000 ? 3 :
- $uv < 0x40000 ? 4 :
- $uv < 0x110000 ? 5 :
- croak "$PACKAGE: illegal char in the composite. codepoint max is 0x10ffff.";
+ return wantarray ? @ret : pack_U(@ret);
}
-my $errExpand = "$PACKAGE: Composition to U+%04X (from U+%04X and U+%04X) " .
- "needs growing the string in %s! Quit. Please inform the author...";
-
########## getting full decomposion ##########
{
my($f, $fh);
foreach my $d (@INC) {
$f = File::Spec->catfile($d, "unicore", "CompositionExclusions.txt");
last if open($fh, $f);
+ $f = File::Spec->catfile($d, "unicore", "CompExcl.txt");
+ last if open($fh, $f);
$f = File::Spec->catfile($d, "unicode", "CompExcl.txt");
last if open($fh, $f);
$f = undef;
@@ -128,10 +113,9 @@ my $errExpand = "$PACKAGE: Composition to U+%04X (from U+%04X and U+%04X) " .
close $fh;
}
-##
## converts string "hhhh hhhh hhhh" to a numeric list
-##
-sub _getHexArray { map hex, $_[0] =~ /([0-9A-Fa-f]+)/g }
+## (hex digits separated by spaces)
+sub _getHexArray { map hex, $_[0] =~ /\G *([0-9A-Fa-f]+)/g }
while ($Combin =~ /(.+)/g) {
my @tab = split /\t/, $1;
@@ -147,14 +131,10 @@ while ($Decomp =~ /(.+)/g) {
my @tab = split /\t/, $1;
my $compat = $tab[2] =~ s/<[^>]+>//;
my $dec = [ _getHexArray($tab[2]) ]; # decomposition
- my $ini = hex($tab[0]);
+ my $ini = hex($tab[0]); # initial decomposable character
my $end = $tab[1] eq '' ? $ini : hex($tab[1]);
# ($ini .. $end) is the range of decomposable characters.
- my $listname =
- @$dec == 2 ? sprintf("${structname}_%06x", $dec->[0]) : 'USELESS';
- # %04x is bad since it'd place _3046 after _1d157.
-
foreach my $u ($ini .. $end) {
$Compat{$u} = $dec;
@@ -162,20 +142,10 @@ while ($Decomp =~ /(.+)/g) {
$Canon{$u} = $dec;
if (@$dec == 2) {
- if (utf8len($dec->[0]) + utf8len($dec->[1]) < utf8len($u)) {
- croak sprintf $errExpand, $u, $dec->[0], $dec->[1],
- "utf-8";
- }
- if (utfelen($dec->[0]) + utfelen($dec->[1]) < utfelen($u)) {
- croak sprintf $errExpand, $u, $dec->[0], $dec->[1],
- "utf-ebcdic";
- }
-
if ($Combin{ $dec->[0] }) {
$NonStD{$u} = 1;
} else {
- $CompList{ $listname }{ $dec->[1] } = $u;
- $Comp1st{ $dec->[0] } = $listname;
+ $Compos{ $dec->[0] }{ $dec->[1] } = $u;
$Comp2nd{ $dec->[1] } = 1 if ! $Exclus{$u};
}
} elsif (@$dec == 1) {
@@ -222,8 +192,47 @@ foreach my $key (keys %Compat) {
$Compat{$key} = [ getCompatList($key) ];
}
-sub _pack_U {
- return pack('U*', @_);
+##### The above part is common to mkheader and PP #####
+
+sub utf8len {
+ my $uv = shift;
+ return $uv < 0x80 ? 1 :
+ $uv < 0x800 ? 2 :
+ $uv < 0x10000 ? 3 :
+ $uv < 0x110000 ? 4 :
+ croak "$PACKAGE: illegal char in the composite. codepoint max is 0x10ffff.";
+}
+
+sub utfelen {
+ my $uv = shift;
+ return $uv < 0xA0 ? 1 :
+ $uv < 0x400 ? 2 :
+ $uv < 0x4000 ? 3 :
+ $uv < 0x40000 ? 4 :
+ $uv < 0x110000 ? 5 :
+ croak "$PACKAGE: illegal char in the composite. codepoint max is 0x10ffff.";
+}
+
+my $errExpand = "$PACKAGE: Composition to U+%04X (from U+%04X and U+%04X) " .
+ "needs growing the string in %s! Quit. Please inform the author...";
+
+foreach my $comp1st (keys %Compos) {
+ my $listname = sprintf("${structname}_%06x", $comp1st);
+ # %04x is bad since it'd place _3046 after _1d157.
+ $Comp1st{$comp1st} = $listname;
+ my $rh1st = $Compos{$comp1st};
+
+ foreach my $comp2nd (keys %$rh1st) {
+ my $uc = $rh1st->{$comp2nd};
+ $CompList{$listname}{$comp2nd} = $uc;
+
+ if (utf8len($comp1st) + utf8len($comp2nd) < utf8len($uc)) {
+ croak sprintf $errExpand, $uc, $comp1st, $comp2nd, "utf-8";
+ }
+ if (utfelen($comp1st) + utfelen($comp2nd) < utfelen($uc)) {
+ croak sprintf $errExpand, $uc, $comp1st, $comp2nd, "utf-ebcdic";
+ }
+ }
}
sub split_into_char {
@@ -239,7 +248,7 @@ sub split_into_char {
sub _U_stringify {
sprintf '"%s"', join '',
- map sprintf("\\x%02x", $_), split_into_char(_pack_U(@_));
+ map sprintf("\\x%02x", $_), split_into_char(pack_U(@_));
}
foreach my $hash (\%Canon, \%Compat) {
diff --git a/cpan/Unicode-Normalize/t/partial1.t b/cpan/Unicode-Normalize/t/partial1.t
new file mode 100644
index 0000000000..d8158511a9
--- /dev/null
+++ b/cpan/Unicode-Normalize/t/partial1.t
@@ -0,0 +1,104 @@
+
+BEGIN {
+ unless ("A" eq pack('U', 0x41)) {
+ print "1..0 # Unicode::Normalize " .
+ "cannot stringify a Unicode code point\n";
+ exit 0;
+ }
+}
+
+BEGIN {
+ if ($ENV{PERL_CORE}) {
+ chdir('t') if -d 't';
+ @INC = $^O eq 'MacOS' ? qw(::lib) : qw(../lib);
+ }
+}
+
+BEGIN {
+ unless (5.006001 <= $]) {
+ print "1..0 # skipped: Perl 5.6.1 or later".
+ " needed for this test\n";
+ exit;
+ }
+}
+
+#########################
+
+use Test;
+use strict;
+use warnings;
+BEGIN { plan tests => 26 };
+use Unicode::Normalize qw(:all);
+ok(1); # If we made it this far, we're ok.
+
+sub _pack_U { Unicode::Normalize::pack_U(@_) }
+sub _unpack_U { Unicode::Normalize::unpack_U(@_) }
+
+#########################
+
+sub arraynorm {
+ my $form = shift;
+ my @string = @_;
+ my $result = "";
+ my $unproc = "";
+ foreach my $str (@string) {
+ $unproc .= $str;
+ $result .= $form eq 'NFC' ? NFC_partial ($unproc) :
+ $form eq 'NFD' ? NFD_partial ($unproc) :
+ $form eq 'NFKC' ? NFKC_partial($unproc) :
+ $form eq 'NFKD' ? NFKD_partial($unproc) :
+ undef;
+ }
+ $result .= $unproc;
+ return $result;
+}
+
+my $strD = "\x{3C9}\x{301}\x{1100}\x{1161}\x{11A8}\x{1100}\x{1161}\x{11AA}";
+my $strC = "\x{3CE}\x{AC01}\x{AC03}";
+my @str1 = (substr($strD,0,3), substr($strD,3,4), substr($strD,7));
+my @str2 = (substr($strD,0,1), substr($strD,1,3), substr($strD,4));
+ok($strC eq NFC($strD));
+ok($strD eq join('', @str1));
+ok($strC eq arraynorm('NFC', @str1));
+ok($strD eq join('', @str2));
+ok($strC eq arraynorm('NFC', @str2));
+
+my @strX = ("\x{300}\x{AC00}", "\x{11A8}");
+my $strX = "\x{300}\x{AC01}";
+ok($strX eq NFC(join('', @strX)));
+ok($strX eq arraynorm('NFC', @strX));
+ok($strX eq NFKC(join('', @strX)));
+ok($strX eq arraynorm('NFKC', @strX));
+
+my @strY = ("\x{304B}\x{0308}", "\x{0323}\x{3099}");
+my $strY = ("\x{304C}\x{0323}\x{0308}");
+ok($strY eq NFC(join('', @strY)));
+ok($strY eq arraynorm('NFC', @strY));
+ok($strY eq NFKC(join('', @strY)));
+ok($strY eq arraynorm('NFKC', @strY));
+
+my @strZ = ("\x{304B}\x{0308}", "\x{0323}", "\x{3099}");
+my $strZ = ("\x{304B}\x{3099}\x{0323}\x{0308}");
+ok($strZ eq NFD(join('', @strZ)));
+ok($strZ eq arraynorm('NFD', @strZ));
+ok($strZ eq NFKD(join('', @strZ)));
+ok($strZ eq arraynorm('NFKD', @strZ));
+
+####
+
+# must modify the source
+my $sNFD = "\x{FA19}";
+ok(NFD_partial($sNFD), "");
+ok($sNFD, "\x{795E}");
+
+my $sNFC = "\x{FA1B}";
+ok(NFC_partial($sNFC), "");
+ok($sNFC, "\x{798F}");
+
+my $sNFKD = "\x{FA1E}";
+ok(NFKD_partial($sNFKD), "");
+ok($sNFKD, "\x{7FBD}");
+
+my $sNFKC = "\x{FA26}";
+ok(NFKC_partial($sNFKC), "");
+ok($sNFKC, "\x{90FD}");
diff --git a/cpan/Unicode-Normalize/t/partial2.t b/cpan/Unicode-Normalize/t/partial2.t
new file mode 100644
index 0000000000..4dcdb79289
--- /dev/null
+++ b/cpan/Unicode-Normalize/t/partial2.t
@@ -0,0 +1,100 @@
+
+BEGIN {
+ unless ("A" eq pack('U', 0x41)) {
+ print "1..0 # Unicode::Normalize " .
+ "cannot stringify a Unicode code point\n";
+ exit 0;
+ }
+}
+
+BEGIN {
+ if ($ENV{PERL_CORE}) {
+ chdir('t') if -d 't';
+ @INC = $^O eq 'MacOS' ? qw(::lib) : qw(../lib);
+ }
+}
+
+BEGIN {
+ unless (5.006001 <= $]) {
+ print "1..0 # skipped: Perl 5.6.1 or later".
+ " needed for this test\n";
+ exit;
+ }
+}
+
+#########################
+
+use Test;
+use strict;
+use warnings;
+BEGIN { plan tests => 26 };
+use Unicode::Normalize qw(:all);
+ok(1); # If we made it this far, we're ok.
+
+sub _pack_U { Unicode::Normalize::pack_U(@_) }
+sub _unpack_U { Unicode::Normalize::unpack_U(@_) }
+
+#########################
+
+sub arraynorm {
+ my $form = shift;
+ my @string = @_;
+ my $result = "";
+ my $unproc = "";
+ foreach my $str (@string) {
+ $unproc .= $str;
+ $result .= normalize_partial($form, $unproc);
+ }
+ $result .= $unproc;
+ return $result;
+}
+
+my $strD = "\x{3C9}\x{301}\x{1100}\x{1161}\x{11A8}\x{1100}\x{1161}\x{11AA}";
+my $strC = "\x{3CE}\x{AC01}\x{AC03}";
+my @str1 = (substr($strD,0,3), substr($strD,3,4), substr($strD,7));
+my @str2 = (substr($strD,0,1), substr($strD,1,3), substr($strD,4));
+ok($strC eq NFC($strD));
+ok($strD eq join('', @str1));
+ok($strC eq arraynorm('NFC', @str1));
+ok($strD eq join('', @str2));
+ok($strC eq arraynorm('NFC', @str2));
+
+my @strX = ("\x{300}\x{AC00}", "\x{11A8}");
+my $strX = "\x{300}\x{AC01}";
+ok($strX eq NFC(join('', @strX)));
+ok($strX eq arraynorm('NFC', @strX));
+ok($strX eq NFKC(join('', @strX)));
+ok($strX eq arraynorm('NFKC', @strX));
+
+my @strY = ("\x{304B}\x{0308}", "\x{0323}\x{3099}");
+my $strY = ("\x{304C}\x{0323}\x{0308}");
+ok($strY eq NFC(join('', @strY)));
+ok($strY eq arraynorm('NFC', @strY));
+ok($strY eq NFKC(join('', @strY)));
+ok($strY eq arraynorm('NFKC', @strY));
+
+my @strZ = ("\x{304B}\x{0308}", "\x{0323}", "\x{3099}");
+my $strZ = ("\x{304B}\x{3099}\x{0323}\x{0308}");
+ok($strZ eq NFD(join('', @strZ)));
+ok($strZ eq arraynorm('NFD', @strZ));
+ok($strZ eq NFKD(join('', @strZ)));
+ok($strZ eq arraynorm('NFKD', @strZ));
+
+####
+
+# must modify the source
+my $sNFD = "\x{FA19}";
+ok(normalize_partial('NFD', $sNFD), "");
+ok($sNFD, "\x{795E}");
+
+my $sNFC = "\x{FA1B}";
+ok(normalize_partial('NFC', $sNFC), "");
+ok($sNFC, "\x{798F}");
+
+my $sNFKD = "\x{FA1E}";
+ok(normalize_partial('NFKD', $sNFKD), "");
+ok($sNFKD, "\x{7FBD}");
+
+my $sNFKC = "\x{FA26}";
+ok(normalize_partial('NFKC', $sNFKC), "");
+ok($sNFKC, "\x{90FD}");
diff --git a/cpan/Unicode-Normalize/t/split.t b/cpan/Unicode-Normalize/t/split.t
index 6bf7ff66b1..a8275bff88 100644
--- a/cpan/Unicode-Normalize/t/split.t
+++ b/cpan/Unicode-Normalize/t/split.t
@@ -27,7 +27,7 @@ BEGIN {
use Test;
use strict;
use warnings;
-BEGIN { plan tests => 14 };
+BEGIN { plan tests => 34 };
use Unicode::Normalize qw(:all);
ok(1); # If we made it this far, we're ok.
@@ -40,14 +40,6 @@ our $proc; # before the last starter
our $unproc; # the last starter and after
# If string has no starter, entire string is set to $unproc.
-# When you have $normalized string and $unnormalized string following,
-# a simple concatenation
-# C<$concat = $normalized . normalize($form, $unnormalized)>
-# is wrong. Instead of it, like this:
-#
-# ($processed, $unprocessed) = splitOnLastStarter($normalized);
-# $concat = $processed . normalize($form, $unprocessed.$unnormalized);
-
($proc, $unproc) = splitOnLastStarter("");
ok($proc, "");
ok($unproc, "");
@@ -79,3 +71,61 @@ ok(NFC($ka_grave.$dakuten) eq $ga_grave);
ok(NFC($ka_grave).NFC($dakuten) ne $ga_grave);
ok($concat eq $ga_grave);
+##############
+
+sub arraynorm {
+ my $form = shift;
+ my @string = @_;
+ my $result = "";
+ my $unproc = "";
+ foreach my $str (@string) {
+ $unproc .= $str;
+ my $n = normalize($form, $unproc);
+ my($p, $u) = splitOnLastStarter($n);
+ $result .= $p;
+ $unproc = $u;
+ }
+ $result .= $unproc;
+ return $result;
+}
+
+my $strD = "\x{3C9}\x{301}\x{1100}\x{1161}\x{11A8}\x{1100}\x{1161}\x{11AA}";
+my $strC = "\x{3CE}\x{AC01}\x{AC03}";
+my @str1 = (substr($strD,0,3), substr($strD,3,4), substr($strD,7));
+my @str2 = (substr($strD,0,1), substr($strD,1,3), substr($strD,4));
+ok($strC eq NFC($strD));
+ok($strD eq join('', @str1));
+ok($strC eq arraynorm('NFC', @str1));
+ok($strD eq join('', @str2));
+ok($strC eq arraynorm('NFC', @str2));
+
+my @strX = ("\x{300}\x{AC00}", "\x{11A8}");
+my $strX = "\x{300}\x{AC01}";
+ok($strX eq NFC(join('', @strX)));
+ok($strX eq arraynorm('NFC', @strX));
+ok($strX eq NFKC(join('', @strX)));
+ok($strX eq arraynorm('NFKC', @strX));
+
+my @strY = ("\x{304B}\x{0308}", "\x{0323}\x{3099}");
+my $strY = ("\x{304C}\x{0323}\x{0308}");
+ok($strY eq NFC(join('', @strY)));
+ok($strY eq arraynorm('NFC', @strY));
+ok($strY eq NFKC(join('', @strY)));
+ok($strY eq arraynorm('NFKC', @strY));
+
+my @strZ = ("\x{304B}\x{0308}", "\x{0323}", "\x{3099}");
+my $strZ = ("\x{304B}\x{3099}\x{0323}\x{0308}");
+ok($strZ eq NFD(join('', @strZ)));
+ok($strZ eq arraynorm('NFD', @strZ));
+ok($strZ eq NFKD(join('', @strZ)));
+ok($strZ eq arraynorm('NFKD', @strZ));
+
+##############
+
+# don't modify the source
+
+my $source = "ABC";
+($proc, $unproc) = splitOnLastStarter($source);
+ok($proc, "AB");
+ok($unproc, "C");
+ok($source, "ABC");
diff --git a/cpan/Unicode-Normalize/t/test.t b/cpan/Unicode-Normalize/t/test.t
index e48e16f1b9..2dea2a1402 100644
--- a/cpan/Unicode-Normalize/t/test.t
+++ b/cpan/Unicode-Normalize/t/test.t
@@ -19,7 +19,7 @@ BEGIN {
use Test;
use strict;
use warnings;
-BEGIN { plan tests => 58 };
+BEGIN { plan tests => 70 };
use Unicode::Normalize;
ok(1); # If we made it this far, we're ok.
@@ -123,3 +123,19 @@ my $str21 = _pack_U(0xE0, 0xAC00);
my $str22 = _pack_U(0x61, 0x0300, 0x1100, 0x1161);
ok(NFD $str21 eq $str22);
+## Bug #53197: NFKC("\x{2000}") produces...
+
+ok(NFKC("\x{2002}") eq ' ');
+ok(NFKD("\x{2002}") eq ' ');
+ok(NFKC("\x{2000}") eq ' ');
+ok(NFKD("\x{2000}") eq ' ');
+
+ok(NFKC("\x{210C}") eq 'H');
+ok(NFKD("\x{210C}") eq 'H');
+ok(NFKC("\x{210D}") eq 'H');
+ok(NFKD("\x{210D}") eq 'H');
+
+ok(NFC("\x{F907}") eq "\x{9F9C}");
+ok(NFD("\x{F907}") eq "\x{9F9C}");
+ok(NFKC("\x{F907}") eq "\x{9F9C}");
+ok(NFKD("\x{F907}") eq "\x{9F9C}");