diff options
author | Chris 'BinGOs' Williams <chris@bingosnet.co.uk> | 2010-06-27 19:51:20 +0100 |
---|---|---|
committer | Chris 'BinGOs' Williams <chris@bingosnet.co.uk> | 2010-06-27 19:51:20 +0100 |
commit | a96160d631b54897af2f2540335c2ca854dcea68 (patch) | |
tree | 2cf0106edd8be54b706dbc2ba0df8a0da0cd7129 | |
parent | 6ad8f254c95c6d4523948ded91d651dcc490dee5 (diff) | |
download | perl-a96160d631b54897af2f2540335c2ca854dcea68.tar.gz |
Update Unicode-Normalize to CPAN release 1.06
[DELTA]
1.06 Thu Feb 11 16:19:54 2010
- Pure Perl/mkheader: fixed the internal _getHexArray() for perl 5.11.3
changes (Bug #53197).
1.05 Mon Sep 28 21:43:17 2009
- normalize_partial() and NFX_partial(). { NFX =~ /^NFK?[CD]\z/ }
- added partial1.t for NFX_partial().
- added partial2.t for normalize_partial().
1.04 Wed Sep 23 22:32:57 2009
- doc: splitOnLastStarter() since 0.24 is now documented.
- test: some new tests are added to split.t.
-rw-r--r-- | MANIFEST | 2 | ||||
-rwxr-xr-x | Porting/Maintainers.pl | 2 | ||||
-rw-r--r-- | cpan/Unicode-Normalize/Changes | 16 | ||||
-rw-r--r-- | cpan/Unicode-Normalize/Normalize.pm | 135 | ||||
-rw-r--r-- | cpan/Unicode-Normalize/README | 4 | ||||
-rw-r--r-- | cpan/Unicode-Normalize/mkheader | 143 | ||||
-rw-r--r-- | cpan/Unicode-Normalize/t/partial1.t | 104 | ||||
-rw-r--r-- | cpan/Unicode-Normalize/t/partial2.t | 100 | ||||
-rw-r--r-- | cpan/Unicode-Normalize/t/split.t | 68 | ||||
-rw-r--r-- | cpan/Unicode-Normalize/t/test.t | 18 |
10 files changed, 490 insertions, 102 deletions
@@ -2512,6 +2512,8 @@ cpan/Unicode-Normalize/t/form.t Unicode::Normalize cpan/Unicode-Normalize/t/func.t Unicode::Normalize cpan/Unicode-Normalize/t/illegal.t Unicode::Normalize cpan/Unicode-Normalize/t/norm.t Unicode::Normalize +cpan/Unicode-Normalize/t/partial1.t Unicode::Normalize +cpan/Unicode-Normalize/t/partial2.t Unicode::Normalize cpan/Unicode-Normalize/t/null.t Unicode::Normalize cpan/Unicode-Normalize/t/proto.t Unicode::Normalize cpan/Unicode-Normalize/t/split.t Unicode::Normalize diff --git a/Porting/Maintainers.pl b/Porting/Maintainers.pl index e8d674d35e..18e6995319 100755 --- a/Porting/Maintainers.pl +++ b/Porting/Maintainers.pl @@ -1501,7 +1501,7 @@ use File::Glob qw(:case); 'Unicode::Normalize' => { 'MAINTAINER' => 'sadahiro', - 'DISTRIBUTION' => 'SADAHIRO/Unicode-Normalize-1.03.tar.gz', + 'DISTRIBUTION' => 'SADAHIRO/Unicode-Normalize-1.06-withoutworldwriteables.tar.gz', 'FILES' => q[cpan/Unicode-Normalize], 'EXCLUDED' => [ qw{MANIFEST.N Normalize.pmN disableXS enableXS }], 'UPSTREAM' => 'first-come', diff --git a/cpan/Unicode-Normalize/Changes b/cpan/Unicode-Normalize/Changes index e9cb3918a5..433edb4db7 100644 --- a/cpan/Unicode-Normalize/Changes +++ b/cpan/Unicode-Normalize/Changes @@ -1,5 +1,18 @@ Revision history for Perl extension Unicode::Normalize. +1.06 Thu Feb 11 16:19:54 2010 + - Pure Perl/mkheader: fixed the internal _getHexArray() for perl 5.11.3 + changes (Bug #53197). + +1.05 Mon Sep 28 21:43:17 2009 + - normalize_partial() and NFX_partial(). { NFX =~ /^NFK?[CD]\z/ } + - added partial1.t for NFX_partial(). + - added partial2.t for normalize_partial(). + +1.04 Wed Sep 23 22:32:57 2009 + - doc: splitOnLastStarter() since 0.24 is now documented. + - test: some new tests are added to split.t. + 1.03 Sun Mar 29 12:56:23 2009 - mkheader: check if no composition needs growing the string. - Makefile.PL: a tweak @@ -69,7 +82,8 @@ Revision history for Perl extension Unicode::Normalize. script files, named "enableXS" and "disableXS". (no longer <perl Makefile.PL xs> and <perl Makefile.PL noxs>.) * simplified Makefile.PL. - - added fcdc.t and split.t. + - added fcdc.t for FCD() and FCC(). + - added split.t for splitOnLastStarter(): an undocumented function. 0.23 Sat Jun 28 20:38:10 2003 - bug fix: \0-terminate in compose() in XS. diff --git a/cpan/Unicode-Normalize/Normalize.pm b/cpan/Unicode-Normalize/Normalize.pm index ad5ff82a83..00be139070 100644 --- a/cpan/Unicode-Normalize/Normalize.pm +++ b/cpan/Unicode-Normalize/Normalize.pm @@ -13,13 +13,9 @@ use Carp; no warnings 'utf8'; -our $VERSION = '1.03'; +our $VERSION = '1.06'; our $PACKAGE = __PACKAGE__; -require Exporter; -require DynaLoader; - -our @ISA = qw(Exporter DynaLoader); our @EXPORT = qw( NFC NFD NFKC NFKD ); our @EXPORT_OK = qw( normalize decompose reorder compose @@ -27,8 +23,8 @@ our @EXPORT_OK = qw( getCanon getCompat getComposite getCombinClass isExclusion isSingleton isNonStDecomp isComp2nd isComp_Ex isNFD_NO isNFC_NO isNFC_MAYBE isNFKD_NO isNFKC_NO isNFKC_MAYBE - FCD checkFCD FCC checkFCC composeContiguous - splitOnLastStarter + FCD checkFCD FCC checkFCC composeContiguous splitOnLastStarter + normalize_partial NFC_partial NFD_partial NFKC_partial NFKD_partial ); our %EXPORT_TAGS = ( all => [ @EXPORT, @EXPORT_OK ], @@ -37,12 +33,6 @@ our %EXPORT_TAGS = ( fast => [ qw/FCD checkFCD FCC checkFCC composeContiguous/ ], ); -###### - -bootstrap Unicode::Normalize $VERSION; - -###### - ## ## utilites for tests ## @@ -55,9 +45,18 @@ sub unpack_U { return unpack('U*', shift(@_).pack('U*')); } +require Exporter; + +##### The above part is common to XS and PP ##### + +our @ISA = qw(Exporter DynaLoader); +require DynaLoader; +bootstrap Unicode::Normalize $VERSION; + +##### The below part is common to XS and PP ##### ## -## normalization forms +## normalize ## sub FCD ($) { @@ -83,9 +82,27 @@ sub normalize($$) croak($PACKAGE."::normalize: invalid form name: $form"); } +## +## partial +## + +sub normalize_partial ($$) { + if (exists $formNorm{$_[0]}) { + my $n = normalize($_[0], $_[1]); + my($p, $u) = splitOnLastStarter($n); + $_[1] = $u; + return $p; + } + croak($PACKAGE."::normalize_partial: invalid form name: $_[0]"); +} + +sub NFD_partial ($) { return normalize_partial('NFD', $_[0]) } +sub NFC_partial ($) { return normalize_partial('NFC', $_[0]) } +sub NFKD_partial($) { return normalize_partial('NFKD',$_[0]) } +sub NFKC_partial($) { return normalize_partial('NFKC',$_[0]) } ## -## quick check +## check ## our %formCheck = ( @@ -239,6 +256,82 @@ you can get its NFC/NFKC string, by saying $NFC_string = compose($NFD_string); $NFKC_string = compose($NFKD_string); +=item C<($processed, $unprocessed) = splitOnLastStarter($normalized)> + +It returns two strings: the first one, C<$processed>, is a part +before the last starter, and the second one, C<$unprocessed> is +another part after the first part. A starter is a character having +a combining class of zero (see UAX #15). + +Note that C<$processed> may be empty (when C<$normalized> contains no +starter or starts with the last starter), and then C<$unprocessed> +should be equal to the entire C<$normalized>. + +When you have a C<$normalized> string and an C<$unnormalized> string +following it, a simple concatenation is wrong: + + $concat = $normalized . normalize($form, $unnormalized); # wrong! + +Instead of it, do like this: + + ($processed, $unprocessed) = splitOnLastStarter($normalized); + $concat = $processed . normalize($form, $unprocessed.$unnormalized); + +C<splitOnLastStarter()> should be called with a pre-normalized parameter +C<$normalized>, that is in the same form as C<$form> you want. + +If you have an array of C<@string> that should be concatenated and then +normalized, you can do like this: + + my $result = ""; + my $unproc = ""; + foreach my $str (@string) { + $unproc .= $str; + my $n = normalize($form, $unproc); + my($p, $u) = splitOnLastStarter($n); + $result .= $p; + $unproc = $u; + } + $result .= $unproc; + # instead of normalize($form, join('', @string)) + +=item C<$processed = normalize_partial($form, $unprocessed)> + +A wrapper for the combination of C<normalize()> and C<splitOnLastStarter()>. +Note that C<$unprocessed> will be modified as a side-effect. + +If you have an array of C<@string> that should be concatenated and then +normalized, you can do like this: + + my $result = ""; + my $unproc = ""; + foreach my $str (@string) { + $unproc .= $str; + $result .= normalize_partial($form, $unproc); + } + $result .= $unproc; + # instead of normalize($form, join('', @string)) + +=item C<$processed = NFD_partial($unprocessed)> + +It does like C<normalize_partial('NFD', $unprocessed)>. +Note that C<$unprocessed> will be modified as a side-effect. + +=item C<$processed = NFC_partial($unprocessed)> + +It does like C<normalize_partial('NFC', $unprocessed)>. +Note that C<$unprocessed> will be modified as a side-effect. + +=item C<$processed = NFKD_partial($unprocessed)> + +It does like C<normalize_partial('NFKD', $unprocessed)>. +Note that C<$unprocessed> will be modified as a side-effect. + +=item C<$processed = NFKC_partial($unprocessed)> + +It does like C<normalize_partial('NFKC', $unprocessed)>. +Note that C<$unprocessed> will be modified as a side-effect. + =back =head2 Quick Check @@ -321,15 +414,15 @@ while C<"B\N{COMBINING ACUTE ACCENT}"> is in NFC. If you want to check exactly, compare the string with its NFC/NFKC/FCC. if ($string eq NFC($string)) { - # $string is exactly normalized in NFC; + # $string is exactly normalized in NFC; } else { - # $string is not normalized in NFC; + # $string is not normalized in NFC; } if ($string eq NFKC($string)) { - # $string is exactly normalized in NFKC; + # $string is exactly normalized in NFKC; } else { - # $string is not normalized in NFKC; + # $string is not normalized in NFKC; } =head2 Character Data @@ -454,7 +547,7 @@ normalization implemented by this module depends on your perl's version. 5.8.4-5.8.6 4.0.1 (normalization is same as 4.0.0) 5.8.7-5.8.8 4.1.0 5.10.0 5.0.0 - 5.8.9 5.1.0 + 5.8.9, 5.10.1 5.1.0 =item Correction of decomposition mapping @@ -482,7 +575,7 @@ lower than 4.1.0. SADAHIRO Tomoyuki <SADAHIRO@cpan.org> -Copyright(C) 2001-2007, SADAHIRO Tomoyuki. Japan. All rights reserved. +Copyright(C) 2001-2010, SADAHIRO Tomoyuki. Japan. All rights reserved. This module is free software; you can redistribute it and/or modify it under the same terms as Perl itself. diff --git a/cpan/Unicode-Normalize/README b/cpan/Unicode-Normalize/README index 8a5390cc78..06c9d0c83f 100644 --- a/cpan/Unicode-Normalize/README +++ b/cpan/Unicode-Normalize/README @@ -1,4 +1,4 @@ -Unicode/Normalize version 1.03 +Unicode/Normalize version 1.06 =================================== Unicode::Normalize - Unicode Normalization Forms @@ -83,7 +83,7 @@ COPYRIGHT AND LICENSE SADAHIRO Tomoyuki <SADAHIRO@cpan.org> -Copyright(C) 2001-2007, SADAHIRO Tomoyuki. Japan. All rights reserved. +Copyright(C) 2001-2010, SADAHIRO Tomoyuki. Japan. All rights reserved. This module is free software; you can redistribute it and/or modify it under the same terms as Perl itself. diff --git a/cpan/Unicode-Normalize/mkheader b/cpan/Unicode-Normalize/mkheader index b3e3c3153c..b6d153c552 100644 --- a/cpan/Unicode-Normalize/mkheader +++ b/cpan/Unicode-Normalize/mkheader @@ -32,30 +32,37 @@ BEGIN { our $PACKAGE = 'Unicode::Normalize, mkheader'; -our $Combin = do "unicore/CombiningClass.pl" - || do "unicode/CombiningClass.pl" - || croak "$PACKAGE: CombiningClass.pl not found"; +our $prefix = "UNF_"; +our $structname = "${prefix}complist"; -our $Decomp = do "unicore/Decomposition.pl" - || do "unicode/Decomposition.pl" - || croak "$PACKAGE: Decomposition.pl not found"; +sub pack_U { + return pack('U*', @_); +} + +# %Canon and %Compat will be ($codepoint => $hexstring) after _U_stringify() +our %Comp1st; # $codepoint => $listname : may be composed with a next char. +our %CompList; # $listname,$2nd => $codepoint : composite + +##### The below part is common to mkheader and PP ##### our %Combin; # $codepoint => $number : combination class our %Canon; # $codepoint => \@codepoints : canonical decomp. our %Compat; # $codepoint => \@codepoints : compat. decomp. -# after _U_stringify(), ($codepoint => $hexstring) for %Canon and %Compat +our %Compos; # $1st,$2nd => $codepoint : composite our %Exclus; # $codepoint => 1 : composition exclusions our %Single; # $codepoint => 1 : singletons our %NonStD; # $codepoint => 1 : non-starter decompositions - -our %Comp1st; # $codepoint => $listname : may be composed with a next char. our %Comp2nd; # $codepoint => 1 : may be composed with a prev char. -our %CompList; # $listname,$2nd => $codepoint : composite -our $prefix = "UNF_"; -our $structname = "${prefix}complist"; +# from Unicode database +our $Combin = do "unicore/CombiningClass.pl" + || do "unicode/CombiningClass.pl" + || croak "$PACKAGE: CombiningClass.pl not found"; +our $Decomp = do "unicore/Decomposition.pl" + || do "unicode/Decomposition.pl" + || croak "$PACKAGE: Decomposition.pl not found"; -########## definition of Hangul constants ########## +# definition of Hangul constants use constant SBase => 0xAC00; use constant SFinal => 0xD7A3; # SBase -1 + SCount use constant SCount => 11172; # LCount * NCount @@ -71,48 +78,26 @@ use constant TFinal => 0x11C2; use constant TCount => 28; sub decomposeHangul { - my $SIndex = $_[0] - SBase; - my $LIndex = int( $SIndex / NCount); - my $VIndex = int(($SIndex % NCount) / TCount); - my $TIndex = $SIndex % TCount; + my $sindex = $_[0] - SBase; + my $lindex = int( $sindex / NCount); + my $vindex = int(($sindex % NCount) / TCount); + my $tindex = $sindex % TCount; my @ret = ( - LBase + $LIndex, - VBase + $VIndex, - $TIndex ? (TBase + $TIndex) : (), + LBase + $lindex, + VBase + $vindex, + $tindex ? (TBase + $tindex) : (), ); - return @ret; -} - -########## length of a character ########## - -sub utf8len { - my $uv = shift; - return $uv < 0x80 ? 1 : - $uv < 0x800 ? 2 : - $uv < 0x10000 ? 3 : - $uv < 0x110000 ? 4 : - croak "$PACKAGE: illegal char in the composite. codepoint max is 0x10ffff."; -} - -sub utfelen { - my $uv = shift; - return $uv < 0xA0 ? 1 : - $uv < 0x400 ? 2 : - $uv < 0x4000 ? 3 : - $uv < 0x40000 ? 4 : - $uv < 0x110000 ? 5 : - croak "$PACKAGE: illegal char in the composite. codepoint max is 0x10ffff."; + return wantarray ? @ret : pack_U(@ret); } -my $errExpand = "$PACKAGE: Composition to U+%04X (from U+%04X and U+%04X) " . - "needs growing the string in %s! Quit. Please inform the author..."; - ########## getting full decomposion ########## { my($f, $fh); foreach my $d (@INC) { $f = File::Spec->catfile($d, "unicore", "CompositionExclusions.txt"); last if open($fh, $f); + $f = File::Spec->catfile($d, "unicore", "CompExcl.txt"); + last if open($fh, $f); $f = File::Spec->catfile($d, "unicode", "CompExcl.txt"); last if open($fh, $f); $f = undef; @@ -128,10 +113,9 @@ my $errExpand = "$PACKAGE: Composition to U+%04X (from U+%04X and U+%04X) " . close $fh; } -## ## converts string "hhhh hhhh hhhh" to a numeric list -## -sub _getHexArray { map hex, $_[0] =~ /([0-9A-Fa-f]+)/g } +## (hex digits separated by spaces) +sub _getHexArray { map hex, $_[0] =~ /\G *([0-9A-Fa-f]+)/g } while ($Combin =~ /(.+)/g) { my @tab = split /\t/, $1; @@ -147,14 +131,10 @@ while ($Decomp =~ /(.+)/g) { my @tab = split /\t/, $1; my $compat = $tab[2] =~ s/<[^>]+>//; my $dec = [ _getHexArray($tab[2]) ]; # decomposition - my $ini = hex($tab[0]); + my $ini = hex($tab[0]); # initial decomposable character my $end = $tab[1] eq '' ? $ini : hex($tab[1]); # ($ini .. $end) is the range of decomposable characters. - my $listname = - @$dec == 2 ? sprintf("${structname}_%06x", $dec->[0]) : 'USELESS'; - # %04x is bad since it'd place _3046 after _1d157. - foreach my $u ($ini .. $end) { $Compat{$u} = $dec; @@ -162,20 +142,10 @@ while ($Decomp =~ /(.+)/g) { $Canon{$u} = $dec; if (@$dec == 2) { - if (utf8len($dec->[0]) + utf8len($dec->[1]) < utf8len($u)) { - croak sprintf $errExpand, $u, $dec->[0], $dec->[1], - "utf-8"; - } - if (utfelen($dec->[0]) + utfelen($dec->[1]) < utfelen($u)) { - croak sprintf $errExpand, $u, $dec->[0], $dec->[1], - "utf-ebcdic"; - } - if ($Combin{ $dec->[0] }) { $NonStD{$u} = 1; } else { - $CompList{ $listname }{ $dec->[1] } = $u; - $Comp1st{ $dec->[0] } = $listname; + $Compos{ $dec->[0] }{ $dec->[1] } = $u; $Comp2nd{ $dec->[1] } = 1 if ! $Exclus{$u}; } } elsif (@$dec == 1) { @@ -222,8 +192,47 @@ foreach my $key (keys %Compat) { $Compat{$key} = [ getCompatList($key) ]; } -sub _pack_U { - return pack('U*', @_); +##### The above part is common to mkheader and PP ##### + +sub utf8len { + my $uv = shift; + return $uv < 0x80 ? 1 : + $uv < 0x800 ? 2 : + $uv < 0x10000 ? 3 : + $uv < 0x110000 ? 4 : + croak "$PACKAGE: illegal char in the composite. codepoint max is 0x10ffff."; +} + +sub utfelen { + my $uv = shift; + return $uv < 0xA0 ? 1 : + $uv < 0x400 ? 2 : + $uv < 0x4000 ? 3 : + $uv < 0x40000 ? 4 : + $uv < 0x110000 ? 5 : + croak "$PACKAGE: illegal char in the composite. codepoint max is 0x10ffff."; +} + +my $errExpand = "$PACKAGE: Composition to U+%04X (from U+%04X and U+%04X) " . + "needs growing the string in %s! Quit. Please inform the author..."; + +foreach my $comp1st (keys %Compos) { + my $listname = sprintf("${structname}_%06x", $comp1st); + # %04x is bad since it'd place _3046 after _1d157. + $Comp1st{$comp1st} = $listname; + my $rh1st = $Compos{$comp1st}; + + foreach my $comp2nd (keys %$rh1st) { + my $uc = $rh1st->{$comp2nd}; + $CompList{$listname}{$comp2nd} = $uc; + + if (utf8len($comp1st) + utf8len($comp2nd) < utf8len($uc)) { + croak sprintf $errExpand, $uc, $comp1st, $comp2nd, "utf-8"; + } + if (utfelen($comp1st) + utfelen($comp2nd) < utfelen($uc)) { + croak sprintf $errExpand, $uc, $comp1st, $comp2nd, "utf-ebcdic"; + } + } } sub split_into_char { @@ -239,7 +248,7 @@ sub split_into_char { sub _U_stringify { sprintf '"%s"', join '', - map sprintf("\\x%02x", $_), split_into_char(_pack_U(@_)); + map sprintf("\\x%02x", $_), split_into_char(pack_U(@_)); } foreach my $hash (\%Canon, \%Compat) { diff --git a/cpan/Unicode-Normalize/t/partial1.t b/cpan/Unicode-Normalize/t/partial1.t new file mode 100644 index 0000000000..d8158511a9 --- /dev/null +++ b/cpan/Unicode-Normalize/t/partial1.t @@ -0,0 +1,104 @@ + +BEGIN { + unless ("A" eq pack('U', 0x41)) { + print "1..0 # Unicode::Normalize " . + "cannot stringify a Unicode code point\n"; + exit 0; + } +} + +BEGIN { + if ($ENV{PERL_CORE}) { + chdir('t') if -d 't'; + @INC = $^O eq 'MacOS' ? qw(::lib) : qw(../lib); + } +} + +BEGIN { + unless (5.006001 <= $]) { + print "1..0 # skipped: Perl 5.6.1 or later". + " needed for this test\n"; + exit; + } +} + +######################### + +use Test; +use strict; +use warnings; +BEGIN { plan tests => 26 }; +use Unicode::Normalize qw(:all); +ok(1); # If we made it this far, we're ok. + +sub _pack_U { Unicode::Normalize::pack_U(@_) } +sub _unpack_U { Unicode::Normalize::unpack_U(@_) } + +######################### + +sub arraynorm { + my $form = shift; + my @string = @_; + my $result = ""; + my $unproc = ""; + foreach my $str (@string) { + $unproc .= $str; + $result .= $form eq 'NFC' ? NFC_partial ($unproc) : + $form eq 'NFD' ? NFD_partial ($unproc) : + $form eq 'NFKC' ? NFKC_partial($unproc) : + $form eq 'NFKD' ? NFKD_partial($unproc) : + undef; + } + $result .= $unproc; + return $result; +} + +my $strD = "\x{3C9}\x{301}\x{1100}\x{1161}\x{11A8}\x{1100}\x{1161}\x{11AA}"; +my $strC = "\x{3CE}\x{AC01}\x{AC03}"; +my @str1 = (substr($strD,0,3), substr($strD,3,4), substr($strD,7)); +my @str2 = (substr($strD,0,1), substr($strD,1,3), substr($strD,4)); +ok($strC eq NFC($strD)); +ok($strD eq join('', @str1)); +ok($strC eq arraynorm('NFC', @str1)); +ok($strD eq join('', @str2)); +ok($strC eq arraynorm('NFC', @str2)); + +my @strX = ("\x{300}\x{AC00}", "\x{11A8}"); +my $strX = "\x{300}\x{AC01}"; +ok($strX eq NFC(join('', @strX))); +ok($strX eq arraynorm('NFC', @strX)); +ok($strX eq NFKC(join('', @strX))); +ok($strX eq arraynorm('NFKC', @strX)); + +my @strY = ("\x{304B}\x{0308}", "\x{0323}\x{3099}"); +my $strY = ("\x{304C}\x{0323}\x{0308}"); +ok($strY eq NFC(join('', @strY))); +ok($strY eq arraynorm('NFC', @strY)); +ok($strY eq NFKC(join('', @strY))); +ok($strY eq arraynorm('NFKC', @strY)); + +my @strZ = ("\x{304B}\x{0308}", "\x{0323}", "\x{3099}"); +my $strZ = ("\x{304B}\x{3099}\x{0323}\x{0308}"); +ok($strZ eq NFD(join('', @strZ))); +ok($strZ eq arraynorm('NFD', @strZ)); +ok($strZ eq NFKD(join('', @strZ))); +ok($strZ eq arraynorm('NFKD', @strZ)); + +#### + +# must modify the source +my $sNFD = "\x{FA19}"; +ok(NFD_partial($sNFD), ""); +ok($sNFD, "\x{795E}"); + +my $sNFC = "\x{FA1B}"; +ok(NFC_partial($sNFC), ""); +ok($sNFC, "\x{798F}"); + +my $sNFKD = "\x{FA1E}"; +ok(NFKD_partial($sNFKD), ""); +ok($sNFKD, "\x{7FBD}"); + +my $sNFKC = "\x{FA26}"; +ok(NFKC_partial($sNFKC), ""); +ok($sNFKC, "\x{90FD}"); diff --git a/cpan/Unicode-Normalize/t/partial2.t b/cpan/Unicode-Normalize/t/partial2.t new file mode 100644 index 0000000000..4dcdb79289 --- /dev/null +++ b/cpan/Unicode-Normalize/t/partial2.t @@ -0,0 +1,100 @@ + +BEGIN { + unless ("A" eq pack('U', 0x41)) { + print "1..0 # Unicode::Normalize " . + "cannot stringify a Unicode code point\n"; + exit 0; + } +} + +BEGIN { + if ($ENV{PERL_CORE}) { + chdir('t') if -d 't'; + @INC = $^O eq 'MacOS' ? qw(::lib) : qw(../lib); + } +} + +BEGIN { + unless (5.006001 <= $]) { + print "1..0 # skipped: Perl 5.6.1 or later". + " needed for this test\n"; + exit; + } +} + +######################### + +use Test; +use strict; +use warnings; +BEGIN { plan tests => 26 }; +use Unicode::Normalize qw(:all); +ok(1); # If we made it this far, we're ok. + +sub _pack_U { Unicode::Normalize::pack_U(@_) } +sub _unpack_U { Unicode::Normalize::unpack_U(@_) } + +######################### + +sub arraynorm { + my $form = shift; + my @string = @_; + my $result = ""; + my $unproc = ""; + foreach my $str (@string) { + $unproc .= $str; + $result .= normalize_partial($form, $unproc); + } + $result .= $unproc; + return $result; +} + +my $strD = "\x{3C9}\x{301}\x{1100}\x{1161}\x{11A8}\x{1100}\x{1161}\x{11AA}"; +my $strC = "\x{3CE}\x{AC01}\x{AC03}"; +my @str1 = (substr($strD,0,3), substr($strD,3,4), substr($strD,7)); +my @str2 = (substr($strD,0,1), substr($strD,1,3), substr($strD,4)); +ok($strC eq NFC($strD)); +ok($strD eq join('', @str1)); +ok($strC eq arraynorm('NFC', @str1)); +ok($strD eq join('', @str2)); +ok($strC eq arraynorm('NFC', @str2)); + +my @strX = ("\x{300}\x{AC00}", "\x{11A8}"); +my $strX = "\x{300}\x{AC01}"; +ok($strX eq NFC(join('', @strX))); +ok($strX eq arraynorm('NFC', @strX)); +ok($strX eq NFKC(join('', @strX))); +ok($strX eq arraynorm('NFKC', @strX)); + +my @strY = ("\x{304B}\x{0308}", "\x{0323}\x{3099}"); +my $strY = ("\x{304C}\x{0323}\x{0308}"); +ok($strY eq NFC(join('', @strY))); +ok($strY eq arraynorm('NFC', @strY)); +ok($strY eq NFKC(join('', @strY))); +ok($strY eq arraynorm('NFKC', @strY)); + +my @strZ = ("\x{304B}\x{0308}", "\x{0323}", "\x{3099}"); +my $strZ = ("\x{304B}\x{3099}\x{0323}\x{0308}"); +ok($strZ eq NFD(join('', @strZ))); +ok($strZ eq arraynorm('NFD', @strZ)); +ok($strZ eq NFKD(join('', @strZ))); +ok($strZ eq arraynorm('NFKD', @strZ)); + +#### + +# must modify the source +my $sNFD = "\x{FA19}"; +ok(normalize_partial('NFD', $sNFD), ""); +ok($sNFD, "\x{795E}"); + +my $sNFC = "\x{FA1B}"; +ok(normalize_partial('NFC', $sNFC), ""); +ok($sNFC, "\x{798F}"); + +my $sNFKD = "\x{FA1E}"; +ok(normalize_partial('NFKD', $sNFKD), ""); +ok($sNFKD, "\x{7FBD}"); + +my $sNFKC = "\x{FA26}"; +ok(normalize_partial('NFKC', $sNFKC), ""); +ok($sNFKC, "\x{90FD}"); diff --git a/cpan/Unicode-Normalize/t/split.t b/cpan/Unicode-Normalize/t/split.t index 6bf7ff66b1..a8275bff88 100644 --- a/cpan/Unicode-Normalize/t/split.t +++ b/cpan/Unicode-Normalize/t/split.t @@ -27,7 +27,7 @@ BEGIN { use Test; use strict; use warnings; -BEGIN { plan tests => 14 }; +BEGIN { plan tests => 34 }; use Unicode::Normalize qw(:all); ok(1); # If we made it this far, we're ok. @@ -40,14 +40,6 @@ our $proc; # before the last starter our $unproc; # the last starter and after # If string has no starter, entire string is set to $unproc. -# When you have $normalized string and $unnormalized string following, -# a simple concatenation -# C<$concat = $normalized . normalize($form, $unnormalized)> -# is wrong. Instead of it, like this: -# -# ($processed, $unprocessed) = splitOnLastStarter($normalized); -# $concat = $processed . normalize($form, $unprocessed.$unnormalized); - ($proc, $unproc) = splitOnLastStarter(""); ok($proc, ""); ok($unproc, ""); @@ -79,3 +71,61 @@ ok(NFC($ka_grave.$dakuten) eq $ga_grave); ok(NFC($ka_grave).NFC($dakuten) ne $ga_grave); ok($concat eq $ga_grave); +############## + +sub arraynorm { + my $form = shift; + my @string = @_; + my $result = ""; + my $unproc = ""; + foreach my $str (@string) { + $unproc .= $str; + my $n = normalize($form, $unproc); + my($p, $u) = splitOnLastStarter($n); + $result .= $p; + $unproc = $u; + } + $result .= $unproc; + return $result; +} + +my $strD = "\x{3C9}\x{301}\x{1100}\x{1161}\x{11A8}\x{1100}\x{1161}\x{11AA}"; +my $strC = "\x{3CE}\x{AC01}\x{AC03}"; +my @str1 = (substr($strD,0,3), substr($strD,3,4), substr($strD,7)); +my @str2 = (substr($strD,0,1), substr($strD,1,3), substr($strD,4)); +ok($strC eq NFC($strD)); +ok($strD eq join('', @str1)); +ok($strC eq arraynorm('NFC', @str1)); +ok($strD eq join('', @str2)); +ok($strC eq arraynorm('NFC', @str2)); + +my @strX = ("\x{300}\x{AC00}", "\x{11A8}"); +my $strX = "\x{300}\x{AC01}"; +ok($strX eq NFC(join('', @strX))); +ok($strX eq arraynorm('NFC', @strX)); +ok($strX eq NFKC(join('', @strX))); +ok($strX eq arraynorm('NFKC', @strX)); + +my @strY = ("\x{304B}\x{0308}", "\x{0323}\x{3099}"); +my $strY = ("\x{304C}\x{0323}\x{0308}"); +ok($strY eq NFC(join('', @strY))); +ok($strY eq arraynorm('NFC', @strY)); +ok($strY eq NFKC(join('', @strY))); +ok($strY eq arraynorm('NFKC', @strY)); + +my @strZ = ("\x{304B}\x{0308}", "\x{0323}", "\x{3099}"); +my $strZ = ("\x{304B}\x{3099}\x{0323}\x{0308}"); +ok($strZ eq NFD(join('', @strZ))); +ok($strZ eq arraynorm('NFD', @strZ)); +ok($strZ eq NFKD(join('', @strZ))); +ok($strZ eq arraynorm('NFKD', @strZ)); + +############## + +# don't modify the source + +my $source = "ABC"; +($proc, $unproc) = splitOnLastStarter($source); +ok($proc, "AB"); +ok($unproc, "C"); +ok($source, "ABC"); diff --git a/cpan/Unicode-Normalize/t/test.t b/cpan/Unicode-Normalize/t/test.t index e48e16f1b9..2dea2a1402 100644 --- a/cpan/Unicode-Normalize/t/test.t +++ b/cpan/Unicode-Normalize/t/test.t @@ -19,7 +19,7 @@ BEGIN { use Test; use strict; use warnings; -BEGIN { plan tests => 58 }; +BEGIN { plan tests => 70 }; use Unicode::Normalize; ok(1); # If we made it this far, we're ok. @@ -123,3 +123,19 @@ my $str21 = _pack_U(0xE0, 0xAC00); my $str22 = _pack_U(0x61, 0x0300, 0x1100, 0x1161); ok(NFD $str21 eq $str22); +## Bug #53197: NFKC("\x{2000}") produces... + +ok(NFKC("\x{2002}") eq ' '); +ok(NFKD("\x{2002}") eq ' '); +ok(NFKC("\x{2000}") eq ' '); +ok(NFKD("\x{2000}") eq ' '); + +ok(NFKC("\x{210C}") eq 'H'); +ok(NFKD("\x{210C}") eq 'H'); +ok(NFKC("\x{210D}") eq 'H'); +ok(NFKD("\x{210D}") eq 'H'); + +ok(NFC("\x{F907}") eq "\x{9F9C}"); +ok(NFD("\x{F907}") eq "\x{9F9C}"); +ok(NFKC("\x{F907}") eq "\x{9F9C}"); +ok(NFKD("\x{F907}") eq "\x{9F9C}"); |