diff options
author | Karl Williamson <khw@cpan.org> | 2016-11-11 14:52:39 +0100 |
---|---|---|
committer | Karl Williamson <khw@cpan.org> | 2016-11-11 17:51:30 +0100 |
commit | 3baae3fab5dea1469c2d040f5380bc2009bdeecb (patch) | |
tree | f20f6c9da2a60da215c786c01db84d23516914f5 /cpan/Unicode-Normalize | |
parent | af25b33d388e2824ad52b31c8f5c7bc722f02dd6 (diff) | |
download | perl-3baae3fab5dea1469c2d040f5380bc2009bdeecb.tar.gz |
Move Unicode-Normalize to dist/
p5p has taken over the maintenance of this module, so it should be in
dist/
Diffstat (limited to 'cpan/Unicode-Normalize')
-rw-r--r-- | cpan/Unicode-Normalize/.gitignore | 1 | ||||
-rw-r--r-- | cpan/Unicode-Normalize/Makefile.PL | 55 | ||||
-rw-r--r-- | cpan/Unicode-Normalize/Normalize.pm | 635 | ||||
-rw-r--r-- | cpan/Unicode-Normalize/Normalize.xs | 925 | ||||
-rw-r--r-- | cpan/Unicode-Normalize/mkheader | 419 | ||||
-rw-r--r-- | cpan/Unicode-Normalize/t/fcdc.t | 138 | ||||
-rw-r--r-- | cpan/Unicode-Normalize/t/form.t | 84 | ||||
-rw-r--r-- | cpan/Unicode-Normalize/t/func.t | 386 | ||||
-rw-r--r-- | cpan/Unicode-Normalize/t/illegal.t | 85 | ||||
-rw-r--r-- | cpan/Unicode-Normalize/t/norm.t | 145 | ||||
-rw-r--r-- | cpan/Unicode-Normalize/t/null.t | 100 | ||||
-rw-r--r-- | cpan/Unicode-Normalize/t/partial1.t | 120 | ||||
-rw-r--r-- | cpan/Unicode-Normalize/t/partial2.t | 116 | ||||
-rw-r--r-- | cpan/Unicode-Normalize/t/proto.t | 99 | ||||
-rw-r--r-- | cpan/Unicode-Normalize/t/split.t | 147 | ||||
-rw-r--r-- | cpan/Unicode-Normalize/t/test.t | 168 | ||||
-rw-r--r-- | cpan/Unicode-Normalize/t/tie.t | 82 |
17 files changed, 0 insertions, 3705 deletions
diff --git a/cpan/Unicode-Normalize/.gitignore b/cpan/Unicode-Normalize/.gitignore deleted file mode 100644 index 424c745c12..0000000000 --- a/cpan/Unicode-Normalize/.gitignore +++ /dev/null @@ -1 +0,0 @@ -*.h diff --git a/cpan/Unicode-Normalize/Makefile.PL b/cpan/Unicode-Normalize/Makefile.PL deleted file mode 100644 index 44a4b8ded8..0000000000 --- a/cpan/Unicode-Normalize/Makefile.PL +++ /dev/null @@ -1,55 +0,0 @@ -require 5.006001; -use ExtUtils::MakeMaker; - -my $clean = {}; - -my $mm_ver = ExtUtils::MakeMaker->VERSION; - -if (-f "Normalize.xs") { - print STDERR "Making header files for XS...\n"; - - do 'mkheader' or die $@ || "mkheader: $!"; - - $clean = { FILES => 'unfcan.h unfcmb.h unfcmp.h unfcpt.h unfexc.h' }; -} - -WriteMakefile( - ($mm_ver < 6.58) - ? ('AUTHOR' => 'SADAHIRO Tomoyuki <SADAHIRO@cpan.org>, Karl Williamson <khw@cpan.org>') - : ('AUTHOR' => [ - 'SADAHIRO Tomoyuki <SADAHIRO@cpan.org>', - 'Karl Williamson <khw@cpan.org>', - ]), - 'ABSTRACT' => 'Unicode Normalization Forms', - 'INSTALLDIRS' => ($] >= 5.007002 && $] < 5.011) ? 'perl' : 'site', - # see perl5110delta, @INC reorganization - 'LICENSE' => 'perl', - 'NAME' => 'Unicode::Normalize', - 'VERSION_FROM' => 'Normalize.pm', # finds $VERSION - 'clean' => $clean, - 'depend' => { 'Normalize.o' => '$(H_FILES)' }, - 'PREREQ_PM' => { - Carp => 0, - constant => 0, - DynaLoader => 0, - Exporter => 0, - File::Spec => 0, - strict => 0, - warnings => 0, - SelectSaver => 0, - }, - ($mm_ver < 6.48 ? () : MIN_PERL_VERSION => 5.6.0), - ($mm_ver < 6.46 ? () : (META_MERGE => { - 'meta-spec' => { version => 2 }, - resources => { - repository => { - url => 'https://github.com/khwilliamson/Unicode-Normalize.git', - web => 'https://github.com/khwilliamson/Unicode-Normalize', - type => 'git', - }, - bugtracker => { - web => 'https://rt.cpan.org/Public/Dist/Display.html?Name=Unicode-Normalize', - }, - }, - })), -); diff --git a/cpan/Unicode-Normalize/Normalize.pm b/cpan/Unicode-Normalize/Normalize.pm deleted file mode 100644 index ff6c0f0dc4..0000000000 --- a/cpan/Unicode-Normalize/Normalize.pm +++ /dev/null @@ -1,635 +0,0 @@ -package Unicode::Normalize; - -BEGIN { - unless ('A' eq pack('U', 0x41)) { - die "Unicode::Normalize cannot stringify a Unicode code point\n"; - } - unless (0x41 == unpack('U', 'A')) { - die "Unicode::Normalize cannot get Unicode code point\n"; - } -} - -use 5.006; -use strict; -use warnings; -use Carp; - -no warnings 'utf8'; - -our $VERSION = '1.25'; -our $PACKAGE = __PACKAGE__; - -our @EXPORT = qw( NFC NFD NFKC NFKD ); -our @EXPORT_OK = qw( - normalize decompose reorder compose - checkNFD checkNFKD checkNFC checkNFKC check - getCanon getCompat getComposite getCombinClass - isExclusion isSingleton isNonStDecomp isComp2nd isComp_Ex - isNFD_NO isNFC_NO isNFC_MAYBE isNFKD_NO isNFKC_NO isNFKC_MAYBE - FCD checkFCD FCC checkFCC composeContiguous splitOnLastStarter - normalize_partial NFC_partial NFD_partial NFKC_partial NFKD_partial -); -our %EXPORT_TAGS = ( - all => [ @EXPORT, @EXPORT_OK ], - normalize => [ @EXPORT, qw/normalize decompose reorder compose/ ], - check => [ qw/checkNFD checkNFKD checkNFC checkNFKC check/ ], - fast => [ qw/FCD checkFCD FCC checkFCC composeContiguous/ ], -); - -## -## utilities for tests -## - -sub pack_U { - return pack('U*', @_); -} - -sub unpack_U { - - # The empty pack returns an empty UTF-8 string, so the effect is to force - # the shifted parameter into being UTF-8. This allows this to work on - # Perl 5.6, where there is no utf8::upgrade(). - return unpack('U*', shift(@_).pack('U*')); -} - -require Exporter; - -##### The above part is common to XS and PP ##### - -our @ISA = qw(Exporter DynaLoader); -require DynaLoader; -bootstrap Unicode::Normalize $VERSION; - -##### The below part is common to XS and PP ##### - -## -## normalize -## - -sub FCD ($) { - my $str = shift; - return checkFCD($str) ? $str : NFD($str); -} - -our %formNorm = ( - NFC => \&NFC, C => \&NFC, - NFD => \&NFD, D => \&NFD, - NFKC => \&NFKC, KC => \&NFKC, - NFKD => \&NFKD, KD => \&NFKD, - FCD => \&FCD, FCC => \&FCC, -); - -sub normalize($$) -{ - my $form = shift; - my $str = shift; - if (exists $formNorm{$form}) { - return $formNorm{$form}->($str); - } - croak($PACKAGE."::normalize: invalid form name: $form"); -} - -## -## partial -## - -sub normalize_partial ($$) { - if (exists $formNorm{$_[0]}) { - my $n = normalize($_[0], $_[1]); - my($p, $u) = splitOnLastStarter($n); - $_[1] = $u; - return $p; - } - croak($PACKAGE."::normalize_partial: invalid form name: $_[0]"); -} - -sub NFD_partial ($) { return normalize_partial('NFD', $_[0]) } -sub NFC_partial ($) { return normalize_partial('NFC', $_[0]) } -sub NFKD_partial($) { return normalize_partial('NFKD',$_[0]) } -sub NFKC_partial($) { return normalize_partial('NFKC',$_[0]) } - -## -## check -## - -our %formCheck = ( - NFC => \&checkNFC, C => \&checkNFC, - NFD => \&checkNFD, D => \&checkNFD, - NFKC => \&checkNFKC, KC => \&checkNFKC, - NFKD => \&checkNFKD, KD => \&checkNFKD, - FCD => \&checkFCD, FCC => \&checkFCC, -); - -sub check($$) -{ - my $form = shift; - my $str = shift; - if (exists $formCheck{$form}) { - return $formCheck{$form}->($str); - } - croak($PACKAGE."::check: invalid form name: $form"); -} - -1; -__END__ - -=head1 NAME - -Unicode::Normalize - Unicode Normalization Forms - -=head1 SYNOPSIS - -(1) using function names exported by default: - - use Unicode::Normalize; - - $NFD_string = NFD($string); # Normalization Form D - $NFC_string = NFC($string); # Normalization Form C - $NFKD_string = NFKD($string); # Normalization Form KD - $NFKC_string = NFKC($string); # Normalization Form KC - -(2) using function names exported on request: - - use Unicode::Normalize 'normalize'; - - $NFD_string = normalize('D', $string); # Normalization Form D - $NFC_string = normalize('C', $string); # Normalization Form C - $NFKD_string = normalize('KD', $string); # Normalization Form KD - $NFKC_string = normalize('KC', $string); # Normalization Form KC - -=head1 DESCRIPTION - -Parameters: - -C<$string> is used as a string under character semantics (see F<perlunicode>). - -C<$code_point> should be an unsigned integer representing a Unicode code point. - -Note: Between XSUB and pure Perl, there is an incompatibility -about the interpretation of C<$code_point> as a decimal number. -XSUB converts C<$code_point> to an unsigned integer, but pure Perl does not. -Do not use a floating point nor a negative sign in C<$code_point>. - -=head2 Normalization Forms - -=over 4 - -=item C<$NFD_string = NFD($string)> - -It returns the Normalization Form D (formed by canonical decomposition). - -=item C<$NFC_string = NFC($string)> - -It returns the Normalization Form C (formed by canonical decomposition -followed by canonical composition). - -=item C<$NFKD_string = NFKD($string)> - -It returns the Normalization Form KD (formed by compatibility decomposition). - -=item C<$NFKC_string = NFKC($string)> - -It returns the Normalization Form KC (formed by compatibility decomposition -followed by B<canonical> composition). - -=item C<$FCD_string = FCD($string)> - -If the given string is in FCD ("Fast C or D" form; cf. UTN #5), -it returns the string without modification; otherwise it returns an FCD string. - -Note: FCD is not always unique, then plural forms may be equivalent -each other. C<FCD()> will return one of these equivalent forms. - -=item C<$FCC_string = FCC($string)> - -It returns the FCC form ("Fast C Contiguous"; cf. UTN #5). - -Note: FCC is unique, as well as four normalization forms (NF*). - -=item C<$normalized_string = normalize($form_name, $string)> - -It returns the normalization form of C<$form_name>. - -As C<$form_name>, one of the following names must be given. - - 'C' or 'NFC' for Normalization Form C (UAX #15) - 'D' or 'NFD' for Normalization Form D (UAX #15) - 'KC' or 'NFKC' for Normalization Form KC (UAX #15) - 'KD' or 'NFKD' for Normalization Form KD (UAX #15) - - 'FCD' for "Fast C or D" Form (UTN #5) - 'FCC' for "Fast C Contiguous" (UTN #5) - -=back - -=head2 Decomposition and Composition - -=over 4 - -=item C<$decomposed_string = decompose($string [, $useCompatMapping])> - -It returns the concatenation of the decomposition of each character -in the string. - -If the second parameter (a boolean) is omitted or false, -the decomposition is canonical decomposition; -if the second parameter (a boolean) is true, -the decomposition is compatibility decomposition. - -The string returned is not always in NFD/NFKD. Reordering may be required. - - $NFD_string = reorder(decompose($string)); # eq. to NFD() - $NFKD_string = reorder(decompose($string, TRUE)); # eq. to NFKD() - -=item C<$reordered_string = reorder($string)> - -It returns the result of reordering the combining characters -according to Canonical Ordering Behavior. - -For example, when you have a list of NFD/NFKD strings, -you can get the concatenated NFD/NFKD string from them, by saying - - $concat_NFD = reorder(join '', @NFD_strings); - $concat_NFKD = reorder(join '', @NFKD_strings); - -=item C<$composed_string = compose($string)> - -It returns the result of canonical composition -without applying any decomposition. - -For example, when you have a NFD/NFKD string, -you can get its NFC/NFKC string, by saying - - $NFC_string = compose($NFD_string); - $NFKC_string = compose($NFKD_string); - -=item C<($processed, $unprocessed) = splitOnLastStarter($normalized)> - -It returns two strings: the first one, C<$processed>, is a part -before the last starter, and the second one, C<$unprocessed> is -another part after the first part. A starter is a character having -a combining class of zero (see UAX #15). - -Note that C<$processed> may be empty (when C<$normalized> contains no -starter or starts with the last starter), and then C<$unprocessed> -should be equal to the entire C<$normalized>. - -When you have a C<$normalized> string and an C<$unnormalized> string -following it, a simple concatenation is wrong: - - $concat = $normalized . normalize($form, $unnormalized); # wrong! - -Instead of it, do like this: - - ($processed, $unprocessed) = splitOnLastStarter($normalized); - $concat = $processed . normalize($form, $unprocessed.$unnormalized); - -C<splitOnLastStarter()> should be called with a pre-normalized parameter -C<$normalized>, that is in the same form as C<$form> you want. - -If you have an array of C<@string> that should be concatenated and then -normalized, you can do like this: - - my $result = ""; - my $unproc = ""; - foreach my $str (@string) { - $unproc .= $str; - my $n = normalize($form, $unproc); - my($p, $u) = splitOnLastStarter($n); - $result .= $p; - $unproc = $u; - } - $result .= $unproc; - # instead of normalize($form, join('', @string)) - -=item C<$processed = normalize_partial($form, $unprocessed)> - -A wrapper for the combination of C<normalize()> and C<splitOnLastStarter()>. -Note that C<$unprocessed> will be modified as a side-effect. - -If you have an array of C<@string> that should be concatenated and then -normalized, you can do like this: - - my $result = ""; - my $unproc = ""; - foreach my $str (@string) { - $unproc .= $str; - $result .= normalize_partial($form, $unproc); - } - $result .= $unproc; - # instead of normalize($form, join('', @string)) - -=item C<$processed = NFD_partial($unprocessed)> - -It does like C<normalize_partial('NFD', $unprocessed)>. -Note that C<$unprocessed> will be modified as a side-effect. - -=item C<$processed = NFC_partial($unprocessed)> - -It does like C<normalize_partial('NFC', $unprocessed)>. -Note that C<$unprocessed> will be modified as a side-effect. - -=item C<$processed = NFKD_partial($unprocessed)> - -It does like C<normalize_partial('NFKD', $unprocessed)>. -Note that C<$unprocessed> will be modified as a side-effect. - -=item C<$processed = NFKC_partial($unprocessed)> - -It does like C<normalize_partial('NFKC', $unprocessed)>. -Note that C<$unprocessed> will be modified as a side-effect. - -=back - -=head2 Quick Check - -(see Annex 8, UAX #15; and F<DerivedNormalizationProps.txt>) - -The following functions check whether the string is in that normalization form. - -The result returned will be one of the following: - - YES The string is in that normalization form. - NO The string is not in that normalization form. - MAYBE Dubious. Maybe yes, maybe no. - -=over 4 - -=item C<$result = checkNFD($string)> - -It returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>. - -=item C<$result = checkNFC($string)> - -It returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>; -C<undef> if C<MAYBE>. - -=item C<$result = checkNFKD($string)> - -It returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>. - -=item C<$result = checkNFKC($string)> - -It returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>; -C<undef> if C<MAYBE>. - -=item C<$result = checkFCD($string)> - -It returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>. - -=item C<$result = checkFCC($string)> - -It returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>; -C<undef> if C<MAYBE>. - -Note: If a string is not in FCD, it must not be in FCC. -So C<checkFCC($not_FCD_string)> should return C<NO>. - -=item C<$result = check($form_name, $string)> - -It returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>; -C<undef> if C<MAYBE>. - -As C<$form_name>, one of the following names must be given. - - 'C' or 'NFC' for Normalization Form C (UAX #15) - 'D' or 'NFD' for Normalization Form D (UAX #15) - 'KC' or 'NFKC' for Normalization Form KC (UAX #15) - 'KD' or 'NFKD' for Normalization Form KD (UAX #15) - - 'FCD' for "Fast C or D" Form (UTN #5) - 'FCC' for "Fast C Contiguous" (UTN #5) - -=back - -B<Note> - -In the cases of NFD, NFKD, and FCD, the answer must be -either C<YES> or C<NO>. The answer C<MAYBE> may be returned -in the cases of NFC, NFKC, and FCC. - -A C<MAYBE> string should contain at least one combining character -or the like. For example, C<COMBINING ACUTE ACCENT> has -the MAYBE_NFC/MAYBE_NFKC property. - -Both C<checkNFC("A\N{COMBINING ACUTE ACCENT}")> -and C<checkNFC("B\N{COMBINING ACUTE ACCENT}")> will return C<MAYBE>. -C<"A\N{COMBINING ACUTE ACCENT}"> is not in NFC -(its NFC is C<"\N{LATIN CAPITAL LETTER A WITH ACUTE}">), -while C<"B\N{COMBINING ACUTE ACCENT}"> is in NFC. - -If you want to check exactly, compare the string with its NFC/NFKC/FCC. - - if ($string eq NFC($string)) { - # $string is exactly normalized in NFC; - } else { - # $string is not normalized in NFC; - } - - if ($string eq NFKC($string)) { - # $string is exactly normalized in NFKC; - } else { - # $string is not normalized in NFKC; - } - -=head2 Character Data - -These functions are interface of character data used internally. -If you want only to get Unicode normalization forms, you don't need -call them yourself. - -=over 4 - -=item C<$canonical_decomposition = getCanon($code_point)> - -If the character is canonically decomposable (including Hangul Syllables), -it returns the (full) canonical decomposition as a string. -Otherwise it returns C<undef>. - -B<Note:> According to the Unicode standard, the canonical decomposition -of the character that is not canonically decomposable is same as -the character itself. - -=item C<$compatibility_decomposition = getCompat($code_point)> - -If the character is compatibility decomposable (including Hangul Syllables), -it returns the (full) compatibility decomposition as a string. -Otherwise it returns C<undef>. - -B<Note:> According to the Unicode standard, the compatibility decomposition -of the character that is not compatibility decomposable is same as -the character itself. - -=item C<$code_point_composite = getComposite($code_point_here, $code_point_next)> - -If two characters here and next (as code points) are composable -(including Hangul Jamo/Syllables and Composition Exclusions), -it returns the code point of the composite. - -If they are not composable, it returns C<undef>. - -=item C<$combining_class = getCombinClass($code_point)> - -It returns the combining class (as an integer) of the character. - -=item C<$may_be_composed_with_prev_char = isComp2nd($code_point)> - -It returns a boolean whether the character of the specified codepoint -may be composed with the previous one in a certain composition -(including Hangul Compositions, but excluding -Composition Exclusions and Non-Starter Decompositions). - -=item C<$is_exclusion = isExclusion($code_point)> - -It returns a boolean whether the code point is a composition exclusion. - -=item C<$is_singleton = isSingleton($code_point)> - -It returns a boolean whether the code point is a singleton - -=item C<$is_non_starter_decomposition = isNonStDecomp($code_point)> - -It returns a boolean whether the code point has Non-Starter Decomposition. - -=item C<$is_Full_Composition_Exclusion = isComp_Ex($code_point)> - -It returns a boolean of the derived property Comp_Ex -(Full_Composition_Exclusion). This property is generated from -Composition Exclusions + Singletons + Non-Starter Decompositions. - -=item C<$NFD_is_NO = isNFD_NO($code_point)> - -It returns a boolean of the derived property NFD_NO -(NFD_Quick_Check=No). - -=item C<$NFC_is_NO = isNFC_NO($code_point)> - -It returns a boolean of the derived property NFC_NO -(NFC_Quick_Check=No). - -=item C<$NFC_is_MAYBE = isNFC_MAYBE($code_point)> - -It returns a boolean of the derived property NFC_MAYBE -(NFC_Quick_Check=Maybe). - -=item C<$NFKD_is_NO = isNFKD_NO($code_point)> - -It returns a boolean of the derived property NFKD_NO -(NFKD_Quick_Check=No). - -=item C<$NFKC_is_NO = isNFKC_NO($code_point)> - -It returns a boolean of the derived property NFKC_NO -(NFKC_Quick_Check=No). - -=item C<$NFKC_is_MAYBE = isNFKC_MAYBE($code_point)> - -It returns a boolean of the derived property NFKC_MAYBE -(NFKC_Quick_Check=Maybe). - -=back - -=head1 EXPORT - -C<NFC>, C<NFD>, C<NFKC>, C<NFKD>: by default. - -C<normalize> and other some functions: on request. - -=head1 CAVEATS - -=over 4 - -=item Perl's version vs. Unicode version - -Since this module refers to perl core's Unicode database in the directory -F</lib/unicore> (or formerly F</lib/unicode>), the Unicode version of -normalization implemented by this module depends on what has been -compiled into your perl. The following table lists the default Unicode -version that comes with various perl versions. (It is possible to change -the Unicode version in any perl version to be any earlier Unicode version, -so one could cause Unicode 3.2 to be used in any perl version starting with -5.8.0. See C<$Config{privlib}>/F<unicore/README.perl>. - - perl's version implemented Unicode version - 5.6.1 3.0.1 - 5.7.2 3.1.0 - 5.7.3 3.1.1 (normalization is same as 3.1.0) - 5.8.0 3.2.0 - 5.8.1-5.8.3 4.0.0 - 5.8.4-5.8.6 4.0.1 (normalization is same as 4.0.0) - 5.8.7-5.8.8 4.1.0 - 5.10.0 5.0.0 - 5.8.9, 5.10.1 5.1.0 - 5.12.x 5.2.0 - 5.14.x 6.0.0 - 5.16.x 6.1.0 - 5.18.x 6.2.0 - 5.20.x 6.3.0 - 5.22.x 7.0.0 - -=item Correction of decomposition mapping - -In older Unicode versions, a small number of characters (all of which are -CJK compatibility ideographs as far as they have been found) may have -an erroneous decomposition mapping (see F<NormalizationCorrections.txt>). -Anyhow, this module will neither refer to F<NormalizationCorrections.txt> -nor provide any specific version of normalization. Therefore this module -running on an older perl with an older Unicode database may use -the erroneous decomposition mapping blindly conforming to the Unicode database. - -=item Revised definition of canonical composition - -In Unicode 4.1.0, the definition D2 of canonical composition (which -affects NFC and NFKC) has been changed (see Public Review Issue #29 -and recent UAX #15). This module has used the newer definition -since the version 0.07 (Oct 31, 2001). -This module will not support the normalization according to the older -definition, even if the Unicode version implemented by perl is -lower than 4.1.0. - -=back - -=head1 AUTHOR - -SADAHIRO Tomoyuki <SADAHIRO@cpan.org> - -Currently maintained by <perl5-porters@perl.org> - -Copyright(C) 2001-2012, SADAHIRO Tomoyuki. Japan. All rights reserved. - -=head1 LICENSE - -This module is free software; you can redistribute it -and/or modify it under the same terms as Perl itself. - -=head1 SEE ALSO - -=over 4 - -=item http://www.unicode.org/reports/tr15/ - -Unicode Normalization Forms - UAX #15 - -=item http://www.unicode.org/Public/UNIDATA/CompositionExclusions.txt - -Composition Exclusion Table - -=item http://www.unicode.org/Public/UNIDATA/DerivedNormalizationProps.txt - -Derived Normalization Properties - -=item http://www.unicode.org/Public/UNIDATA/NormalizationCorrections.txt - -Normalization Corrections - -=item http://www.unicode.org/review/pr-29.html - -Public Review Issue #29: Normalization Issue - -=item http://www.unicode.org/notes/tn5/ - -Canonical Equivalence in Applications - UTN #5 - -=back - -=cut diff --git a/cpan/Unicode-Normalize/Normalize.xs b/cpan/Unicode-Normalize/Normalize.xs deleted file mode 100644 index 4acff7fe49..0000000000 --- a/cpan/Unicode-Normalize/Normalize.xs +++ /dev/null @@ -1,925 +0,0 @@ - -#define PERL_NO_GET_CONTEXT /* we want efficiency */ - -/* private functions which need pTHX_ and aTHX_ - pv_cat_decompHangul - sv_2pvunicode - pv_utf8_decompose - pv_utf8_reorder - pv_utf8_compose -*/ - -#include "EXTERN.h" -#include "perl.h" -#include "XSUB.h" - -/* These 5 files are prepared by mkheader */ -#include "unfcmb.h" -#include "unfcan.h" -#include "unfcpt.h" -#include "unfcmp.h" -#include "unfexc.h" - -/* The generated normalization tables since v5.20 are in native character set - * terms. Prior to that, they were in Unicode terms. So we use 'uvchr' for - * later perls, and redefine that to be 'uvuni' for earlier ones */ -#if PERL_VERSION < 20 -# undef uvchr_to_utf8 -# ifdef uvuni_to_utf8 -# define uvchr_to_utf8 uvuni_to_utf8 -# else /* Perl 5.6.1 */ -# define uvchr_to_utf8 uv_to_utf8 -# endif - -# undef utf8n_to_uvchr -# ifdef utf8n_to_uvuni -# define utf8n_to_uvchr utf8n_to_uvuni -# else /* Perl 5.6.1 */ -# define utf8n_to_uvchr utf8_to_uv -# endif -#endif - -/* UTF8_ALLOW_BOM is used before Perl 5.8.0 */ -#ifndef UTF8_ALLOW_BOM -#define UTF8_ALLOW_BOM (0) -#endif /* UTF8_ALLOW_BOM */ - -#ifndef UTF8_ALLOW_SURROGATE -#define UTF8_ALLOW_SURROGATE (0) -#endif /* UTF8_ALLOW_SURROGATE */ - -#ifndef UTF8_ALLOW_FE_FF -#define UTF8_ALLOW_FE_FF (0) -#endif /* UTF8_ALLOW_FE_FF */ - -#ifndef UTF8_ALLOW_FFFF -#define UTF8_ALLOW_FFFF (0) -#endif /* UTF8_ALLOW_FFFF */ - -#ifndef PERL_UNUSED_VAR -# define PERL_UNUSED_VAR(x) ((void)sizeof(x)) -#endif - -#define AllowAnyUTF (UTF8_ALLOW_SURROGATE|UTF8_ALLOW_BOM|UTF8_ALLOW_FE_FF|UTF8_ALLOW_FFFF) - -/* check if the string buffer is enough before uvchr_to_utf8(). */ -/* dstart, d, and dlen should be defined outside before. */ -#define Renew_d_if_not_enough_to(need) STRLEN curlen = d - dstart; \ - if (dlen < curlen + (need)) { \ - dlen += (need); \ - Renew(dstart, dlen+1, U8); \ - d = dstart + curlen; \ - } - -/* if utf8n_to_uvchr() sets retlen to 0 (if broken?) */ -#define ErrRetlenIsZero "panic (Unicode::Normalize %s): zero-length character" - -/* utf8_hop() hops back before start. Maybe broken UTF-8 */ -#define ErrHopBeforeStart "panic (Unicode::Normalize): hopping before start" - -/* At present, char > 0x10ffff are unaffected without complaint, right? */ -#define VALID_UTF_MAX (0x10ffff) -#define OVER_UTF_MAX(uv) (VALID_UTF_MAX < (uv)) - -/* size of array for combining characters */ -/* enough as an initial value? */ -#define CC_SEQ_SIZE (10) -#define CC_SEQ_STEP (5) - -/* HANGUL begin */ -#define Hangul_SBase 0xAC00 -#define Hangul_SFinal 0xD7A3 -#define Hangul_SCount 11172 - -#define Hangul_NCount 588 - -#define Hangul_LBase 0x1100 -#define Hangul_LFinal 0x1112 -#define Hangul_LCount 19 - -#define Hangul_VBase 0x1161 -#define Hangul_VFinal 0x1175 -#define Hangul_VCount 21 - -#define Hangul_TBase 0x11A7 -#define Hangul_TFinal 0x11C2 -#define Hangul_TCount 28 - -#define Hangul_IsS(u) ((Hangul_SBase <= (u)) && ((u) <= Hangul_SFinal)) -#define Hangul_IsN(u) (((u) - Hangul_SBase) % Hangul_TCount == 0) -#define Hangul_IsLV(u) (Hangul_IsS(u) && Hangul_IsN(u)) -#define Hangul_IsL(u) ((Hangul_LBase <= (u)) && ((u) <= Hangul_LFinal)) -#define Hangul_IsV(u) ((Hangul_VBase <= (u)) && ((u) <= Hangul_VFinal)) -#define Hangul_IsT(u) ((Hangul_TBase < (u)) && ((u) <= Hangul_TFinal)) -/* HANGUL end */ - -/* this is used for canonical ordering of combining characters (c.c.). */ -typedef struct { - U8 cc; /* combining class */ - UV uv; /* codepoint */ - STRLEN pos; /* position */ -} UNF_cc; - -static int compare_cc(const void *a, const void *b) -{ - int ret_cc; - ret_cc = ((UNF_cc*) a)->cc - ((UNF_cc*) b)->cc; - if (ret_cc) - return ret_cc; - - return ( ((UNF_cc*) a)->pos > ((UNF_cc*) b)->pos ) - - ( ((UNF_cc*) a)->pos < ((UNF_cc*) b)->pos ); -} - -static U8* dec_canonical(UV uv) -{ - U8 ***plane, **row; - if (OVER_UTF_MAX(uv)) - return NULL; - plane = (U8***)UNF_canon[uv >> 16]; - if (! plane) - return NULL; - row = plane[(uv >> 8) & 0xff]; - return row ? row[uv & 0xff] : NULL; -} - -static U8* dec_compat(UV uv) -{ - U8 ***plane, **row; - if (OVER_UTF_MAX(uv)) - return NULL; - plane = (U8***)UNF_compat[uv >> 16]; - if (! plane) - return NULL; - row = plane[(uv >> 8) & 0xff]; - return row ? row[uv & 0xff] : NULL; -} - -static UV composite_uv(UV uv, UV uv2) -{ - UNF_complist ***plane, **row, *cell, *i; - - if (!uv2 || OVER_UTF_MAX(uv) || OVER_UTF_MAX(uv2)) - return 0; - - if (Hangul_IsL(uv) && Hangul_IsV(uv2)) { - UV lindex = uv - Hangul_LBase; - UV vindex = uv2 - Hangul_VBase; - return(Hangul_SBase + (lindex * Hangul_VCount + vindex) * - Hangul_TCount); - } - if (Hangul_IsLV(uv) && Hangul_IsT(uv2)) { - UV tindex = uv2 - Hangul_TBase; - return(uv + tindex); - } - plane = UNF_compos[uv >> 16]; - if (! plane) - return 0; - row = plane[(uv >> 8) & 0xff]; - if (! row) - return 0; - cell = row[uv & 0xff]; - if (! cell) - return 0; - for (i = cell; i->nextchar; i++) { - if (uv2 == i->nextchar) - return i->composite; - } - return 0; -} - -static U8 getCombinClass(UV uv) -{ - U8 **plane, *row; - if (OVER_UTF_MAX(uv)) - return 0; - plane = (U8**)UNF_combin[uv >> 16]; - if (! plane) - return 0; - row = plane[(uv >> 8) & 0xff]; - return row ? row[uv & 0xff] : 0; -} - -static U8* pv_cat_decompHangul(pTHX_ U8* d, UV uv) -{ - UV sindex = uv - Hangul_SBase; - UV lindex = sindex / Hangul_NCount; - UV vindex = (sindex % Hangul_NCount) / Hangul_TCount; - UV tindex = sindex % Hangul_TCount; - - if (! Hangul_IsS(uv)) - return d; - - d = uvchr_to_utf8(d, (lindex + Hangul_LBase)); - d = uvchr_to_utf8(d, (vindex + Hangul_VBase)); - if (tindex) - d = uvchr_to_utf8(d, (tindex + Hangul_TBase)); - return d; -} - -static char* sv_2pvunicode(pTHX_ SV *sv, STRLEN *lp) -{ - char *s; - STRLEN len; - s = SvPV(sv,len); - if (!SvUTF8(sv)) { - SV* tmpsv = sv_2mortal(newSVpvn(s, len)); - if (!SvPOK(tmpsv)) - s = SvPV_force(tmpsv,len); - sv_utf8_upgrade(tmpsv); - s = SvPV(tmpsv,len); - } - if (lp) - *lp = len; - return s; -} - -static -U8* pv_utf8_decompose(pTHX_ U8* s, STRLEN slen, U8** dp, STRLEN dlen, bool iscompat) -{ - U8* p = s; - U8* e = s + slen; - U8* dstart = *dp; - U8* d = dstart; - - while (p < e) { - STRLEN retlen; - UV uv = utf8n_to_uvchr(p, e - p, &retlen, AllowAnyUTF); - if (!retlen) - croak(ErrRetlenIsZero, "decompose"); - p += retlen; - - if (Hangul_IsS(uv)) { - Renew_d_if_not_enough_to(UTF8_MAXLEN * 3) - d = pv_cat_decompHangul(aTHX_ d, uv); - } - else { - U8* r = iscompat ? dec_compat(uv) : dec_canonical(uv); - - if (r) { - STRLEN len = (STRLEN)strlen((char *)r); - Renew_d_if_not_enough_to(len) - while (len--) - *d++ = *r++; - } - else { - Renew_d_if_not_enough_to(UTF8_MAXLEN) - d = uvchr_to_utf8(d, uv); - } - } - } - *dp = dstart; - return d; -} - -static -U8* pv_utf8_reorder(pTHX_ U8* s, STRLEN slen, U8** dp, STRLEN dlen) -{ - U8* p = s; - U8* e = s + slen; - U8* dstart = *dp; - U8* d = dstart; - - UNF_cc seq_ary[CC_SEQ_SIZE]; - UNF_cc* seq_ptr = seq_ary; /* use array at the beginning */ - UNF_cc* seq_ext = NULL; /* extend if need */ - STRLEN seq_max = CC_SEQ_SIZE; - STRLEN cc_pos = 0; - - while (p < e) { - U8 curCC; - STRLEN retlen; - UV uv = utf8n_to_uvchr(p, e - p, &retlen, AllowAnyUTF); - if (!retlen) - croak(ErrRetlenIsZero, "reorder"); - p += retlen; - - curCC = getCombinClass(uv); - - if (curCC != 0) { - if (seq_max < cc_pos + 1) { /* extend if need */ - seq_max = cc_pos + CC_SEQ_STEP; /* new size */ - if (CC_SEQ_SIZE == cc_pos) { /* seq_ary full */ - STRLEN i; - New(0, seq_ext, seq_max, UNF_cc); - for (i = 0; i < cc_pos; i++) - seq_ext[i] = seq_ary[i]; - } - else { - Renew(seq_ext, seq_max, UNF_cc); - } - seq_ptr = seq_ext; /* use seq_ext from now */ - } - - seq_ptr[cc_pos].cc = curCC; - seq_ptr[cc_pos].uv = uv; - seq_ptr[cc_pos].pos = cc_pos; - ++cc_pos; - - if (p < e) - continue; - } - - /* output */ - if (cc_pos) { - STRLEN i; - - if (cc_pos > 1) /* reordered if there are two c.c.'s */ - qsort((void*)seq_ptr, cc_pos, sizeof(UNF_cc), compare_cc); - - for (i = 0; i < cc_pos; i++) { - Renew_d_if_not_enough_to(UTF8_MAXLEN) - d = uvchr_to_utf8(d, seq_ptr[i].uv); - } - cc_pos = 0; - } - - if (curCC == 0) { - Renew_d_if_not_enough_to(UTF8_MAXLEN) - d = uvchr_to_utf8(d, uv); - } - } - if (seq_ext) - Safefree(seq_ext); - *dp = dstart; - return d; -} - -static -U8* pv_utf8_compose(pTHX_ U8* s, STRLEN slen, U8** dp, STRLEN dlen, bool iscontig) -{ - U8* p = s; - U8* e = s + slen; - U8* dstart = *dp; - U8* d = dstart; - - UV uvS = 0; /* code point of the starter */ - bool valid_uvS = FALSE; /* if FALSE, uvS isn't initialized yet */ - U8 preCC = 0; - - UV seq_ary[CC_SEQ_SIZE]; - UV* seq_ptr = seq_ary; /* use array at the beginning */ - UV* seq_ext = NULL; /* extend if need */ - STRLEN seq_max = CC_SEQ_SIZE; - STRLEN cc_pos = 0; - - while (p < e) { - U8 curCC; - STRLEN retlen; - UV uv = utf8n_to_uvchr(p, e - p, &retlen, AllowAnyUTF); - if (!retlen) - croak(ErrRetlenIsZero, "compose"); - p += retlen; - - curCC = getCombinClass(uv); - - if (!valid_uvS) { - if (curCC == 0) { - uvS = uv; /* the first Starter is found */ - valid_uvS = TRUE; - if (p < e) - continue; - } - else { - Renew_d_if_not_enough_to(UTF8_MAXLEN) - d = uvchr_to_utf8(d, uv); - continue; - } - } - else { - bool composed; - - /* blocked */ - if ((iscontig && cc_pos) || /* discontiguous combination */ - (curCC != 0 && preCC == curCC) || /* blocked by same CC */ - (preCC > curCC)) /* blocked by higher CC: revised D2 */ - composed = FALSE; - - /* not blocked: - iscontig && cc_pos == 0 -- contiguous combination - curCC == 0 && preCC == 0 -- starter + starter - curCC != 0 && preCC < curCC -- lower CC */ - else { - /* try composition */ - UV uvComp = composite_uv(uvS, uv); - - if (uvComp && !isExclusion(uvComp)) { - uvS = uvComp; - composed = TRUE; - - /* preCC should not be changed to curCC */ - /* e.g. 1E14 = 0045 0304 0300 where CC(0304) == CC(0300) */ - if (p < e) - continue; - } - else - composed = FALSE; - } - - if (!composed) { - preCC = curCC; - if (curCC != 0 || !(p < e)) { - if (seq_max < cc_pos + 1) { /* extend if need */ - seq_max = cc_pos + CC_SEQ_STEP; /* new size */ - if (CC_SEQ_SIZE == cc_pos) { /* seq_ary full */ - New(0, seq_ext, seq_max, UV); - Copy(seq_ary, seq_ext, cc_pos, UV); - } - else { - Renew(seq_ext, seq_max, UV); - } - seq_ptr = seq_ext; /* use seq_ext from now */ - } - seq_ptr[cc_pos] = uv; - ++cc_pos; - } - if (curCC != 0 && p < e) - continue; - } - } - - /* output */ - { - Renew_d_if_not_enough_to(UTF8_MAXLEN) - d = uvchr_to_utf8(d, uvS); /* starter (composed or not) */ - } - - if (cc_pos) { - STRLEN i; - - for (i = 0; i < cc_pos; i++) { - Renew_d_if_not_enough_to(UTF8_MAXLEN) - d = uvchr_to_utf8(d, seq_ptr[i]); - } - cc_pos = 0; - } - - uvS = uv; - } - if (seq_ext) - Safefree(seq_ext); - *dp = dstart; - return d; -} - -MODULE = Unicode::Normalize PACKAGE = Unicode::Normalize - -SV* -decompose(src, compat = &PL_sv_no) - SV * src - SV * compat - PROTOTYPE: $;$ - PREINIT: - SV* dst; - U8 *s, *d, *dend; - STRLEN slen, dlen; - CODE: - s = (U8*)sv_2pvunicode(aTHX_ src,&slen); - dst = newSVpvn("", 0); - dlen = slen; - New(0, d, dlen+1, U8); - dend = pv_utf8_decompose(aTHX_ s, slen, &d, dlen, (bool)SvTRUE(compat)); - sv_setpvn(dst, (char *)d, dend - d); - SvUTF8_on(dst); - Safefree(d); - RETVAL = dst; - OUTPUT: - RETVAL - - -SV* -reorder(src) - SV * src - PROTOTYPE: $ - PREINIT: - SV* dst; - U8 *s, *d, *dend; - STRLEN slen, dlen; - CODE: - s = (U8*)sv_2pvunicode(aTHX_ src,&slen); - dst = newSVpvn("", 0); - dlen = slen; - New(0, d, dlen+1, U8); - dend = pv_utf8_reorder(aTHX_ s, slen, &d, dlen); - sv_setpvn(dst, (char *)d, dend - d); - SvUTF8_on(dst); - Safefree(d); - RETVAL = dst; - OUTPUT: - RETVAL - - -SV* -compose(src) - SV * src - PROTOTYPE: $ - ALIAS: - composeContiguous = 1 - PREINIT: - SV* dst; - U8 *s, *d, *dend; - STRLEN slen, dlen; - CODE: - s = (U8*)sv_2pvunicode(aTHX_ src,&slen); - dst = newSVpvn("", 0); - dlen = slen; - New(0, d, dlen+1, U8); - dend = pv_utf8_compose(aTHX_ s, slen, &d, dlen, (bool)ix); - sv_setpvn(dst, (char *)d, dend - d); - SvUTF8_on(dst); - Safefree(d); - RETVAL = dst; - OUTPUT: - RETVAL - - -SV* -NFD(src) - SV * src - PROTOTYPE: $ - ALIAS: - NFKD = 1 - PREINIT: - SV *dst; - U8 *s, *t, *tend, *d, *dend; - STRLEN slen, tlen, dlen; - CODE: - s = (U8*)sv_2pvunicode(aTHX_ src,&slen); - - /* decompose */ - tlen = slen; - New(0, t, tlen+1, U8); - tend = pv_utf8_decompose(aTHX_ s, slen, &t, tlen, (bool)(ix==1)); - *tend = '\0'; - tlen = tend - t; /* no longer know real size of t */ - - /* reorder */ - dlen = tlen; - New(0, d, dlen+1, U8); - dend = pv_utf8_reorder(aTHX_ t, tlen, &d, dlen); - *dend = '\0'; - dlen = dend - d; /* no longer know real size of d */ - - /* return */ - dst = newSVpvn("", 0); - sv_setpvn(dst, (char *)d, dlen); - SvUTF8_on(dst); - - Safefree(t); - Safefree(d); - RETVAL = dst; - OUTPUT: - RETVAL - - -SV* -NFC(src) - SV * src - PROTOTYPE: $ - ALIAS: - NFKC = 1 - FCC = 2 - PREINIT: - SV *dst; - U8 *s, *t, *tend, *u, *uend, *d, *dend; - STRLEN slen, tlen, ulen, dlen; - CODE: - s = (U8*)sv_2pvunicode(aTHX_ src,&slen); - - /* decompose */ - tlen = slen; - New(0, t, tlen+1, U8); - tend = pv_utf8_decompose(aTHX_ s, slen, &t, tlen, (bool)(ix==1)); - *tend = '\0'; - tlen = tend - t; /* no longer know real size of t */ - - /* reorder */ - ulen = tlen; - New(0, u, ulen+1, U8); - uend = pv_utf8_reorder(aTHX_ t, tlen, &u, ulen); - *uend = '\0'; - ulen = uend - u; /* no longer know real size of u */ - - /* compose */ - dlen = ulen; - New(0, d, dlen+1, U8); - dend = pv_utf8_compose(aTHX_ u, ulen, &d, dlen, (bool)(ix==2)); - *dend = '\0'; - dlen = dend - d; /* no longer know real size of d */ - - /* return */ - dst = newSVpvn("", 0); - sv_setpvn(dst, (char *)d, dlen); - SvUTF8_on(dst); - - Safefree(t); - Safefree(u); - Safefree(d); - RETVAL = dst; - OUTPUT: - RETVAL - - -SV* -checkNFD(src) - SV * src - PROTOTYPE: $ - ALIAS: - checkNFKD = 1 - PREINIT: - STRLEN srclen, retlen; - U8 *s, *e, *p, curCC, preCC; - bool result = TRUE; - CODE: - s = (U8*)sv_2pvunicode(aTHX_ src,&srclen); - e = s + srclen; - - preCC = 0; - for (p = s; p < e; p += retlen) { - UV uv = utf8n_to_uvchr(p, e - p, &retlen, AllowAnyUTF); - if (!retlen) - croak(ErrRetlenIsZero, "checkNFD or -NFKD"); - - curCC = getCombinClass(uv); - if (preCC > curCC && curCC != 0) { /* canonical ordering violated */ - result = FALSE; - break; - } - if (Hangul_IsS(uv) || (ix ? dec_compat(uv) : dec_canonical(uv))) { - result = FALSE; - break; - } - preCC = curCC; - } - RETVAL = boolSV(result); - OUTPUT: - RETVAL - - -SV* -checkNFC(src) - SV * src - PROTOTYPE: $ - ALIAS: - checkNFKC = 1 - PREINIT: - STRLEN srclen, retlen; - U8 *s, *e, *p, curCC, preCC; - bool result = TRUE; - bool isMAYBE = FALSE; - CODE: - s = (U8*)sv_2pvunicode(aTHX_ src,&srclen); - e = s + srclen; - - preCC = 0; - for (p = s; p < e; p += retlen) { - UV uv = utf8n_to_uvchr(p, e - p, &retlen, AllowAnyUTF); - if (!retlen) - croak(ErrRetlenIsZero, "checkNFC or -NFKC"); - - curCC = getCombinClass(uv); - if (preCC > curCC && curCC != 0) { /* canonical ordering violated */ - result = FALSE; - break; - } - - /* get NFC/NFKC property */ - if (Hangul_IsS(uv)) /* Hangul syllables are canonical composites */ - ; /* YES */ - else if (isExclusion(uv) || isSingleton(uv) || isNonStDecomp(uv)) { - result = FALSE; - break; - } - else if (isComp2nd(uv)) - isMAYBE = TRUE; - else if (ix) { - char *canon, *compat; - /* NFKC_NO when having compatibility mapping. */ - canon = (char *) dec_canonical(uv); - compat = (char *) dec_compat(uv); - if (compat && !(canon && strEQ(canon, compat))) { - result = FALSE; - break; - } - } /* end of get NFC/NFKC property */ - - preCC = curCC; - } - if (isMAYBE && result) /* NO precedes MAYBE */ - XSRETURN_UNDEF; - RETVAL = boolSV(result); - OUTPUT: - RETVAL - - -SV* -checkFCD(src) - SV * src - PROTOTYPE: $ - ALIAS: - checkFCC = 1 - PREINIT: - STRLEN srclen, retlen; - U8 *s, *e, *p, curCC, preCC; - bool result = TRUE; - bool isMAYBE = FALSE; - CODE: - s = (U8*)sv_2pvunicode(aTHX_ src,&srclen); - e = s + srclen; - preCC = 0; - for (p = s; p < e; p += retlen) { - U8 *sCan; - UV uvLead; - STRLEN canlen = 0; - UV uv = utf8n_to_uvchr(p, e - p, &retlen, AllowAnyUTF); - if (!retlen) - croak(ErrRetlenIsZero, "checkFCD or -FCC"); - - sCan = (U8*) dec_canonical(uv); - - if (sCan) { - STRLEN canret; - canlen = (STRLEN)strlen((char *) sCan); - uvLead = utf8n_to_uvchr(sCan, canlen, &canret, AllowAnyUTF); - if (!canret) - croak(ErrRetlenIsZero, "checkFCD or -FCC"); - } - else { - uvLead = uv; - } - - curCC = getCombinClass(uvLead); - - if (curCC != 0 && curCC < preCC) { /* canonical ordering violated */ - result = FALSE; - break; - } - - if (ix) { - if (isExclusion(uv) || isSingleton(uv) || isNonStDecomp(uv)) { - result = FALSE; - break; - } - else if (isComp2nd(uv)) - isMAYBE = TRUE; - } - - if (sCan) { - STRLEN canret; - UV uvTrail; - U8* eCan = sCan + canlen; - U8* pCan = utf8_hop(eCan, -1); - if (pCan < sCan) - croak(ErrHopBeforeStart); - uvTrail = utf8n_to_uvchr(pCan, eCan - pCan, &canret, AllowAnyUTF); - if (!canret) - croak(ErrRetlenIsZero, "checkFCD or -FCC"); - preCC = getCombinClass(uvTrail); - } - else { - preCC = curCC; - } - } - if (isMAYBE && result) /* NO precedes MAYBE */ - XSRETURN_UNDEF; - RETVAL = boolSV(result); - OUTPUT: - RETVAL - - -U8 -getCombinClass(uv) - UV uv - PROTOTYPE: $ - -bool -isExclusion(uv) - UV uv - PROTOTYPE: $ - -bool -isSingleton(uv) - UV uv - PROTOTYPE: $ - -bool -isNonStDecomp(uv) - UV uv - PROTOTYPE: $ - -bool -isComp2nd(uv) - UV uv - PROTOTYPE: $ - ALIAS: - isNFC_MAYBE = 1 - isNFKC_MAYBE = 2 - INIT: - PERL_UNUSED_VAR(ix); - -SV* -isNFD_NO(uv) - UV uv - PROTOTYPE: $ - ALIAS: - isNFKD_NO = 1 - PREINIT: - bool result = FALSE; - CODE: - if (Hangul_IsS(uv) || (ix ? dec_compat(uv) : dec_canonical(uv))) - result = TRUE; /* NFD_NO or NFKD_NO */ - RETVAL = boolSV(result); - OUTPUT: - RETVAL - - -SV* -isComp_Ex(uv) - UV uv - PROTOTYPE: $ - ALIAS: - isNFC_NO = 0 - isNFKC_NO = 1 - PREINIT: - bool result = FALSE; - CODE: - if (isExclusion(uv) || isSingleton(uv) || isNonStDecomp(uv)) - result = TRUE; /* NFC_NO or NFKC_NO */ - else if (ix) { - char *canon, *compat; - canon = (char *) dec_canonical(uv); - compat = (char *) dec_compat(uv); - if (compat && (!canon || strNE(canon, compat))) - result = TRUE; /* NFC_NO or NFKC_NO */ - } - RETVAL = boolSV(result); - OUTPUT: - RETVAL - -SV* -getComposite(uv, uv2) - UV uv - UV uv2 - PROTOTYPE: $$ - PREINIT: - UV composite; - CODE: - composite = composite_uv(uv, uv2); - RETVAL = composite ? newSVuv(composite) : &PL_sv_undef; - OUTPUT: - RETVAL - - - -SV* -getCanon(uv) - UV uv - PROTOTYPE: $ - ALIAS: - getCompat = 1 - CODE: - if (Hangul_IsS(uv)) { - U8 tmp[3 * UTF8_MAXLEN + 1]; - U8 *t = tmp; - U8 *e = pv_cat_decompHangul(aTHX_ t, uv); - RETVAL = newSVpvn((char *)t, e - t); - } else { - U8* rstr = ix ? dec_compat(uv) : dec_canonical(uv); - if (!rstr) - XSRETURN_UNDEF; - RETVAL = newSVpvn((char *)rstr, strlen((char *)rstr)); - } - SvUTF8_on(RETVAL); - OUTPUT: - RETVAL - - -void -splitOnLastStarter(src) - SV * src - PREINIT: - SV *svp; - STRLEN srclen; - U8 *s, *e, *p; - PPCODE: - s = (U8*)sv_2pvunicode(aTHX_ src,&srclen); - e = s + srclen; - p = e; - while (s < p) { - UV uv; - p = utf8_hop(p, -1); - if (p < s) - croak(ErrHopBeforeStart); - uv = utf8n_to_uvchr(p, e - p, NULL, AllowAnyUTF); - if (getCombinClass(uv) == 0) /* Last Starter found */ - break; - } - - svp = sv_2mortal(newSVpvn((char*)s, p - s)); - SvUTF8_on(svp); - XPUSHs(svp); - - svp = sv_2mortal(newSVpvn((char*)p, e - p)); - SvUTF8_on(svp); - XPUSHs(svp); - diff --git a/cpan/Unicode-Normalize/mkheader b/cpan/Unicode-Normalize/mkheader deleted file mode 100644 index 8d4c1b8e8d..0000000000 --- a/cpan/Unicode-Normalize/mkheader +++ /dev/null @@ -1,419 +0,0 @@ -#!perl -# -# This auxiliary script makes five header files -# used for building XSUB of Unicode::Normalize. -# -# Usage: -# <do 'mkheader'> in perl, or <perl mkheader> in command line -# -# Input files: -# unicore/CombiningClass.pl (or unicode/CombiningClass.pl) -# unicore/Decomposition.pl (or unicode/Decomposition.pl) -# -# Output files: -# unfcan.h -# unfcpt.h -# unfcmb.h -# unfcmp.h -# unfexc.h -# -use 5.006; -use strict; -use warnings; -use Carp; -use File::Spec; -use SelectSaver; - -BEGIN { - unless ('A' eq pack('U', 0x41)) { - die "Unicode::Normalize cannot stringify a Unicode code point\n"; - } - unless (0x41 == unpack('U', 'A')) { - die "Unicode::Normalize cannot get Unicode code point\n"; - } -} - -our $PACKAGE = 'Unicode::Normalize, mkheader'; - -our $prefix = "UNF_"; -our $structname = "${prefix}complist"; - -# Starting in v5.20, the tables in lib/unicore are built using the platform's -# native character set for code points 0-255. -*pack_U = ($] ge 5.020) - ? sub { return pack('W*', @_).pack('U*'); } # The empty pack returns - # an empty UTF-8 string, - # so the effect is to - # force the return into - # being UTF-8. - : sub { return pack('U*', @_); }; - -# %Canon and %Compat will be ($codepoint => $hexstring) after _U_stringify() -our %Comp1st; # $codepoint => $listname : may be composed with a next char. -our %CompList; # $listname,$2nd => $codepoint : composite - -##### The below part is common to mkheader and PP ##### - -our %Combin; # $codepoint => $number : combination class -our %Canon; # $codepoint => \@codepoints : canonical decomp. -our %Compat; # $codepoint => \@codepoints : compat. decomp. -our %Compos; # $1st,$2nd => $codepoint : composite -our %Exclus; # $codepoint => 1 : composition exclusions -our %Single; # $codepoint => 1 : singletons -our %NonStD; # $codepoint => 1 : non-starter decompositions -our %Comp2nd; # $codepoint => 1 : may be composed with a prev char. - -# from core Unicode database -our $Combin = do "unicore/CombiningClass.pl" - || do "unicode/CombiningClass.pl" - || croak "$PACKAGE: CombiningClass.pl not found"; -our $Decomp = do "unicore/Decomposition.pl" - || do "unicode/Decomposition.pl" - || croak "$PACKAGE: Decomposition.pl not found"; - -# CompositionExclusions.txt since Unicode 3.2.0. If this ever changes, it -# would be better to get the values from Unicode::UCD rather than hard-code -# them here, as that will protect from having to make fixes for future -# changes. -our @CompEx = qw( - 0958 0959 095A 095B 095C 095D 095E 095F 09DC 09DD 09DF 0A33 0A36 - 0A59 0A5A 0A5B 0A5E 0B5C 0B5D 0F43 0F4D 0F52 0F57 0F5C 0F69 0F76 - 0F78 0F93 0F9D 0FA2 0FA7 0FAC 0FB9 FB1D FB1F FB2A FB2B FB2C FB2D - FB2E FB2F FB30 FB31 FB32 FB33 FB34 FB35 FB36 FB38 FB39 FB3A FB3B - FB3C FB3E FB40 FB41 FB43 FB44 FB46 FB47 FB48 FB49 FB4A FB4B FB4C - FB4D FB4E 2ADC 1D15E 1D15F 1D160 1D161 1D162 1D163 1D164 1D1BB - 1D1BC 1D1BD 1D1BE 1D1BF 1D1C0 -); - -# definition of Hangul constants -use constant SBase => 0xAC00; -use constant SFinal => 0xD7A3; # SBase -1 + SCount -use constant SCount => 11172; # LCount * NCount -use constant NCount => 588; # VCount * TCount -use constant LBase => 0x1100; -use constant LFinal => 0x1112; -use constant LCount => 19; -use constant VBase => 0x1161; -use constant VFinal => 0x1175; -use constant VCount => 21; -use constant TBase => 0x11A7; -use constant TFinal => 0x11C2; -use constant TCount => 28; - -sub decomposeHangul { - my $sindex = $_[0] - SBase; - my $lindex = int( $sindex / NCount); - my $vindex = int(($sindex % NCount) / TCount); - my $tindex = $sindex % TCount; - my @ret = ( - LBase + $lindex, - VBase + $vindex, - $tindex ? (TBase + $tindex) : (), - ); - return wantarray ? @ret : pack_U(@ret); -} - -########## getting full decomposition ########## - -## converts string "hhhh hhhh hhhh" to a numeric list -## (hex digits separated by spaces) -sub _getHexArray { map hex, $_[0] =~ /\G *([0-9A-Fa-f]+)/g } - -while ($Combin =~ /(.+)/g) { - my @tab = split /\t/, $1; - my $ini = hex $tab[0]; - if ($tab[1] eq '') { - $Combin{$ini} = $tab[2]; - } else { - $Combin{$_} = $tab[2] foreach $ini .. hex($tab[1]); - } -} - -while ($Decomp =~ /(.+)/g) { - my @tab = split /\t/, $1; - my $compat = $tab[2] =~ s/<[^>]+>//; - my $dec = [ _getHexArray($tab[2]) ]; # decomposition - my $ini = hex($tab[0]); # initial decomposable character - my $end = $tab[1] eq '' ? $ini : hex($tab[1]); - # ($ini .. $end) is the range of decomposable characters. - - foreach my $u ($ini .. $end) { - $Compat{$u} = $dec; - $Canon{$u} = $dec if ! $compat; - } -} - -for my $s (@CompEx) { - my $u = hex $s; - next if !$Canon{$u}; # not assigned - next if $u == 0xFB1D && !$Canon{0x1D15E}; # 3.0.1 before Corrigendum #2 - $Exclus{$u} = 1; -} - -foreach my $u (keys %Canon) { - my $dec = $Canon{$u}; - - if (@$dec == 2) { - if ($Combin{ $dec->[0] }) { - $NonStD{$u} = 1; - } else { - $Compos{ $dec->[0] }{ $dec->[1] } = $u; - $Comp2nd{ $dec->[1] } = 1 if ! $Exclus{$u}; - } - } elsif (@$dec == 1) { - $Single{$u} = 1; - } else { - my $h = sprintf '%04X', $u; - croak("Weird Canonical Decomposition of U+$h"); - } -} - -# modern HANGUL JUNGSEONG and HANGUL JONGSEONG jamo -foreach my $j (0x1161..0x1175, 0x11A8..0x11C2) { - $Comp2nd{$j} = 1; -} - -sub getCanonList { - my @src = @_; - my @dec = map { - (SBase <= $_ && $_ <= SFinal) ? decomposeHangul($_) - : $Canon{$_} ? @{ $Canon{$_} } : $_ - } @src; - return join(" ",@src) eq join(" ",@dec) ? @dec : getCanonList(@dec); - # condition @src == @dec is not ok. -} - -sub getCompatList { - my @src = @_; - my @dec = map { - (SBase <= $_ && $_ <= SFinal) ? decomposeHangul($_) - : $Compat{$_} ? @{ $Compat{$_} } : $_ - } @src; - return join(" ",@src) eq join(" ",@dec) ? @dec : getCompatList(@dec); - # condition @src == @dec is not ok. -} - -# exhaustive decomposition -foreach my $key (keys %Canon) { - $Canon{$key} = [ getCanonList($key) ]; -} - -# exhaustive decomposition -foreach my $key (keys %Compat) { - $Compat{$key} = [ getCompatList($key) ]; -} - -##### The above part is common to mkheader and PP ##### - -foreach my $comp1st (keys %Compos) { - my $listname = sprintf("${structname}_%06x", $comp1st); - # %04x is bad since it'd place _3046 after _1d157. - $Comp1st{$comp1st} = $listname; - my $rh1st = $Compos{$comp1st}; - - foreach my $comp2nd (keys %$rh1st) { - my $uc = $rh1st->{$comp2nd}; - $CompList{$listname}{$comp2nd} = $uc; - } -} - -sub split_into_char { - use bytes; - my $uni = shift; - my $len = length($uni); - my @ary; - for(my $i = 0; $i < $len; ++$i) { - push @ary, ord(substr($uni,$i,1)); - } - return @ary; -} - -sub _U_stringify { - sprintf '"%s"', join '', - map sprintf("\\x%02x", $_), split_into_char(pack_U(@_)); -} - -foreach my $hash (\%Canon, \%Compat) { - foreach my $key (keys %$hash) { - $hash->{$key} = _U_stringify( @{ $hash->{$key} } ); - } -} - -########## writing header files ########## - -my @boolfunc = ( - { - name => "Exclusion", - type => "bool", - hash => \%Exclus, - }, - { - name => "Singleton", - type => "bool", - hash => \%Single, - }, - { - name => "NonStDecomp", - type => "bool", - hash => \%NonStD, - }, - { - name => "Comp2nd", - type => "bool", - hash => \%Comp2nd, - }, -); - -my $orig_fh = SelectSaver->new; -{ - -my $file = "unfexc.h"; -open FH, ">$file" or croak "$PACKAGE: $file can't be made"; -binmode FH; select FH; - - print << 'EOF'; -/* - * This file is auto-generated by mkheader. - * Any changes here will be lost! - */ -EOF - -foreach my $tbl (@boolfunc) { - my @temp = sort {$a <=> $b} keys %{$tbl->{hash}}; - my $type = $tbl->{type}; - my $name = $tbl->{name}; - print "$type is$name (UV uv)\n{\nreturn\n\t"; - - while (@temp) { - my $cur = shift @temp; - if (@temp && $cur + 1 == $temp[0]) { - print "($cur <= uv && uv <= "; - while (@temp && $cur + 1 == $temp[0]) { - $cur = shift @temp; - } - print "$cur)"; - print "\n\t|| " if @temp; - } else { - print "uv == $cur"; - print "\n\t|| " if @temp; - } - } - print "\n\t? TRUE : FALSE;\n}\n\n"; -} - -close FH; - -#################################### - -my $compinit = - "typedef struct { UV nextchar; UV composite; } $structname;\n\n"; - -foreach my $i (sort keys %CompList) { - $compinit .= "$structname $i [] = {\n"; - $compinit .= join ",\n", - map sprintf("\t{ %d, %d }", $_, $CompList{$i}{$_}), - sort {$a <=> $b } keys %{ $CompList{$i} }; - $compinit .= ",\n{0,0}\n};\n\n"; # with sentinel -} - -my @tripletable = ( - { - file => "unfcmb", - name => "combin", - type => "STDCHAR", - hash => \%Combin, - null => 0, - }, - { - file => "unfcan", - name => "canon", - type => "char*", - hash => \%Canon, - null => "NULL", - }, - { - file => "unfcpt", - name => "compat", - type => "char*", - hash => \%Compat, - null => "NULL", - }, - { - file => "unfcmp", - name => "compos", - type => "$structname *", - hash => \%Comp1st, - null => "NULL", - init => $compinit, - }, -); - -foreach my $tbl (@tripletable) { - my $file = "$tbl->{file}.h"; - my $head = "${prefix}$tbl->{name}"; - my $type = $tbl->{type}; - my $hash = $tbl->{hash}; - my $null = $tbl->{null}; - my $init = $tbl->{init}; - - open FH, ">$file" or croak "$PACKAGE: $file can't be made"; - binmode FH; select FH; - my %val; - - print FH << 'EOF'; -/* - * This file is auto-generated by mkheader. - * Any changes here will be lost! - */ -EOF - - print $init if defined $init; - - foreach my $uv (keys %$hash) { - croak sprintf("a Unicode code point 0x%04X over 0x10FFFF.", $uv) - unless $uv <= 0x10FFFF; - my @c = unpack 'CCCC', pack 'N', $uv; - $val{ $c[1] }{ $c[2] }{ $c[3] } = $hash->{$uv}; - } - - foreach my $p (sort { $a <=> $b } keys %val) { - next if ! $val{ $p }; - for (my $r = 0; $r < 256; $r++) { - next if ! $val{ $p }{ $r }; - printf "static $type ${head}_%02x_%02x [256] = {\n", $p, $r; - for (my $c = 0; $c < 256; $c++) { - print "\t", defined $val{$p}{$r}{$c} - ? "($type)".$val{$p}{$r}{$c} - : $null; - print ',' if $c != 255; - print "\n" if $c % 8 == 7; - } - print "};\n\n"; - } - } - foreach my $p (sort { $a <=> $b } keys %val) { - next if ! $val{ $p }; - printf "static $type* ${head}_%02x [256] = {\n", $p; - for (my $r = 0; $r < 256; $r++) { - print $val{ $p }{ $r } - ? sprintf("${head}_%02x_%02x", $p, $r) - : "NULL"; - print ',' if $r != 255; - print "\n" if $val{ $p }{ $r } || ($r+1) % 8 == 0; - } - print "};\n\n"; - } - print "static $type** $head [] = {\n"; - for (my $p = 0; $p <= 0x10; $p++) { - print $val{ $p } ? sprintf("${head}_%02x", $p) : "NULL"; - print ',' if $p != 0x10; - print "\n"; - } - print "};\n\n"; - close FH; -} - -} # End of block for SelectSaver - -1; -__END__ diff --git a/cpan/Unicode-Normalize/t/fcdc.t b/cpan/Unicode-Normalize/t/fcdc.t deleted file mode 100644 index d2ef28b9e9..0000000000 --- a/cpan/Unicode-Normalize/t/fcdc.t +++ /dev/null @@ -1,138 +0,0 @@ - -BEGIN { - unless ('A' eq pack('U', 0x41)) { - print "1..0 # Unicode::Normalize cannot pack a Unicode code point\n"; - exit 0; - } - unless (0x41 == unpack('U', 'A')) { - print "1..0 # Unicode::Normalize cannot get a Unicode code point\n"; - exit 0; - } -} - -BEGIN { - if ($ENV{PERL_CORE}) { - chdir('t') if -d 't'; - @INC = $^O eq 'MacOS' ? qw(::lib) : qw(../lib); - } -} - -######################### - -use strict; -use warnings; -BEGIN { $| = 1; print "1..70\n"; } -my $count = 0; -sub ok ($;$) { - my $p = my $r = shift; - if (@_) { - my $x = shift; - $p = !defined $x ? !defined $r : !defined $r ? 0 : $r eq $x; - } - print $p ? "ok" : "not ok", ' ', ++$count, "\n"; -} - -use Unicode::Normalize qw(:all); - -ok(1); - -sub _pack_U { Unicode::Normalize::pack_U(@_) } -sub hexU { _pack_U map hex, split ' ', shift } -sub answer { defined $_[0] ? $_[0] ? "YES" : "NO" : "MAYBE" } - -######################### - -ok(FCD(''), ""); -ok(FCC(''), ""); -ok(FCD('A'), "A"); -ok(FCC('A'), "A"); - -ok(normalize('FCD', ""), ""); -ok(normalize('FCC', ""), ""); -ok(normalize('FCC', "A"), "A"); -ok(normalize('FCD', "A"), "A"); - -# 9 - -# if checkFCD is YES, the return value from FCD should be same as the original -ok(FCD(hexU("00C5")), hexU("00C5")); # A with ring above -ok(FCD(hexU("0041 030A")), hexU("0041 030A")); # A+ring -ok(FCD(hexU("0041 0327 030A")), hexU("0041 0327 030A")); # A+cedilla+ring -ok(FCD(hexU("AC01 1100 1161")), hexU("AC01 1100 1161")); # hangul -ok(FCD(hexU("212B F900")), hexU("212B F900")); # compat - -ok(normalize('FCD', hexU("00C5")), hexU("00C5")); -ok(normalize('FCD', hexU("0041 030A")), hexU("0041 030A")); -ok(normalize('FCD', hexU("0041 0327 030A")), hexU("0041 0327 030A")); -ok(normalize('FCD', hexU("AC01 1100 1161")), hexU("AC01 1100 1161")); -ok(normalize('FCD', hexU("212B F900")), hexU("212B F900")); - -# 19 - -# if checkFCD is MAYBE or NO, FCD returns NFD (this behavior isn't documented) -ok(FCD(hexU("00C5 0327")), hexU("0041 0327 030A")); -ok(FCD(hexU("0041 030A 0327")), hexU("0041 0327 030A")); -ok(FCD(hexU("00C5 0327")), NFD(hexU("00C5 0327"))); -ok(FCD(hexU("0041 030A 0327")), NFD(hexU("0041 030A 0327"))); - -ok(normalize('FCD', hexU("00C5 0327")), hexU("0041 0327 030A")); -ok(normalize('FCD', hexU("0041 030A 0327")), hexU("0041 0327 030A")); -ok(normalize('FCD', hexU("00C5 0327")), NFD(hexU("00C5 0327"))); -ok(normalize('FCD', hexU("0041 030A 0327")), NFD(hexU("0041 030A 0327"))); - -# 27 - -ok(answer(checkFCD('')), 'YES'); -ok(answer(checkFCD('A')), 'YES'); -ok(answer(checkFCD("\x{030A}")), 'YES'); # 030A;COMBINING RING ABOVE -ok(answer(checkFCD("\x{0327}")), 'YES'); # 0327;COMBINING CEDILLA -ok(answer(checkFCD(_pack_U(0x00C5))), 'YES'); # A with ring above -ok(answer(checkFCD(hexU("0041 030A"))), 'YES'); # A+ring -ok(answer(checkFCD(hexU("0041 0327 030A"))), 'YES'); # A+cedilla+ring -ok(answer(checkFCD(hexU("0041 030A 0327"))), 'NO'); # A+ring+cedilla -ok(answer(checkFCD(hexU("00C5 0327"))), 'NO'); # A-ring+cedilla -ok(answer(checkNFC(hexU("00C5 0327"))), 'MAYBE'); # NFC: A-ring+cedilla -ok(answer(check("FCD", hexU("00C5 0327"))), 'NO'); -ok(answer(check("NFC", hexU("00C5 0327"))), 'MAYBE'); -ok(answer(checkFCD("\x{AC01}\x{1100}\x{1161}")), 'YES'); # hangul -ok(answer(checkFCD("\x{212B}\x{F900}")), 'YES'); # compat - -ok(answer(checkFCD(hexU("1EA7 05AE 0315 0062"))), "NO"); -ok(answer(checkFCC(hexU("1EA7 05AE 0315 0062"))), "NO"); -ok(answer(check('FCD', hexU("1EA7 05AE 0315 0062"))), "NO"); -ok(answer(check('FCC', hexU("1EA7 05AE 0315 0062"))), "NO"); - -# 45 - -ok(FCC(hexU("00C5 0327")), hexU("0041 0327 030A")); -ok(FCC(hexU("0045 0304 0300")), "\x{1E14}"); -ok(FCC("\x{1100}\x{1161}\x{1100}\x{1173}\x{11AF}"), "\x{AC00}\x{AE00}"); -ok(normalize('FCC', hexU("00C5 0327")), hexU("0041 0327 030A")); -ok(normalize('FCC', hexU("0045 0304 0300")), "\x{1E14}"); -ok(normalize('FCC', hexU("1100 1161 1100 1173 11AF")), "\x{AC00}\x{AE00}"); - -ok(FCC("\x{0B47}\x{0300}\x{0B3E}"), "\x{0B47}\x{0300}\x{0B3E}"); -ok(FCC("\x{1100}\x{0300}\x{1161}"), "\x{1100}\x{0300}\x{1161}"); -ok(FCC("\x{0B47}\x{0B3E}\x{0300}"), "\x{0B4B}\x{0300}"); -ok(FCC("\x{1100}\x{1161}\x{0300}"), "\x{AC00}\x{0300}"); -ok(FCC("\x{0B47}\x{300}\x{0B3E}\x{327}"), "\x{0B47}\x{300}\x{0B3E}\x{327}"); -ok(FCC("\x{1100}\x{300}\x{1161}\x{327}"), "\x{1100}\x{300}\x{1161}\x{327}"); - -# 57 - -ok(answer(checkFCC('')), 'YES'); -ok(answer(checkFCC('A')), 'YES'); -ok(answer(checkFCC("\x{030A}")), 'MAYBE'); # 030A;COMBINING RING ABOVE -ok(answer(checkFCC("\x{0327}")), 'MAYBE'); # 0327;COMBINING CEDILLA -ok(answer(checkFCC(hexU("00C5"))), 'YES'); # A with ring above -ok(answer(checkFCC(hexU("0041 030A"))), 'MAYBE'); # A+ring -ok(answer(checkFCC(hexU("0041 0327 030A"))), 'MAYBE'); # A+cedilla+ring -ok(answer(checkFCC(hexU("0041 030A 0327"))), 'NO'); # A+ring+cedilla -ok(answer(checkFCC(hexU("00C5 0327"))), 'NO'); # A-ring+cedilla -ok(answer(checkFCC("\x{AC01}\x{1100}\x{1161}")), 'MAYBE'); # hangul -ok(answer(checkFCC("\x{212B}\x{F900}")), 'NO'); # compat -ok(answer(checkFCC("\x{212B}\x{0327}")), 'NO'); # compat -ok(answer(checkFCC("\x{0327}\x{212B}")), 'NO'); # compat - -# 70 - diff --git a/cpan/Unicode-Normalize/t/form.t b/cpan/Unicode-Normalize/t/form.t deleted file mode 100644 index 6bbfb082ca..0000000000 --- a/cpan/Unicode-Normalize/t/form.t +++ /dev/null @@ -1,84 +0,0 @@ - -BEGIN { - unless ('A' eq pack('U', 0x41)) { - print "1..0 # Unicode::Normalize cannot pack a Unicode code point\n"; - exit 0; - } - unless (0x41 == unpack('U', 'A')) { - print "1..0 # Unicode::Normalize cannot get a Unicode code point\n"; - exit 0; - } -} - -BEGIN { - if ($ENV{PERL_CORE}) { - chdir('t') if -d 't'; - @INC = $^O eq 'MacOS' ? qw(::lib) : qw(../lib); - } -} - -######################### - -use strict; -use warnings; -BEGIN { $| = 1; print "1..37\n"; } -my $count = 0; -sub ok ($;$) { - my $p = my $r = shift; - if (@_) { - my $x = shift; - $p = !defined $x ? !defined $r : !defined $r ? 0 : $r eq $x; - } - print $p ? "ok" : "not ok", ' ', ++$count, "\n"; -} - -use Unicode::Normalize qw(:all); - -ok(1); - -sub answer { defined $_[0] ? $_[0] ? "YES" : "NO" : "MAYBE" } - -######################### - -ok(NFD ("\x{304C}\x{FF76}"), "\x{304B}\x{3099}\x{FF76}"); -ok(NFC ("\x{304C}\x{FF76}"), "\x{304C}\x{FF76}"); -ok(NFKD("\x{304C}\x{FF76}"), "\x{304B}\x{3099}\x{30AB}"); -ok(NFKC("\x{304C}\x{FF76}"), "\x{304C}\x{30AB}"); - -ok(answer(checkNFD ("\x{304C}")), "NO"); -ok(answer(checkNFC ("\x{304C}")), "YES"); -ok(answer(checkNFKD("\x{304C}")), "NO"); -ok(answer(checkNFKC("\x{304C}")), "YES"); -ok(answer(checkNFD ("\x{FF76}")), "YES"); -ok(answer(checkNFC ("\x{FF76}")), "YES"); -ok(answer(checkNFKD("\x{FF76}")), "NO"); -ok(answer(checkNFKC("\x{FF76}")), "NO"); - -ok(normalize('D', "\x{304C}\x{FF76}"), "\x{304B}\x{3099}\x{FF76}"); -ok(normalize('C', "\x{304C}\x{FF76}"), "\x{304C}\x{FF76}"); -ok(normalize('KD',"\x{304C}\x{FF76}"), "\x{304B}\x{3099}\x{30AB}"); -ok(normalize('KC',"\x{304C}\x{FF76}"), "\x{304C}\x{30AB}"); - -ok(answer(check('D', "\x{304C}")), "NO"); -ok(answer(check('C', "\x{304C}")), "YES"); -ok(answer(check('KD',"\x{304C}")), "NO"); -ok(answer(check('KC',"\x{304C}")), "YES"); -ok(answer(check('D' ,"\x{FF76}")), "YES"); -ok(answer(check('C' ,"\x{FF76}")), "YES"); -ok(answer(check('KD',"\x{FF76}")), "NO"); -ok(answer(check('KC',"\x{FF76}")), "NO"); - -ok(normalize('NFD', "\x{304C}\x{FF76}"), "\x{304B}\x{3099}\x{FF76}"); -ok(normalize('NFC', "\x{304C}\x{FF76}"), "\x{304C}\x{FF76}"); -ok(normalize('NFKD',"\x{304C}\x{FF76}"), "\x{304B}\x{3099}\x{30AB}"); -ok(normalize('NFKC',"\x{304C}\x{FF76}"), "\x{304C}\x{30AB}"); - -ok(answer(check('NFD', "\x{304C}")), "NO"); -ok(answer(check('NFC', "\x{304C}")), "YES"); -ok(answer(check('NFKD',"\x{304C}")), "NO"); -ok(answer(check('NFKC',"\x{304C}")), "YES"); -ok(answer(check('NFD' ,"\x{FF76}")), "YES"); -ok(answer(check('NFC' ,"\x{FF76}")), "YES"); -ok(answer(check('NFKD',"\x{FF76}")), "NO"); -ok(answer(check('NFKC',"\x{FF76}")), "NO"); - diff --git a/cpan/Unicode-Normalize/t/func.t b/cpan/Unicode-Normalize/t/func.t deleted file mode 100644 index 2bd6e504a3..0000000000 --- a/cpan/Unicode-Normalize/t/func.t +++ /dev/null @@ -1,386 +0,0 @@ - -BEGIN { - unless ('A' eq pack('U', 0x41)) { - print "1..0 # Unicode::Normalize cannot pack a Unicode code point\n"; - exit 0; - } - unless (0x41 == unpack('U', 'A')) { - print "1..0 # Unicode::Normalize cannot get a Unicode code point\n"; - exit 0; - } -} - -BEGIN { - if ($ENV{PERL_CORE}) { - chdir('t') if -d 't'; - @INC = $^O eq 'MacOS' ? qw(::lib) : qw(../lib); - } -} - -######################### - -use strict; -use warnings; -BEGIN { $| = 1; print "1..217\n"; } -my $count = 0; -sub ok ($;$) { - my $p = my $r = shift; - if (@_) { - my $x = shift; - $p = !defined $x ? !defined $r : !defined $r ? 0 : $r eq $x; - } - print $p ? "ok" : "not ok", ' ', ++$count, "\n"; -} - -use Unicode::Normalize qw(:all); - -ok(1); - -sub _pack_U { Unicode::Normalize::pack_U(@_) } -sub hexU { _pack_U map hex, split ' ', shift } - -# This won't work on EBCDIC platforms prior to v5.8.0, which is when this -# translation function was defined -*to_native = (defined &utf8::unicode_to_native) - ? \&utf8::unicode_to_native - : sub { return shift }; - -######################### - -ok(getCombinClass( to_native(0)), 0); -ok(getCombinClass(to_native(41)), 0); -ok(getCombinClass(to_native(65)), 0); -ok(getCombinClass( 768), 230); -ok(getCombinClass(1809), 36); - -ok(getCanon(to_native( 0)), undef); -ok(getCanon(to_native(0x29)), undef); -ok(getCanon(to_native(0x41)), undef); -ok(getCanon(to_native(0x00C0)), _pack_U(0x0041, 0x0300)); -ok(getCanon(to_native(0x00EF)), _pack_U(0x0069, 0x0308)); -ok(getCanon(0x304C), _pack_U(0x304B, 0x3099)); -ok(getCanon(0x1EA4), _pack_U(0x0041, 0x0302, 0x0301)); -ok(getCanon(0x1F82), _pack_U(0x03B1, 0x0313, 0x0300, 0x0345)); -ok(getCanon(0x1FAF), _pack_U(0x03A9, 0x0314, 0x0342, 0x0345)); -ok(getCanon(0xAC00), _pack_U(0x1100, 0x1161)); -ok(getCanon(0xAE00), _pack_U(0x1100, 0x1173, 0x11AF)); -ok(getCanon(0x212C), undef); -ok(getCanon(0x3243), undef); -ok(getCanon(0xFA2D), _pack_U(0x9DB4)); - -# 20 - -ok(getCompat(to_native( 0)), undef); -ok(getCompat(to_native(0x29)), undef); -ok(getCompat(to_native(0x41)), undef); -ok(getCompat(to_native(0x00C0)), _pack_U(0x0041, 0x0300)); -ok(getCompat(to_native(0x00EF)), _pack_U(0x0069, 0x0308)); -ok(getCompat(0x304C), _pack_U(0x304B, 0x3099)); -ok(getCompat(0x1EA4), _pack_U(0x0041, 0x0302, 0x0301)); -ok(getCompat(0x1F82), _pack_U(0x03B1, 0x0313, 0x0300, 0x0345)); -ok(getCompat(0x1FAF), _pack_U(0x03A9, 0x0314, 0x0342, 0x0345)); -ok(getCompat(0x212C), _pack_U(0x0042)); -ok(getCompat(0x3243), _pack_U(0x0028, 0x81F3, 0x0029)); -ok(getCompat(0xAC00), _pack_U(0x1100, 0x1161)); -ok(getCompat(0xAE00), _pack_U(0x1100, 0x1173, 0x11AF)); -ok(getCompat(0xFA2D), _pack_U(0x9DB4)); - -# 34 - -ok(getComposite(to_native( 0), to_native( 0)), undef); -ok(getComposite(to_native( 0), to_native(0x29)), undef); -ok(getComposite(to_native(0x29), to_native( 0)), undef); -ok(getComposite(to_native(0x29), to_native(0x29)), undef); -ok(getComposite(to_native( 0), to_native(0x41)), undef); -ok(getComposite(to_native(0x41), to_native( 0)), undef); -ok(getComposite(to_native(0x41), to_native(0x41)), undef); -ok(getComposite(to_native(12), to_native(0x0300)), undef); -ok(getComposite(to_native(0x0055), 0xFF00), undef); -ok(getComposite(to_native(0x0041), 0x0300), to_native(0x00C0)); -ok(getComposite(to_native(0x0055), 0x0300), to_native(0x00D9)); -ok(getComposite(0x0112, 0x0300), 0x1E14); -ok(getComposite(0x1100, 0x1161), 0xAC00); -ok(getComposite(0x1100, 0x1173), 0xADF8); -ok(getComposite(0x1100, 0x11AF), undef); -ok(getComposite(0x1173, 0x11AF), undef); -ok(getComposite(0xAC00, 0x11A7), undef); -ok(getComposite(0xAC00, 0x11A8), 0xAC01); -ok(getComposite(0xADF8, 0x11AF), 0xAE00); - -# 53 - -sub uprops { - my $uv = shift; - my $r = ""; - $r .= isExclusion($uv) ? 'X' : 'x'; - $r .= isSingleton($uv) ? 'S' : 's'; - $r .= isNonStDecomp($uv) ? 'N' : 'n'; # Non-Starter Decomposition - $r .= isComp_Ex($uv) ? 'F' : 'f'; # Full exclusion (X + S + N) - $r .= isComp2nd($uv) ? 'B' : 'b'; # B = M = Y - $r .= isNFD_NO($uv) ? 'D' : 'd'; - $r .= isNFC_MAYBE($uv) ? 'M' : 'm'; # Maybe - $r .= isNFC_NO($uv) ? 'C' : 'c'; - $r .= isNFKD_NO($uv) ? 'K' : 'k'; - $r .= isNFKC_MAYBE($uv) ? 'Y' : 'y'; # maYbe - $r .= isNFKC_NO($uv) ? 'G' : 'g'; - return $r; -} - -ok(uprops(to_native(0x0000)), 'xsnfbdmckyg'); # NULL -ok(uprops(to_native(0x0029)), 'xsnfbdmckyg'); # RIGHT PARENTHESIS -ok(uprops(to_native(0x0041)), 'xsnfbdmckyg'); # LATIN CAPITAL LETTER A -ok(uprops(to_native(0x00A0)), 'xsnfbdmcKyG'); # NO-BREAK SPACE -ok(uprops(to_native(0x00C0)), 'xsnfbDmcKyg'); # LATIN CAPITAL LETTER A WITH GRAVE -ok(uprops(0x0300), 'xsnfBdMckYg'); # COMBINING GRAVE ACCENT -ok(uprops(0x0344), 'xsNFbDmCKyG'); # COMBINING GREEK DIALYTIKA TONOS -ok(uprops(0x0387), 'xSnFbDmCKyG'); # GREEK ANO TELEIA -ok(uprops(0x0958), 'XsnFbDmCKyG'); # DEVANAGARI LETTER QA -ok(uprops(0x0F43), 'XsnFbDmCKyG'); # TIBETAN LETTER GHA -ok(uprops(0x1100), 'xsnfbdmckyg'); # HANGUL CHOSEONG KIYEOK -ok(uprops(0x1161), 'xsnfBdMckYg'); # HANGUL JUNGSEONG A -ok(uprops(0x11AF), 'xsnfBdMckYg'); # HANGUL JONGSEONG RIEUL -ok(uprops(0x212B), 'xSnFbDmCKyG'); # ANGSTROM SIGN -ok(uprops(0xAC00), 'xsnfbDmcKyg'); # HANGUL SYLLABLE GA -ok(uprops(0xF900), 'xSnFbDmCKyG'); # CJK COMPATIBILITY IDEOGRAPH-F900 -ok(uprops(0xFB4E), 'XsnFbDmCKyG'); # HEBREW LETTER PE WITH RAFE -ok(uprops(0xFF71), 'xsnfbdmcKyG'); # HALFWIDTH KATAKANA LETTER A - -# 71 - -ok(decompose(""), ""); -ok(decompose("A"), "A"); -ok(decompose("", 1), ""); -ok(decompose("A", 1), "A"); - -ok(decompose(hexU("1E14 AC01")), hexU("0045 0304 0300 1100 1161 11A8")); -ok(decompose(hexU("AC00 AE00")), hexU("1100 1161 1100 1173 11AF")); -ok(decompose(hexU("304C FF76")), hexU("304B 3099 FF76")); - -ok(decompose(hexU("1E14 AC01"), 1), hexU("0045 0304 0300 1100 1161 11A8")); -ok(decompose(hexU("AC00 AE00"), 1), hexU("1100 1161 1100 1173 11AF")); -ok(decompose(hexU("304C FF76"), 1), hexU("304B 3099 30AB")); - -# don't modify the source -my $sDec = "\x{FA19}"; -ok(decompose($sDec), "\x{795E}"); -ok($sDec, "\x{FA19}"); - -# 83 - -ok(reorder(""), ""); -ok(reorder("A"), "A"); -ok(reorder(hexU("0041 0300 0315 0313 031b 0061")), - hexU("0041 031b 0300 0313 0315 0061")); -ok(reorder(hexU("00C1 0300 0315 0313 031b 0061 309A 3099")), - hexU("00C1 031b 0300 0313 0315 0061 309A 3099")); - -# don't modify the source -my $sReord = "\x{3000}\x{300}\x{31b}"; -ok(reorder($sReord), "\x{3000}\x{31b}\x{300}"); -ok($sReord, "\x{3000}\x{300}\x{31b}"); - -# 89 - -ok(compose(""), ""); -ok(compose("A"), "A"); -ok(compose(hexU("0061 0300")), hexU("00E0")); -ok(compose(hexU("0061 0300 031B")), hexU("00E0 031B")); -ok(compose(hexU("0061 0300 0315")), hexU("00E0 0315")); -ok(compose(hexU("0061 0300 0313")), hexU("00E0 0313")); -ok(compose(hexU("0061 031B 0300")), hexU("00E0 031B")); -ok(compose(hexU("0061 0315 0300")), hexU("0061 0315 0300")); -ok(compose(hexU("0061 0313 0300")), hexU("0061 0313 0300")); - -# don't modify the source -my $sCom = "\x{304B}\x{3099}"; -ok(compose($sCom), "\x{304C}"); -ok($sCom, "\x{304B}\x{3099}"); - -# 100 - -ok(composeContiguous(""), ""); -ok(composeContiguous("A"), "A"); -ok(composeContiguous(hexU("0061 0300")), hexU("00E0")); -ok(composeContiguous(hexU("0061 0300 031B")), hexU("00E0 031B")); -ok(composeContiguous(hexU("0061 0300 0315")), hexU("00E0 0315")); -ok(composeContiguous(hexU("0061 0300 0313")), hexU("00E0 0313")); -ok(composeContiguous(hexU("0061 031B 0300")), hexU("0061 031B 0300")); -ok(composeContiguous(hexU("0061 0315 0300")), hexU("0061 0315 0300")); -ok(composeContiguous(hexU("0061 0313 0300")), hexU("0061 0313 0300")); - -# don't modify the source -my $sCtg = "\x{30DB}\x{309A}"; -ok(composeContiguous($sCtg), "\x{30DD}"); -ok($sCtg, "\x{30DB}\x{309A}"); - -# 111 - -sub answer { defined $_[0] ? $_[0] ? "YES" : "NO" : "MAYBE" } - -ok(answer(checkNFD("")), "YES"); -ok(answer(checkNFC("")), "YES"); -ok(answer(checkNFKD("")), "YES"); -ok(answer(checkNFKC("")), "YES"); -ok(answer(check("NFD", "")), "YES"); -ok(answer(check("NFC", "")), "YES"); -ok(answer(check("NFKD","")), "YES"); -ok(answer(check("NFKC","")), "YES"); - -# U+0000 to U+007F are prenormalized in all the normalization forms. -ok(answer(checkNFD("AZaz\t12!#`")), "YES"); -ok(answer(checkNFC("AZaz\t12!#`")), "YES"); -ok(answer(checkNFKD("AZaz\t12!#`")), "YES"); -ok(answer(checkNFKC("AZaz\t12!#`")), "YES"); -ok(answer(check("D", "AZaz\t12!#`")), "YES"); -ok(answer(check("C", "AZaz\t12!#`")), "YES"); -ok(answer(check("KD","AZaz\t12!#`")), "YES"); -ok(answer(check("KC","AZaz\t12!#`")), "YES"); - -ok(answer(checkNFD(NFD(_pack_U(0xC1, 0x1100, 0x1173, 0x11AF)))), "YES"); -ok(answer(checkNFD(hexU("20 C1 1100 1173 11AF"))), "NO"); -ok(answer(checkNFC(hexU("20 C1 1173 11AF"))), "MAYBE"); -ok(answer(checkNFC(hexU("20 C1 AE00 1100"))), "YES"); -ok(answer(checkNFC(hexU("20 C1 AE00 1100 0300"))), "MAYBE"); -ok(answer(checkNFC(hexU("212B 1100 0300"))), "NO"); -ok(answer(checkNFC(hexU("1100 0300 212B"))), "NO"); -ok(answer(checkNFC(hexU("0041 0327 030A"))), "MAYBE"); # A+cedilla+ring -ok(answer(checkNFC(hexU("0041 030A 0327"))), "NO"); # A+ring+cedilla -ok(answer(checkNFC(hexU("20 C1 FF71 2025"))),"YES"); -ok(answer(check("NFC", hexU("20 C1 212B 300"))), "NO"); -ok(answer(checkNFKD(hexU("20 C1 FF71 2025"))), "NO"); -ok(answer(checkNFKC(hexU("20 C1 AE00 2025"))), "NO"); -ok(answer(checkNFKC(hexU("212B 1100 0300"))), "NO"); -ok(answer(checkNFKC(hexU("1100 0300 212B"))), "NO"); -ok(answer(checkNFKC(hexU("0041 0327 030A"))), "MAYBE"); # A+cedilla+ring -ok(answer(checkNFKC(hexU("0041 030A 0327"))), "NO"); # A+ring+cedilla -ok(answer(check("NFKC", hexU("20 C1 212B 300"))), "NO"); - -# 145 - -"012ABC" =~ /(\d+)(\w+)/; -ok("012" eq NFC $1 && "ABC" eq NFC $2); - -ok(normalize('C', $1), "012"); -ok(normalize('C', $2), "ABC"); - -ok(normalize('NFC', $1), "012"); -ok(normalize('NFC', $2), "ABC"); - # s/^NF// in normalize() must not prevent using $1, $&, etc. - -# 150 - -# a string with initial zero should be treated like a number - -# LATIN CAPITAL LETTER A WITH GRAVE -ok(getCombinClass(sprintf("0%d", to_native(192))), 0); -ok(getCanon (sprintf("0%d", to_native(192))), _pack_U(0x41, 0x300)); -ok(getCompat(sprintf("0%d", to_native(192))), _pack_U(0x41, 0x300)); -my $lead_zero = sprintf "0%d", to_native(65); -ok(getComposite($lead_zero, "0768"), to_native(192)); -ok(isNFD_NO (sprintf("0%d", to_native(192)))); -ok(isNFKD_NO(sprintf("0%d", to_native(192)))); - -# DEVANAGARI LETTER QA -ok(isExclusion("02392")); -ok(isComp_Ex ("02392")); -ok(isNFC_NO ("02392")); -ok(isNFKC_NO ("02392")); -ok(isNFD_NO ("02392")); -ok(isNFKD_NO ("02392")); - -# ANGSTROM SIGN -ok(isSingleton("08491")); -ok(isComp_Ex ("08491")); -ok(isNFC_NO ("08491")); -ok(isNFKC_NO ("08491")); -ok(isNFD_NO ("08491")); -ok(isNFKD_NO ("08491")); - -# COMBINING GREEK DIALYTIKA TONOS -ok(isNonStDecomp("0836")); -ok(isComp_Ex ("0836")); -ok(isNFC_NO ("0836")); -ok(isNFKC_NO ("0836")); -ok(isNFD_NO ("0836")); -ok(isNFKD_NO ("0836")); - -# COMBINING GRAVE ACCENT -ok(getCombinClass("0768"), 230); -ok(isComp2nd ("0768")); -ok(isNFC_MAYBE ("0768")); -ok(isNFKC_MAYBE("0768")); - -# HANGUL SYLLABLE GA -ok(getCombinClass("044032"), 0); -ok(getCanon("044032"), _pack_U(0x1100, 0x1161)); -ok(getCompat("044032"), _pack_U(0x1100, 0x1161)); -ok(getComposite("04352", "04449"), 0xAC00); - -# 182 - -# string with 22 combining characters: (0x300..0x315) -my $str_cc22 = _pack_U(0x3041, 0x300..0x315, 0x3042); -ok(decompose($str_cc22), $str_cc22); -ok(reorder($str_cc22), $str_cc22); -ok(compose($str_cc22), $str_cc22); -ok(composeContiguous($str_cc22), $str_cc22); -ok(NFD($str_cc22), $str_cc22); -ok(NFC($str_cc22), $str_cc22); -ok(NFKD($str_cc22), $str_cc22); -ok(NFKC($str_cc22), $str_cc22); -ok(FCD($str_cc22), $str_cc22); -ok(FCC($str_cc22), $str_cc22); - -# 192 - -# string with 40 combining characters of the same class: (0x300..0x313)x2 -my $str_cc40 = _pack_U(0x3041, 0x300..0x313, 0x300..0x313, 0x3042); -ok(decompose($str_cc40), $str_cc40); -ok(reorder($str_cc40), $str_cc40); -ok(compose($str_cc40), $str_cc40); -ok(composeContiguous($str_cc40), $str_cc40); -ok(NFD($str_cc40), $str_cc40); -ok(NFC($str_cc40), $str_cc40); -ok(NFKD($str_cc40), $str_cc40); -ok(NFKC($str_cc40), $str_cc40); -ok(FCD($str_cc40), $str_cc40); -ok(FCC($str_cc40), $str_cc40); - -# 202 - -my $precomp = hexU("304C 304E 3050 3052 3054"); -my $combseq = hexU("304B 3099 304D 3099 304F 3099 3051 3099 3053 3099"); -ok(decompose($precomp x 5), $combseq x 5); -ok(decompose($precomp x 10), $combseq x 10); -ok(decompose($precomp x 20), $combseq x 20); - -my $hangsyl = hexU("AC00 B098 B2E4 B77C B9C8"); -my $jamoseq = hexU("1100 1161 1102 1161 1103 1161 1105 1161 1106 1161"); -ok(decompose($hangsyl x 5), $jamoseq x 5); -ok(decompose($hangsyl x 10), $jamoseq x 10); -ok(decompose($hangsyl x 20), $jamoseq x 20); - -my $notcomp = hexU("304B 304D 304F 3051 3053"); -ok(decompose($precomp . $notcomp), $combseq . $notcomp); -ok(decompose($precomp . $notcomp x 5), $combseq . $notcomp x 5); -ok(decompose($precomp . $notcomp x10), $combseq . $notcomp x10); - -# 211 - -my $preUnicode3_1 = !defined getCanon(0x1D15E); -my $preUnicode3_2 = !defined getCanon(0x2ADC); - -# HEBREW LETTER YOD WITH HIRIQ -ok($preUnicode3_1 xor isExclusion(0xFB1D)); -ok($preUnicode3_1 xor isComp_Ex (0xFB1D)); - -# MUSICAL SYMBOL HALF NOTE -ok($preUnicode3_1 xor isExclusion(0x1D15E)); -ok($preUnicode3_1 xor isComp_Ex (0x1D15E)); - -# FORKING -ok($preUnicode3_2 xor isExclusion(0x2ADC)); -ok($preUnicode3_2 xor isComp_Ex (0x2ADC)); - -# 217 - diff --git a/cpan/Unicode-Normalize/t/illegal.t b/cpan/Unicode-Normalize/t/illegal.t deleted file mode 100644 index ccf2b4aae6..0000000000 --- a/cpan/Unicode-Normalize/t/illegal.t +++ /dev/null @@ -1,85 +0,0 @@ - -BEGIN { - unless ('A' eq pack('U', 0x41)) { - print "1..0 # Unicode::Normalize cannot pack a Unicode code point\n"; - exit 0; - } - unless (0x41 == unpack('U', 'A')) { - print "1..0 # Unicode::Normalize cannot get a Unicode code point\n"; - exit 0; - } -} - -BEGIN { - if ($ENV{PERL_CORE}) { - chdir('t') if -d 't'; - @INC = $^O eq 'MacOS' ? qw(::lib) : qw(../lib); - } -} - -BEGIN { - unless (5.006001 <= $]) { - print "1..0 # skipped: Perl 5.6.1 or later". - " needed for this test\n"; - exit; - } -} - -######################### - -BEGIN { - use Unicode::Normalize qw(:all); - - unless (exists &Unicode::Normalize::bootstrap or 5.008 <= $]) { - print "1..0 # skipped: XSUB, or Perl 5.8.0 or later". - " needed for this test\n"; - print $@; - exit; - } -} - -use strict; -use warnings; - -BEGIN { $| = 1; print "1..113\n"; } -my $count = 0; -sub ok ($;$) { - my $p = my $r = shift; - if (@_) { - my $x = shift; - $p = !defined $x ? !defined $r : !defined $r ? 0 : $r eq $x; - } - print $p ? "ok" : "not ok", ' ', ++$count, "\n"; -} - -ok(1); - -######################### - -no warnings qw(utf8); - -for my $u (0xD800, 0xDFFF, 0xFDD0, 0xFDEF, 0xFEFF, 0xFFFE, 0xFFFF, - 0x1FFFF, 0x10FFFF, 0x110000, 0x3FFFFFFF) -{ - my $c = chr $u; - ok($c eq NFD($c)); # 1 - ok($c eq NFC($c)); # 2 - ok($c eq NFKD($c)); # 3 - ok($c eq NFKC($c)); # 4 - ok($c eq FCD($c)); # 5 - ok($c eq FCC($c)); # 6 - ok($c eq decompose($c)); # 7 - ok($c eq decompose($c,1)); # 8 - ok($c eq reorder($c)); # 9 - ok($c eq compose($c)); # 10 -} - -our $proc; # before the last starter -our $unproc; # the last starter and after - -sub _pack_U { Unicode::Normalize::pack_U(@_) } - -($proc, $unproc) = splitOnLastStarter(_pack_U(0x41, 0x300, 0x327, 0xFFFF)); -ok($proc eq _pack_U(0x41, 0x300, 0x327)); -ok($unproc eq "\x{FFFF}"); - diff --git a/cpan/Unicode-Normalize/t/norm.t b/cpan/Unicode-Normalize/t/norm.t deleted file mode 100644 index d3cec3aea1..0000000000 --- a/cpan/Unicode-Normalize/t/norm.t +++ /dev/null @@ -1,145 +0,0 @@ - -BEGIN { - unless ('A' eq pack('U', 0x41)) { - print "1..0 # Unicode::Normalize cannot pack a Unicode code point\n"; - exit 0; - } - unless (0x41 == unpack('U', 'A')) { - print "1..0 # Unicode::Normalize cannot get a Unicode code point\n"; - exit 0; - } -} - -BEGIN { - if ($ENV{PERL_CORE}) { - chdir('t') if -d 't'; - @INC = $^O eq 'MacOS' ? qw(::lib) : qw(../lib); - } -} - -######################### - -use strict; -use warnings; -BEGIN { $| = 1; print "1..64\n"; } -my $count = 0; -sub ok ($;$) { - my $p = my $r = shift; - if (@_) { - my $x = shift; - $p = !defined $x ? !defined $r : !defined $r ? 0 : $r eq $x; - } - print $p ? "ok" : "not ok", ' ', ++$count, "\n"; -} - -use Unicode::Normalize qw(normalize); - -ok(1); - -sub _pack_U { Unicode::Normalize::pack_U(@_) } -sub _unpack_U { Unicode::Normalize::unpack_U(@_) } - -######################### - -ok(normalize('D', ""), ""); -ok(normalize('C', ""), ""); -ok(normalize('KD',""), ""); -ok(normalize('KC',""), ""); - -ok(normalize('D', "A"), "A"); -ok(normalize('C', "A"), "A"); -ok(normalize('KD',"A"), "A"); -ok(normalize('KC',"A"), "A"); - -ok(normalize('NFD', ""), ""); -ok(normalize('NFC', ""), ""); -ok(normalize('NFKD',""), ""); -ok(normalize('NFKC',""), ""); - -ok(normalize('NFD', "A"), "A"); -ok(normalize('NFC', "A"), "A"); -ok(normalize('NFKD',"A"), "A"); -ok(normalize('NFKC',"A"), "A"); - -# 17 - -# don't modify the source -my $sNFD = "\x{FA19}"; -ok(normalize('NFD', $sNFD), "\x{795E}"); -ok($sNFD, "\x{FA19}"); - -my $sNFC = "\x{FA1B}"; -ok(normalize('NFC', $sNFC), "\x{798F}"); -ok($sNFC, "\x{FA1B}"); - -my $sNFKD = "\x{FA1E}"; -ok(normalize('NFKD', $sNFKD), "\x{7FBD}"); -ok($sNFKD, "\x{FA1E}"); - -my $sNFKC = "\x{FA26}"; -ok(normalize('NFKC', $sNFKC), "\x{90FD}"); -ok($sNFKC, "\x{FA26}"); - -# 25 - -sub hexNFC { - join " ", map sprintf("%04X", $_), - _unpack_U normalize 'C', _pack_U map hex, split ' ', shift; -} -sub hexNFD { - join " ", map sprintf("%04X", $_), - _unpack_U normalize 'D', _pack_U map hex, split ' ', shift; -} - -ok(hexNFD("1E14 AC01"), "0045 0304 0300 1100 1161 11A8"); -ok(hexNFD("AC00 AE00"), "1100 1161 1100 1173 11AF"); - -ok(hexNFC("0061 0315 0300 05AE 05C4 0062"), "00E0 05AE 05C4 0315 0062"); -ok(hexNFC("00E0 05AE 05C4 0315 0062"), "00E0 05AE 05C4 0315 0062"); -ok(hexNFC("0061 05AE 0300 05C4 0315 0062"), "00E0 05AE 05C4 0315 0062"); -ok(hexNFC("0045 0304 0300 AC00 11A8"), "1E14 AC01"); -ok(hexNFC("1100 1161 1100 1173 11AF"), "AC00 AE00"); -ok(hexNFC("1100 0300 1161 1173 11AF"), "1100 0300 1161 1173 11AF"); - -ok(hexNFD("0061 0315 0300 05AE 05C4 0062"), "0061 05AE 0300 05C4 0315 0062"); -ok(hexNFD("00E0 05AE 05C4 0315 0062"), "0061 05AE 0300 05C4 0315 0062"); -ok(hexNFD("0061 05AE 0300 05C4 0315 0062"), "0061 05AE 0300 05C4 0315 0062"); -ok(hexNFC("0061 05C4 0315 0300 05AE 0062"), "0061 05AE 05C4 0300 0315 0062"); -ok(hexNFC("0061 05AE 05C4 0300 0315 0062"), "0061 05AE 05C4 0300 0315 0062"); -ok(hexNFD("0061 05C4 0315 0300 05AE 0062"), "0061 05AE 05C4 0300 0315 0062"); -ok(hexNFD("0061 05AE 05C4 0300 0315 0062"), "0061 05AE 05C4 0300 0315 0062"); -ok(hexNFC("0000 0041 0000 0000"), "0000 0041 0000 0000"); -ok(hexNFD("0000 0041 0000 0000"), "0000 0041 0000 0000"); - -ok(hexNFC("AC00 11A7"), "AC00 11A7"); -ok(hexNFC("AC00 11A8"), "AC01"); -ok(hexNFC("AC00 11A9"), "AC02"); -ok(hexNFC("AC00 11C2"), "AC1B"); -ok(hexNFC("AC00 11C3"), "AC00 11C3"); - -# 47 - -# Test Cases from Public Review Issue #29: Normalization Issue -# cf. http://www.unicode.org/review/pr-29.html -ok(hexNFC("0B47 0300 0B3E"), "0B47 0300 0B3E"); -ok(hexNFC("1100 0300 1161"), "1100 0300 1161"); -ok(hexNFC("0B47 0B3E 0300"), "0B4B 0300"); -ok(hexNFC("1100 1161 0300"), "AC00 0300"); -ok(hexNFC("0B47 0300 0B3E 0327"), "0B47 0300 0B3E 0327"); -ok(hexNFC("1100 0300 1161 0327"), "1100 0300 1161 0327"); - -ok(hexNFC("0300 0041"), "0300 0041"); -ok(hexNFC("0300 0301 0041"), "0300 0301 0041"); -ok(hexNFC("0301 0300 0041"), "0301 0300 0041"); -ok(hexNFC("0000 0300 0000 0301"), "0000 0300 0000 0301"); -ok(hexNFC("0000 0301 0000 0300"), "0000 0301 0000 0300"); - -ok(hexNFC("0327 0061 0300"), "0327 00E0"); -ok(hexNFC("0301 0061 0300"), "0301 00E0"); -ok(hexNFC("0315 0061 0300"), "0315 00E0"); -ok(hexNFC("0000 0327 0061 0300"), "0000 0327 00E0"); -ok(hexNFC("0000 0301 0061 0300"), "0000 0301 00E0"); -ok(hexNFC("0000 0315 0061 0300"), "0000 0315 00E0"); - -# 64 - diff --git a/cpan/Unicode-Normalize/t/null.t b/cpan/Unicode-Normalize/t/null.t deleted file mode 100644 index 9a0008708e..0000000000 --- a/cpan/Unicode-Normalize/t/null.t +++ /dev/null @@ -1,100 +0,0 @@ - -BEGIN { - unless ('A' eq pack('U', 0x41)) { - print "1..0 # Unicode::Normalize cannot pack a Unicode code point\n"; - exit 0; - } - unless (0x41 == unpack('U', 'A')) { - print "1..0 # Unicode::Normalize cannot get a Unicode code point\n"; - exit 0; - } -} - -BEGIN { - if ($ENV{PERL_CORE}) { - chdir('t') if -d 't'; - @INC = $^O eq 'MacOS' ? qw(::lib) : qw(../lib); - } -} - -######################### - -use strict; -use warnings; - -use Unicode::Normalize qw(:all); -print "1..24\n"; - -print "ok 1\n"; - -# if $_ is not NULL-terminated, test may fail. - -$_ = compose('abc'); -print /c$/ ? "ok" : "not ok", " 2\n"; - -$_ = decompose('abc'); -print /c$/ ? "ok" : "not ok", " 3\n"; - -$_ = reorder('abc'); -print /c$/ ? "ok" : "not ok", " 4\n"; - -$_ = NFD('abc'); -print /c$/ ? "ok" : "not ok", " 5\n"; - -$_ = NFC('abc'); -print /c$/ ? "ok" : "not ok", " 6\n"; - -$_ = NFKD('abc'); -print /c$/ ? "ok" : "not ok", " 7\n"; - -$_ = NFKC('abc'); -print /c$/ ? "ok" : "not ok", " 8\n"; - -$_ = FCC('abc'); -print /c$/ ? "ok" : "not ok", " 9\n"; - -$_ = decompose("\x{304C}abc"); -print /c$/ ? "ok" : "not ok", " 10\n"; - -$_ = decompose("\x{304B}\x{3099}abc"); -print /c$/ ? "ok" : "not ok", " 11\n"; - -$_ = reorder("\x{304C}abc"); -print /c$/ ? "ok" : "not ok", " 12\n"; - -$_ = reorder("\x{304B}\x{3099}abc"); -print /c$/ ? "ok" : "not ok", " 13\n"; - -$_ = compose("\x{304C}abc"); -print /c$/ ? "ok" : "not ok", " 14\n"; - -$_ = compose("\x{304B}\x{3099}abc"); -print /c$/ ? "ok" : "not ok", " 15\n"; - -$_ = NFD("\x{304C}abc"); -print /c$/ ? "ok" : "not ok", " 16\n"; - -$_ = NFC("\x{304C}abc"); -print /c$/ ? "ok" : "not ok", " 17\n"; - -$_ = NFKD("\x{304C}abc"); -print /c$/ ? "ok" : "not ok", " 18\n"; - -$_ = NFKC("\x{304C}abc"); -print /c$/ ? "ok" : "not ok", " 19\n"; - -$_ = FCC("\x{304C}abc"); -print /c$/ ? "ok" : "not ok", " 20\n"; - -$_ = getCanon(0x100); -print s/.$// ? "ok" : "not ok", " 21\n"; - -$_ = getCompat(0x100); -print s/.$// ? "ok" : "not ok", " 22\n"; - -$_ = getCanon(0xAC00); -print s/.$// ? "ok" : "not ok", " 23\n"; - -$_ = getCompat(0xAC00); -print s/.$// ? "ok" : "not ok", " 24\n"; - diff --git a/cpan/Unicode-Normalize/t/partial1.t b/cpan/Unicode-Normalize/t/partial1.t deleted file mode 100644 index 3e44a63dc0..0000000000 --- a/cpan/Unicode-Normalize/t/partial1.t +++ /dev/null @@ -1,120 +0,0 @@ - -BEGIN { - unless ('A' eq pack('U', 0x41)) { - print "1..0 # Unicode::Normalize cannot pack a Unicode code point\n"; - exit 0; - } - unless (0x41 == unpack('U', 'A')) { - print "1..0 # Unicode::Normalize cannot get a Unicode code point\n"; - exit 0; - } -} - -BEGIN { - if ($ENV{PERL_CORE}) { - chdir('t') if -d 't'; - @INC = $^O eq 'MacOS' ? qw(::lib) : qw(../lib); - } -} - -BEGIN { - unless (5.006001 <= $]) { - print "1..0 # skipped: Perl 5.6.1 or later". - " needed for this test\n"; - exit; - } -} - -######################### - -use strict; -use warnings; -BEGIN { $| = 1; print "1..26\n"; } -my $count = 0; -sub ok ($;$) { - my $p = my $r = shift; - if (@_) { - my $x = shift; - $p = !defined $x ? !defined $r : !defined $r ? 0 : $r eq $x; - } - print $p ? "ok" : "not ok", ' ', ++$count, "\n"; -} - -use Unicode::Normalize qw(:all); - -ok(1); - -sub _pack_U { Unicode::Normalize::pack_U(@_) } -sub _unpack_U { Unicode::Normalize::unpack_U(@_) } - -######################### - -sub arraynorm { - my $form = shift; - my @string = @_; - my $result = ""; - my $unproc = ""; - foreach my $str (@string) { - $unproc .= $str; - $result .= $form eq 'NFC' ? NFC_partial ($unproc) : - $form eq 'NFD' ? NFD_partial ($unproc) : - $form eq 'NFKC' ? NFKC_partial($unproc) : - $form eq 'NFKD' ? NFKD_partial($unproc) : - undef; - } - $result .= $unproc; - return $result; -} - -my $strD = "\x{3C9}\x{301}\x{1100}\x{1161}\x{11A8}\x{1100}\x{1161}\x{11AA}"; -my $strC = "\x{3CE}\x{AC01}\x{AC03}"; -my @str1 = (substr($strD,0,3), substr($strD,3,4), substr($strD,7)); -my @str2 = (substr($strD,0,1), substr($strD,1,3), substr($strD,4)); -ok($strC eq NFC($strD)); -ok($strD eq join('', @str1)); -ok($strC eq arraynorm('NFC', @str1)); -ok($strD eq join('', @str2)); -ok($strC eq arraynorm('NFC', @str2)); - -my @strX = ("\x{300}\x{AC00}", "\x{11A8}"); -my $strX = "\x{300}\x{AC01}"; -ok($strX eq NFC(join('', @strX))); -ok($strX eq arraynorm('NFC', @strX)); -ok($strX eq NFKC(join('', @strX))); -ok($strX eq arraynorm('NFKC', @strX)); - -my @strY = ("\x{304B}\x{0308}", "\x{0323}\x{3099}"); -my $strY = ("\x{304C}\x{0323}\x{0308}"); -ok($strY eq NFC(join('', @strY))); -ok($strY eq arraynorm('NFC', @strY)); -ok($strY eq NFKC(join('', @strY))); -ok($strY eq arraynorm('NFKC', @strY)); - -my @strZ = ("\x{304B}\x{0308}", "\x{0323}", "\x{3099}"); -my $strZ = ("\x{304B}\x{3099}\x{0323}\x{0308}"); -ok($strZ eq NFD(join('', @strZ))); -ok($strZ eq arraynorm('NFD', @strZ)); -ok($strZ eq NFKD(join('', @strZ))); -ok($strZ eq arraynorm('NFKD', @strZ)); - -# 18 - -# must modify the source -my $sNFD = "\x{FA19}"; -ok(NFD_partial($sNFD), ""); -ok($sNFD, "\x{795E}"); - -my $sNFC = "\x{FA1B}"; -ok(NFC_partial($sNFC), ""); -ok($sNFC, "\x{798F}"); - -my $sNFKD = "\x{FA1E}"; -ok(NFKD_partial($sNFKD), ""); -ok($sNFKD, "\x{7FBD}"); - -my $sNFKC = "\x{FA26}"; -ok(NFKC_partial($sNFKC), ""); -ok($sNFKC, "\x{90FD}"); - -# 26 - diff --git a/cpan/Unicode-Normalize/t/partial2.t b/cpan/Unicode-Normalize/t/partial2.t deleted file mode 100644 index 7f19e9365b..0000000000 --- a/cpan/Unicode-Normalize/t/partial2.t +++ /dev/null @@ -1,116 +0,0 @@ - -BEGIN { - unless ('A' eq pack('U', 0x41)) { - print "1..0 # Unicode::Normalize cannot pack a Unicode code point\n"; - exit 0; - } - unless (0x41 == unpack('U', 'A')) { - print "1..0 # Unicode::Normalize cannot get a Unicode code point\n"; - exit 0; - } -} - -BEGIN { - if ($ENV{PERL_CORE}) { - chdir('t') if -d 't'; - @INC = $^O eq 'MacOS' ? qw(::lib) : qw(../lib); - } -} - -BEGIN { - unless (5.006001 <= $]) { - print "1..0 # skipped: Perl 5.6.1 or later". - " needed for this test\n"; - exit; - } -} - -######################### - -use strict; -use warnings; -BEGIN { $| = 1; print "1..26\n"; } -my $count = 0; -sub ok ($;$) { - my $p = my $r = shift; - if (@_) { - my $x = shift; - $p = !defined $x ? !defined $r : !defined $r ? 0 : $r eq $x; - } - print $p ? "ok" : "not ok", ' ', ++$count, "\n"; -} - -use Unicode::Normalize qw(:all); - -ok(1); - -sub _pack_U { Unicode::Normalize::pack_U(@_) } -sub _unpack_U { Unicode::Normalize::unpack_U(@_) } - -######################### - -sub arraynorm { - my $form = shift; - my @string = @_; - my $result = ""; - my $unproc = ""; - foreach my $str (@string) { - $unproc .= $str; - $result .= normalize_partial($form, $unproc); - } - $result .= $unproc; - return $result; -} - -my $strD = "\x{3C9}\x{301}\x{1100}\x{1161}\x{11A8}\x{1100}\x{1161}\x{11AA}"; -my $strC = "\x{3CE}\x{AC01}\x{AC03}"; -my @str1 = (substr($strD,0,3), substr($strD,3,4), substr($strD,7)); -my @str2 = (substr($strD,0,1), substr($strD,1,3), substr($strD,4)); -ok($strC eq NFC($strD)); -ok($strD eq join('', @str1)); -ok($strC eq arraynorm('NFC', @str1)); -ok($strD eq join('', @str2)); -ok($strC eq arraynorm('NFC', @str2)); - -my @strX = ("\x{300}\x{AC00}", "\x{11A8}"); -my $strX = "\x{300}\x{AC01}"; -ok($strX eq NFC(join('', @strX))); -ok($strX eq arraynorm('NFC', @strX)); -ok($strX eq NFKC(join('', @strX))); -ok($strX eq arraynorm('NFKC', @strX)); - -my @strY = ("\x{304B}\x{0308}", "\x{0323}\x{3099}"); -my $strY = ("\x{304C}\x{0323}\x{0308}"); -ok($strY eq NFC(join('', @strY))); -ok($strY eq arraynorm('NFC', @strY)); -ok($strY eq NFKC(join('', @strY))); -ok($strY eq arraynorm('NFKC', @strY)); - -my @strZ = ("\x{304B}\x{0308}", "\x{0323}", "\x{3099}"); -my $strZ = ("\x{304B}\x{3099}\x{0323}\x{0308}"); -ok($strZ eq NFD(join('', @strZ))); -ok($strZ eq arraynorm('NFD', @strZ)); -ok($strZ eq NFKD(join('', @strZ))); -ok($strZ eq arraynorm('NFKD', @strZ)); - -# 18 - -# must modify the source -my $sNFD = "\x{FA19}"; -ok(normalize_partial('NFD', $sNFD), ""); -ok($sNFD, "\x{795E}"); - -my $sNFC = "\x{FA1B}"; -ok(normalize_partial('NFC', $sNFC), ""); -ok($sNFC, "\x{798F}"); - -my $sNFKD = "\x{FA1E}"; -ok(normalize_partial('NFKD', $sNFKD), ""); -ok($sNFKD, "\x{7FBD}"); - -my $sNFKC = "\x{FA26}"; -ok(normalize_partial('NFKC', $sNFKC), ""); -ok($sNFKC, "\x{90FD}"); - -# 26 - diff --git a/cpan/Unicode-Normalize/t/proto.t b/cpan/Unicode-Normalize/t/proto.t deleted file mode 100644 index 38c6985759..0000000000 --- a/cpan/Unicode-Normalize/t/proto.t +++ /dev/null @@ -1,99 +0,0 @@ - -BEGIN { - unless ('A' eq pack('U', 0x41)) { - print "1..0 # Unicode::Normalize cannot pack a Unicode code point\n"; - exit 0; - } - unless (0x41 == unpack('U', 'A')) { - print "1..0 # Unicode::Normalize cannot get a Unicode code point\n"; - exit 0; - } -} - -BEGIN { - if ($ENV{PERL_CORE}) { - chdir('t') if -d 't'; - @INC = $^O eq 'MacOS' ? qw(::lib) : qw(../lib); - } -} - -######################### - -use strict; -use warnings; -BEGIN { $| = 1; print "1..48\n"; } -my $count = 0; -sub ok ($;$) { - my $p = my $r = shift; - if (@_) { - my $x = shift; - $p = !defined $x ? !defined $r : !defined $r ? 0 : $r eq $x; - } - print $p ? "ok" : "not ok", ' ', ++$count, "\n"; -} - -use Unicode::Normalize qw(:all); - -ok(1); - -######################### - -# unary op. RING-CEDILLA -ok( "\x{30A}\x{327}" ne "\x{327}\x{30A}"); -ok(NFD "\x{30A}\x{327}" eq "\x{327}\x{30A}"); -ok(NFC "\x{30A}\x{327}" eq "\x{327}\x{30A}"); -ok(NFKD "\x{30A}\x{327}" eq "\x{327}\x{30A}"); -ok(NFKC "\x{30A}\x{327}" eq "\x{327}\x{30A}"); -ok(FCD "\x{30A}\x{327}" eq "\x{327}\x{30A}"); -ok(FCC "\x{30A}\x{327}" eq "\x{327}\x{30A}"); -ok(reorder "\x{30A}\x{327}" eq "\x{327}\x{30A}"); - -# 9 - -ok(prototype \&normalize,'$$'); -ok(prototype \&NFD, '$'); -ok(prototype \&NFC, '$'); -ok(prototype \&NFKD, '$'); -ok(prototype \&NFKC, '$'); -ok(prototype \&FCD, '$'); -ok(prototype \&FCC, '$'); - -ok(prototype \&check, '$$'); -ok(prototype \&checkNFD, '$'); -ok(prototype \&checkNFC, '$'); -ok(prototype \&checkNFKD,'$'); -ok(prototype \&checkNFKC,'$'); -ok(prototype \&checkFCD, '$'); -ok(prototype \&checkFCC, '$'); - -ok(prototype \&decompose, '$;$'); -ok(prototype \&reorder, '$'); -ok(prototype \&compose, '$'); -ok(prototype \&composeContiguous, '$'); - -# 27 - -ok(prototype \&getCanon, '$'); -ok(prototype \&getCompat, '$'); -ok(prototype \&getComposite, '$$'); -ok(prototype \&getCombinClass,'$'); -ok(prototype \&isExclusion, '$'); -ok(prototype \&isSingleton, '$'); -ok(prototype \&isNonStDecomp, '$'); -ok(prototype \&isComp2nd, '$'); -ok(prototype \&isComp_Ex, '$'); -ok(prototype \&isNFD_NO, '$'); -ok(prototype \&isNFC_NO, '$'); -ok(prototype \&isNFC_MAYBE, '$'); -ok(prototype \&isNFKD_NO, '$'); -ok(prototype \&isNFKC_NO, '$'); -ok(prototype \&isNFKC_MAYBE, '$'); -ok(prototype \&splitOnLastStarter, undef); -ok(prototype \&normalize_partial, '$$'); -ok(prototype \&NFD_partial, '$'); -ok(prototype \&NFC_partial, '$'); -ok(prototype \&NFKD_partial, '$'); -ok(prototype \&NFKC_partial, '$'); - -# 48 - diff --git a/cpan/Unicode-Normalize/t/split.t b/cpan/Unicode-Normalize/t/split.t deleted file mode 100644 index a92957c208..0000000000 --- a/cpan/Unicode-Normalize/t/split.t +++ /dev/null @@ -1,147 +0,0 @@ - -BEGIN { - unless ('A' eq pack('U', 0x41)) { - print "1..0 # Unicode::Normalize cannot pack a Unicode code point\n"; - exit 0; - } - unless (0x41 == unpack('U', 'A')) { - print "1..0 # Unicode::Normalize cannot get a Unicode code point\n"; - exit 0; - } -} - -BEGIN { - if ($ENV{PERL_CORE}) { - chdir('t') if -d 't'; - @INC = $^O eq 'MacOS' ? qw(::lib) : qw(../lib); - } -} - -BEGIN { - unless (5.006001 <= $]) { - print "1..0 # skipped: Perl 5.6.1 or later". - " needed for this test\n"; - exit; - } -} - -######################### - -use strict; -use warnings; -BEGIN { $| = 1; print "1..34\n"; } -my $count = 0; -sub ok ($;$) { - my $p = my $r = shift; - if (@_) { - my $x = shift; - $p = !defined $x ? !defined $r : !defined $r ? 0 : $r eq $x; - } - print $p ? "ok" : "not ok", ' ', ++$count, "\n"; -} - -use Unicode::Normalize qw(:all); - -ok(1); - -sub _pack_U { Unicode::Normalize::pack_U(@_) } -sub _unpack_U { Unicode::Normalize::unpack_U(@_) } - -######################### - -our $proc; # before the last starter -our $unproc; # the last starter and after -# If string has no starter, entire string is set to $unproc. - -($proc, $unproc) = splitOnLastStarter(""); -ok($proc, ""); -ok($unproc, ""); - -($proc, $unproc) = splitOnLastStarter("A"); -ok($proc, ""); -ok($unproc, "A"); - -($proc, $unproc) = splitOnLastStarter(_pack_U(0x41, 0x300, 0x327, 0x42)); -ok($proc, _pack_U(0x41, 0x300, 0x327)); -ok($unproc, "B"); - -($proc, $unproc) = splitOnLastStarter(_pack_U(0x4E00, 0x41, 0x301)); -ok($proc, _pack_U(0x4E00)); -ok($unproc, _pack_U(0x41, 0x301)); - -($proc, $unproc) = splitOnLastStarter(_pack_U(0x302, 0x301, 0x300)); -ok($proc, ""); -ok($unproc, _pack_U(0x302, 0x301, 0x300)); - -our $ka_grave = _pack_U(0x41, 0, 0x42, 0x304B, 0x300); -our $dakuten = _pack_U(0x3099); -our $ga_grave = _pack_U(0x41, 0, 0x42, 0x304C, 0x300); - -our ($p, $u) = splitOnLastStarter($ka_grave); -our $concat = $p . NFC($u.$dakuten); - -ok(NFC($ka_grave.$dakuten) eq $ga_grave); -ok(NFC($ka_grave).NFC($dakuten) ne $ga_grave); -ok($concat eq $ga_grave); - -# 14 - -sub arraynorm { - my $form = shift; - my @string = @_; - my $result = ""; - my $unproc = ""; - foreach my $str (@string) { - $unproc .= $str; - my $n = normalize($form, $unproc); - my($p, $u) = splitOnLastStarter($n); - $result .= $p; - $unproc = $u; - } - $result .= $unproc; - return $result; -} - -my $strD = "\x{3C9}\x{301}\x{1100}\x{1161}\x{11A8}\x{1100}\x{1161}\x{11AA}"; -my $strC = "\x{3CE}\x{AC01}\x{AC03}"; -my @str1 = (substr($strD,0,3), substr($strD,3,4), substr($strD,7)); -my @str2 = (substr($strD,0,1), substr($strD,1,3), substr($strD,4)); -ok($strC eq NFC($strD)); -ok($strD eq join('', @str1)); -ok($strC eq arraynorm('NFC', @str1)); -ok($strD eq join('', @str2)); -ok($strC eq arraynorm('NFC', @str2)); - -my @strX = ("\x{300}\x{AC00}", "\x{11A8}"); -my $strX = "\x{300}\x{AC01}"; -ok($strX eq NFC(join('', @strX))); -ok($strX eq arraynorm('NFC', @strX)); -ok($strX eq NFKC(join('', @strX))); -ok($strX eq arraynorm('NFKC', @strX)); - -my @strY = ("\x{304B}\x{0308}", "\x{0323}\x{3099}"); -my $strY = ("\x{304C}\x{0323}\x{0308}"); -ok($strY eq NFC(join('', @strY))); -ok($strY eq arraynorm('NFC', @strY)); -ok($strY eq NFKC(join('', @strY))); -ok($strY eq arraynorm('NFKC', @strY)); - -my @strZ = ("\x{304B}\x{0308}", "\x{0323}", "\x{3099}"); -my $strZ = ("\x{304B}\x{3099}\x{0323}\x{0308}"); -ok($strZ eq NFD(join('', @strZ))); -ok($strZ eq arraynorm('NFD', @strZ)); -ok($strZ eq NFKD(join('', @strZ))); -ok($strZ eq arraynorm('NFKD', @strZ)); - -# 31 - -# don't modify the source - -my $source = "ABC"; -($proc, $unproc) = splitOnLastStarter($source); -ok($proc, "AB"); -ok($unproc, "C"); -ok($source, "ABC"); - -# 34 - diff --git a/cpan/Unicode-Normalize/t/test.t b/cpan/Unicode-Normalize/t/test.t deleted file mode 100644 index cb4b6ea637..0000000000 --- a/cpan/Unicode-Normalize/t/test.t +++ /dev/null @@ -1,168 +0,0 @@ - -BEGIN { - unless ('A' eq pack('U', 0x41)) { - print "1..0 # Unicode::Normalize cannot pack a Unicode code point\n"; - exit 0; - } - unless (0x41 == unpack('U', 'A')) { - print "1..0 # Unicode::Normalize cannot get a Unicode code point\n"; - exit 0; - } -} - -BEGIN { - if ($ENV{PERL_CORE}) { - chdir('t') if -d 't'; - @INC = $^O eq 'MacOS' ? qw(::lib) : qw(../lib); - } -} - -######################### - -use strict; -use warnings; -BEGIN { $| = 1; print "1..72\n"; } -my $count = 0; -sub ok ($;$) { - my $p = my $r = shift; - if (@_) { - my $x = shift; - $p = !defined $x ? !defined $r : !defined $r ? 0 : $r eq $x; - } - print $p ? "ok" : "not ok", ' ', ++$count, "\n"; -} - -use Unicode::Normalize; - -ok(1); - -sub _pack_U { Unicode::Normalize::pack_U(@_) } -sub _unpack_U { Unicode::Normalize::unpack_U(@_) } - -######################### - -ok(NFD(""), ""); -ok(NFC(""), ""); -ok(NFKD(""), ""); -ok(NFKC(""), ""); - -ok(NFD("A"), "A"); -ok(NFC("A"), "A"); -ok(NFKD("A"), "A"); -ok(NFKC("A"), "A"); - -# 9 - -# don't modify the source -my $sNFD = "\x{FA19}"; -ok(NFD($sNFD), "\x{795E}"); -ok($sNFD, "\x{FA19}"); - -my $sNFC = "\x{FA1B}"; -ok(NFC($sNFC), "\x{798F}"); -ok($sNFC, "\x{FA1B}"); - -my $sNFKD = "\x{FA1E}"; -ok(NFKD($sNFKD), "\x{7FBD}"); -ok($sNFKD, "\x{FA1E}"); - -my $sNFKC = "\x{FA26}"; -ok(NFKC($sNFKC), "\x{90FD}"); -ok($sNFKC, "\x{FA26}"); - -# 17 - -sub hexNFC { - join " ", map sprintf("%04X", $_), - _unpack_U NFC _pack_U map hex, split ' ', shift; -} -sub hexNFD { - join " ", map sprintf("%04X", $_), - _unpack_U NFD _pack_U map hex, split ' ', shift; -} - -ok(hexNFD("1E14 AC01"), "0045 0304 0300 1100 1161 11A8"); -ok(hexNFD("AC00 AE00"), "1100 1161 1100 1173 11AF"); - -ok(hexNFC("0061 0315 0300 05AE 05C4 0062"), "00E0 05AE 05C4 0315 0062"); -ok(hexNFC("00E0 05AE 05C4 0315 0062"), "00E0 05AE 05C4 0315 0062"); -ok(hexNFC("0061 05AE 0300 05C4 0315 0062"), "00E0 05AE 05C4 0315 0062"); -ok(hexNFC("0045 0304 0300 AC00 11A8"), "1E14 AC01"); -ok(hexNFC("1100 1161 1100 1173 11AF"), "AC00 AE00"); -ok(hexNFC("1100 0300 1161 1173 11AF"), "1100 0300 1161 1173 11AF"); - -ok(hexNFD("0061 0315 0300 05AE 05C4 0062"), "0061 05AE 0300 05C4 0315 0062"); -ok(hexNFD("00E0 05AE 05C4 0315 0062"), "0061 05AE 0300 05C4 0315 0062"); -ok(hexNFD("0061 05AE 0300 05C4 0315 0062"), "0061 05AE 0300 05C4 0315 0062"); -ok(hexNFC("0061 05C4 0315 0300 05AE 0062"), "0061 05AE 05C4 0300 0315 0062"); -ok(hexNFC("0061 05AE 05C4 0300 0315 0062"), "0061 05AE 05C4 0300 0315 0062"); -ok(hexNFD("0061 05C4 0315 0300 05AE 0062"), "0061 05AE 05C4 0300 0315 0062"); -ok(hexNFD("0061 05AE 05C4 0300 0315 0062"), "0061 05AE 05C4 0300 0315 0062"); -ok(hexNFC("0000 0041 0000 0000"), "0000 0041 0000 0000"); -ok(hexNFD("0000 0041 0000 0000"), "0000 0041 0000 0000"); - -ok(hexNFC("AC00 11A7"), "AC00 11A7"); -ok(hexNFC("AC00 11A8"), "AC01"); -ok(hexNFC("AC00 11A9"), "AC02"); -ok(hexNFC("AC00 11C2"), "AC1B"); -ok(hexNFC("AC00 11C3"), "AC00 11C3"); - -# 39 - -# Test Cases from Public Review Issue #29: Normalization Issue -# cf. http://www.unicode.org/review/pr-29.html -ok(hexNFC("0B47 0300 0B3E"), "0B47 0300 0B3E"); -ok(hexNFC("1100 0300 1161"), "1100 0300 1161"); -ok(hexNFC("0B47 0B3E 0300"), "0B4B 0300"); -ok(hexNFC("1100 1161 0300"), "AC00 0300"); -ok(hexNFC("0B47 0300 0B3E 0327"), "0B47 0300 0B3E 0327"); -ok(hexNFC("1100 0300 1161 0327"), "1100 0300 1161 0327"); - -ok(hexNFC("0300 0041"), "0300 0041"); -ok(hexNFC("0300 0301 0041"), "0300 0301 0041"); -ok(hexNFC("0301 0300 0041"), "0301 0300 0041"); -ok(hexNFC("0000 0300 0000 0301"), "0000 0300 0000 0301"); -ok(hexNFC("0000 0301 0000 0300"), "0000 0301 0000 0300"); - -ok(hexNFC("0327 0061 0300"), "0327 00E0"); -ok(hexNFC("0301 0061 0300"), "0301 00E0"); -ok(hexNFC("0315 0061 0300"), "0315 00E0"); -ok(hexNFC("0000 0327 0061 0300"), "0000 0327 00E0"); -ok(hexNFC("0000 0301 0061 0300"), "0000 0301 00E0"); -ok(hexNFC("0000 0315 0061 0300"), "0000 0315 00E0"); - -# 56 - -# NFC() and NFKC() should be unary. -my $str11 = _pack_U(0x41, 0x0302, 0x0301, 0x62); -my $str12 = _pack_U(0x1EA4, 0x62); -ok(NFC $str11 eq $str12); -ok(NFKC $str11 eq $str12); - -# NFD() and NFKD() should be unary. -my $str21 = _pack_U(0xE0, 0xAC00); -my $str22 = _pack_U(0x61, 0x0300, 0x1100, 0x1161); -ok(NFD $str21 eq $str22); -ok(NFKD $str21 eq $str22); - -# 60 - -## Bug #53197: NFKC("\x{2000}") produces... - -ok(NFKC("\x{2002}") eq ' '); -ok(NFKD("\x{2002}") eq ' '); -ok(NFKC("\x{2000}") eq ' '); -ok(NFKD("\x{2000}") eq ' '); - -ok(NFKC("\x{210C}") eq 'H'); -ok(NFKD("\x{210C}") eq 'H'); -ok(NFKC("\x{210D}") eq 'H'); -ok(NFKD("\x{210D}") eq 'H'); - -ok(NFC("\x{F907}") eq "\x{9F9C}"); -ok(NFD("\x{F907}") eq "\x{9F9C}"); -ok(NFKC("\x{F907}") eq "\x{9F9C}"); -ok(NFKD("\x{F907}") eq "\x{9F9C}"); - -# 72 - diff --git a/cpan/Unicode-Normalize/t/tie.t b/cpan/Unicode-Normalize/t/tie.t deleted file mode 100644 index 4fdd121e07..0000000000 --- a/cpan/Unicode-Normalize/t/tie.t +++ /dev/null @@ -1,82 +0,0 @@ - -BEGIN { - unless ('A' eq pack('U', 0x41)) { - print "1..0 # Unicode::Normalize cannot pack a Unicode code point\n"; - exit 0; - } - unless (0x41 == unpack('U', 'A')) { - print "1..0 # Unicode::Normalize cannot get a Unicode code point\n"; - exit 0; - } -} - -BEGIN { - if ($ENV{PERL_CORE}) { - chdir('t') if -d 't'; - @INC = $^O eq 'MacOS' ? qw(::lib) : qw(../lib); - } -} - -######################### - -BEGIN { - use Unicode::Normalize qw(:all); - - unless (exists &Unicode::Normalize::bootstrap or 5.008 <= $]) { - print "1..0 # skipped: XSUB, or Perl 5.8.0 or later". - " needed for this test\n"; - print $@; - exit; - } -} - -use strict; -use warnings; -BEGIN { $| = 1; print "1..17\n"; } -my $count = 0; -sub ok ($;$) { - my $p = my $r = shift; - if (@_) { - my $x = shift; - $p = !defined $x ? !defined $r : !defined $r ? 0 : $r eq $x; - } - print $p ? "ok" : "not ok", ' ', ++$count, "\n"; -} - -ok(1); - -package tiescalar; -sub TIESCALAR { - my ($class, $instance) = @_; - return bless \$instance => $class; -} -sub FETCH { return ${$_[0]}++ } -sub STORE { return ${$_[0]} = $_[1] } -sub DESTROY { undef ${$_[0]} } - -######################### - -package main; - -tie my $tie1, 'tiescalar', "123"; -ok(NFD($tie1), 123); -ok(NFC($tie1), 124); -ok(NFKD($tie1), 125); -ok(NFKC($tie1), 126); -ok(FCD($tie1), 127); -ok(FCC($tie1), 128); - -tie my $tie2, 'tiescalar', "256"; -ok(normalize('NFD', $tie2), 256); -ok(normalize('NFC', $tie2), 257); -ok(normalize('NFKD', $tie2), 258); -ok(normalize('NFKC', $tie2), 259); -ok(normalize('FCD', $tie2), 260); -ok(normalize('FCC', $tie2), 261); - -tie my $tie3, 'tiescalar', "315"; -ok(decompose($tie3), 315); -ok(reorder($tie3), 316); -ok(compose($tie3), 317); -ok(composeContiguous($tie3), 318); - |