diff options
author | Karl Williamson <khw@cpan.org> | 2016-11-11 14:52:39 +0100 |
---|---|---|
committer | Karl Williamson <khw@cpan.org> | 2016-11-11 17:51:30 +0100 |
commit | 3baae3fab5dea1469c2d040f5380bc2009bdeecb (patch) | |
tree | f20f6c9da2a60da215c786c01db84d23516914f5 /dist/Unicode-Normalize | |
parent | af25b33d388e2824ad52b31c8f5c7bc722f02dd6 (diff) | |
download | perl-3baae3fab5dea1469c2d040f5380bc2009bdeecb.tar.gz |
Move Unicode-Normalize to dist/
p5p has taken over the maintenance of this module, so it should be in
dist/
Diffstat (limited to 'dist/Unicode-Normalize')
-rw-r--r-- | dist/Unicode-Normalize/.gitignore | 1 | ||||
-rw-r--r-- | dist/Unicode-Normalize/Makefile.PL | 55 | ||||
-rw-r--r-- | dist/Unicode-Normalize/Normalize.pm | 635 | ||||
-rw-r--r-- | dist/Unicode-Normalize/Normalize.xs | 925 | ||||
-rw-r--r-- | dist/Unicode-Normalize/mkheader | 419 | ||||
-rw-r--r-- | dist/Unicode-Normalize/t/fcdc.t | 138 | ||||
-rw-r--r-- | dist/Unicode-Normalize/t/form.t | 84 | ||||
-rw-r--r-- | dist/Unicode-Normalize/t/func.t | 386 | ||||
-rw-r--r-- | dist/Unicode-Normalize/t/illegal.t | 85 | ||||
-rw-r--r-- | dist/Unicode-Normalize/t/norm.t | 145 | ||||
-rw-r--r-- | dist/Unicode-Normalize/t/null.t | 100 | ||||
-rw-r--r-- | dist/Unicode-Normalize/t/partial1.t | 120 | ||||
-rw-r--r-- | dist/Unicode-Normalize/t/partial2.t | 116 | ||||
-rw-r--r-- | dist/Unicode-Normalize/t/proto.t | 99 | ||||
-rw-r--r-- | dist/Unicode-Normalize/t/split.t | 147 | ||||
-rw-r--r-- | dist/Unicode-Normalize/t/test.t | 168 | ||||
-rw-r--r-- | dist/Unicode-Normalize/t/tie.t | 82 |
17 files changed, 3705 insertions, 0 deletions
diff --git a/dist/Unicode-Normalize/.gitignore b/dist/Unicode-Normalize/.gitignore new file mode 100644 index 0000000000..424c745c12 --- /dev/null +++ b/dist/Unicode-Normalize/.gitignore @@ -0,0 +1 @@ +*.h diff --git a/dist/Unicode-Normalize/Makefile.PL b/dist/Unicode-Normalize/Makefile.PL new file mode 100644 index 0000000000..44a4b8ded8 --- /dev/null +++ b/dist/Unicode-Normalize/Makefile.PL @@ -0,0 +1,55 @@ +require 5.006001; +use ExtUtils::MakeMaker; + +my $clean = {}; + +my $mm_ver = ExtUtils::MakeMaker->VERSION; + +if (-f "Normalize.xs") { + print STDERR "Making header files for XS...\n"; + + do 'mkheader' or die $@ || "mkheader: $!"; + + $clean = { FILES => 'unfcan.h unfcmb.h unfcmp.h unfcpt.h unfexc.h' }; +} + +WriteMakefile( + ($mm_ver < 6.58) + ? ('AUTHOR' => 'SADAHIRO Tomoyuki <SADAHIRO@cpan.org>, Karl Williamson <khw@cpan.org>') + : ('AUTHOR' => [ + 'SADAHIRO Tomoyuki <SADAHIRO@cpan.org>', + 'Karl Williamson <khw@cpan.org>', + ]), + 'ABSTRACT' => 'Unicode Normalization Forms', + 'INSTALLDIRS' => ($] >= 5.007002 && $] < 5.011) ? 'perl' : 'site', + # see perl5110delta, @INC reorganization + 'LICENSE' => 'perl', + 'NAME' => 'Unicode::Normalize', + 'VERSION_FROM' => 'Normalize.pm', # finds $VERSION + 'clean' => $clean, + 'depend' => { 'Normalize.o' => '$(H_FILES)' }, + 'PREREQ_PM' => { + Carp => 0, + constant => 0, + DynaLoader => 0, + Exporter => 0, + File::Spec => 0, + strict => 0, + warnings => 0, + SelectSaver => 0, + }, + ($mm_ver < 6.48 ? () : MIN_PERL_VERSION => 5.6.0), + ($mm_ver < 6.46 ? () : (META_MERGE => { + 'meta-spec' => { version => 2 }, + resources => { + repository => { + url => 'https://github.com/khwilliamson/Unicode-Normalize.git', + web => 'https://github.com/khwilliamson/Unicode-Normalize', + type => 'git', + }, + bugtracker => { + web => 'https://rt.cpan.org/Public/Dist/Display.html?Name=Unicode-Normalize', + }, + }, + })), +); diff --git a/dist/Unicode-Normalize/Normalize.pm b/dist/Unicode-Normalize/Normalize.pm new file mode 100644 index 0000000000..b53f5c728b --- /dev/null +++ b/dist/Unicode-Normalize/Normalize.pm @@ -0,0 +1,635 @@ +package Unicode::Normalize; + +BEGIN { + unless ('A' eq pack('U', 0x41)) { + die "Unicode::Normalize cannot stringify a Unicode code point\n"; + } + unless (0x41 == unpack('U', 'A')) { + die "Unicode::Normalize cannot get Unicode code point\n"; + } +} + +use 5.006; +use strict; +use warnings; +use Carp; + +no warnings 'utf8'; + +our $VERSION = '1.25'; +our $PACKAGE = __PACKAGE__; + +our @EXPORT = qw( NFC NFD NFKC NFKD ); +our @EXPORT_OK = qw( + normalize decompose reorder compose + checkNFD checkNFKD checkNFC checkNFKC check + getCanon getCompat getComposite getCombinClass + isExclusion isSingleton isNonStDecomp isComp2nd isComp_Ex + isNFD_NO isNFC_NO isNFC_MAYBE isNFKD_NO isNFKC_NO isNFKC_MAYBE + FCD checkFCD FCC checkFCC composeContiguous splitOnLastStarter + normalize_partial NFC_partial NFD_partial NFKC_partial NFKD_partial +); +our %EXPORT_TAGS = ( + all => [ @EXPORT, @EXPORT_OK ], + normalize => [ @EXPORT, qw/normalize decompose reorder compose/ ], + check => [ qw/checkNFD checkNFKD checkNFC checkNFKC check/ ], + fast => [ qw/FCD checkFCD FCC checkFCC composeContiguous/ ], +); + +## +## utilities for tests +## + +sub pack_U { + return pack('U*', @_); +} + +sub unpack_U { + + # The empty pack returns an empty UTF-8 string, so the effect is to force + # the shifted parameter into being UTF-8. This allows this to work on + # Perl 5.6, where there is no utf8::upgrade(). + return unpack('U*', shift(@_).pack('U*')); +} + +require Exporter; + +##### The above part is common to XS and PP ##### + +our @ISA = qw(Exporter DynaLoader); +require DynaLoader; +bootstrap Unicode::Normalize $VERSION; + +##### The below part is common to XS and PP ##### + +## +## normalize +## + +sub FCD ($) { + my $str = shift; + return checkFCD($str) ? $str : NFD($str); +} + +our %formNorm = ( + NFC => \&NFC, C => \&NFC, + NFD => \&NFD, D => \&NFD, + NFKC => \&NFKC, KC => \&NFKC, + NFKD => \&NFKD, KD => \&NFKD, + FCD => \&FCD, FCC => \&FCC, +); + +sub normalize($$) +{ + my $form = shift; + my $str = shift; + if (exists $formNorm{$form}) { + return $formNorm{$form}->($str); + } + croak($PACKAGE."::normalize: invalid form name: $form"); +} + +## +## partial +## + +sub normalize_partial ($$) { + if (exists $formNorm{$_[0]}) { + my $n = normalize($_[0], $_[1]); + my($p, $u) = splitOnLastStarter($n); + $_[1] = $u; + return $p; + } + croak($PACKAGE."::normalize_partial: invalid form name: $_[0]"); +} + +sub NFD_partial ($) { return normalize_partial('NFD', $_[0]) } +sub NFC_partial ($) { return normalize_partial('NFC', $_[0]) } +sub NFKD_partial($) { return normalize_partial('NFKD',$_[0]) } +sub NFKC_partial($) { return normalize_partial('NFKC',$_[0]) } + +## +## check +## + +our %formCheck = ( + NFC => \&checkNFC, C => \&checkNFC, + NFD => \&checkNFD, D => \&checkNFD, + NFKC => \&checkNFKC, KC => \&checkNFKC, + NFKD => \&checkNFKD, KD => \&checkNFKD, + FCD => \&checkFCD, FCC => \&checkFCC, +); + +sub check($$) +{ + my $form = shift; + my $str = shift; + if (exists $formCheck{$form}) { + return $formCheck{$form}->($str); + } + croak($PACKAGE."::check: invalid form name: $form"); +} + +1; +__END__ + +=head1 NAME + +Unicode::Normalize - Unicode Normalization Forms + +=head1 SYNOPSIS + +(1) using function names exported by default: + + use Unicode::Normalize; + + $NFD_string = NFD($string); # Normalization Form D + $NFC_string = NFC($string); # Normalization Form C + $NFKD_string = NFKD($string); # Normalization Form KD + $NFKC_string = NFKC($string); # Normalization Form KC + +(2) using function names exported on request: + + use Unicode::Normalize 'normalize'; + + $NFD_string = normalize('D', $string); # Normalization Form D + $NFC_string = normalize('C', $string); # Normalization Form C + $NFKD_string = normalize('KD', $string); # Normalization Form KD + $NFKC_string = normalize('KC', $string); # Normalization Form KC + +=head1 DESCRIPTION + +Parameters: + +C<$string> is used as a string under character semantics (see L<perlunicode>). + +C<$code_point> should be an unsigned integer representing a Unicode code point. + +Note: Between XSUB and pure Perl, there is an incompatibility +about the interpretation of C<$code_point> as a decimal number. +XSUB converts C<$code_point> to an unsigned integer, but pure Perl does not. +Do not use a floating point nor a negative sign in C<$code_point>. + +=head2 Normalization Forms + +=over 4 + +=item C<$NFD_string = NFD($string)> + +It returns the Normalization Form D (formed by canonical decomposition). + +=item C<$NFC_string = NFC($string)> + +It returns the Normalization Form C (formed by canonical decomposition +followed by canonical composition). + +=item C<$NFKD_string = NFKD($string)> + +It returns the Normalization Form KD (formed by compatibility decomposition). + +=item C<$NFKC_string = NFKC($string)> + +It returns the Normalization Form KC (formed by compatibility decomposition +followed by B<canonical> composition). + +=item C<$FCD_string = FCD($string)> + +If the given string is in FCD ("Fast C or D" form; cf. UTN #5), +it returns the string without modification; otherwise it returns an FCD string. + +Note: FCD is not always unique, then plural forms may be equivalent +each other. C<FCD()> will return one of these equivalent forms. + +=item C<$FCC_string = FCC($string)> + +It returns the FCC form ("Fast C Contiguous"; cf. UTN #5). + +Note: FCC is unique, as well as four normalization forms (NF*). + +=item C<$normalized_string = normalize($form_name, $string)> + +It returns the normalization form of C<$form_name>. + +As C<$form_name>, one of the following names must be given. + + 'C' or 'NFC' for Normalization Form C (UAX #15) + 'D' or 'NFD' for Normalization Form D (UAX #15) + 'KC' or 'NFKC' for Normalization Form KC (UAX #15) + 'KD' or 'NFKD' for Normalization Form KD (UAX #15) + + 'FCD' for "Fast C or D" Form (UTN #5) + 'FCC' for "Fast C Contiguous" (UTN #5) + +=back + +=head2 Decomposition and Composition + +=over 4 + +=item C<$decomposed_string = decompose($string [, $useCompatMapping])> + +It returns the concatenation of the decomposition of each character +in the string. + +If the second parameter (a boolean) is omitted or false, +the decomposition is canonical decomposition; +if the second parameter (a boolean) is true, +the decomposition is compatibility decomposition. + +The string returned is not always in NFD/NFKD. Reordering may be required. + + $NFD_string = reorder(decompose($string)); # eq. to NFD() + $NFKD_string = reorder(decompose($string, TRUE)); # eq. to NFKD() + +=item C<$reordered_string = reorder($string)> + +It returns the result of reordering the combining characters +according to Canonical Ordering Behavior. + +For example, when you have a list of NFD/NFKD strings, +you can get the concatenated NFD/NFKD string from them, by saying + + $concat_NFD = reorder(join '', @NFD_strings); + $concat_NFKD = reorder(join '', @NFKD_strings); + +=item C<$composed_string = compose($string)> + +It returns the result of canonical composition +without applying any decomposition. + +For example, when you have a NFD/NFKD string, +you can get its NFC/NFKC string, by saying + + $NFC_string = compose($NFD_string); + $NFKC_string = compose($NFKD_string); + +=item C<($processed, $unprocessed) = splitOnLastStarter($normalized)> + +It returns two strings: the first one, C<$processed>, is a part +before the last starter, and the second one, C<$unprocessed> is +another part after the first part. A starter is a character having +a combining class of zero (see UAX #15). + +Note that C<$processed> may be empty (when C<$normalized> contains no +starter or starts with the last starter), and then C<$unprocessed> +should be equal to the entire C<$normalized>. + +When you have a C<$normalized> string and an C<$unnormalized> string +following it, a simple concatenation is wrong: + + $concat = $normalized . normalize($form, $unnormalized); # wrong! + +Instead of it, do like this: + + ($processed, $unprocessed) = splitOnLastStarter($normalized); + $concat = $processed . normalize($form,$unprocessed.$unnormalized); + +C<splitOnLastStarter()> should be called with a pre-normalized parameter +C<$normalized>, that is in the same form as C<$form> you want. + +If you have an array of C<@string> that should be concatenated and then +normalized, you can do like this: + + my $result = ""; + my $unproc = ""; + foreach my $str (@string) { + $unproc .= $str; + my $n = normalize($form, $unproc); + my($p, $u) = splitOnLastStarter($n); + $result .= $p; + $unproc = $u; + } + $result .= $unproc; + # instead of normalize($form, join('', @string)) + +=item C<$processed = normalize_partial($form, $unprocessed)> + +A wrapper for the combination of C<normalize()> and C<splitOnLastStarter()>. +Note that C<$unprocessed> will be modified as a side-effect. + +If you have an array of C<@string> that should be concatenated and then +normalized, you can do like this: + + my $result = ""; + my $unproc = ""; + foreach my $str (@string) { + $unproc .= $str; + $result .= normalize_partial($form, $unproc); + } + $result .= $unproc; + # instead of normalize($form, join('', @string)) + +=item C<$processed = NFD_partial($unprocessed)> + +It does like C<normalize_partial('NFD', $unprocessed)>. +Note that C<$unprocessed> will be modified as a side-effect. + +=item C<$processed = NFC_partial($unprocessed)> + +It does like C<normalize_partial('NFC', $unprocessed)>. +Note that C<$unprocessed> will be modified as a side-effect. + +=item C<$processed = NFKD_partial($unprocessed)> + +It does like C<normalize_partial('NFKD', $unprocessed)>. +Note that C<$unprocessed> will be modified as a side-effect. + +=item C<$processed = NFKC_partial($unprocessed)> + +It does like C<normalize_partial('NFKC', $unprocessed)>. +Note that C<$unprocessed> will be modified as a side-effect. + +=back + +=head2 Quick Check + +(see Annex 8, UAX #15; and F<DerivedNormalizationProps.txt>) + +The following functions check whether the string is in that normalization form. + +The result returned will be one of the following: + + YES The string is in that normalization form. + NO The string is not in that normalization form. + MAYBE Dubious. Maybe yes, maybe no. + +=over 4 + +=item C<$result = checkNFD($string)> + +It returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>. + +=item C<$result = checkNFC($string)> + +It returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>; +C<undef> if C<MAYBE>. + +=item C<$result = checkNFKD($string)> + +It returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>. + +=item C<$result = checkNFKC($string)> + +It returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>; +C<undef> if C<MAYBE>. + +=item C<$result = checkFCD($string)> + +It returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>. + +=item C<$result = checkFCC($string)> + +It returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>; +C<undef> if C<MAYBE>. + +Note: If a string is not in FCD, it must not be in FCC. +So C<checkFCC($not_FCD_string)> should return C<NO>. + +=item C<$result = check($form_name, $string)> + +It returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>; +C<undef> if C<MAYBE>. + +As C<$form_name>, one of the following names must be given. + + 'C' or 'NFC' for Normalization Form C (UAX #15) + 'D' or 'NFD' for Normalization Form D (UAX #15) + 'KC' or 'NFKC' for Normalization Form KC (UAX #15) + 'KD' or 'NFKD' for Normalization Form KD (UAX #15) + + 'FCD' for "Fast C or D" Form (UTN #5) + 'FCC' for "Fast C Contiguous" (UTN #5) + +=back + +B<Note> + +In the cases of NFD, NFKD, and FCD, the answer must be +either C<YES> or C<NO>. The answer C<MAYBE> may be returned +in the cases of NFC, NFKC, and FCC. + +A C<MAYBE> string should contain at least one combining character +or the like. For example, C<COMBINING ACUTE ACCENT> has +the MAYBE_NFC/MAYBE_NFKC property. + +Both C<checkNFC("A\N{COMBINING ACUTE ACCENT}")> +and C<checkNFC("B\N{COMBINING ACUTE ACCENT}")> will return C<MAYBE>. +C<"A\N{COMBINING ACUTE ACCENT}"> is not in NFC +(its NFC is C<"\N{LATIN CAPITAL LETTER A WITH ACUTE}">), +while C<"B\N{COMBINING ACUTE ACCENT}"> is in NFC. + +If you want to check exactly, compare the string with its NFC/NFKC/FCC. + + if ($string eq NFC($string)) { + # $string is exactly normalized in NFC; + } else { + # $string is not normalized in NFC; + } + + if ($string eq NFKC($string)) { + # $string is exactly normalized in NFKC; + } else { + # $string is not normalized in NFKC; + } + +=head2 Character Data + +These functions are interface of character data used internally. +If you want only to get Unicode normalization forms, you don't need +call them yourself. + +=over 4 + +=item C<$canonical_decomposition = getCanon($code_point)> + +If the character is canonically decomposable (including Hangul Syllables), +it returns the (full) canonical decomposition as a string. +Otherwise it returns C<undef>. + +B<Note:> According to the Unicode standard, the canonical decomposition +of the character that is not canonically decomposable is same as +the character itself. + +=item C<$compatibility_decomposition = getCompat($code_point)> + +If the character is compatibility decomposable (including Hangul Syllables), +it returns the (full) compatibility decomposition as a string. +Otherwise it returns C<undef>. + +B<Note:> According to the Unicode standard, the compatibility decomposition +of the character that is not compatibility decomposable is same as +the character itself. + +=item C<$code_point_composite = getComposite($code_point_here, $code_point_next)> + +If two characters here and next (as code points) are composable +(including Hangul Jamo/Syllables and Composition Exclusions), +it returns the code point of the composite. + +If they are not composable, it returns C<undef>. + +=item C<$combining_class = getCombinClass($code_point)> + +It returns the combining class (as an integer) of the character. + +=item C<$may_be_composed_with_prev_char = isComp2nd($code_point)> + +It returns a boolean whether the character of the specified codepoint +may be composed with the previous one in a certain composition +(including Hangul Compositions, but excluding +Composition Exclusions and Non-Starter Decompositions). + +=item C<$is_exclusion = isExclusion($code_point)> + +It returns a boolean whether the code point is a composition exclusion. + +=item C<$is_singleton = isSingleton($code_point)> + +It returns a boolean whether the code point is a singleton + +=item C<$is_non_starter_decomposition = isNonStDecomp($code_point)> + +It returns a boolean whether the code point has Non-Starter Decomposition. + +=item C<$is_Full_Composition_Exclusion = isComp_Ex($code_point)> + +It returns a boolean of the derived property Comp_Ex +(Full_Composition_Exclusion). This property is generated from +Composition Exclusions + Singletons + Non-Starter Decompositions. + +=item C<$NFD_is_NO = isNFD_NO($code_point)> + +It returns a boolean of the derived property NFD_NO +(NFD_Quick_Check=No). + +=item C<$NFC_is_NO = isNFC_NO($code_point)> + +It returns a boolean of the derived property NFC_NO +(NFC_Quick_Check=No). + +=item C<$NFC_is_MAYBE = isNFC_MAYBE($code_point)> + +It returns a boolean of the derived property NFC_MAYBE +(NFC_Quick_Check=Maybe). + +=item C<$NFKD_is_NO = isNFKD_NO($code_point)> + +It returns a boolean of the derived property NFKD_NO +(NFKD_Quick_Check=No). + +=item C<$NFKC_is_NO = isNFKC_NO($code_point)> + +It returns a boolean of the derived property NFKC_NO +(NFKC_Quick_Check=No). + +=item C<$NFKC_is_MAYBE = isNFKC_MAYBE($code_point)> + +It returns a boolean of the derived property NFKC_MAYBE +(NFKC_Quick_Check=Maybe). + +=back + +=head1 EXPORT + +C<NFC>, C<NFD>, C<NFKC>, C<NFKD>: by default. + +C<normalize> and other some functions: on request. + +=head1 CAVEATS + +=over 4 + +=item Perl's version vs. Unicode version + +Since this module refers to perl core's Unicode database in the directory +F</lib/unicore> (or formerly F</lib/unicode>), the Unicode version of +normalization implemented by this module depends on what has been +compiled into your perl. The following table lists the default Unicode +version that comes with various perl versions. (It is possible to change +the Unicode version in any perl version to be any earlier Unicode version, +so one could cause Unicode 3.2 to be used in any perl version starting with +5.8.0. Read F<C<$Config{privlib}>/unicore/README.perl> for details. + + perl's version implemented Unicode version + 5.6.1 3.0.1 + 5.7.2 3.1.0 + 5.7.3 3.1.1 (normalization is same as 3.1.0) + 5.8.0 3.2.0 + 5.8.1-5.8.3 4.0.0 + 5.8.4-5.8.6 4.0.1 (normalization is same as 4.0.0) + 5.8.7-5.8.8 4.1.0 + 5.10.0 5.0.0 + 5.8.9, 5.10.1 5.1.0 + 5.12.x 5.2.0 + 5.14.x 6.0.0 + 5.16.x 6.1.0 + 5.18.x 6.2.0 + 5.20.x 6.3.0 + 5.22.x 7.0.0 + +=item Correction of decomposition mapping + +In older Unicode versions, a small number of characters (all of which are +CJK compatibility ideographs as far as they have been found) may have +an erroneous decomposition mapping (see F<NormalizationCorrections.txt>). +Anyhow, this module will neither refer to F<NormalizationCorrections.txt> +nor provide any specific version of normalization. Therefore this module +running on an older perl with an older Unicode database may use +the erroneous decomposition mapping blindly conforming to the Unicode database. + +=item Revised definition of canonical composition + +In Unicode 4.1.0, the definition D2 of canonical composition (which +affects NFC and NFKC) has been changed (see Public Review Issue #29 +and recent UAX #15). This module has used the newer definition +since the version 0.07 (Oct 31, 2001). +This module will not support the normalization according to the older +definition, even if the Unicode version implemented by perl is +lower than 4.1.0. + +=back + +=head1 AUTHOR + +SADAHIRO Tomoyuki <SADAHIRO@cpan.org> + +Currently maintained by <perl5-porters@perl.org> + +Copyright(C) 2001-2012, SADAHIRO Tomoyuki. Japan. All rights reserved. + +=head1 LICENSE + +This module is free software; you can redistribute it +and/or modify it under the same terms as Perl itself. + +=head1 SEE ALSO + +=over 4 + +=item http://www.unicode.org/reports/tr15/ + +Unicode Normalization Forms - UAX #15 + +=item http://www.unicode.org/Public/UNIDATA/CompositionExclusions.txt + +Composition Exclusion Table + +=item http://www.unicode.org/Public/UNIDATA/DerivedNormalizationProps.txt + +Derived Normalization Properties + +=item http://www.unicode.org/Public/UNIDATA/NormalizationCorrections.txt + +Normalization Corrections + +=item http://www.unicode.org/review/pr-29.html + +Public Review Issue #29: Normalization Issue + +=item http://www.unicode.org/notes/tn5/ + +Canonical Equivalence in Applications - UTN #5 + +=back + +=cut diff --git a/dist/Unicode-Normalize/Normalize.xs b/dist/Unicode-Normalize/Normalize.xs new file mode 100644 index 0000000000..4acff7fe49 --- /dev/null +++ b/dist/Unicode-Normalize/Normalize.xs @@ -0,0 +1,925 @@ + +#define PERL_NO_GET_CONTEXT /* we want efficiency */ + +/* private functions which need pTHX_ and aTHX_ + pv_cat_decompHangul + sv_2pvunicode + pv_utf8_decompose + pv_utf8_reorder + pv_utf8_compose +*/ + +#include "EXTERN.h" +#include "perl.h" +#include "XSUB.h" + +/* These 5 files are prepared by mkheader */ +#include "unfcmb.h" +#include "unfcan.h" +#include "unfcpt.h" +#include "unfcmp.h" +#include "unfexc.h" + +/* The generated normalization tables since v5.20 are in native character set + * terms. Prior to that, they were in Unicode terms. So we use 'uvchr' for + * later perls, and redefine that to be 'uvuni' for earlier ones */ +#if PERL_VERSION < 20 +# undef uvchr_to_utf8 +# ifdef uvuni_to_utf8 +# define uvchr_to_utf8 uvuni_to_utf8 +# else /* Perl 5.6.1 */ +# define uvchr_to_utf8 uv_to_utf8 +# endif + +# undef utf8n_to_uvchr +# ifdef utf8n_to_uvuni +# define utf8n_to_uvchr utf8n_to_uvuni +# else /* Perl 5.6.1 */ +# define utf8n_to_uvchr utf8_to_uv +# endif +#endif + +/* UTF8_ALLOW_BOM is used before Perl 5.8.0 */ +#ifndef UTF8_ALLOW_BOM +#define UTF8_ALLOW_BOM (0) +#endif /* UTF8_ALLOW_BOM */ + +#ifndef UTF8_ALLOW_SURROGATE +#define UTF8_ALLOW_SURROGATE (0) +#endif /* UTF8_ALLOW_SURROGATE */ + +#ifndef UTF8_ALLOW_FE_FF +#define UTF8_ALLOW_FE_FF (0) +#endif /* UTF8_ALLOW_FE_FF */ + +#ifndef UTF8_ALLOW_FFFF +#define UTF8_ALLOW_FFFF (0) +#endif /* UTF8_ALLOW_FFFF */ + +#ifndef PERL_UNUSED_VAR +# define PERL_UNUSED_VAR(x) ((void)sizeof(x)) +#endif + +#define AllowAnyUTF (UTF8_ALLOW_SURROGATE|UTF8_ALLOW_BOM|UTF8_ALLOW_FE_FF|UTF8_ALLOW_FFFF) + +/* check if the string buffer is enough before uvchr_to_utf8(). */ +/* dstart, d, and dlen should be defined outside before. */ +#define Renew_d_if_not_enough_to(need) STRLEN curlen = d - dstart; \ + if (dlen < curlen + (need)) { \ + dlen += (need); \ + Renew(dstart, dlen+1, U8); \ + d = dstart + curlen; \ + } + +/* if utf8n_to_uvchr() sets retlen to 0 (if broken?) */ +#define ErrRetlenIsZero "panic (Unicode::Normalize %s): zero-length character" + +/* utf8_hop() hops back before start. Maybe broken UTF-8 */ +#define ErrHopBeforeStart "panic (Unicode::Normalize): hopping before start" + +/* At present, char > 0x10ffff are unaffected without complaint, right? */ +#define VALID_UTF_MAX (0x10ffff) +#define OVER_UTF_MAX(uv) (VALID_UTF_MAX < (uv)) + +/* size of array for combining characters */ +/* enough as an initial value? */ +#define CC_SEQ_SIZE (10) +#define CC_SEQ_STEP (5) + +/* HANGUL begin */ +#define Hangul_SBase 0xAC00 +#define Hangul_SFinal 0xD7A3 +#define Hangul_SCount 11172 + +#define Hangul_NCount 588 + +#define Hangul_LBase 0x1100 +#define Hangul_LFinal 0x1112 +#define Hangul_LCount 19 + +#define Hangul_VBase 0x1161 +#define Hangul_VFinal 0x1175 +#define Hangul_VCount 21 + +#define Hangul_TBase 0x11A7 +#define Hangul_TFinal 0x11C2 +#define Hangul_TCount 28 + +#define Hangul_IsS(u) ((Hangul_SBase <= (u)) && ((u) <= Hangul_SFinal)) +#define Hangul_IsN(u) (((u) - Hangul_SBase) % Hangul_TCount == 0) +#define Hangul_IsLV(u) (Hangul_IsS(u) && Hangul_IsN(u)) +#define Hangul_IsL(u) ((Hangul_LBase <= (u)) && ((u) <= Hangul_LFinal)) +#define Hangul_IsV(u) ((Hangul_VBase <= (u)) && ((u) <= Hangul_VFinal)) +#define Hangul_IsT(u) ((Hangul_TBase < (u)) && ((u) <= Hangul_TFinal)) +/* HANGUL end */ + +/* this is used for canonical ordering of combining characters (c.c.). */ +typedef struct { + U8 cc; /* combining class */ + UV uv; /* codepoint */ + STRLEN pos; /* position */ +} UNF_cc; + +static int compare_cc(const void *a, const void *b) +{ + int ret_cc; + ret_cc = ((UNF_cc*) a)->cc - ((UNF_cc*) b)->cc; + if (ret_cc) + return ret_cc; + + return ( ((UNF_cc*) a)->pos > ((UNF_cc*) b)->pos ) + - ( ((UNF_cc*) a)->pos < ((UNF_cc*) b)->pos ); +} + +static U8* dec_canonical(UV uv) +{ + U8 ***plane, **row; + if (OVER_UTF_MAX(uv)) + return NULL; + plane = (U8***)UNF_canon[uv >> 16]; + if (! plane) + return NULL; + row = plane[(uv >> 8) & 0xff]; + return row ? row[uv & 0xff] : NULL; +} + +static U8* dec_compat(UV uv) +{ + U8 ***plane, **row; + if (OVER_UTF_MAX(uv)) + return NULL; + plane = (U8***)UNF_compat[uv >> 16]; + if (! plane) + return NULL; + row = plane[(uv >> 8) & 0xff]; + return row ? row[uv & 0xff] : NULL; +} + +static UV composite_uv(UV uv, UV uv2) +{ + UNF_complist ***plane, **row, *cell, *i; + + if (!uv2 || OVER_UTF_MAX(uv) || OVER_UTF_MAX(uv2)) + return 0; + + if (Hangul_IsL(uv) && Hangul_IsV(uv2)) { + UV lindex = uv - Hangul_LBase; + UV vindex = uv2 - Hangul_VBase; + return(Hangul_SBase + (lindex * Hangul_VCount + vindex) * + Hangul_TCount); + } + if (Hangul_IsLV(uv) && Hangul_IsT(uv2)) { + UV tindex = uv2 - Hangul_TBase; + return(uv + tindex); + } + plane = UNF_compos[uv >> 16]; + if (! plane) + return 0; + row = plane[(uv >> 8) & 0xff]; + if (! row) + return 0; + cell = row[uv & 0xff]; + if (! cell) + return 0; + for (i = cell; i->nextchar; i++) { + if (uv2 == i->nextchar) + return i->composite; + } + return 0; +} + +static U8 getCombinClass(UV uv) +{ + U8 **plane, *row; + if (OVER_UTF_MAX(uv)) + return 0; + plane = (U8**)UNF_combin[uv >> 16]; + if (! plane) + return 0; + row = plane[(uv >> 8) & 0xff]; + return row ? row[uv & 0xff] : 0; +} + +static U8* pv_cat_decompHangul(pTHX_ U8* d, UV uv) +{ + UV sindex = uv - Hangul_SBase; + UV lindex = sindex / Hangul_NCount; + UV vindex = (sindex % Hangul_NCount) / Hangul_TCount; + UV tindex = sindex % Hangul_TCount; + + if (! Hangul_IsS(uv)) + return d; + + d = uvchr_to_utf8(d, (lindex + Hangul_LBase)); + d = uvchr_to_utf8(d, (vindex + Hangul_VBase)); + if (tindex) + d = uvchr_to_utf8(d, (tindex + Hangul_TBase)); + return d; +} + +static char* sv_2pvunicode(pTHX_ SV *sv, STRLEN *lp) +{ + char *s; + STRLEN len; + s = SvPV(sv,len); + if (!SvUTF8(sv)) { + SV* tmpsv = sv_2mortal(newSVpvn(s, len)); + if (!SvPOK(tmpsv)) + s = SvPV_force(tmpsv,len); + sv_utf8_upgrade(tmpsv); + s = SvPV(tmpsv,len); + } + if (lp) + *lp = len; + return s; +} + +static +U8* pv_utf8_decompose(pTHX_ U8* s, STRLEN slen, U8** dp, STRLEN dlen, bool iscompat) +{ + U8* p = s; + U8* e = s + slen; + U8* dstart = *dp; + U8* d = dstart; + + while (p < e) { + STRLEN retlen; + UV uv = utf8n_to_uvchr(p, e - p, &retlen, AllowAnyUTF); + if (!retlen) + croak(ErrRetlenIsZero, "decompose"); + p += retlen; + + if (Hangul_IsS(uv)) { + Renew_d_if_not_enough_to(UTF8_MAXLEN * 3) + d = pv_cat_decompHangul(aTHX_ d, uv); + } + else { + U8* r = iscompat ? dec_compat(uv) : dec_canonical(uv); + + if (r) { + STRLEN len = (STRLEN)strlen((char *)r); + Renew_d_if_not_enough_to(len) + while (len--) + *d++ = *r++; + } + else { + Renew_d_if_not_enough_to(UTF8_MAXLEN) + d = uvchr_to_utf8(d, uv); + } + } + } + *dp = dstart; + return d; +} + +static +U8* pv_utf8_reorder(pTHX_ U8* s, STRLEN slen, U8** dp, STRLEN dlen) +{ + U8* p = s; + U8* e = s + slen; + U8* dstart = *dp; + U8* d = dstart; + + UNF_cc seq_ary[CC_SEQ_SIZE]; + UNF_cc* seq_ptr = seq_ary; /* use array at the beginning */ + UNF_cc* seq_ext = NULL; /* extend if need */ + STRLEN seq_max = CC_SEQ_SIZE; + STRLEN cc_pos = 0; + + while (p < e) { + U8 curCC; + STRLEN retlen; + UV uv = utf8n_to_uvchr(p, e - p, &retlen, AllowAnyUTF); + if (!retlen) + croak(ErrRetlenIsZero, "reorder"); + p += retlen; + + curCC = getCombinClass(uv); + + if (curCC != 0) { + if (seq_max < cc_pos + 1) { /* extend if need */ + seq_max = cc_pos + CC_SEQ_STEP; /* new size */ + if (CC_SEQ_SIZE == cc_pos) { /* seq_ary full */ + STRLEN i; + New(0, seq_ext, seq_max, UNF_cc); + for (i = 0; i < cc_pos; i++) + seq_ext[i] = seq_ary[i]; + } + else { + Renew(seq_ext, seq_max, UNF_cc); + } + seq_ptr = seq_ext; /* use seq_ext from now */ + } + + seq_ptr[cc_pos].cc = curCC; + seq_ptr[cc_pos].uv = uv; + seq_ptr[cc_pos].pos = cc_pos; + ++cc_pos; + + if (p < e) + continue; + } + + /* output */ + if (cc_pos) { + STRLEN i; + + if (cc_pos > 1) /* reordered if there are two c.c.'s */ + qsort((void*)seq_ptr, cc_pos, sizeof(UNF_cc), compare_cc); + + for (i = 0; i < cc_pos; i++) { + Renew_d_if_not_enough_to(UTF8_MAXLEN) + d = uvchr_to_utf8(d, seq_ptr[i].uv); + } + cc_pos = 0; + } + + if (curCC == 0) { + Renew_d_if_not_enough_to(UTF8_MAXLEN) + d = uvchr_to_utf8(d, uv); + } + } + if (seq_ext) + Safefree(seq_ext); + *dp = dstart; + return d; +} + +static +U8* pv_utf8_compose(pTHX_ U8* s, STRLEN slen, U8** dp, STRLEN dlen, bool iscontig) +{ + U8* p = s; + U8* e = s + slen; + U8* dstart = *dp; + U8* d = dstart; + + UV uvS = 0; /* code point of the starter */ + bool valid_uvS = FALSE; /* if FALSE, uvS isn't initialized yet */ + U8 preCC = 0; + + UV seq_ary[CC_SEQ_SIZE]; + UV* seq_ptr = seq_ary; /* use array at the beginning */ + UV* seq_ext = NULL; /* extend if need */ + STRLEN seq_max = CC_SEQ_SIZE; + STRLEN cc_pos = 0; + + while (p < e) { + U8 curCC; + STRLEN retlen; + UV uv = utf8n_to_uvchr(p, e - p, &retlen, AllowAnyUTF); + if (!retlen) + croak(ErrRetlenIsZero, "compose"); + p += retlen; + + curCC = getCombinClass(uv); + + if (!valid_uvS) { + if (curCC == 0) { + uvS = uv; /* the first Starter is found */ + valid_uvS = TRUE; + if (p < e) + continue; + } + else { + Renew_d_if_not_enough_to(UTF8_MAXLEN) + d = uvchr_to_utf8(d, uv); + continue; + } + } + else { + bool composed; + + /* blocked */ + if ((iscontig && cc_pos) || /* discontiguous combination */ + (curCC != 0 && preCC == curCC) || /* blocked by same CC */ + (preCC > curCC)) /* blocked by higher CC: revised D2 */ + composed = FALSE; + + /* not blocked: + iscontig && cc_pos == 0 -- contiguous combination + curCC == 0 && preCC == 0 -- starter + starter + curCC != 0 && preCC < curCC -- lower CC */ + else { + /* try composition */ + UV uvComp = composite_uv(uvS, uv); + + if (uvComp && !isExclusion(uvComp)) { + uvS = uvComp; + composed = TRUE; + + /* preCC should not be changed to curCC */ + /* e.g. 1E14 = 0045 0304 0300 where CC(0304) == CC(0300) */ + if (p < e) + continue; + } + else + composed = FALSE; + } + + if (!composed) { + preCC = curCC; + if (curCC != 0 || !(p < e)) { + if (seq_max < cc_pos + 1) { /* extend if need */ + seq_max = cc_pos + CC_SEQ_STEP; /* new size */ + if (CC_SEQ_SIZE == cc_pos) { /* seq_ary full */ + New(0, seq_ext, seq_max, UV); + Copy(seq_ary, seq_ext, cc_pos, UV); + } + else { + Renew(seq_ext, seq_max, UV); + } + seq_ptr = seq_ext; /* use seq_ext from now */ + } + seq_ptr[cc_pos] = uv; + ++cc_pos; + } + if (curCC != 0 && p < e) + continue; + } + } + + /* output */ + { + Renew_d_if_not_enough_to(UTF8_MAXLEN) + d = uvchr_to_utf8(d, uvS); /* starter (composed or not) */ + } + + if (cc_pos) { + STRLEN i; + + for (i = 0; i < cc_pos; i++) { + Renew_d_if_not_enough_to(UTF8_MAXLEN) + d = uvchr_to_utf8(d, seq_ptr[i]); + } + cc_pos = 0; + } + + uvS = uv; + } + if (seq_ext) + Safefree(seq_ext); + *dp = dstart; + return d; +} + +MODULE = Unicode::Normalize PACKAGE = Unicode::Normalize + +SV* +decompose(src, compat = &PL_sv_no) + SV * src + SV * compat + PROTOTYPE: $;$ + PREINIT: + SV* dst; + U8 *s, *d, *dend; + STRLEN slen, dlen; + CODE: + s = (U8*)sv_2pvunicode(aTHX_ src,&slen); + dst = newSVpvn("", 0); + dlen = slen; + New(0, d, dlen+1, U8); + dend = pv_utf8_decompose(aTHX_ s, slen, &d, dlen, (bool)SvTRUE(compat)); + sv_setpvn(dst, (char *)d, dend - d); + SvUTF8_on(dst); + Safefree(d); + RETVAL = dst; + OUTPUT: + RETVAL + + +SV* +reorder(src) + SV * src + PROTOTYPE: $ + PREINIT: + SV* dst; + U8 *s, *d, *dend; + STRLEN slen, dlen; + CODE: + s = (U8*)sv_2pvunicode(aTHX_ src,&slen); + dst = newSVpvn("", 0); + dlen = slen; + New(0, d, dlen+1, U8); + dend = pv_utf8_reorder(aTHX_ s, slen, &d, dlen); + sv_setpvn(dst, (char *)d, dend - d); + SvUTF8_on(dst); + Safefree(d); + RETVAL = dst; + OUTPUT: + RETVAL + + +SV* +compose(src) + SV * src + PROTOTYPE: $ + ALIAS: + composeContiguous = 1 + PREINIT: + SV* dst; + U8 *s, *d, *dend; + STRLEN slen, dlen; + CODE: + s = (U8*)sv_2pvunicode(aTHX_ src,&slen); + dst = newSVpvn("", 0); + dlen = slen; + New(0, d, dlen+1, U8); + dend = pv_utf8_compose(aTHX_ s, slen, &d, dlen, (bool)ix); + sv_setpvn(dst, (char *)d, dend - d); + SvUTF8_on(dst); + Safefree(d); + RETVAL = dst; + OUTPUT: + RETVAL + + +SV* +NFD(src) + SV * src + PROTOTYPE: $ + ALIAS: + NFKD = 1 + PREINIT: + SV *dst; + U8 *s, *t, *tend, *d, *dend; + STRLEN slen, tlen, dlen; + CODE: + s = (U8*)sv_2pvunicode(aTHX_ src,&slen); + + /* decompose */ + tlen = slen; + New(0, t, tlen+1, U8); + tend = pv_utf8_decompose(aTHX_ s, slen, &t, tlen, (bool)(ix==1)); + *tend = '\0'; + tlen = tend - t; /* no longer know real size of t */ + + /* reorder */ + dlen = tlen; + New(0, d, dlen+1, U8); + dend = pv_utf8_reorder(aTHX_ t, tlen, &d, dlen); + *dend = '\0'; + dlen = dend - d; /* no longer know real size of d */ + + /* return */ + dst = newSVpvn("", 0); + sv_setpvn(dst, (char *)d, dlen); + SvUTF8_on(dst); + + Safefree(t); + Safefree(d); + RETVAL = dst; + OUTPUT: + RETVAL + + +SV* +NFC(src) + SV * src + PROTOTYPE: $ + ALIAS: + NFKC = 1 + FCC = 2 + PREINIT: + SV *dst; + U8 *s, *t, *tend, *u, *uend, *d, *dend; + STRLEN slen, tlen, ulen, dlen; + CODE: + s = (U8*)sv_2pvunicode(aTHX_ src,&slen); + + /* decompose */ + tlen = slen; + New(0, t, tlen+1, U8); + tend = pv_utf8_decompose(aTHX_ s, slen, &t, tlen, (bool)(ix==1)); + *tend = '\0'; + tlen = tend - t; /* no longer know real size of t */ + + /* reorder */ + ulen = tlen; + New(0, u, ulen+1, U8); + uend = pv_utf8_reorder(aTHX_ t, tlen, &u, ulen); + *uend = '\0'; + ulen = uend - u; /* no longer know real size of u */ + + /* compose */ + dlen = ulen; + New(0, d, dlen+1, U8); + dend = pv_utf8_compose(aTHX_ u, ulen, &d, dlen, (bool)(ix==2)); + *dend = '\0'; + dlen = dend - d; /* no longer know real size of d */ + + /* return */ + dst = newSVpvn("", 0); + sv_setpvn(dst, (char *)d, dlen); + SvUTF8_on(dst); + + Safefree(t); + Safefree(u); + Safefree(d); + RETVAL = dst; + OUTPUT: + RETVAL + + +SV* +checkNFD(src) + SV * src + PROTOTYPE: $ + ALIAS: + checkNFKD = 1 + PREINIT: + STRLEN srclen, retlen; + U8 *s, *e, *p, curCC, preCC; + bool result = TRUE; + CODE: + s = (U8*)sv_2pvunicode(aTHX_ src,&srclen); + e = s + srclen; + + preCC = 0; + for (p = s; p < e; p += retlen) { + UV uv = utf8n_to_uvchr(p, e - p, &retlen, AllowAnyUTF); + if (!retlen) + croak(ErrRetlenIsZero, "checkNFD or -NFKD"); + + curCC = getCombinClass(uv); + if (preCC > curCC && curCC != 0) { /* canonical ordering violated */ + result = FALSE; + break; + } + if (Hangul_IsS(uv) || (ix ? dec_compat(uv) : dec_canonical(uv))) { + result = FALSE; + break; + } + preCC = curCC; + } + RETVAL = boolSV(result); + OUTPUT: + RETVAL + + +SV* +checkNFC(src) + SV * src + PROTOTYPE: $ + ALIAS: + checkNFKC = 1 + PREINIT: + STRLEN srclen, retlen; + U8 *s, *e, *p, curCC, preCC; + bool result = TRUE; + bool isMAYBE = FALSE; + CODE: + s = (U8*)sv_2pvunicode(aTHX_ src,&srclen); + e = s + srclen; + + preCC = 0; + for (p = s; p < e; p += retlen) { + UV uv = utf8n_to_uvchr(p, e - p, &retlen, AllowAnyUTF); + if (!retlen) + croak(ErrRetlenIsZero, "checkNFC or -NFKC"); + + curCC = getCombinClass(uv); + if (preCC > curCC && curCC != 0) { /* canonical ordering violated */ + result = FALSE; + break; + } + + /* get NFC/NFKC property */ + if (Hangul_IsS(uv)) /* Hangul syllables are canonical composites */ + ; /* YES */ + else if (isExclusion(uv) || isSingleton(uv) || isNonStDecomp(uv)) { + result = FALSE; + break; + } + else if (isComp2nd(uv)) + isMAYBE = TRUE; + else if (ix) { + char *canon, *compat; + /* NFKC_NO when having compatibility mapping. */ + canon = (char *) dec_canonical(uv); + compat = (char *) dec_compat(uv); + if (compat && !(canon && strEQ(canon, compat))) { + result = FALSE; + break; + } + } /* end of get NFC/NFKC property */ + + preCC = curCC; + } + if (isMAYBE && result) /* NO precedes MAYBE */ + XSRETURN_UNDEF; + RETVAL = boolSV(result); + OUTPUT: + RETVAL + + +SV* +checkFCD(src) + SV * src + PROTOTYPE: $ + ALIAS: + checkFCC = 1 + PREINIT: + STRLEN srclen, retlen; + U8 *s, *e, *p, curCC, preCC; + bool result = TRUE; + bool isMAYBE = FALSE; + CODE: + s = (U8*)sv_2pvunicode(aTHX_ src,&srclen); + e = s + srclen; + preCC = 0; + for (p = s; p < e; p += retlen) { + U8 *sCan; + UV uvLead; + STRLEN canlen = 0; + UV uv = utf8n_to_uvchr(p, e - p, &retlen, AllowAnyUTF); + if (!retlen) + croak(ErrRetlenIsZero, "checkFCD or -FCC"); + + sCan = (U8*) dec_canonical(uv); + + if (sCan) { + STRLEN canret; + canlen = (STRLEN)strlen((char *) sCan); + uvLead = utf8n_to_uvchr(sCan, canlen, &canret, AllowAnyUTF); + if (!canret) + croak(ErrRetlenIsZero, "checkFCD or -FCC"); + } + else { + uvLead = uv; + } + + curCC = getCombinClass(uvLead); + + if (curCC != 0 && curCC < preCC) { /* canonical ordering violated */ + result = FALSE; + break; + } + + if (ix) { + if (isExclusion(uv) || isSingleton(uv) || isNonStDecomp(uv)) { + result = FALSE; + break; + } + else if (isComp2nd(uv)) + isMAYBE = TRUE; + } + + if (sCan) { + STRLEN canret; + UV uvTrail; + U8* eCan = sCan + canlen; + U8* pCan = utf8_hop(eCan, -1); + if (pCan < sCan) + croak(ErrHopBeforeStart); + uvTrail = utf8n_to_uvchr(pCan, eCan - pCan, &canret, AllowAnyUTF); + if (!canret) + croak(ErrRetlenIsZero, "checkFCD or -FCC"); + preCC = getCombinClass(uvTrail); + } + else { + preCC = curCC; + } + } + if (isMAYBE && result) /* NO precedes MAYBE */ + XSRETURN_UNDEF; + RETVAL = boolSV(result); + OUTPUT: + RETVAL + + +U8 +getCombinClass(uv) + UV uv + PROTOTYPE: $ + +bool +isExclusion(uv) + UV uv + PROTOTYPE: $ + +bool +isSingleton(uv) + UV uv + PROTOTYPE: $ + +bool +isNonStDecomp(uv) + UV uv + PROTOTYPE: $ + +bool +isComp2nd(uv) + UV uv + PROTOTYPE: $ + ALIAS: + isNFC_MAYBE = 1 + isNFKC_MAYBE = 2 + INIT: + PERL_UNUSED_VAR(ix); + +SV* +isNFD_NO(uv) + UV uv + PROTOTYPE: $ + ALIAS: + isNFKD_NO = 1 + PREINIT: + bool result = FALSE; + CODE: + if (Hangul_IsS(uv) || (ix ? dec_compat(uv) : dec_canonical(uv))) + result = TRUE; /* NFD_NO or NFKD_NO */ + RETVAL = boolSV(result); + OUTPUT: + RETVAL + + +SV* +isComp_Ex(uv) + UV uv + PROTOTYPE: $ + ALIAS: + isNFC_NO = 0 + isNFKC_NO = 1 + PREINIT: + bool result = FALSE; + CODE: + if (isExclusion(uv) || isSingleton(uv) || isNonStDecomp(uv)) + result = TRUE; /* NFC_NO or NFKC_NO */ + else if (ix) { + char *canon, *compat; + canon = (char *) dec_canonical(uv); + compat = (char *) dec_compat(uv); + if (compat && (!canon || strNE(canon, compat))) + result = TRUE; /* NFC_NO or NFKC_NO */ + } + RETVAL = boolSV(result); + OUTPUT: + RETVAL + +SV* +getComposite(uv, uv2) + UV uv + UV uv2 + PROTOTYPE: $$ + PREINIT: + UV composite; + CODE: + composite = composite_uv(uv, uv2); + RETVAL = composite ? newSVuv(composite) : &PL_sv_undef; + OUTPUT: + RETVAL + + + +SV* +getCanon(uv) + UV uv + PROTOTYPE: $ + ALIAS: + getCompat = 1 + CODE: + if (Hangul_IsS(uv)) { + U8 tmp[3 * UTF8_MAXLEN + 1]; + U8 *t = tmp; + U8 *e = pv_cat_decompHangul(aTHX_ t, uv); + RETVAL = newSVpvn((char *)t, e - t); + } else { + U8* rstr = ix ? dec_compat(uv) : dec_canonical(uv); + if (!rstr) + XSRETURN_UNDEF; + RETVAL = newSVpvn((char *)rstr, strlen((char *)rstr)); + } + SvUTF8_on(RETVAL); + OUTPUT: + RETVAL + + +void +splitOnLastStarter(src) + SV * src + PREINIT: + SV *svp; + STRLEN srclen; + U8 *s, *e, *p; + PPCODE: + s = (U8*)sv_2pvunicode(aTHX_ src,&srclen); + e = s + srclen; + p = e; + while (s < p) { + UV uv; + p = utf8_hop(p, -1); + if (p < s) + croak(ErrHopBeforeStart); + uv = utf8n_to_uvchr(p, e - p, NULL, AllowAnyUTF); + if (getCombinClass(uv) == 0) /* Last Starter found */ + break; + } + + svp = sv_2mortal(newSVpvn((char*)s, p - s)); + SvUTF8_on(svp); + XPUSHs(svp); + + svp = sv_2mortal(newSVpvn((char*)p, e - p)); + SvUTF8_on(svp); + XPUSHs(svp); + diff --git a/dist/Unicode-Normalize/mkheader b/dist/Unicode-Normalize/mkheader new file mode 100644 index 0000000000..8d4c1b8e8d --- /dev/null +++ b/dist/Unicode-Normalize/mkheader @@ -0,0 +1,419 @@ +#!perl +# +# This auxiliary script makes five header files +# used for building XSUB of Unicode::Normalize. +# +# Usage: +# <do 'mkheader'> in perl, or <perl mkheader> in command line +# +# Input files: +# unicore/CombiningClass.pl (or unicode/CombiningClass.pl) +# unicore/Decomposition.pl (or unicode/Decomposition.pl) +# +# Output files: +# unfcan.h +# unfcpt.h +# unfcmb.h +# unfcmp.h +# unfexc.h +# +use 5.006; +use strict; +use warnings; +use Carp; +use File::Spec; +use SelectSaver; + +BEGIN { + unless ('A' eq pack('U', 0x41)) { + die "Unicode::Normalize cannot stringify a Unicode code point\n"; + } + unless (0x41 == unpack('U', 'A')) { + die "Unicode::Normalize cannot get Unicode code point\n"; + } +} + +our $PACKAGE = 'Unicode::Normalize, mkheader'; + +our $prefix = "UNF_"; +our $structname = "${prefix}complist"; + +# Starting in v5.20, the tables in lib/unicore are built using the platform's +# native character set for code points 0-255. +*pack_U = ($] ge 5.020) + ? sub { return pack('W*', @_).pack('U*'); } # The empty pack returns + # an empty UTF-8 string, + # so the effect is to + # force the return into + # being UTF-8. + : sub { return pack('U*', @_); }; + +# %Canon and %Compat will be ($codepoint => $hexstring) after _U_stringify() +our %Comp1st; # $codepoint => $listname : may be composed with a next char. +our %CompList; # $listname,$2nd => $codepoint : composite + +##### The below part is common to mkheader and PP ##### + +our %Combin; # $codepoint => $number : combination class +our %Canon; # $codepoint => \@codepoints : canonical decomp. +our %Compat; # $codepoint => \@codepoints : compat. decomp. +our %Compos; # $1st,$2nd => $codepoint : composite +our %Exclus; # $codepoint => 1 : composition exclusions +our %Single; # $codepoint => 1 : singletons +our %NonStD; # $codepoint => 1 : non-starter decompositions +our %Comp2nd; # $codepoint => 1 : may be composed with a prev char. + +# from core Unicode database +our $Combin = do "unicore/CombiningClass.pl" + || do "unicode/CombiningClass.pl" + || croak "$PACKAGE: CombiningClass.pl not found"; +our $Decomp = do "unicore/Decomposition.pl" + || do "unicode/Decomposition.pl" + || croak "$PACKAGE: Decomposition.pl not found"; + +# CompositionExclusions.txt since Unicode 3.2.0. If this ever changes, it +# would be better to get the values from Unicode::UCD rather than hard-code +# them here, as that will protect from having to make fixes for future +# changes. +our @CompEx = qw( + 0958 0959 095A 095B 095C 095D 095E 095F 09DC 09DD 09DF 0A33 0A36 + 0A59 0A5A 0A5B 0A5E 0B5C 0B5D 0F43 0F4D 0F52 0F57 0F5C 0F69 0F76 + 0F78 0F93 0F9D 0FA2 0FA7 0FAC 0FB9 FB1D FB1F FB2A FB2B FB2C FB2D + FB2E FB2F FB30 FB31 FB32 FB33 FB34 FB35 FB36 FB38 FB39 FB3A FB3B + FB3C FB3E FB40 FB41 FB43 FB44 FB46 FB47 FB48 FB49 FB4A FB4B FB4C + FB4D FB4E 2ADC 1D15E 1D15F 1D160 1D161 1D162 1D163 1D164 1D1BB + 1D1BC 1D1BD 1D1BE 1D1BF 1D1C0 +); + +# definition of Hangul constants +use constant SBase => 0xAC00; +use constant SFinal => 0xD7A3; # SBase -1 + SCount +use constant SCount => 11172; # LCount * NCount +use constant NCount => 588; # VCount * TCount +use constant LBase => 0x1100; +use constant LFinal => 0x1112; +use constant LCount => 19; +use constant VBase => 0x1161; +use constant VFinal => 0x1175; +use constant VCount => 21; +use constant TBase => 0x11A7; +use constant TFinal => 0x11C2; +use constant TCount => 28; + +sub decomposeHangul { + my $sindex = $_[0] - SBase; + my $lindex = int( $sindex / NCount); + my $vindex = int(($sindex % NCount) / TCount); + my $tindex = $sindex % TCount; + my @ret = ( + LBase + $lindex, + VBase + $vindex, + $tindex ? (TBase + $tindex) : (), + ); + return wantarray ? @ret : pack_U(@ret); +} + +########## getting full decomposition ########## + +## converts string "hhhh hhhh hhhh" to a numeric list +## (hex digits separated by spaces) +sub _getHexArray { map hex, $_[0] =~ /\G *([0-9A-Fa-f]+)/g } + +while ($Combin =~ /(.+)/g) { + my @tab = split /\t/, $1; + my $ini = hex $tab[0]; + if ($tab[1] eq '') { + $Combin{$ini} = $tab[2]; + } else { + $Combin{$_} = $tab[2] foreach $ini .. hex($tab[1]); + } +} + +while ($Decomp =~ /(.+)/g) { + my @tab = split /\t/, $1; + my $compat = $tab[2] =~ s/<[^>]+>//; + my $dec = [ _getHexArray($tab[2]) ]; # decomposition + my $ini = hex($tab[0]); # initial decomposable character + my $end = $tab[1] eq '' ? $ini : hex($tab[1]); + # ($ini .. $end) is the range of decomposable characters. + + foreach my $u ($ini .. $end) { + $Compat{$u} = $dec; + $Canon{$u} = $dec if ! $compat; + } +} + +for my $s (@CompEx) { + my $u = hex $s; + next if !$Canon{$u}; # not assigned + next if $u == 0xFB1D && !$Canon{0x1D15E}; # 3.0.1 before Corrigendum #2 + $Exclus{$u} = 1; +} + +foreach my $u (keys %Canon) { + my $dec = $Canon{$u}; + + if (@$dec == 2) { + if ($Combin{ $dec->[0] }) { + $NonStD{$u} = 1; + } else { + $Compos{ $dec->[0] }{ $dec->[1] } = $u; + $Comp2nd{ $dec->[1] } = 1 if ! $Exclus{$u}; + } + } elsif (@$dec == 1) { + $Single{$u} = 1; + } else { + my $h = sprintf '%04X', $u; + croak("Weird Canonical Decomposition of U+$h"); + } +} + +# modern HANGUL JUNGSEONG and HANGUL JONGSEONG jamo +foreach my $j (0x1161..0x1175, 0x11A8..0x11C2) { + $Comp2nd{$j} = 1; +} + +sub getCanonList { + my @src = @_; + my @dec = map { + (SBase <= $_ && $_ <= SFinal) ? decomposeHangul($_) + : $Canon{$_} ? @{ $Canon{$_} } : $_ + } @src; + return join(" ",@src) eq join(" ",@dec) ? @dec : getCanonList(@dec); + # condition @src == @dec is not ok. +} + +sub getCompatList { + my @src = @_; + my @dec = map { + (SBase <= $_ && $_ <= SFinal) ? decomposeHangul($_) + : $Compat{$_} ? @{ $Compat{$_} } : $_ + } @src; + return join(" ",@src) eq join(" ",@dec) ? @dec : getCompatList(@dec); + # condition @src == @dec is not ok. +} + +# exhaustive decomposition +foreach my $key (keys %Canon) { + $Canon{$key} = [ getCanonList($key) ]; +} + +# exhaustive decomposition +foreach my $key (keys %Compat) { + $Compat{$key} = [ getCompatList($key) ]; +} + +##### The above part is common to mkheader and PP ##### + +foreach my $comp1st (keys %Compos) { + my $listname = sprintf("${structname}_%06x", $comp1st); + # %04x is bad since it'd place _3046 after _1d157. + $Comp1st{$comp1st} = $listname; + my $rh1st = $Compos{$comp1st}; + + foreach my $comp2nd (keys %$rh1st) { + my $uc = $rh1st->{$comp2nd}; + $CompList{$listname}{$comp2nd} = $uc; + } +} + +sub split_into_char { + use bytes; + my $uni = shift; + my $len = length($uni); + my @ary; + for(my $i = 0; $i < $len; ++$i) { + push @ary, ord(substr($uni,$i,1)); + } + return @ary; +} + +sub _U_stringify { + sprintf '"%s"', join '', + map sprintf("\\x%02x", $_), split_into_char(pack_U(@_)); +} + +foreach my $hash (\%Canon, \%Compat) { + foreach my $key (keys %$hash) { + $hash->{$key} = _U_stringify( @{ $hash->{$key} } ); + } +} + +########## writing header files ########## + +my @boolfunc = ( + { + name => "Exclusion", + type => "bool", + hash => \%Exclus, + }, + { + name => "Singleton", + type => "bool", + hash => \%Single, + }, + { + name => "NonStDecomp", + type => "bool", + hash => \%NonStD, + }, + { + name => "Comp2nd", + type => "bool", + hash => \%Comp2nd, + }, +); + +my $orig_fh = SelectSaver->new; +{ + +my $file = "unfexc.h"; +open FH, ">$file" or croak "$PACKAGE: $file can't be made"; +binmode FH; select FH; + + print << 'EOF'; +/* + * This file is auto-generated by mkheader. + * Any changes here will be lost! + */ +EOF + +foreach my $tbl (@boolfunc) { + my @temp = sort {$a <=> $b} keys %{$tbl->{hash}}; + my $type = $tbl->{type}; + my $name = $tbl->{name}; + print "$type is$name (UV uv)\n{\nreturn\n\t"; + + while (@temp) { + my $cur = shift @temp; + if (@temp && $cur + 1 == $temp[0]) { + print "($cur <= uv && uv <= "; + while (@temp && $cur + 1 == $temp[0]) { + $cur = shift @temp; + } + print "$cur)"; + print "\n\t|| " if @temp; + } else { + print "uv == $cur"; + print "\n\t|| " if @temp; + } + } + print "\n\t? TRUE : FALSE;\n}\n\n"; +} + +close FH; + +#################################### + +my $compinit = + "typedef struct { UV nextchar; UV composite; } $structname;\n\n"; + +foreach my $i (sort keys %CompList) { + $compinit .= "$structname $i [] = {\n"; + $compinit .= join ",\n", + map sprintf("\t{ %d, %d }", $_, $CompList{$i}{$_}), + sort {$a <=> $b } keys %{ $CompList{$i} }; + $compinit .= ",\n{0,0}\n};\n\n"; # with sentinel +} + +my @tripletable = ( + { + file => "unfcmb", + name => "combin", + type => "STDCHAR", + hash => \%Combin, + null => 0, + }, + { + file => "unfcan", + name => "canon", + type => "char*", + hash => \%Canon, + null => "NULL", + }, + { + file => "unfcpt", + name => "compat", + type => "char*", + hash => \%Compat, + null => "NULL", + }, + { + file => "unfcmp", + name => "compos", + type => "$structname *", + hash => \%Comp1st, + null => "NULL", + init => $compinit, + }, +); + +foreach my $tbl (@tripletable) { + my $file = "$tbl->{file}.h"; + my $head = "${prefix}$tbl->{name}"; + my $type = $tbl->{type}; + my $hash = $tbl->{hash}; + my $null = $tbl->{null}; + my $init = $tbl->{init}; + + open FH, ">$file" or croak "$PACKAGE: $file can't be made"; + binmode FH; select FH; + my %val; + + print FH << 'EOF'; +/* + * This file is auto-generated by mkheader. + * Any changes here will be lost! + */ +EOF + + print $init if defined $init; + + foreach my $uv (keys %$hash) { + croak sprintf("a Unicode code point 0x%04X over 0x10FFFF.", $uv) + unless $uv <= 0x10FFFF; + my @c = unpack 'CCCC', pack 'N', $uv; + $val{ $c[1] }{ $c[2] }{ $c[3] } = $hash->{$uv}; + } + + foreach my $p (sort { $a <=> $b } keys %val) { + next if ! $val{ $p }; + for (my $r = 0; $r < 256; $r++) { + next if ! $val{ $p }{ $r }; + printf "static $type ${head}_%02x_%02x [256] = {\n", $p, $r; + for (my $c = 0; $c < 256; $c++) { + print "\t", defined $val{$p}{$r}{$c} + ? "($type)".$val{$p}{$r}{$c} + : $null; + print ',' if $c != 255; + print "\n" if $c % 8 == 7; + } + print "};\n\n"; + } + } + foreach my $p (sort { $a <=> $b } keys %val) { + next if ! $val{ $p }; + printf "static $type* ${head}_%02x [256] = {\n", $p; + for (my $r = 0; $r < 256; $r++) { + print $val{ $p }{ $r } + ? sprintf("${head}_%02x_%02x", $p, $r) + : "NULL"; + print ',' if $r != 255; + print "\n" if $val{ $p }{ $r } || ($r+1) % 8 == 0; + } + print "};\n\n"; + } + print "static $type** $head [] = {\n"; + for (my $p = 0; $p <= 0x10; $p++) { + print $val{ $p } ? sprintf("${head}_%02x", $p) : "NULL"; + print ',' if $p != 0x10; + print "\n"; + } + print "};\n\n"; + close FH; +} + +} # End of block for SelectSaver + +1; +__END__ diff --git a/dist/Unicode-Normalize/t/fcdc.t b/dist/Unicode-Normalize/t/fcdc.t new file mode 100644 index 0000000000..d2ef28b9e9 --- /dev/null +++ b/dist/Unicode-Normalize/t/fcdc.t @@ -0,0 +1,138 @@ + +BEGIN { + unless ('A' eq pack('U', 0x41)) { + print "1..0 # Unicode::Normalize cannot pack a Unicode code point\n"; + exit 0; + } + unless (0x41 == unpack('U', 'A')) { + print "1..0 # Unicode::Normalize cannot get a Unicode code point\n"; + exit 0; + } +} + +BEGIN { + if ($ENV{PERL_CORE}) { + chdir('t') if -d 't'; + @INC = $^O eq 'MacOS' ? qw(::lib) : qw(../lib); + } +} + +######################### + +use strict; +use warnings; +BEGIN { $| = 1; print "1..70\n"; } +my $count = 0; +sub ok ($;$) { + my $p = my $r = shift; + if (@_) { + my $x = shift; + $p = !defined $x ? !defined $r : !defined $r ? 0 : $r eq $x; + } + print $p ? "ok" : "not ok", ' ', ++$count, "\n"; +} + +use Unicode::Normalize qw(:all); + +ok(1); + +sub _pack_U { Unicode::Normalize::pack_U(@_) } +sub hexU { _pack_U map hex, split ' ', shift } +sub answer { defined $_[0] ? $_[0] ? "YES" : "NO" : "MAYBE" } + +######################### + +ok(FCD(''), ""); +ok(FCC(''), ""); +ok(FCD('A'), "A"); +ok(FCC('A'), "A"); + +ok(normalize('FCD', ""), ""); +ok(normalize('FCC', ""), ""); +ok(normalize('FCC', "A"), "A"); +ok(normalize('FCD', "A"), "A"); + +# 9 + +# if checkFCD is YES, the return value from FCD should be same as the original +ok(FCD(hexU("00C5")), hexU("00C5")); # A with ring above +ok(FCD(hexU("0041 030A")), hexU("0041 030A")); # A+ring +ok(FCD(hexU("0041 0327 030A")), hexU("0041 0327 030A")); # A+cedilla+ring +ok(FCD(hexU("AC01 1100 1161")), hexU("AC01 1100 1161")); # hangul +ok(FCD(hexU("212B F900")), hexU("212B F900")); # compat + +ok(normalize('FCD', hexU("00C5")), hexU("00C5")); +ok(normalize('FCD', hexU("0041 030A")), hexU("0041 030A")); +ok(normalize('FCD', hexU("0041 0327 030A")), hexU("0041 0327 030A")); +ok(normalize('FCD', hexU("AC01 1100 1161")), hexU("AC01 1100 1161")); +ok(normalize('FCD', hexU("212B F900")), hexU("212B F900")); + +# 19 + +# if checkFCD is MAYBE or NO, FCD returns NFD (this behavior isn't documented) +ok(FCD(hexU("00C5 0327")), hexU("0041 0327 030A")); +ok(FCD(hexU("0041 030A 0327")), hexU("0041 0327 030A")); +ok(FCD(hexU("00C5 0327")), NFD(hexU("00C5 0327"))); +ok(FCD(hexU("0041 030A 0327")), NFD(hexU("0041 030A 0327"))); + +ok(normalize('FCD', hexU("00C5 0327")), hexU("0041 0327 030A")); +ok(normalize('FCD', hexU("0041 030A 0327")), hexU("0041 0327 030A")); +ok(normalize('FCD', hexU("00C5 0327")), NFD(hexU("00C5 0327"))); +ok(normalize('FCD', hexU("0041 030A 0327")), NFD(hexU("0041 030A 0327"))); + +# 27 + +ok(answer(checkFCD('')), 'YES'); +ok(answer(checkFCD('A')), 'YES'); +ok(answer(checkFCD("\x{030A}")), 'YES'); # 030A;COMBINING RING ABOVE +ok(answer(checkFCD("\x{0327}")), 'YES'); # 0327;COMBINING CEDILLA +ok(answer(checkFCD(_pack_U(0x00C5))), 'YES'); # A with ring above +ok(answer(checkFCD(hexU("0041 030A"))), 'YES'); # A+ring +ok(answer(checkFCD(hexU("0041 0327 030A"))), 'YES'); # A+cedilla+ring +ok(answer(checkFCD(hexU("0041 030A 0327"))), 'NO'); # A+ring+cedilla +ok(answer(checkFCD(hexU("00C5 0327"))), 'NO'); # A-ring+cedilla +ok(answer(checkNFC(hexU("00C5 0327"))), 'MAYBE'); # NFC: A-ring+cedilla +ok(answer(check("FCD", hexU("00C5 0327"))), 'NO'); +ok(answer(check("NFC", hexU("00C5 0327"))), 'MAYBE'); +ok(answer(checkFCD("\x{AC01}\x{1100}\x{1161}")), 'YES'); # hangul +ok(answer(checkFCD("\x{212B}\x{F900}")), 'YES'); # compat + +ok(answer(checkFCD(hexU("1EA7 05AE 0315 0062"))), "NO"); +ok(answer(checkFCC(hexU("1EA7 05AE 0315 0062"))), "NO"); +ok(answer(check('FCD', hexU("1EA7 05AE 0315 0062"))), "NO"); +ok(answer(check('FCC', hexU("1EA7 05AE 0315 0062"))), "NO"); + +# 45 + +ok(FCC(hexU("00C5 0327")), hexU("0041 0327 030A")); +ok(FCC(hexU("0045 0304 0300")), "\x{1E14}"); +ok(FCC("\x{1100}\x{1161}\x{1100}\x{1173}\x{11AF}"), "\x{AC00}\x{AE00}"); +ok(normalize('FCC', hexU("00C5 0327")), hexU("0041 0327 030A")); +ok(normalize('FCC', hexU("0045 0304 0300")), "\x{1E14}"); +ok(normalize('FCC', hexU("1100 1161 1100 1173 11AF")), "\x{AC00}\x{AE00}"); + +ok(FCC("\x{0B47}\x{0300}\x{0B3E}"), "\x{0B47}\x{0300}\x{0B3E}"); +ok(FCC("\x{1100}\x{0300}\x{1161}"), "\x{1100}\x{0300}\x{1161}"); +ok(FCC("\x{0B47}\x{0B3E}\x{0300}"), "\x{0B4B}\x{0300}"); +ok(FCC("\x{1100}\x{1161}\x{0300}"), "\x{AC00}\x{0300}"); +ok(FCC("\x{0B47}\x{300}\x{0B3E}\x{327}"), "\x{0B47}\x{300}\x{0B3E}\x{327}"); +ok(FCC("\x{1100}\x{300}\x{1161}\x{327}"), "\x{1100}\x{300}\x{1161}\x{327}"); + +# 57 + +ok(answer(checkFCC('')), 'YES'); +ok(answer(checkFCC('A')), 'YES'); +ok(answer(checkFCC("\x{030A}")), 'MAYBE'); # 030A;COMBINING RING ABOVE +ok(answer(checkFCC("\x{0327}")), 'MAYBE'); # 0327;COMBINING CEDILLA +ok(answer(checkFCC(hexU("00C5"))), 'YES'); # A with ring above +ok(answer(checkFCC(hexU("0041 030A"))), 'MAYBE'); # A+ring +ok(answer(checkFCC(hexU("0041 0327 030A"))), 'MAYBE'); # A+cedilla+ring +ok(answer(checkFCC(hexU("0041 030A 0327"))), 'NO'); # A+ring+cedilla +ok(answer(checkFCC(hexU("00C5 0327"))), 'NO'); # A-ring+cedilla +ok(answer(checkFCC("\x{AC01}\x{1100}\x{1161}")), 'MAYBE'); # hangul +ok(answer(checkFCC("\x{212B}\x{F900}")), 'NO'); # compat +ok(answer(checkFCC("\x{212B}\x{0327}")), 'NO'); # compat +ok(answer(checkFCC("\x{0327}\x{212B}")), 'NO'); # compat + +# 70 + diff --git a/dist/Unicode-Normalize/t/form.t b/dist/Unicode-Normalize/t/form.t new file mode 100644 index 0000000000..6bbfb082ca --- /dev/null +++ b/dist/Unicode-Normalize/t/form.t @@ -0,0 +1,84 @@ + +BEGIN { + unless ('A' eq pack('U', 0x41)) { + print "1..0 # Unicode::Normalize cannot pack a Unicode code point\n"; + exit 0; + } + unless (0x41 == unpack('U', 'A')) { + print "1..0 # Unicode::Normalize cannot get a Unicode code point\n"; + exit 0; + } +} + +BEGIN { + if ($ENV{PERL_CORE}) { + chdir('t') if -d 't'; + @INC = $^O eq 'MacOS' ? qw(::lib) : qw(../lib); + } +} + +######################### + +use strict; +use warnings; +BEGIN { $| = 1; print "1..37\n"; } +my $count = 0; +sub ok ($;$) { + my $p = my $r = shift; + if (@_) { + my $x = shift; + $p = !defined $x ? !defined $r : !defined $r ? 0 : $r eq $x; + } + print $p ? "ok" : "not ok", ' ', ++$count, "\n"; +} + +use Unicode::Normalize qw(:all); + +ok(1); + +sub answer { defined $_[0] ? $_[0] ? "YES" : "NO" : "MAYBE" } + +######################### + +ok(NFD ("\x{304C}\x{FF76}"), "\x{304B}\x{3099}\x{FF76}"); +ok(NFC ("\x{304C}\x{FF76}"), "\x{304C}\x{FF76}"); +ok(NFKD("\x{304C}\x{FF76}"), "\x{304B}\x{3099}\x{30AB}"); +ok(NFKC("\x{304C}\x{FF76}"), "\x{304C}\x{30AB}"); + +ok(answer(checkNFD ("\x{304C}")), "NO"); +ok(answer(checkNFC ("\x{304C}")), "YES"); +ok(answer(checkNFKD("\x{304C}")), "NO"); +ok(answer(checkNFKC("\x{304C}")), "YES"); +ok(answer(checkNFD ("\x{FF76}")), "YES"); +ok(answer(checkNFC ("\x{FF76}")), "YES"); +ok(answer(checkNFKD("\x{FF76}")), "NO"); +ok(answer(checkNFKC("\x{FF76}")), "NO"); + +ok(normalize('D', "\x{304C}\x{FF76}"), "\x{304B}\x{3099}\x{FF76}"); +ok(normalize('C', "\x{304C}\x{FF76}"), "\x{304C}\x{FF76}"); +ok(normalize('KD',"\x{304C}\x{FF76}"), "\x{304B}\x{3099}\x{30AB}"); +ok(normalize('KC',"\x{304C}\x{FF76}"), "\x{304C}\x{30AB}"); + +ok(answer(check('D', "\x{304C}")), "NO"); +ok(answer(check('C', "\x{304C}")), "YES"); +ok(answer(check('KD',"\x{304C}")), "NO"); +ok(answer(check('KC',"\x{304C}")), "YES"); +ok(answer(check('D' ,"\x{FF76}")), "YES"); +ok(answer(check('C' ,"\x{FF76}")), "YES"); +ok(answer(check('KD',"\x{FF76}")), "NO"); +ok(answer(check('KC',"\x{FF76}")), "NO"); + +ok(normalize('NFD', "\x{304C}\x{FF76}"), "\x{304B}\x{3099}\x{FF76}"); +ok(normalize('NFC', "\x{304C}\x{FF76}"), "\x{304C}\x{FF76}"); +ok(normalize('NFKD',"\x{304C}\x{FF76}"), "\x{304B}\x{3099}\x{30AB}"); +ok(normalize('NFKC',"\x{304C}\x{FF76}"), "\x{304C}\x{30AB}"); + +ok(answer(check('NFD', "\x{304C}")), "NO"); +ok(answer(check('NFC', "\x{304C}")), "YES"); +ok(answer(check('NFKD',"\x{304C}")), "NO"); +ok(answer(check('NFKC',"\x{304C}")), "YES"); +ok(answer(check('NFD' ,"\x{FF76}")), "YES"); +ok(answer(check('NFC' ,"\x{FF76}")), "YES"); +ok(answer(check('NFKD',"\x{FF76}")), "NO"); +ok(answer(check('NFKC',"\x{FF76}")), "NO"); + diff --git a/dist/Unicode-Normalize/t/func.t b/dist/Unicode-Normalize/t/func.t new file mode 100644 index 0000000000..2bd6e504a3 --- /dev/null +++ b/dist/Unicode-Normalize/t/func.t @@ -0,0 +1,386 @@ + +BEGIN { + unless ('A' eq pack('U', 0x41)) { + print "1..0 # Unicode::Normalize cannot pack a Unicode code point\n"; + exit 0; + } + unless (0x41 == unpack('U', 'A')) { + print "1..0 # Unicode::Normalize cannot get a Unicode code point\n"; + exit 0; + } +} + +BEGIN { + if ($ENV{PERL_CORE}) { + chdir('t') if -d 't'; + @INC = $^O eq 'MacOS' ? qw(::lib) : qw(../lib); + } +} + +######################### + +use strict; +use warnings; +BEGIN { $| = 1; print "1..217\n"; } +my $count = 0; +sub ok ($;$) { + my $p = my $r = shift; + if (@_) { + my $x = shift; + $p = !defined $x ? !defined $r : !defined $r ? 0 : $r eq $x; + } + print $p ? "ok" : "not ok", ' ', ++$count, "\n"; +} + +use Unicode::Normalize qw(:all); + +ok(1); + +sub _pack_U { Unicode::Normalize::pack_U(@_) } +sub hexU { _pack_U map hex, split ' ', shift } + +# This won't work on EBCDIC platforms prior to v5.8.0, which is when this +# translation function was defined +*to_native = (defined &utf8::unicode_to_native) + ? \&utf8::unicode_to_native + : sub { return shift }; + +######################### + +ok(getCombinClass( to_native(0)), 0); +ok(getCombinClass(to_native(41)), 0); +ok(getCombinClass(to_native(65)), 0); +ok(getCombinClass( 768), 230); +ok(getCombinClass(1809), 36); + +ok(getCanon(to_native( 0)), undef); +ok(getCanon(to_native(0x29)), undef); +ok(getCanon(to_native(0x41)), undef); +ok(getCanon(to_native(0x00C0)), _pack_U(0x0041, 0x0300)); +ok(getCanon(to_native(0x00EF)), _pack_U(0x0069, 0x0308)); +ok(getCanon(0x304C), _pack_U(0x304B, 0x3099)); +ok(getCanon(0x1EA4), _pack_U(0x0041, 0x0302, 0x0301)); +ok(getCanon(0x1F82), _pack_U(0x03B1, 0x0313, 0x0300, 0x0345)); +ok(getCanon(0x1FAF), _pack_U(0x03A9, 0x0314, 0x0342, 0x0345)); +ok(getCanon(0xAC00), _pack_U(0x1100, 0x1161)); +ok(getCanon(0xAE00), _pack_U(0x1100, 0x1173, 0x11AF)); +ok(getCanon(0x212C), undef); +ok(getCanon(0x3243), undef); +ok(getCanon(0xFA2D), _pack_U(0x9DB4)); + +# 20 + +ok(getCompat(to_native( 0)), undef); +ok(getCompat(to_native(0x29)), undef); +ok(getCompat(to_native(0x41)), undef); +ok(getCompat(to_native(0x00C0)), _pack_U(0x0041, 0x0300)); +ok(getCompat(to_native(0x00EF)), _pack_U(0x0069, 0x0308)); +ok(getCompat(0x304C), _pack_U(0x304B, 0x3099)); +ok(getCompat(0x1EA4), _pack_U(0x0041, 0x0302, 0x0301)); +ok(getCompat(0x1F82), _pack_U(0x03B1, 0x0313, 0x0300, 0x0345)); +ok(getCompat(0x1FAF), _pack_U(0x03A9, 0x0314, 0x0342, 0x0345)); +ok(getCompat(0x212C), _pack_U(0x0042)); +ok(getCompat(0x3243), _pack_U(0x0028, 0x81F3, 0x0029)); +ok(getCompat(0xAC00), _pack_U(0x1100, 0x1161)); +ok(getCompat(0xAE00), _pack_U(0x1100, 0x1173, 0x11AF)); +ok(getCompat(0xFA2D), _pack_U(0x9DB4)); + +# 34 + +ok(getComposite(to_native( 0), to_native( 0)), undef); +ok(getComposite(to_native( 0), to_native(0x29)), undef); +ok(getComposite(to_native(0x29), to_native( 0)), undef); +ok(getComposite(to_native(0x29), to_native(0x29)), undef); +ok(getComposite(to_native( 0), to_native(0x41)), undef); +ok(getComposite(to_native(0x41), to_native( 0)), undef); +ok(getComposite(to_native(0x41), to_native(0x41)), undef); +ok(getComposite(to_native(12), to_native(0x0300)), undef); +ok(getComposite(to_native(0x0055), 0xFF00), undef); +ok(getComposite(to_native(0x0041), 0x0300), to_native(0x00C0)); +ok(getComposite(to_native(0x0055), 0x0300), to_native(0x00D9)); +ok(getComposite(0x0112, 0x0300), 0x1E14); +ok(getComposite(0x1100, 0x1161), 0xAC00); +ok(getComposite(0x1100, 0x1173), 0xADF8); +ok(getComposite(0x1100, 0x11AF), undef); +ok(getComposite(0x1173, 0x11AF), undef); +ok(getComposite(0xAC00, 0x11A7), undef); +ok(getComposite(0xAC00, 0x11A8), 0xAC01); +ok(getComposite(0xADF8, 0x11AF), 0xAE00); + +# 53 + +sub uprops { + my $uv = shift; + my $r = ""; + $r .= isExclusion($uv) ? 'X' : 'x'; + $r .= isSingleton($uv) ? 'S' : 's'; + $r .= isNonStDecomp($uv) ? 'N' : 'n'; # Non-Starter Decomposition + $r .= isComp_Ex($uv) ? 'F' : 'f'; # Full exclusion (X + S + N) + $r .= isComp2nd($uv) ? 'B' : 'b'; # B = M = Y + $r .= isNFD_NO($uv) ? 'D' : 'd'; + $r .= isNFC_MAYBE($uv) ? 'M' : 'm'; # Maybe + $r .= isNFC_NO($uv) ? 'C' : 'c'; + $r .= isNFKD_NO($uv) ? 'K' : 'k'; + $r .= isNFKC_MAYBE($uv) ? 'Y' : 'y'; # maYbe + $r .= isNFKC_NO($uv) ? 'G' : 'g'; + return $r; +} + +ok(uprops(to_native(0x0000)), 'xsnfbdmckyg'); # NULL +ok(uprops(to_native(0x0029)), 'xsnfbdmckyg'); # RIGHT PARENTHESIS +ok(uprops(to_native(0x0041)), 'xsnfbdmckyg'); # LATIN CAPITAL LETTER A +ok(uprops(to_native(0x00A0)), 'xsnfbdmcKyG'); # NO-BREAK SPACE +ok(uprops(to_native(0x00C0)), 'xsnfbDmcKyg'); # LATIN CAPITAL LETTER A WITH GRAVE +ok(uprops(0x0300), 'xsnfBdMckYg'); # COMBINING GRAVE ACCENT +ok(uprops(0x0344), 'xsNFbDmCKyG'); # COMBINING GREEK DIALYTIKA TONOS +ok(uprops(0x0387), 'xSnFbDmCKyG'); # GREEK ANO TELEIA +ok(uprops(0x0958), 'XsnFbDmCKyG'); # DEVANAGARI LETTER QA +ok(uprops(0x0F43), 'XsnFbDmCKyG'); # TIBETAN LETTER GHA +ok(uprops(0x1100), 'xsnfbdmckyg'); # HANGUL CHOSEONG KIYEOK +ok(uprops(0x1161), 'xsnfBdMckYg'); # HANGUL JUNGSEONG A +ok(uprops(0x11AF), 'xsnfBdMckYg'); # HANGUL JONGSEONG RIEUL +ok(uprops(0x212B), 'xSnFbDmCKyG'); # ANGSTROM SIGN +ok(uprops(0xAC00), 'xsnfbDmcKyg'); # HANGUL SYLLABLE GA +ok(uprops(0xF900), 'xSnFbDmCKyG'); # CJK COMPATIBILITY IDEOGRAPH-F900 +ok(uprops(0xFB4E), 'XsnFbDmCKyG'); # HEBREW LETTER PE WITH RAFE +ok(uprops(0xFF71), 'xsnfbdmcKyG'); # HALFWIDTH KATAKANA LETTER A + +# 71 + +ok(decompose(""), ""); +ok(decompose("A"), "A"); +ok(decompose("", 1), ""); +ok(decompose("A", 1), "A"); + +ok(decompose(hexU("1E14 AC01")), hexU("0045 0304 0300 1100 1161 11A8")); +ok(decompose(hexU("AC00 AE00")), hexU("1100 1161 1100 1173 11AF")); +ok(decompose(hexU("304C FF76")), hexU("304B 3099 FF76")); + +ok(decompose(hexU("1E14 AC01"), 1), hexU("0045 0304 0300 1100 1161 11A8")); +ok(decompose(hexU("AC00 AE00"), 1), hexU("1100 1161 1100 1173 11AF")); +ok(decompose(hexU("304C FF76"), 1), hexU("304B 3099 30AB")); + +# don't modify the source +my $sDec = "\x{FA19}"; +ok(decompose($sDec), "\x{795E}"); +ok($sDec, "\x{FA19}"); + +# 83 + +ok(reorder(""), ""); +ok(reorder("A"), "A"); +ok(reorder(hexU("0041 0300 0315 0313 031b 0061")), + hexU("0041 031b 0300 0313 0315 0061")); +ok(reorder(hexU("00C1 0300 0315 0313 031b 0061 309A 3099")), + hexU("00C1 031b 0300 0313 0315 0061 309A 3099")); + +# don't modify the source +my $sReord = "\x{3000}\x{300}\x{31b}"; +ok(reorder($sReord), "\x{3000}\x{31b}\x{300}"); +ok($sReord, "\x{3000}\x{300}\x{31b}"); + +# 89 + +ok(compose(""), ""); +ok(compose("A"), "A"); +ok(compose(hexU("0061 0300")), hexU("00E0")); +ok(compose(hexU("0061 0300 031B")), hexU("00E0 031B")); +ok(compose(hexU("0061 0300 0315")), hexU("00E0 0315")); +ok(compose(hexU("0061 0300 0313")), hexU("00E0 0313")); +ok(compose(hexU("0061 031B 0300")), hexU("00E0 031B")); +ok(compose(hexU("0061 0315 0300")), hexU("0061 0315 0300")); +ok(compose(hexU("0061 0313 0300")), hexU("0061 0313 0300")); + +# don't modify the source +my $sCom = "\x{304B}\x{3099}"; +ok(compose($sCom), "\x{304C}"); +ok($sCom, "\x{304B}\x{3099}"); + +# 100 + +ok(composeContiguous(""), ""); +ok(composeContiguous("A"), "A"); +ok(composeContiguous(hexU("0061 0300")), hexU("00E0")); +ok(composeContiguous(hexU("0061 0300 031B")), hexU("00E0 031B")); +ok(composeContiguous(hexU("0061 0300 0315")), hexU("00E0 0315")); +ok(composeContiguous(hexU("0061 0300 0313")), hexU("00E0 0313")); +ok(composeContiguous(hexU("0061 031B 0300")), hexU("0061 031B 0300")); +ok(composeContiguous(hexU("0061 0315 0300")), hexU("0061 0315 0300")); +ok(composeContiguous(hexU("0061 0313 0300")), hexU("0061 0313 0300")); + +# don't modify the source +my $sCtg = "\x{30DB}\x{309A}"; +ok(composeContiguous($sCtg), "\x{30DD}"); +ok($sCtg, "\x{30DB}\x{309A}"); + +# 111 + +sub answer { defined $_[0] ? $_[0] ? "YES" : "NO" : "MAYBE" } + +ok(answer(checkNFD("")), "YES"); +ok(answer(checkNFC("")), "YES"); +ok(answer(checkNFKD("")), "YES"); +ok(answer(checkNFKC("")), "YES"); +ok(answer(check("NFD", "")), "YES"); +ok(answer(check("NFC", "")), "YES"); +ok(answer(check("NFKD","")), "YES"); +ok(answer(check("NFKC","")), "YES"); + +# U+0000 to U+007F are prenormalized in all the normalization forms. +ok(answer(checkNFD("AZaz\t12!#`")), "YES"); +ok(answer(checkNFC("AZaz\t12!#`")), "YES"); +ok(answer(checkNFKD("AZaz\t12!#`")), "YES"); +ok(answer(checkNFKC("AZaz\t12!#`")), "YES"); +ok(answer(check("D", "AZaz\t12!#`")), "YES"); +ok(answer(check("C", "AZaz\t12!#`")), "YES"); +ok(answer(check("KD","AZaz\t12!#`")), "YES"); +ok(answer(check("KC","AZaz\t12!#`")), "YES"); + +ok(answer(checkNFD(NFD(_pack_U(0xC1, 0x1100, 0x1173, 0x11AF)))), "YES"); +ok(answer(checkNFD(hexU("20 C1 1100 1173 11AF"))), "NO"); +ok(answer(checkNFC(hexU("20 C1 1173 11AF"))), "MAYBE"); +ok(answer(checkNFC(hexU("20 C1 AE00 1100"))), "YES"); +ok(answer(checkNFC(hexU("20 C1 AE00 1100 0300"))), "MAYBE"); +ok(answer(checkNFC(hexU("212B 1100 0300"))), "NO"); +ok(answer(checkNFC(hexU("1100 0300 212B"))), "NO"); +ok(answer(checkNFC(hexU("0041 0327 030A"))), "MAYBE"); # A+cedilla+ring +ok(answer(checkNFC(hexU("0041 030A 0327"))), "NO"); # A+ring+cedilla +ok(answer(checkNFC(hexU("20 C1 FF71 2025"))),"YES"); +ok(answer(check("NFC", hexU("20 C1 212B 300"))), "NO"); +ok(answer(checkNFKD(hexU("20 C1 FF71 2025"))), "NO"); +ok(answer(checkNFKC(hexU("20 C1 AE00 2025"))), "NO"); +ok(answer(checkNFKC(hexU("212B 1100 0300"))), "NO"); +ok(answer(checkNFKC(hexU("1100 0300 212B"))), "NO"); +ok(answer(checkNFKC(hexU("0041 0327 030A"))), "MAYBE"); # A+cedilla+ring +ok(answer(checkNFKC(hexU("0041 030A 0327"))), "NO"); # A+ring+cedilla +ok(answer(check("NFKC", hexU("20 C1 212B 300"))), "NO"); + +# 145 + +"012ABC" =~ /(\d+)(\w+)/; +ok("012" eq NFC $1 && "ABC" eq NFC $2); + +ok(normalize('C', $1), "012"); +ok(normalize('C', $2), "ABC"); + +ok(normalize('NFC', $1), "012"); +ok(normalize('NFC', $2), "ABC"); + # s/^NF// in normalize() must not prevent using $1, $&, etc. + +# 150 + +# a string with initial zero should be treated like a number + +# LATIN CAPITAL LETTER A WITH GRAVE +ok(getCombinClass(sprintf("0%d", to_native(192))), 0); +ok(getCanon (sprintf("0%d", to_native(192))), _pack_U(0x41, 0x300)); +ok(getCompat(sprintf("0%d", to_native(192))), _pack_U(0x41, 0x300)); +my $lead_zero = sprintf "0%d", to_native(65); +ok(getComposite($lead_zero, "0768"), to_native(192)); +ok(isNFD_NO (sprintf("0%d", to_native(192)))); +ok(isNFKD_NO(sprintf("0%d", to_native(192)))); + +# DEVANAGARI LETTER QA +ok(isExclusion("02392")); +ok(isComp_Ex ("02392")); +ok(isNFC_NO ("02392")); +ok(isNFKC_NO ("02392")); +ok(isNFD_NO ("02392")); +ok(isNFKD_NO ("02392")); + +# ANGSTROM SIGN +ok(isSingleton("08491")); +ok(isComp_Ex ("08491")); +ok(isNFC_NO ("08491")); +ok(isNFKC_NO ("08491")); +ok(isNFD_NO ("08491")); +ok(isNFKD_NO ("08491")); + +# COMBINING GREEK DIALYTIKA TONOS +ok(isNonStDecomp("0836")); +ok(isComp_Ex ("0836")); +ok(isNFC_NO ("0836")); +ok(isNFKC_NO ("0836")); +ok(isNFD_NO ("0836")); +ok(isNFKD_NO ("0836")); + +# COMBINING GRAVE ACCENT +ok(getCombinClass("0768"), 230); +ok(isComp2nd ("0768")); +ok(isNFC_MAYBE ("0768")); +ok(isNFKC_MAYBE("0768")); + +# HANGUL SYLLABLE GA +ok(getCombinClass("044032"), 0); +ok(getCanon("044032"), _pack_U(0x1100, 0x1161)); +ok(getCompat("044032"), _pack_U(0x1100, 0x1161)); +ok(getComposite("04352", "04449"), 0xAC00); + +# 182 + +# string with 22 combining characters: (0x300..0x315) +my $str_cc22 = _pack_U(0x3041, 0x300..0x315, 0x3042); +ok(decompose($str_cc22), $str_cc22); +ok(reorder($str_cc22), $str_cc22); +ok(compose($str_cc22), $str_cc22); +ok(composeContiguous($str_cc22), $str_cc22); +ok(NFD($str_cc22), $str_cc22); +ok(NFC($str_cc22), $str_cc22); +ok(NFKD($str_cc22), $str_cc22); +ok(NFKC($str_cc22), $str_cc22); +ok(FCD($str_cc22), $str_cc22); +ok(FCC($str_cc22), $str_cc22); + +# 192 + +# string with 40 combining characters of the same class: (0x300..0x313)x2 +my $str_cc40 = _pack_U(0x3041, 0x300..0x313, 0x300..0x313, 0x3042); +ok(decompose($str_cc40), $str_cc40); +ok(reorder($str_cc40), $str_cc40); +ok(compose($str_cc40), $str_cc40); +ok(composeContiguous($str_cc40), $str_cc40); +ok(NFD($str_cc40), $str_cc40); +ok(NFC($str_cc40), $str_cc40); +ok(NFKD($str_cc40), $str_cc40); +ok(NFKC($str_cc40), $str_cc40); +ok(FCD($str_cc40), $str_cc40); +ok(FCC($str_cc40), $str_cc40); + +# 202 + +my $precomp = hexU("304C 304E 3050 3052 3054"); +my $combseq = hexU("304B 3099 304D 3099 304F 3099 3051 3099 3053 3099"); +ok(decompose($precomp x 5), $combseq x 5); +ok(decompose($precomp x 10), $combseq x 10); +ok(decompose($precomp x 20), $combseq x 20); + +my $hangsyl = hexU("AC00 B098 B2E4 B77C B9C8"); +my $jamoseq = hexU("1100 1161 1102 1161 1103 1161 1105 1161 1106 1161"); +ok(decompose($hangsyl x 5), $jamoseq x 5); +ok(decompose($hangsyl x 10), $jamoseq x 10); +ok(decompose($hangsyl x 20), $jamoseq x 20); + +my $notcomp = hexU("304B 304D 304F 3051 3053"); +ok(decompose($precomp . $notcomp), $combseq . $notcomp); +ok(decompose($precomp . $notcomp x 5), $combseq . $notcomp x 5); +ok(decompose($precomp . $notcomp x10), $combseq . $notcomp x10); + +# 211 + +my $preUnicode3_1 = !defined getCanon(0x1D15E); +my $preUnicode3_2 = !defined getCanon(0x2ADC); + +# HEBREW LETTER YOD WITH HIRIQ +ok($preUnicode3_1 xor isExclusion(0xFB1D)); +ok($preUnicode3_1 xor isComp_Ex (0xFB1D)); + +# MUSICAL SYMBOL HALF NOTE +ok($preUnicode3_1 xor isExclusion(0x1D15E)); +ok($preUnicode3_1 xor isComp_Ex (0x1D15E)); + +# FORKING +ok($preUnicode3_2 xor isExclusion(0x2ADC)); +ok($preUnicode3_2 xor isComp_Ex (0x2ADC)); + +# 217 + diff --git a/dist/Unicode-Normalize/t/illegal.t b/dist/Unicode-Normalize/t/illegal.t new file mode 100644 index 0000000000..ccf2b4aae6 --- /dev/null +++ b/dist/Unicode-Normalize/t/illegal.t @@ -0,0 +1,85 @@ + +BEGIN { + unless ('A' eq pack('U', 0x41)) { + print "1..0 # Unicode::Normalize cannot pack a Unicode code point\n"; + exit 0; + } + unless (0x41 == unpack('U', 'A')) { + print "1..0 # Unicode::Normalize cannot get a Unicode code point\n"; + exit 0; + } +} + +BEGIN { + if ($ENV{PERL_CORE}) { + chdir('t') if -d 't'; + @INC = $^O eq 'MacOS' ? qw(::lib) : qw(../lib); + } +} + +BEGIN { + unless (5.006001 <= $]) { + print "1..0 # skipped: Perl 5.6.1 or later". + " needed for this test\n"; + exit; + } +} + +######################### + +BEGIN { + use Unicode::Normalize qw(:all); + + unless (exists &Unicode::Normalize::bootstrap or 5.008 <= $]) { + print "1..0 # skipped: XSUB, or Perl 5.8.0 or later". + " needed for this test\n"; + print $@; + exit; + } +} + +use strict; +use warnings; + +BEGIN { $| = 1; print "1..113\n"; } +my $count = 0; +sub ok ($;$) { + my $p = my $r = shift; + if (@_) { + my $x = shift; + $p = !defined $x ? !defined $r : !defined $r ? 0 : $r eq $x; + } + print $p ? "ok" : "not ok", ' ', ++$count, "\n"; +} + +ok(1); + +######################### + +no warnings qw(utf8); + +for my $u (0xD800, 0xDFFF, 0xFDD0, 0xFDEF, 0xFEFF, 0xFFFE, 0xFFFF, + 0x1FFFF, 0x10FFFF, 0x110000, 0x3FFFFFFF) +{ + my $c = chr $u; + ok($c eq NFD($c)); # 1 + ok($c eq NFC($c)); # 2 + ok($c eq NFKD($c)); # 3 + ok($c eq NFKC($c)); # 4 + ok($c eq FCD($c)); # 5 + ok($c eq FCC($c)); # 6 + ok($c eq decompose($c)); # 7 + ok($c eq decompose($c,1)); # 8 + ok($c eq reorder($c)); # 9 + ok($c eq compose($c)); # 10 +} + +our $proc; # before the last starter +our $unproc; # the last starter and after + +sub _pack_U { Unicode::Normalize::pack_U(@_) } + +($proc, $unproc) = splitOnLastStarter(_pack_U(0x41, 0x300, 0x327, 0xFFFF)); +ok($proc eq _pack_U(0x41, 0x300, 0x327)); +ok($unproc eq "\x{FFFF}"); + diff --git a/dist/Unicode-Normalize/t/norm.t b/dist/Unicode-Normalize/t/norm.t new file mode 100644 index 0000000000..d3cec3aea1 --- /dev/null +++ b/dist/Unicode-Normalize/t/norm.t @@ -0,0 +1,145 @@ + +BEGIN { + unless ('A' eq pack('U', 0x41)) { + print "1..0 # Unicode::Normalize cannot pack a Unicode code point\n"; + exit 0; + } + unless (0x41 == unpack('U', 'A')) { + print "1..0 # Unicode::Normalize cannot get a Unicode code point\n"; + exit 0; + } +} + +BEGIN { + if ($ENV{PERL_CORE}) { + chdir('t') if -d 't'; + @INC = $^O eq 'MacOS' ? qw(::lib) : qw(../lib); + } +} + +######################### + +use strict; +use warnings; +BEGIN { $| = 1; print "1..64\n"; } +my $count = 0; +sub ok ($;$) { + my $p = my $r = shift; + if (@_) { + my $x = shift; + $p = !defined $x ? !defined $r : !defined $r ? 0 : $r eq $x; + } + print $p ? "ok" : "not ok", ' ', ++$count, "\n"; +} + +use Unicode::Normalize qw(normalize); + +ok(1); + +sub _pack_U { Unicode::Normalize::pack_U(@_) } +sub _unpack_U { Unicode::Normalize::unpack_U(@_) } + +######################### + +ok(normalize('D', ""), ""); +ok(normalize('C', ""), ""); +ok(normalize('KD',""), ""); +ok(normalize('KC',""), ""); + +ok(normalize('D', "A"), "A"); +ok(normalize('C', "A"), "A"); +ok(normalize('KD',"A"), "A"); +ok(normalize('KC',"A"), "A"); + +ok(normalize('NFD', ""), ""); +ok(normalize('NFC', ""), ""); +ok(normalize('NFKD',""), ""); +ok(normalize('NFKC',""), ""); + +ok(normalize('NFD', "A"), "A"); +ok(normalize('NFC', "A"), "A"); +ok(normalize('NFKD',"A"), "A"); +ok(normalize('NFKC',"A"), "A"); + +# 17 + +# don't modify the source +my $sNFD = "\x{FA19}"; +ok(normalize('NFD', $sNFD), "\x{795E}"); +ok($sNFD, "\x{FA19}"); + +my $sNFC = "\x{FA1B}"; +ok(normalize('NFC', $sNFC), "\x{798F}"); +ok($sNFC, "\x{FA1B}"); + +my $sNFKD = "\x{FA1E}"; +ok(normalize('NFKD', $sNFKD), "\x{7FBD}"); +ok($sNFKD, "\x{FA1E}"); + +my $sNFKC = "\x{FA26}"; +ok(normalize('NFKC', $sNFKC), "\x{90FD}"); +ok($sNFKC, "\x{FA26}"); + +# 25 + +sub hexNFC { + join " ", map sprintf("%04X", $_), + _unpack_U normalize 'C', _pack_U map hex, split ' ', shift; +} +sub hexNFD { + join " ", map sprintf("%04X", $_), + _unpack_U normalize 'D', _pack_U map hex, split ' ', shift; +} + +ok(hexNFD("1E14 AC01"), "0045 0304 0300 1100 1161 11A8"); +ok(hexNFD("AC00 AE00"), "1100 1161 1100 1173 11AF"); + +ok(hexNFC("0061 0315 0300 05AE 05C4 0062"), "00E0 05AE 05C4 0315 0062"); +ok(hexNFC("00E0 05AE 05C4 0315 0062"), "00E0 05AE 05C4 0315 0062"); +ok(hexNFC("0061 05AE 0300 05C4 0315 0062"), "00E0 05AE 05C4 0315 0062"); +ok(hexNFC("0045 0304 0300 AC00 11A8"), "1E14 AC01"); +ok(hexNFC("1100 1161 1100 1173 11AF"), "AC00 AE00"); +ok(hexNFC("1100 0300 1161 1173 11AF"), "1100 0300 1161 1173 11AF"); + +ok(hexNFD("0061 0315 0300 05AE 05C4 0062"), "0061 05AE 0300 05C4 0315 0062"); +ok(hexNFD("00E0 05AE 05C4 0315 0062"), "0061 05AE 0300 05C4 0315 0062"); +ok(hexNFD("0061 05AE 0300 05C4 0315 0062"), "0061 05AE 0300 05C4 0315 0062"); +ok(hexNFC("0061 05C4 0315 0300 05AE 0062"), "0061 05AE 05C4 0300 0315 0062"); +ok(hexNFC("0061 05AE 05C4 0300 0315 0062"), "0061 05AE 05C4 0300 0315 0062"); +ok(hexNFD("0061 05C4 0315 0300 05AE 0062"), "0061 05AE 05C4 0300 0315 0062"); +ok(hexNFD("0061 05AE 05C4 0300 0315 0062"), "0061 05AE 05C4 0300 0315 0062"); +ok(hexNFC("0000 0041 0000 0000"), "0000 0041 0000 0000"); +ok(hexNFD("0000 0041 0000 0000"), "0000 0041 0000 0000"); + +ok(hexNFC("AC00 11A7"), "AC00 11A7"); +ok(hexNFC("AC00 11A8"), "AC01"); +ok(hexNFC("AC00 11A9"), "AC02"); +ok(hexNFC("AC00 11C2"), "AC1B"); +ok(hexNFC("AC00 11C3"), "AC00 11C3"); + +# 47 + +# Test Cases from Public Review Issue #29: Normalization Issue +# cf. http://www.unicode.org/review/pr-29.html +ok(hexNFC("0B47 0300 0B3E"), "0B47 0300 0B3E"); +ok(hexNFC("1100 0300 1161"), "1100 0300 1161"); +ok(hexNFC("0B47 0B3E 0300"), "0B4B 0300"); +ok(hexNFC("1100 1161 0300"), "AC00 0300"); +ok(hexNFC("0B47 0300 0B3E 0327"), "0B47 0300 0B3E 0327"); +ok(hexNFC("1100 0300 1161 0327"), "1100 0300 1161 0327"); + +ok(hexNFC("0300 0041"), "0300 0041"); +ok(hexNFC("0300 0301 0041"), "0300 0301 0041"); +ok(hexNFC("0301 0300 0041"), "0301 0300 0041"); +ok(hexNFC("0000 0300 0000 0301"), "0000 0300 0000 0301"); +ok(hexNFC("0000 0301 0000 0300"), "0000 0301 0000 0300"); + +ok(hexNFC("0327 0061 0300"), "0327 00E0"); +ok(hexNFC("0301 0061 0300"), "0301 00E0"); +ok(hexNFC("0315 0061 0300"), "0315 00E0"); +ok(hexNFC("0000 0327 0061 0300"), "0000 0327 00E0"); +ok(hexNFC("0000 0301 0061 0300"), "0000 0301 00E0"); +ok(hexNFC("0000 0315 0061 0300"), "0000 0315 00E0"); + +# 64 + diff --git a/dist/Unicode-Normalize/t/null.t b/dist/Unicode-Normalize/t/null.t new file mode 100644 index 0000000000..9a0008708e --- /dev/null +++ b/dist/Unicode-Normalize/t/null.t @@ -0,0 +1,100 @@ + +BEGIN { + unless ('A' eq pack('U', 0x41)) { + print "1..0 # Unicode::Normalize cannot pack a Unicode code point\n"; + exit 0; + } + unless (0x41 == unpack('U', 'A')) { + print "1..0 # Unicode::Normalize cannot get a Unicode code point\n"; + exit 0; + } +} + +BEGIN { + if ($ENV{PERL_CORE}) { + chdir('t') if -d 't'; + @INC = $^O eq 'MacOS' ? qw(::lib) : qw(../lib); + } +} + +######################### + +use strict; +use warnings; + +use Unicode::Normalize qw(:all); +print "1..24\n"; + +print "ok 1\n"; + +# if $_ is not NULL-terminated, test may fail. + +$_ = compose('abc'); +print /c$/ ? "ok" : "not ok", " 2\n"; + +$_ = decompose('abc'); +print /c$/ ? "ok" : "not ok", " 3\n"; + +$_ = reorder('abc'); +print /c$/ ? "ok" : "not ok", " 4\n"; + +$_ = NFD('abc'); +print /c$/ ? "ok" : "not ok", " 5\n"; + +$_ = NFC('abc'); +print /c$/ ? "ok" : "not ok", " 6\n"; + +$_ = NFKD('abc'); +print /c$/ ? "ok" : "not ok", " 7\n"; + +$_ = NFKC('abc'); +print /c$/ ? "ok" : "not ok", " 8\n"; + +$_ = FCC('abc'); +print /c$/ ? "ok" : "not ok", " 9\n"; + +$_ = decompose("\x{304C}abc"); +print /c$/ ? "ok" : "not ok", " 10\n"; + +$_ = decompose("\x{304B}\x{3099}abc"); +print /c$/ ? "ok" : "not ok", " 11\n"; + +$_ = reorder("\x{304C}abc"); +print /c$/ ? "ok" : "not ok", " 12\n"; + +$_ = reorder("\x{304B}\x{3099}abc"); +print /c$/ ? "ok" : "not ok", " 13\n"; + +$_ = compose("\x{304C}abc"); +print /c$/ ? "ok" : "not ok", " 14\n"; + +$_ = compose("\x{304B}\x{3099}abc"); +print /c$/ ? "ok" : "not ok", " 15\n"; + +$_ = NFD("\x{304C}abc"); +print /c$/ ? "ok" : "not ok", " 16\n"; + +$_ = NFC("\x{304C}abc"); +print /c$/ ? "ok" : "not ok", " 17\n"; + +$_ = NFKD("\x{304C}abc"); +print /c$/ ? "ok" : "not ok", " 18\n"; + +$_ = NFKC("\x{304C}abc"); +print /c$/ ? "ok" : "not ok", " 19\n"; + +$_ = FCC("\x{304C}abc"); +print /c$/ ? "ok" : "not ok", " 20\n"; + +$_ = getCanon(0x100); +print s/.$// ? "ok" : "not ok", " 21\n"; + +$_ = getCompat(0x100); +print s/.$// ? "ok" : "not ok", " 22\n"; + +$_ = getCanon(0xAC00); +print s/.$// ? "ok" : "not ok", " 23\n"; + +$_ = getCompat(0xAC00); +print s/.$// ? "ok" : "not ok", " 24\n"; + diff --git a/dist/Unicode-Normalize/t/partial1.t b/dist/Unicode-Normalize/t/partial1.t new file mode 100644 index 0000000000..3e44a63dc0 --- /dev/null +++ b/dist/Unicode-Normalize/t/partial1.t @@ -0,0 +1,120 @@ + +BEGIN { + unless ('A' eq pack('U', 0x41)) { + print "1..0 # Unicode::Normalize cannot pack a Unicode code point\n"; + exit 0; + } + unless (0x41 == unpack('U', 'A')) { + print "1..0 # Unicode::Normalize cannot get a Unicode code point\n"; + exit 0; + } +} + +BEGIN { + if ($ENV{PERL_CORE}) { + chdir('t') if -d 't'; + @INC = $^O eq 'MacOS' ? qw(::lib) : qw(../lib); + } +} + +BEGIN { + unless (5.006001 <= $]) { + print "1..0 # skipped: Perl 5.6.1 or later". + " needed for this test\n"; + exit; + } +} + +######################### + +use strict; +use warnings; +BEGIN { $| = 1; print "1..26\n"; } +my $count = 0; +sub ok ($;$) { + my $p = my $r = shift; + if (@_) { + my $x = shift; + $p = !defined $x ? !defined $r : !defined $r ? 0 : $r eq $x; + } + print $p ? "ok" : "not ok", ' ', ++$count, "\n"; +} + +use Unicode::Normalize qw(:all); + +ok(1); + +sub _pack_U { Unicode::Normalize::pack_U(@_) } +sub _unpack_U { Unicode::Normalize::unpack_U(@_) } + +######################### + +sub arraynorm { + my $form = shift; + my @string = @_; + my $result = ""; + my $unproc = ""; + foreach my $str (@string) { + $unproc .= $str; + $result .= $form eq 'NFC' ? NFC_partial ($unproc) : + $form eq 'NFD' ? NFD_partial ($unproc) : + $form eq 'NFKC' ? NFKC_partial($unproc) : + $form eq 'NFKD' ? NFKD_partial($unproc) : + undef; + } + $result .= $unproc; + return $result; +} + +my $strD = "\x{3C9}\x{301}\x{1100}\x{1161}\x{11A8}\x{1100}\x{1161}\x{11AA}"; +my $strC = "\x{3CE}\x{AC01}\x{AC03}"; +my @str1 = (substr($strD,0,3), substr($strD,3,4), substr($strD,7)); +my @str2 = (substr($strD,0,1), substr($strD,1,3), substr($strD,4)); +ok($strC eq NFC($strD)); +ok($strD eq join('', @str1)); +ok($strC eq arraynorm('NFC', @str1)); +ok($strD eq join('', @str2)); +ok($strC eq arraynorm('NFC', @str2)); + +my @strX = ("\x{300}\x{AC00}", "\x{11A8}"); +my $strX = "\x{300}\x{AC01}"; +ok($strX eq NFC(join('', @strX))); +ok($strX eq arraynorm('NFC', @strX)); +ok($strX eq NFKC(join('', @strX))); +ok($strX eq arraynorm('NFKC', @strX)); + +my @strY = ("\x{304B}\x{0308}", "\x{0323}\x{3099}"); +my $strY = ("\x{304C}\x{0323}\x{0308}"); +ok($strY eq NFC(join('', @strY))); +ok($strY eq arraynorm('NFC', @strY)); +ok($strY eq NFKC(join('', @strY))); +ok($strY eq arraynorm('NFKC', @strY)); + +my @strZ = ("\x{304B}\x{0308}", "\x{0323}", "\x{3099}"); +my $strZ = ("\x{304B}\x{3099}\x{0323}\x{0308}"); +ok($strZ eq NFD(join('', @strZ))); +ok($strZ eq arraynorm('NFD', @strZ)); +ok($strZ eq NFKD(join('', @strZ))); +ok($strZ eq arraynorm('NFKD', @strZ)); + +# 18 + +# must modify the source +my $sNFD = "\x{FA19}"; +ok(NFD_partial($sNFD), ""); +ok($sNFD, "\x{795E}"); + +my $sNFC = "\x{FA1B}"; +ok(NFC_partial($sNFC), ""); +ok($sNFC, "\x{798F}"); + +my $sNFKD = "\x{FA1E}"; +ok(NFKD_partial($sNFKD), ""); +ok($sNFKD, "\x{7FBD}"); + +my $sNFKC = "\x{FA26}"; +ok(NFKC_partial($sNFKC), ""); +ok($sNFKC, "\x{90FD}"); + +# 26 + diff --git a/dist/Unicode-Normalize/t/partial2.t b/dist/Unicode-Normalize/t/partial2.t new file mode 100644 index 0000000000..7f19e9365b --- /dev/null +++ b/dist/Unicode-Normalize/t/partial2.t @@ -0,0 +1,116 @@ + +BEGIN { + unless ('A' eq pack('U', 0x41)) { + print "1..0 # Unicode::Normalize cannot pack a Unicode code point\n"; + exit 0; + } + unless (0x41 == unpack('U', 'A')) { + print "1..0 # Unicode::Normalize cannot get a Unicode code point\n"; + exit 0; + } +} + +BEGIN { + if ($ENV{PERL_CORE}) { + chdir('t') if -d 't'; + @INC = $^O eq 'MacOS' ? qw(::lib) : qw(../lib); + } +} + +BEGIN { + unless (5.006001 <= $]) { + print "1..0 # skipped: Perl 5.6.1 or later". + " needed for this test\n"; + exit; + } +} + +######################### + +use strict; +use warnings; +BEGIN { $| = 1; print "1..26\n"; } +my $count = 0; +sub ok ($;$) { + my $p = my $r = shift; + if (@_) { + my $x = shift; + $p = !defined $x ? !defined $r : !defined $r ? 0 : $r eq $x; + } + print $p ? "ok" : "not ok", ' ', ++$count, "\n"; +} + +use Unicode::Normalize qw(:all); + +ok(1); + +sub _pack_U { Unicode::Normalize::pack_U(@_) } +sub _unpack_U { Unicode::Normalize::unpack_U(@_) } + +######################### + +sub arraynorm { + my $form = shift; + my @string = @_; + my $result = ""; + my $unproc = ""; + foreach my $str (@string) { + $unproc .= $str; + $result .= normalize_partial($form, $unproc); + } + $result .= $unproc; + return $result; +} + +my $strD = "\x{3C9}\x{301}\x{1100}\x{1161}\x{11A8}\x{1100}\x{1161}\x{11AA}"; +my $strC = "\x{3CE}\x{AC01}\x{AC03}"; +my @str1 = (substr($strD,0,3), substr($strD,3,4), substr($strD,7)); +my @str2 = (substr($strD,0,1), substr($strD,1,3), substr($strD,4)); +ok($strC eq NFC($strD)); +ok($strD eq join('', @str1)); +ok($strC eq arraynorm('NFC', @str1)); +ok($strD eq join('', @str2)); +ok($strC eq arraynorm('NFC', @str2)); + +my @strX = ("\x{300}\x{AC00}", "\x{11A8}"); +my $strX = "\x{300}\x{AC01}"; +ok($strX eq NFC(join('', @strX))); +ok($strX eq arraynorm('NFC', @strX)); +ok($strX eq NFKC(join('', @strX))); +ok($strX eq arraynorm('NFKC', @strX)); + +my @strY = ("\x{304B}\x{0308}", "\x{0323}\x{3099}"); +my $strY = ("\x{304C}\x{0323}\x{0308}"); +ok($strY eq NFC(join('', @strY))); +ok($strY eq arraynorm('NFC', @strY)); +ok($strY eq NFKC(join('', @strY))); +ok($strY eq arraynorm('NFKC', @strY)); + +my @strZ = ("\x{304B}\x{0308}", "\x{0323}", "\x{3099}"); +my $strZ = ("\x{304B}\x{3099}\x{0323}\x{0308}"); +ok($strZ eq NFD(join('', @strZ))); +ok($strZ eq arraynorm('NFD', @strZ)); +ok($strZ eq NFKD(join('', @strZ))); +ok($strZ eq arraynorm('NFKD', @strZ)); + +# 18 + +# must modify the source +my $sNFD = "\x{FA19}"; +ok(normalize_partial('NFD', $sNFD), ""); +ok($sNFD, "\x{795E}"); + +my $sNFC = "\x{FA1B}"; +ok(normalize_partial('NFC', $sNFC), ""); +ok($sNFC, "\x{798F}"); + +my $sNFKD = "\x{FA1E}"; +ok(normalize_partial('NFKD', $sNFKD), ""); +ok($sNFKD, "\x{7FBD}"); + +my $sNFKC = "\x{FA26}"; +ok(normalize_partial('NFKC', $sNFKC), ""); +ok($sNFKC, "\x{90FD}"); + +# 26 + diff --git a/dist/Unicode-Normalize/t/proto.t b/dist/Unicode-Normalize/t/proto.t new file mode 100644 index 0000000000..38c6985759 --- /dev/null +++ b/dist/Unicode-Normalize/t/proto.t @@ -0,0 +1,99 @@ + +BEGIN { + unless ('A' eq pack('U', 0x41)) { + print "1..0 # Unicode::Normalize cannot pack a Unicode code point\n"; + exit 0; + } + unless (0x41 == unpack('U', 'A')) { + print "1..0 # Unicode::Normalize cannot get a Unicode code point\n"; + exit 0; + } +} + +BEGIN { + if ($ENV{PERL_CORE}) { + chdir('t') if -d 't'; + @INC = $^O eq 'MacOS' ? qw(::lib) : qw(../lib); + } +} + +######################### + +use strict; +use warnings; +BEGIN { $| = 1; print "1..48\n"; } +my $count = 0; +sub ok ($;$) { + my $p = my $r = shift; + if (@_) { + my $x = shift; + $p = !defined $x ? !defined $r : !defined $r ? 0 : $r eq $x; + } + print $p ? "ok" : "not ok", ' ', ++$count, "\n"; +} + +use Unicode::Normalize qw(:all); + +ok(1); + +######################### + +# unary op. RING-CEDILLA +ok( "\x{30A}\x{327}" ne "\x{327}\x{30A}"); +ok(NFD "\x{30A}\x{327}" eq "\x{327}\x{30A}"); +ok(NFC "\x{30A}\x{327}" eq "\x{327}\x{30A}"); +ok(NFKD "\x{30A}\x{327}" eq "\x{327}\x{30A}"); +ok(NFKC "\x{30A}\x{327}" eq "\x{327}\x{30A}"); +ok(FCD "\x{30A}\x{327}" eq "\x{327}\x{30A}"); +ok(FCC "\x{30A}\x{327}" eq "\x{327}\x{30A}"); +ok(reorder "\x{30A}\x{327}" eq "\x{327}\x{30A}"); + +# 9 + +ok(prototype \&normalize,'$$'); +ok(prototype \&NFD, '$'); +ok(prototype \&NFC, '$'); +ok(prototype \&NFKD, '$'); +ok(prototype \&NFKC, '$'); +ok(prototype \&FCD, '$'); +ok(prototype \&FCC, '$'); + +ok(prototype \&check, '$$'); +ok(prototype \&checkNFD, '$'); +ok(prototype \&checkNFC, '$'); +ok(prototype \&checkNFKD,'$'); +ok(prototype \&checkNFKC,'$'); +ok(prototype \&checkFCD, '$'); +ok(prototype \&checkFCC, '$'); + +ok(prototype \&decompose, '$;$'); +ok(prototype \&reorder, '$'); +ok(prototype \&compose, '$'); +ok(prototype \&composeContiguous, '$'); + +# 27 + +ok(prototype \&getCanon, '$'); +ok(prototype \&getCompat, '$'); +ok(prototype \&getComposite, '$$'); +ok(prototype \&getCombinClass,'$'); +ok(prototype \&isExclusion, '$'); +ok(prototype \&isSingleton, '$'); +ok(prototype \&isNonStDecomp, '$'); +ok(prototype \&isComp2nd, '$'); +ok(prototype \&isComp_Ex, '$'); +ok(prototype \&isNFD_NO, '$'); +ok(prototype \&isNFC_NO, '$'); +ok(prototype \&isNFC_MAYBE, '$'); +ok(prototype \&isNFKD_NO, '$'); +ok(prototype \&isNFKC_NO, '$'); +ok(prototype \&isNFKC_MAYBE, '$'); +ok(prototype \&splitOnLastStarter, undef); +ok(prototype \&normalize_partial, '$$'); +ok(prototype \&NFD_partial, '$'); +ok(prototype \&NFC_partial, '$'); +ok(prototype \&NFKD_partial, '$'); +ok(prototype \&NFKC_partial, '$'); + +# 48 + diff --git a/dist/Unicode-Normalize/t/split.t b/dist/Unicode-Normalize/t/split.t new file mode 100644 index 0000000000..a92957c208 --- /dev/null +++ b/dist/Unicode-Normalize/t/split.t @@ -0,0 +1,147 @@ + +BEGIN { + unless ('A' eq pack('U', 0x41)) { + print "1..0 # Unicode::Normalize cannot pack a Unicode code point\n"; + exit 0; + } + unless (0x41 == unpack('U', 'A')) { + print "1..0 # Unicode::Normalize cannot get a Unicode code point\n"; + exit 0; + } +} + +BEGIN { + if ($ENV{PERL_CORE}) { + chdir('t') if -d 't'; + @INC = $^O eq 'MacOS' ? qw(::lib) : qw(../lib); + } +} + +BEGIN { + unless (5.006001 <= $]) { + print "1..0 # skipped: Perl 5.6.1 or later". + " needed for this test\n"; + exit; + } +} + +######################### + +use strict; +use warnings; +BEGIN { $| = 1; print "1..34\n"; } +my $count = 0; +sub ok ($;$) { + my $p = my $r = shift; + if (@_) { + my $x = shift; + $p = !defined $x ? !defined $r : !defined $r ? 0 : $r eq $x; + } + print $p ? "ok" : "not ok", ' ', ++$count, "\n"; +} + +use Unicode::Normalize qw(:all); + +ok(1); + +sub _pack_U { Unicode::Normalize::pack_U(@_) } +sub _unpack_U { Unicode::Normalize::unpack_U(@_) } + +######################### + +our $proc; # before the last starter +our $unproc; # the last starter and after +# If string has no starter, entire string is set to $unproc. + +($proc, $unproc) = splitOnLastStarter(""); +ok($proc, ""); +ok($unproc, ""); + +($proc, $unproc) = splitOnLastStarter("A"); +ok($proc, ""); +ok($unproc, "A"); + +($proc, $unproc) = splitOnLastStarter(_pack_U(0x41, 0x300, 0x327, 0x42)); +ok($proc, _pack_U(0x41, 0x300, 0x327)); +ok($unproc, "B"); + +($proc, $unproc) = splitOnLastStarter(_pack_U(0x4E00, 0x41, 0x301)); +ok($proc, _pack_U(0x4E00)); +ok($unproc, _pack_U(0x41, 0x301)); + +($proc, $unproc) = splitOnLastStarter(_pack_U(0x302, 0x301, 0x300)); +ok($proc, ""); +ok($unproc, _pack_U(0x302, 0x301, 0x300)); + +our $ka_grave = _pack_U(0x41, 0, 0x42, 0x304B, 0x300); +our $dakuten = _pack_U(0x3099); +our $ga_grave = _pack_U(0x41, 0, 0x42, 0x304C, 0x300); + +our ($p, $u) = splitOnLastStarter($ka_grave); +our $concat = $p . NFC($u.$dakuten); + +ok(NFC($ka_grave.$dakuten) eq $ga_grave); +ok(NFC($ka_grave).NFC($dakuten) ne $ga_grave); +ok($concat eq $ga_grave); + +# 14 + +sub arraynorm { + my $form = shift; + my @string = @_; + my $result = ""; + my $unproc = ""; + foreach my $str (@string) { + $unproc .= $str; + my $n = normalize($form, $unproc); + my($p, $u) = splitOnLastStarter($n); + $result .= $p; + $unproc = $u; + } + $result .= $unproc; + return $result; +} + +my $strD = "\x{3C9}\x{301}\x{1100}\x{1161}\x{11A8}\x{1100}\x{1161}\x{11AA}"; +my $strC = "\x{3CE}\x{AC01}\x{AC03}"; +my @str1 = (substr($strD,0,3), substr($strD,3,4), substr($strD,7)); +my @str2 = (substr($strD,0,1), substr($strD,1,3), substr($strD,4)); +ok($strC eq NFC($strD)); +ok($strD eq join('', @str1)); +ok($strC eq arraynorm('NFC', @str1)); +ok($strD eq join('', @str2)); +ok($strC eq arraynorm('NFC', @str2)); + +my @strX = ("\x{300}\x{AC00}", "\x{11A8}"); +my $strX = "\x{300}\x{AC01}"; +ok($strX eq NFC(join('', @strX))); +ok($strX eq arraynorm('NFC', @strX)); +ok($strX eq NFKC(join('', @strX))); +ok($strX eq arraynorm('NFKC', @strX)); + +my @strY = ("\x{304B}\x{0308}", "\x{0323}\x{3099}"); +my $strY = ("\x{304C}\x{0323}\x{0308}"); +ok($strY eq NFC(join('', @strY))); +ok($strY eq arraynorm('NFC', @strY)); +ok($strY eq NFKC(join('', @strY))); +ok($strY eq arraynorm('NFKC', @strY)); + +my @strZ = ("\x{304B}\x{0308}", "\x{0323}", "\x{3099}"); +my $strZ = ("\x{304B}\x{3099}\x{0323}\x{0308}"); +ok($strZ eq NFD(join('', @strZ))); +ok($strZ eq arraynorm('NFD', @strZ)); +ok($strZ eq NFKD(join('', @strZ))); +ok($strZ eq arraynorm('NFKD', @strZ)); + +# 31 + +# don't modify the source + +my $source = "ABC"; +($proc, $unproc) = splitOnLastStarter($source); +ok($proc, "AB"); +ok($unproc, "C"); +ok($source, "ABC"); + +# 34 + diff --git a/dist/Unicode-Normalize/t/test.t b/dist/Unicode-Normalize/t/test.t new file mode 100644 index 0000000000..cb4b6ea637 --- /dev/null +++ b/dist/Unicode-Normalize/t/test.t @@ -0,0 +1,168 @@ + +BEGIN { + unless ('A' eq pack('U', 0x41)) { + print "1..0 # Unicode::Normalize cannot pack a Unicode code point\n"; + exit 0; + } + unless (0x41 == unpack('U', 'A')) { + print "1..0 # Unicode::Normalize cannot get a Unicode code point\n"; + exit 0; + } +} + +BEGIN { + if ($ENV{PERL_CORE}) { + chdir('t') if -d 't'; + @INC = $^O eq 'MacOS' ? qw(::lib) : qw(../lib); + } +} + +######################### + +use strict; +use warnings; +BEGIN { $| = 1; print "1..72\n"; } +my $count = 0; +sub ok ($;$) { + my $p = my $r = shift; + if (@_) { + my $x = shift; + $p = !defined $x ? !defined $r : !defined $r ? 0 : $r eq $x; + } + print $p ? "ok" : "not ok", ' ', ++$count, "\n"; +} + +use Unicode::Normalize; + +ok(1); + +sub _pack_U { Unicode::Normalize::pack_U(@_) } +sub _unpack_U { Unicode::Normalize::unpack_U(@_) } + +######################### + +ok(NFD(""), ""); +ok(NFC(""), ""); +ok(NFKD(""), ""); +ok(NFKC(""), ""); + +ok(NFD("A"), "A"); +ok(NFC("A"), "A"); +ok(NFKD("A"), "A"); +ok(NFKC("A"), "A"); + +# 9 + +# don't modify the source +my $sNFD = "\x{FA19}"; +ok(NFD($sNFD), "\x{795E}"); +ok($sNFD, "\x{FA19}"); + +my $sNFC = "\x{FA1B}"; +ok(NFC($sNFC), "\x{798F}"); +ok($sNFC, "\x{FA1B}"); + +my $sNFKD = "\x{FA1E}"; +ok(NFKD($sNFKD), "\x{7FBD}"); +ok($sNFKD, "\x{FA1E}"); + +my $sNFKC = "\x{FA26}"; +ok(NFKC($sNFKC), "\x{90FD}"); +ok($sNFKC, "\x{FA26}"); + +# 17 + +sub hexNFC { + join " ", map sprintf("%04X", $_), + _unpack_U NFC _pack_U map hex, split ' ', shift; +} +sub hexNFD { + join " ", map sprintf("%04X", $_), + _unpack_U NFD _pack_U map hex, split ' ', shift; +} + +ok(hexNFD("1E14 AC01"), "0045 0304 0300 1100 1161 11A8"); +ok(hexNFD("AC00 AE00"), "1100 1161 1100 1173 11AF"); + +ok(hexNFC("0061 0315 0300 05AE 05C4 0062"), "00E0 05AE 05C4 0315 0062"); +ok(hexNFC("00E0 05AE 05C4 0315 0062"), "00E0 05AE 05C4 0315 0062"); +ok(hexNFC("0061 05AE 0300 05C4 0315 0062"), "00E0 05AE 05C4 0315 0062"); +ok(hexNFC("0045 0304 0300 AC00 11A8"), "1E14 AC01"); +ok(hexNFC("1100 1161 1100 1173 11AF"), "AC00 AE00"); +ok(hexNFC("1100 0300 1161 1173 11AF"), "1100 0300 1161 1173 11AF"); + +ok(hexNFD("0061 0315 0300 05AE 05C4 0062"), "0061 05AE 0300 05C4 0315 0062"); +ok(hexNFD("00E0 05AE 05C4 0315 0062"), "0061 05AE 0300 05C4 0315 0062"); +ok(hexNFD("0061 05AE 0300 05C4 0315 0062"), "0061 05AE 0300 05C4 0315 0062"); +ok(hexNFC("0061 05C4 0315 0300 05AE 0062"), "0061 05AE 05C4 0300 0315 0062"); +ok(hexNFC("0061 05AE 05C4 0300 0315 0062"), "0061 05AE 05C4 0300 0315 0062"); +ok(hexNFD("0061 05C4 0315 0300 05AE 0062"), "0061 05AE 05C4 0300 0315 0062"); +ok(hexNFD("0061 05AE 05C4 0300 0315 0062"), "0061 05AE 05C4 0300 0315 0062"); +ok(hexNFC("0000 0041 0000 0000"), "0000 0041 0000 0000"); +ok(hexNFD("0000 0041 0000 0000"), "0000 0041 0000 0000"); + +ok(hexNFC("AC00 11A7"), "AC00 11A7"); +ok(hexNFC("AC00 11A8"), "AC01"); +ok(hexNFC("AC00 11A9"), "AC02"); +ok(hexNFC("AC00 11C2"), "AC1B"); +ok(hexNFC("AC00 11C3"), "AC00 11C3"); + +# 39 + +# Test Cases from Public Review Issue #29: Normalization Issue +# cf. http://www.unicode.org/review/pr-29.html +ok(hexNFC("0B47 0300 0B3E"), "0B47 0300 0B3E"); +ok(hexNFC("1100 0300 1161"), "1100 0300 1161"); +ok(hexNFC("0B47 0B3E 0300"), "0B4B 0300"); +ok(hexNFC("1100 1161 0300"), "AC00 0300"); +ok(hexNFC("0B47 0300 0B3E 0327"), "0B47 0300 0B3E 0327"); +ok(hexNFC("1100 0300 1161 0327"), "1100 0300 1161 0327"); + +ok(hexNFC("0300 0041"), "0300 0041"); +ok(hexNFC("0300 0301 0041"), "0300 0301 0041"); +ok(hexNFC("0301 0300 0041"), "0301 0300 0041"); +ok(hexNFC("0000 0300 0000 0301"), "0000 0300 0000 0301"); +ok(hexNFC("0000 0301 0000 0300"), "0000 0301 0000 0300"); + +ok(hexNFC("0327 0061 0300"), "0327 00E0"); +ok(hexNFC("0301 0061 0300"), "0301 00E0"); +ok(hexNFC("0315 0061 0300"), "0315 00E0"); +ok(hexNFC("0000 0327 0061 0300"), "0000 0327 00E0"); +ok(hexNFC("0000 0301 0061 0300"), "0000 0301 00E0"); +ok(hexNFC("0000 0315 0061 0300"), "0000 0315 00E0"); + +# 56 + +# NFC() and NFKC() should be unary. +my $str11 = _pack_U(0x41, 0x0302, 0x0301, 0x62); +my $str12 = _pack_U(0x1EA4, 0x62); +ok(NFC $str11 eq $str12); +ok(NFKC $str11 eq $str12); + +# NFD() and NFKD() should be unary. +my $str21 = _pack_U(0xE0, 0xAC00); +my $str22 = _pack_U(0x61, 0x0300, 0x1100, 0x1161); +ok(NFD $str21 eq $str22); +ok(NFKD $str21 eq $str22); + +# 60 + +## Bug #53197: NFKC("\x{2000}") produces... + +ok(NFKC("\x{2002}") eq ' '); +ok(NFKD("\x{2002}") eq ' '); +ok(NFKC("\x{2000}") eq ' '); +ok(NFKD("\x{2000}") eq ' '); + +ok(NFKC("\x{210C}") eq 'H'); +ok(NFKD("\x{210C}") eq 'H'); +ok(NFKC("\x{210D}") eq 'H'); +ok(NFKD("\x{210D}") eq 'H'); + +ok(NFC("\x{F907}") eq "\x{9F9C}"); +ok(NFD("\x{F907}") eq "\x{9F9C}"); +ok(NFKC("\x{F907}") eq "\x{9F9C}"); +ok(NFKD("\x{F907}") eq "\x{9F9C}"); + +# 72 + diff --git a/dist/Unicode-Normalize/t/tie.t b/dist/Unicode-Normalize/t/tie.t new file mode 100644 index 0000000000..4fdd121e07 --- /dev/null +++ b/dist/Unicode-Normalize/t/tie.t @@ -0,0 +1,82 @@ + +BEGIN { + unless ('A' eq pack('U', 0x41)) { + print "1..0 # Unicode::Normalize cannot pack a Unicode code point\n"; + exit 0; + } + unless (0x41 == unpack('U', 'A')) { + print "1..0 # Unicode::Normalize cannot get a Unicode code point\n"; + exit 0; + } +} + +BEGIN { + if ($ENV{PERL_CORE}) { + chdir('t') if -d 't'; + @INC = $^O eq 'MacOS' ? qw(::lib) : qw(../lib); + } +} + +######################### + +BEGIN { + use Unicode::Normalize qw(:all); + + unless (exists &Unicode::Normalize::bootstrap or 5.008 <= $]) { + print "1..0 # skipped: XSUB, or Perl 5.8.0 or later". + " needed for this test\n"; + print $@; + exit; + } +} + +use strict; +use warnings; +BEGIN { $| = 1; print "1..17\n"; } +my $count = 0; +sub ok ($;$) { + my $p = my $r = shift; + if (@_) { + my $x = shift; + $p = !defined $x ? !defined $r : !defined $r ? 0 : $r eq $x; + } + print $p ? "ok" : "not ok", ' ', ++$count, "\n"; +} + +ok(1); + +package tiescalar; +sub TIESCALAR { + my ($class, $instance) = @_; + return bless \$instance => $class; +} +sub FETCH { return ${$_[0]}++ } +sub STORE { return ${$_[0]} = $_[1] } +sub DESTROY { undef ${$_[0]} } + +######################### + +package main; + +tie my $tie1, 'tiescalar', "123"; +ok(NFD($tie1), 123); +ok(NFC($tie1), 124); +ok(NFKD($tie1), 125); +ok(NFKC($tie1), 126); +ok(FCD($tie1), 127); +ok(FCC($tie1), 128); + +tie my $tie2, 'tiescalar', "256"; +ok(normalize('NFD', $tie2), 256); +ok(normalize('NFC', $tie2), 257); +ok(normalize('NFKD', $tie2), 258); +ok(normalize('NFKC', $tie2), 259); +ok(normalize('FCD', $tie2), 260); +ok(normalize('FCC', $tie2), 261); + +tie my $tie3, 'tiescalar', "315"; +ok(decompose($tie3), 315); +ok(reorder($tie3), 316); +ok(compose($tie3), 317); +ok(composeContiguous($tie3), 318); + |