summaryrefslogtreecommitdiff
path: root/ext
diff options
context:
space:
mode:
authorRafael Garcia-Suarez <rgarciasuarez@gmail.com>2005-04-05 15:23:47 +0000
committerRafael Garcia-Suarez <rgarciasuarez@gmail.com>2005-04-05 15:23:47 +0000
commit628bbff0c15c13c4704c1f63e1b8dac4c47eb639 (patch)
tree1d55f9a355720ba8a4b08fead7a4d4011f7cd64d /ext
parent0723351e0aae3b4ed046fabd41bf188a3d6a77df (diff)
downloadperl-628bbff0c15c13c4704c1f63e1b8dac4c47eb639.tar.gz
Upgrade to Unicode::Normalize 0.32
p4raw-id: //depot/perl@24166
Diffstat (limited to 'ext')
-rw-r--r--ext/Unicode/Normalize/Changes14
-rw-r--r--ext/Unicode/Normalize/Makefile.PL17
-rw-r--r--ext/Unicode/Normalize/Normalize.pm116
-rw-r--r--ext/Unicode/Normalize/Normalize.xs2
-rw-r--r--ext/Unicode/Normalize/mkheader26
-rw-r--r--ext/Unicode/Normalize/t/form.t4
-rw-r--r--ext/Unicode/Normalize/t/norm.t18
-rw-r--r--ext/Unicode/Normalize/t/test.t22
8 files changed, 175 insertions, 44 deletions
diff --git a/ext/Unicode/Normalize/Changes b/ext/Unicode/Normalize/Changes
index bb1b6930e0..9c0271b61f 100644
--- a/ext/Unicode/Normalize/Changes
+++ b/ext/Unicode/Normalize/Changes
@@ -1,8 +1,18 @@
Revision history for Perl extension Unicode::Normalize.
+0.32 Tue Apr 5 22:47:09 2005
+ - Some literal and grammatical errors in POD are fixed.
+
+0.31 Tue Apr 5 21:43:20 2005
+ - CAVEATS in POD is added.
+ - Some test cases from Unicode Public Review Issue #29
+ (Normalization Issue) are added to norm.t and test.t.
+ - do 'mkheader' returns true so that Makefile.PL will catch error.
+ - META.yml is added.
+
0.30 Sun May 2 14:35:00 2004
- - XSUB: (perl 5.8.1 or later) improved utf8 upgrade of non-POK (private POK)
- values like tied scalars, overloaded objects, etc.
+ - XSUB: (perl 5.8.1 or later) improved utf8 upgrade of non-POK
+ (private POK) values like tied scalars, overloaded objects, etc.
0.28 Sat Nov 22 23:46:24 2003
- XSUB: even if string contains a malformed, "short" Unicode character,
diff --git a/ext/Unicode/Normalize/Makefile.PL b/ext/Unicode/Normalize/Makefile.PL
index 2f37b62c4d..ae920dc5ab 100644
--- a/ext/Unicode/Normalize/Makefile.PL
+++ b/ext/Unicode/Normalize/Makefile.PL
@@ -1,3 +1,4 @@
+require 5.006001;
use ExtUtils::MakeMaker;
my $clean = {};
@@ -5,13 +6,25 @@ my $clean = {};
if (-f "Normalize.xs") {
print STDERR "Making header files for XS...\n";
- do "mkheader";
+ do 'mkheader' or die $@ || "mkheader: $!";
+
$clean = { FILES => 'unfcan.h unfcmb.h unfcmp.h unfcpt.h unfexc.h' };
}
WriteMakefile(
- 'INSTALLDIRS' => $] >= 5.007 ? 'perl' : 'site',
+ 'INSTALLDIRS' => $] >= 5.007002 ? 'perl' : 'site',
'NAME' => 'Unicode::Normalize',
'VERSION_FROM' => 'Normalize.pm', # finds $VERSION
'clean' => $clean,
+ 'PREREQ_PM' => {
+ Carp => 0,
+ constant => 0,
+ DynaLoader => 0,
+ Exporter => 0,
+ File::Copy => 0,
+ File::Spec => 0,
+ strict => 0,
+ Test => 0,
+ warnings => 0,
+ },
);
diff --git a/ext/Unicode/Normalize/Normalize.pm b/ext/Unicode/Normalize/Normalize.pm
index 09ef371cc8..8f5f4ccef4 100644
--- a/ext/Unicode/Normalize/Normalize.pm
+++ b/ext/Unicode/Normalize/Normalize.pm
@@ -13,7 +13,7 @@ use Carp;
no warnings 'utf8';
-our $VERSION = '0.30';
+our $VERSION = '0.32';
our $PACKAGE = __PACKAGE__;
require Exporter;
@@ -81,7 +81,7 @@ sub normalize($$)
{
my $form = shift;
my $str = shift;
- return exists $formNorm{$form}
+ return exists $formNorm{$form}
? $formNorm{$form}->($str)
: croak $PACKAGE."::normalize: invalid form name: $form";
}
@@ -103,7 +103,7 @@ sub check($$)
{
my $form = shift;
my $str = shift;
- return exists $formCheck{$form}
+ return exists $formCheck{$form}
? $formCheck{$form}->($str)
: croak $PACKAGE."::check: invalid form name: $form";
}
@@ -145,9 +145,9 @@ C<$string> is used as a string under character semantics
C<$codepoint> should be an unsigned integer
representing a Unicode code point.
-Note: Between XS edition and pure Perl edition,
-interpretation of C<$codepoint> as a decimal number has incompatibility.
-XS converts C<$codepoint> to an unsigned integer, but pure Perl does not.
+Note: Between XSUB and pure Perl, there is an incompatibility
+about the interpretation of C<$codepoint> as a decimal number.
+XSUB converts C<$codepoint> to an unsigned integer, but pure Perl does not.
Do not use a floating point nor a negative sign in C<$codepoint>.
=head2 Normalization Forms
@@ -259,36 +259,48 @@ The result returned will be:
=item C<$result = checkNFD($string)>
-returns C<YES> (C<1>) or C<NO> (C<empty string>).
+returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>.
=item C<$result = checkNFC($string)>
-returns C<YES> (C<1>), C<NO> (C<empty string>), or C<MAYBE> (C<undef>).
+returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>;
+C<undef> if C<MAYBE>.
=item C<$result = checkNFKD($string)>
-returns C<YES> (C<1>) or C<NO> (C<empty string>).
+returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>.
=item C<$result = checkNFKC($string)>
-returns C<YES> (C<1>), C<NO> (C<empty string>), or C<MAYBE> (C<undef>).
+returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>;
+C<undef> if C<MAYBE>.
=item C<$result = checkFCD($string)>
-returns C<YES> (C<1>) or C<NO> (C<empty string>).
+returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>.
=item C<$result = checkFCC($string)>
-returns C<YES> (C<1>), C<NO> (C<empty string>), or C<MAYBE> (C<undef>).
+returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>;
+C<undef> if C<MAYBE>.
If a string is not in FCD, it must not be in FCC.
So C<checkFCC($not_FCD_string)> should return C<NO>.
=item C<$result = check($form_name, $string)>
-returns C<YES> (C<1>), C<NO> (C<empty string>), or C<MAYBE> (C<undef>).
+returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>;
+C<undef> if C<MAYBE>.
-C<$form_name> is alike to that for C<normalize()>.
+As C<$form_name>, one of the following names must be given.
+
+ 'C' or 'NFC' for Normalization Form C (UAX #15)
+ 'D' or 'NFD' for Normalization Form D (UAX #15)
+ 'KC' or 'NFKC' for Normalization Form KC (UAX #15)
+ 'KD' or 'NFKD' for Normalization Form KD (UAX #15)
+
+ 'FCD' for "Fast C or D" Form (UTN #5)
+ 'FCC' for "Fast C Contiguous" (UTN #5)
=back
@@ -308,12 +320,19 @@ C<"A\N{COMBINING ACUTE ACCENT}"> is not in NFC
(its NFC is C<"\N{LATIN CAPITAL LETTER A WITH ACUTE}">),
while C<"B\N{COMBINING ACUTE ACCENT}"> is in NFC.
-If you want to check exactly, compare the string with its NFC/NFKC/FCC;
-i.e.,
+If you want to check exactly, compare the string with its NFC/NFKC/FCC.
+
+ if ($string eq NFC($string)) {
+ # $string is exactly normalized in NFC;
+ } else {
+ # $string is not normalized in NFC;
+ }
- $string eq NFC($string) # thorough than checkNFC($string)
- $string eq NFKC($string) # thorough than checkNFKC($string)
- $string eq FCC($string) # thorough than checkFCC($string)
+ if ($string eq NFKC($string)) {
+ # $string is exactly normalized in NFKC;
+ } else {
+ # $string is not normalized in NFKC;
+ }
=head2 Character Data
@@ -376,22 +395,60 @@ Composition Exclusions and Non-Starter Decompositions).
=back
-=head2 EXPORT
+=head1 EXPORT
C<NFC>, C<NFD>, C<NFKC>, C<NFKD>: by default.
C<normalize> and other some functions: on request.
+=head1 CAVEATS
+
+=over 4
+
+=item Perl's version vs. Unicode version
+
+Since this module refers to perl core's Unicode database in the directory
+F</lib/unicore> (or formerly F</lib/unicode>), the Unicode version of
+normalization implemented by this module depends on your perl's version.
+
+ perl's version implemented Unicode version
+ 5.6.1 3.0.1
+ 5.7.2 3.1.0
+ 5.7.3 3.1.1 (same normalized form as that of 3.1.0)
+ 5.8.0 3.2.0
+ 5.8.1-5.8.3 4.0.0
+ 5.8.4-5.8.6 (latest) 4.0.1 (same normalized form as that of 4.0.0)
+
+=item Correction of decomposition mapping
+
+In older Unicode versions, a small number of characters (all of which are
+CJK compatibility ideographs as far as they have been found) may have
+an erroneous decomposition mapping (see F<NormalizationCorrections.txt>).
+Anyhow, this module will neither refer to F<NormalizationCorrections.txt>
+nor provide any specific version of normalization. Therefore this module
+running on an older perl with an older Unicode database may use
+the erroneous decomposition mapping blindly conforming to the Unicode database.
+
+=item Revised definition of canonical composition
+
+In Unicode 4.1.0, the definition D2 of canonical composition (which
+affects NFC and NFKC) has been changed (see Public Review Issue #29
+and recent UAX #15). This module has used the newer definition
+since the version 0.07 (Oct 31, 2001).
+This module does not support normalization according to the older
+definition, even if the Unicode version implemented by perl is
+lower than 4.1.0.
+
+=back
+
=head1 AUTHOR
SADAHIRO Tomoyuki <SADAHIRO@cpan.org>
- http://homepage1.nifty.com/nomenclator/perl/
-
- Copyright(C) 2001-2004, SADAHIRO Tomoyuki. Japan. All rights reserved.
+Copyright(C) 2001-2005, SADAHIRO Tomoyuki. Japan. All rights reserved.
- This module is free software; you can redistribute it
- and/or modify it under the same terms as Perl itself.
+This module is free software; you can redistribute it
+and/or modify it under the same terms as Perl itself.
=head1 SEE ALSO
@@ -405,6 +462,14 @@ Unicode Normalization Forms - UAX #15
Derived Normalization Properties
+=item http://www.unicode.org/Public/UNIDATA/NormalizationCorrections.txt
+
+Normalization Corrections
+
+=item http://www.unicode.org/review/pr-29.html
+
+Public Review Issue #29: Normalization Issue
+
=item http://www.unicode.org/notes/tn5/
Canonical Equivalence in Applications - UTN #5
@@ -412,4 +477,3 @@ Canonical Equivalence in Applications - UTN #5
=back
=cut
-
diff --git a/ext/Unicode/Normalize/Normalize.xs b/ext/Unicode/Normalize/Normalize.xs
index 13544c9240..7398ce039a 100644
--- a/ext/Unicode/Normalize/Normalize.xs
+++ b/ext/Unicode/Normalize/Normalize.xs
@@ -23,7 +23,7 @@
/* UTF8_ALLOW_BOM is used before Perl 5.8.0 */
#ifdef UTF8_ALLOW_BOM
#define AllowAnyUTF (UTF8_ALLOW_SURROGATE|UTF8_ALLOW_BOM|UTF8_ALLOW_FFFF)
-#else
+#else
#define AllowAnyUTF (UTF8_ALLOW_SURROGATE|UTF8_ALLOW_FFFF)
#endif
diff --git a/ext/Unicode/Normalize/mkheader b/ext/Unicode/Normalize/mkheader
index 4da4d07528..ff30759c95 100644
--- a/ext/Unicode/Normalize/mkheader
+++ b/ext/Unicode/Normalize/mkheader
@@ -1,13 +1,22 @@
#!perl
#
-# This script generates "unfcan.h", "unfcpt.h", "unfcmb.h",
-# "unfcmp.h", and "unfexc.h"
-# from CombiningClass.pl, Decomposition.pl, CompositionExclusions.txt
-# in lib/unicore or unicode directory
-# for Unicode::Normalize.xs. (cf. Makefile.PL)
+# This auxiliary script makes five header files
+# used for building XSUB of Unicode::Normalize.
#
-# Usage: <perl mkheader> in command line
-# or <do 'mkheader'> in perl
+# Usage:
+# <do 'mkheader'> in perl, or <perl mkheader> in command line
+#
+# Input files:
+# unicore/CombiningClass.pl (or unicode/CombiningClass.pl)
+# unicore/Decomposition.pl (or unicode/Decomposition.pl)
+# unicore/CompositionExclusions.txt (or unicode/CompExcl.txt)
+#
+# Output files:
+# unfcan.h
+# unfcpt.h
+# unfcmb.h
+# unfcmp.h
+# unfexc.h
#
use 5.006;
use strict;
@@ -197,7 +206,7 @@ foreach my $key (keys %Canon) {
}
# exhaustive decomposition
-foreach my $key (keys %Compat) {
+foreach my $key (keys %Compat) {
$Compat{$key} = [ getCompatList($key) ];
}
@@ -387,4 +396,5 @@ EOF
close FH;
}
+1;
__END__
diff --git a/ext/Unicode/Normalize/t/form.t b/ext/Unicode/Normalize/t/form.t
index 4e9b885aa9..27cd177596 100644
--- a/ext/Unicode/Normalize/t/form.t
+++ b/ext/Unicode/Normalize/t/form.t
@@ -9,8 +9,8 @@ BEGIN {
BEGIN {
if ($ENV{PERL_CORE}) {
- chdir('t') if -d 't';
- @INC = $^O eq 'MacOS' ? qw(::lib) : qw(../lib);
+ chdir('t') if -d 't';
+ @INC = $^O eq 'MacOS' ? qw(::lib) : qw(../lib);
}
}
diff --git a/ext/Unicode/Normalize/t/norm.t b/ext/Unicode/Normalize/t/norm.t
index 90c037a0dc..a9399075ba 100644
--- a/ext/Unicode/Normalize/t/norm.t
+++ b/ext/Unicode/Normalize/t/norm.t
@@ -19,7 +19,7 @@ BEGIN {
use Test;
use strict;
use warnings;
-BEGIN { plan tests => 18 };
+BEGIN { plan tests => 29 };
use Unicode::Normalize qw(normalize);
ok(1); # If we made it this far, we're ok.
@@ -57,3 +57,19 @@ ok(hexNFD("0061 05AE 05C4 0300 0315 0062"), "0061 05AE 05C4 0300 0315 0062");
ok(hexNFC("0000 0041 0000 0000"), "0000 0041 0000 0000");
ok(hexNFD("0000 0041 0000 0000"), "0000 0041 0000 0000");
+ok(hexNFC("AC00 11A7"), "AC00 11A7");
+ok(hexNFC("AC00 11A8"), "AC01");
+ok(hexNFC("AC00 11A9"), "AC02");
+ok(hexNFC("AC00 11C2"), "AC1B");
+ok(hexNFC("AC00 11C3"), "AC00 11C3");
+
+# Test Cases from Public Review Issue #29: Normalization Issue
+# cf. http://www.unicode.org/review/pr-29.html
+ok(hexNFC("0B47 0300 0B3E"), "0B47 0300 0B3E");
+ok(hexNFC("1100 0300 1161"), "1100 0300 1161");
+
+ok(hexNFC("0B47 0B3E 0300"), "0B4B 0300");
+ok(hexNFC("1100 1161 0300"), "AC00 0300");
+
+ok(hexNFC("0B47 0300 0B3E 0327"), "0B47 0300 0B3E 0327");
+ok(hexNFC("1100 0300 1161 0327"), "1100 0300 1161 0327");
diff --git a/ext/Unicode/Normalize/t/test.t b/ext/Unicode/Normalize/t/test.t
index c5ebf3a119..8e3369f58a 100644
--- a/ext/Unicode/Normalize/t/test.t
+++ b/ext/Unicode/Normalize/t/test.t
@@ -19,7 +19,7 @@ BEGIN {
use Test;
use strict;
use warnings;
-BEGIN { plan tests => 20 };
+BEGIN { plan tests => 31 };
use Unicode::Normalize;
ok(1); # If we made it this far, we're ok.
@@ -57,11 +57,29 @@ ok(hexNFD("0061 05AE 05C4 0300 0315 0062"), "0061 05AE 05C4 0300 0315 0062");
ok(hexNFC("0000 0041 0000 0000"), "0000 0041 0000 0000");
ok(hexNFD("0000 0041 0000 0000"), "0000 0041 0000 0000");
-# should be unary.
+ok(hexNFC("AC00 11A7"), "AC00 11A7");
+ok(hexNFC("AC00 11A8"), "AC01");
+ok(hexNFC("AC00 11A9"), "AC02");
+ok(hexNFC("AC00 11C2"), "AC1B");
+ok(hexNFC("AC00 11C3"), "AC00 11C3");
+
+# Test Cases from Public Review Issue #29: Normalization Issue
+# cf. http://www.unicode.org/review/pr-29.html
+ok(hexNFC("0B47 0300 0B3E"), "0B47 0300 0B3E");
+ok(hexNFC("1100 0300 1161"), "1100 0300 1161");
+
+ok(hexNFC("0B47 0B3E 0300"), "0B4B 0300");
+ok(hexNFC("1100 1161 0300"), "AC00 0300");
+
+ok(hexNFC("0B47 0300 0B3E 0327"), "0B47 0300 0B3E 0327");
+ok(hexNFC("1100 0300 1161 0327"), "1100 0300 1161 0327");
+
+# NFC() should be unary.
my $str11 = _pack_U(0x41, 0x0302, 0x0301, 0x62);
my $str12 = _pack_U(0x1EA4, 0x62);
ok(NFC $str11 eq $str12);
+# NFD() should be unary.
my $str21 = _pack_U(0xE0, 0xAC00);
my $str22 = _pack_U(0x61, 0x0300, 0x1100, 0x1161);
ok(NFD $str21 eq $str22);