summaryrefslogtreecommitdiff
path: root/ext/Unicode
diff options
context:
space:
mode:
authorSteve Peters <steve@fisharerojo.org>2006-06-13 01:00:02 +0000
committerSteve Peters <steve@fisharerojo.org>2006-06-13 01:00:02 +0000
commitfe067ad959549a513d3f99948bd05deb85d6e222 (patch)
treec9470701aeee4c654a2b3275487e309269edf264 /ext/Unicode
parent283d8f99b00e13e7093e982ec62814fc64bdc2ff (diff)
downloadperl-fe067ad959549a513d3f99948bd05deb85d6e222.tar.gz
Upgrade to Unicode-Normalize-1.00
p4raw-id: //depot/perl@28389
Diffstat (limited to 'ext/Unicode')
-rw-r--r--ext/Unicode/Normalize/Changes17
-rw-r--r--ext/Unicode/Normalize/Normalize.pm229
-rw-r--r--ext/Unicode/Normalize/Normalize.xs658
-rw-r--r--ext/Unicode/Normalize/README4
-rw-r--r--ext/Unicode/Normalize/mkheader6
-rw-r--r--ext/Unicode/Normalize/t/fcdc.t92
-rw-r--r--ext/Unicode/Normalize/t/func.t387
-rw-r--r--ext/Unicode/Normalize/t/norm.t54
-rw-r--r--ext/Unicode/Normalize/t/null.t50
-rw-r--r--ext/Unicode/Normalize/t/test.t48
10 files changed, 1052 insertions, 493 deletions
diff --git a/ext/Unicode/Normalize/Changes b/ext/Unicode/Normalize/Changes
index 9c0271b61f..8d05a3dfd6 100644
--- a/ext/Unicode/Normalize/Changes
+++ b/ext/Unicode/Normalize/Changes
@@ -1,5 +1,16 @@
Revision history for Perl extension Unicode::Normalize.
+1.00 Thu May 25 20:35:06 2006
+ - Pure Perl: compose($not_canonically_reordered) works like that in XSUB,
+ where an intervening character with higher combining class blocks
+ the composition. (This change doesn't affect any normalization forms.)
+ - XSUB: NFD(), NFC(), NFKD(), NFC(), and FCC() are now in XSUB, then
+ internal subroutine calls are avoided.
+ - The functions isComp_Ex(), isNFD_NO(), isNFC_NO(), isNFC_MAYBE(),
+ isNFKD_NO(), isNFKC_NO(), and isNFKC_MAYBE() are documented.
+ - Tests are more amplified and documentations are more clarified.
+ - Makefile.PL: Change 26295 is incorporated.
+
0.32 Tue Apr 5 22:47:09 2005
- Some literal and grammatical errors in POD are fixed.
@@ -62,13 +73,13 @@ Revision history for Perl extension Unicode::Normalize.
0.18 ... unreleased
- synchronization with bleadperl.
- - Change 16262: by me
+ - Change 16262: by sadahiro
0.17 Sun Apr 28 23:13:32 2002
- now normalize('NFC',$1) should work.
- Some croak()'s are added in mkheader.
- synchronization with bleadperl.
- - Change 15596: by me
+ - Change 15596: by sadahiro
- Change 16136: by pudge
0.16 Thu Mar 21 13:36:14 2002
@@ -85,7 +96,7 @@ Revision history for Perl extension Unicode::Normalize.
- synchronization with bleadperl.
- Change 14128: by Arthur
- Change 14129: by jhi
- - Change 14156:
+ - Change 14156: by sadahiro
- Change 14199: by Nikola Knezevic
- Change 14308: by Benjamin Goldberg
- Change 14370: by jhi
diff --git a/ext/Unicode/Normalize/Normalize.pm b/ext/Unicode/Normalize/Normalize.pm
index 8f5f4ccef4..16d7664b33 100644
--- a/ext/Unicode/Normalize/Normalize.pm
+++ b/ext/Unicode/Normalize/Normalize.pm
@@ -13,7 +13,7 @@ use Carp;
no warnings 'utf8';
-our $VERSION = '0.32';
+our $VERSION = '1.00';
our $PACKAGE = __PACKAGE__;
require Exporter;
@@ -43,12 +43,16 @@ bootstrap Unicode::Normalize $VERSION;
######
+##
+## utilites for tests
+##
+
sub pack_U {
return pack('U*', @_);
}
sub unpack_U {
- return unpack('U*', pack('U*').shift);
+ return unpack('U*', shift(@_).pack('U*'));
}
@@ -56,18 +60,10 @@ sub unpack_U {
## normalization forms
##
-use constant COMPAT => 1;
-
-sub NFD ($) { reorder(decompose($_[0])) }
-sub NFKD ($) { reorder(decompose($_[0], COMPAT)) }
-sub NFC ($) { compose(reorder(decompose($_[0]))) }
-sub NFKC ($) { compose(reorder(decompose($_[0], COMPAT))) }
-
sub FCD ($) {
my $str = shift;
return checkFCD($str) ? $str : NFD($str);
}
-sub FCC ($) { composeContiguous(reorder(decompose($_[0]))) }
our %formNorm = (
NFC => \&NFC, C => \&NFC,
@@ -81,9 +77,10 @@ sub normalize($$)
{
my $form = shift;
my $str = shift;
- return exists $formNorm{$form}
- ? $formNorm{$form}->($str)
- : croak $PACKAGE."::normalize: invalid form name: $form";
+ if (exists $formNorm{$form}) {
+ return $formNorm{$form}->($str);
+ }
+ croak($PACKAGE."::normalize: invalid form name: $form");
}
@@ -103,9 +100,10 @@ sub check($$)
{
my $form = shift;
my $str = shift;
- return exists $formCheck{$form}
- ? $formCheck{$form}->($str)
- : croak $PACKAGE."::check: invalid form name: $form";
+ if (exists $formCheck{$form}) {
+ return $formCheck{$form}->($str);
+ }
+ croak($PACKAGE."::check: invalid form name: $form");
}
1;
@@ -139,16 +137,14 @@ Unicode::Normalize - Unicode Normalization Forms
Parameters:
-C<$string> is used as a string under character semantics
-(see F<perlunicode>).
+C<$string> is used as a string under character semantics (see F<perlunicode>).
-C<$codepoint> should be an unsigned integer
-representing a Unicode code point.
+C<$code_point> should be an unsigned integer representing a Unicode code point.
Note: Between XSUB and pure Perl, there is an incompatibility
-about the interpretation of C<$codepoint> as a decimal number.
-XSUB converts C<$codepoint> to an unsigned integer, but pure Perl does not.
-Do not use a floating point nor a negative sign in C<$codepoint>.
+about the interpretation of C<$code_point> as a decimal number.
+XSUB converts C<$code_point> to an unsigned integer, but pure Perl does not.
+Do not use a floating point nor a negative sign in C<$code_point>.
=head2 Normalization Forms
@@ -156,38 +152,40 @@ Do not use a floating point nor a negative sign in C<$codepoint>.
=item C<$NFD_string = NFD($string)>
-returns the Normalization Form D (formed by canonical decomposition).
+It returns the Normalization Form D (formed by canonical decomposition).
=item C<$NFC_string = NFC($string)>
-returns the Normalization Form C (formed by canonical decomposition
+It returns the Normalization Form C (formed by canonical decomposition
followed by canonical composition).
=item C<$NFKD_string = NFKD($string)>
-returns the Normalization Form KD (formed by compatibility decomposition).
+It returns the Normalization Form KD (formed by compatibility decomposition).
=item C<$NFKC_string = NFKC($string)>
-returns the Normalization Form KC (formed by compatibility decomposition
+It returns the Normalization Form KC (formed by compatibility decomposition
followed by B<canonical> composition).
=item C<$FCD_string = FCD($string)>
If the given string is in FCD ("Fast C or D" form; cf. UTN #5),
-returns it without modification; otherwise returns an FCD string.
+it returns the string without modification; otherwise it returns an FCD string.
Note: FCD is not always unique, then plural forms may be equivalent
each other. C<FCD()> will return one of these equivalent forms.
=item C<$FCC_string = FCC($string)>
-returns the FCC form ("Fast C Contiguous"; cf. UTN #5).
+It returns the FCC form ("Fast C Contiguous"; cf. UTN #5).
Note: FCC is unique, as well as four normalization forms (NF*).
=item C<$normalized_string = normalize($form_name, $string)>
+It returns the normalization form of C<$form_name>.
+
As C<$form_name>, one of the following names must be given.
'C' or 'NFC' for Normalization Form C (UAX #15)
@@ -204,39 +202,39 @@ As C<$form_name>, one of the following names must be given.
=over 4
-=item C<$decomposed_string = decompose($string)>
+=item C<$decomposed_string = decompose($string [, $useCompatMapping])>
-=item C<$decomposed_string = decompose($string, $useCompatMapping)>
+It returns the concatenation of the decomposition of each character
+in the string.
-Decomposes the specified string and returns the result.
+If the second parameter (a boolean) is omitted or false,
+the decomposition is canonical decomposition;
+if the second parameter (a boolean) is true,
+the decomposition is compatibility decomposition.
-If the second parameter (a boolean) is omitted or false, decomposes it
-using the Canonical Decomposition Mapping.
-If true, decomposes it using the Compatibility Decomposition Mapping.
-
-The string returned is not always in NFD/NFKD.
-Reordering may be required.
+The string returned is not always in NFD/NFKD. Reordering may be required.
$NFD_string = reorder(decompose($string)); # eq. to NFD()
$NFKD_string = reorder(decompose($string, TRUE)); # eq. to NFKD()
-=item C<$reordered_string = reorder($string)>
+=item C<$reordered_string = reorder($string)>
-Reorders the combining characters and the like in the canonical ordering
-and returns the result.
+It returns the result of reordering the combining characters
+according to Canonical Ordering Behavior.
-E.g., when you have a list of NFD/NFKD strings,
-you can get the concatenated NFD/NFKD string from them, saying
+For example, when you have a list of NFD/NFKD strings,
+you can get the concatenated NFD/NFKD string from them, by saying
$concat_NFD = reorder(join '', @NFD_strings);
$concat_NFKD = reorder(join '', @NFKD_strings);
-=item C<$composed_string = compose($string)>
+=item C<$composed_string = compose($string)>
-Returns the string where composable pairs are composed.
+It returns the result of canonical composition
+without applying any decomposition.
-E.g., when you have a NFD/NFKD string,
-you can get its NFC/NFKC string, saying
+For example, when you have a NFD/NFKD string,
+you can get its NFC/NFKC string, by saying
$NFC_string = compose($NFD_string);
$NFKC_string = compose($NFKD_string);
@@ -249,7 +247,7 @@ you can get its NFC/NFKC string, saying
The following functions check whether the string is in that normalization form.
-The result returned will be:
+The result returned will be one of the following:
YES The string is in that normalization form.
NO The string is not in that normalization form.
@@ -259,37 +257,37 @@ The result returned will be:
=item C<$result = checkNFD($string)>
-returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>.
+It returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>.
=item C<$result = checkNFC($string)>
-returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>;
+It returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>;
C<undef> if C<MAYBE>.
=item C<$result = checkNFKD($string)>
-returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>.
+It returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>.
=item C<$result = checkNFKC($string)>
-returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>;
+It returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>;
C<undef> if C<MAYBE>.
=item C<$result = checkFCD($string)>
-returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>.
+It returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>.
=item C<$result = checkFCC($string)>
-returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>;
+It returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>;
C<undef> if C<MAYBE>.
-If a string is not in FCD, it must not be in FCC.
+Note: If a string is not in FCD, it must not be in FCC.
So C<checkFCC($not_FCD_string)> should return C<NO>.
=item C<$result = check($form_name, $string)>
-returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>;
+It returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>;
C<undef> if C<MAYBE>.
As C<$form_name>, one of the following names must be given.
@@ -342,56 +340,92 @@ call them yourself.
=over 4
-=item C<$canonical_decomposed = getCanon($codepoint)>
+=item C<$canonical_decomposition = getCanon($code_point)>
-If the character of the specified codepoint is canonically
-decomposable (including Hangul Syllables),
-returns the B<completely decomposed> string canonically equivalent to it.
+If the character is canonically decomposable (including Hangul Syllables),
+it returns the (full) canonical decomposition as a string.
+Otherwise it returns C<undef>.
-If it is not decomposable, returns C<undef>.
+B<Note:> According to the Unicode standard, the canonical decomposition
+of the character that is not canonically decomposable is same as
+the character itself.
-=item C<$compatibility_decomposed = getCompat($codepoint)>
+=item C<$compatibility_decomposition = getCompat($code_point)>
-If the character of the specified codepoint is compatibility
-decomposable (including Hangul Syllables),
-returns the B<completely decomposed> string compatibility equivalent to it.
+If the character is compatibility decomposable (including Hangul Syllables),
+it returns the (full) compatibility decomposition as a string.
+Otherwise it returns C<undef>.
-If it is not decomposable, returns C<undef>.
+B<Note:> According to the Unicode standard, the compatibility decomposition
+of the character that is not compatibility decomposable is same as
+the character itself.
-=item C<$codepoint_composite = getComposite($codepoint_here, $codepoint_next)>
+=item C<$code_point_composite = getComposite($code_point_here, $code_point_next)>
-If two characters here and next (as codepoints) are composable
+If two characters here and next (as code points) are composable
(including Hangul Jamo/Syllables and Composition Exclusions),
-returns the codepoint of the composite.
+it returns the code point of the composite.
+
+If they are not composable, it returns C<undef>.
-If they are not composable, returns C<undef>.
+=item C<$combining_class = getCombinClass($code_point)>
-=item C<$combining_class = getCombinClass($codepoint)>
+It returns the combining class (as an integer) of the character.
-Returns the combining class of the character as an integer.
+=item C<$may_be_composed_with_prev_char = isComp2nd($code_point)>
-=item C<$is_exclusion = isExclusion($codepoint)>
+It returns a boolean whether the character of the specified codepoint
+may be composed with the previous one in a certain composition
+(including Hangul Compositions, but excluding
+Composition Exclusions and Non-Starter Decompositions).
-Returns a boolean whether the character of the specified codepoint
-is a composition exclusion.
+=item C<$is_exclusion = isExclusion($code_point)>
-=item C<$is_singleton = isSingleton($codepoint)>
+It returns a boolean whether the code point is a composition exclusion.
-Returns a boolean whether the character of the specified codepoint is
-a singleton.
+=item C<$is_singleton = isSingleton($code_point)>
-=item C<$is_non_starter_decomposition = isNonStDecomp($codepoint)>
+It returns a boolean whether the code point is a singleton
-Returns a boolean whether the canonical decomposition
-of the character of the specified codepoint
-is a Non-Starter Decomposition.
+=item C<$is_non_starter_decomposition = isNonStDecomp($code_point)>
-=item C<$may_be_composed_with_prev_char = isComp2nd($codepoint)>
+It returns a boolean whether the code point has Non-Starter Decomposition.
-Returns a boolean whether the character of the specified codepoint
-may be composed with the previous one in a certain composition
-(including Hangul Compositions, but excluding
-Composition Exclusions and Non-Starter Decompositions).
+=item C<$is_Full_Composition_Exclusion = isComp_Ex($code_point)>
+
+It returns a boolean of the derived property Comp_Ex
+(Full_Composition_Exclusion). This property is generated from
+Composition Exclusions + Singletons + Non-Starter Decompositions.
+
+=item C<$NFD_is_NO = isNFD_NO($code_point)>
+
+It returns a boolean of the derived property NFD_NO
+(NFD_Quick_Check=No).
+
+=item C<$NFC_is_NO = isNFC_NO($code_point)>
+
+It returns a boolean of the derived property NFC_NO
+(NFC_Quick_Check=No).
+
+=item C<$NFC_is_MAYBE = isNFC_MAYBE($code_point)>
+
+It returns a boolean of the derived property NFC_MAYBE
+(NFC_Quick_Check=Maybe).
+
+=item C<$NFKD_is_NO = isNFKD_NO($code_point)>
+
+It returns a boolean of the derived property NFKD_NO
+(NFKD_Quick_Check=No).
+
+=item C<$NFKC_is_NO = isNFKC_NO($code_point)>
+
+It returns a boolean of the derived property NFKC_NO
+(NFKC_Quick_Check=No).
+
+=item C<$NFKC_is_MAYBE = isNFKC_MAYBE($code_point)>
+
+It returns a boolean of the derived property NFKC_MAYBE
+(NFKC_Quick_Check=Maybe).
=back
@@ -411,13 +445,14 @@ Since this module refers to perl core's Unicode database in the directory
F</lib/unicore> (or formerly F</lib/unicode>), the Unicode version of
normalization implemented by this module depends on your perl's version.
- perl's version implemented Unicode version
- 5.6.1 3.0.1
- 5.7.2 3.1.0
- 5.7.3 3.1.1 (same normalized form as that of 3.1.0)
- 5.8.0 3.2.0
- 5.8.1-5.8.3 4.0.0
- 5.8.4-5.8.6 (latest) 4.0.1 (same normalized form as that of 4.0.0)
+ perl's version implemented Unicode version
+ 5.6.1 3.0.1
+ 5.7.2 3.1.0
+ 5.7.3 3.1.1 (normalization is same as 3.1.0)
+ 5.8.0 3.2.0
+ 5.8.1-5.8.3 4.0.0
+ 5.8.4-5.8.6 4.0.1 (normalization is same as 4.0.0)
+ 5.8.7-5.8.8 4.1.0
=item Correction of decomposition mapping
@@ -445,7 +480,7 @@ lower than 4.1.0.
SADAHIRO Tomoyuki <SADAHIRO@cpan.org>
-Copyright(C) 2001-2005, SADAHIRO Tomoyuki. Japan. All rights reserved.
+Copyright(C) 2001-2006, SADAHIRO Tomoyuki. Japan. All rights reserved.
This module is free software; you can redistribute it
and/or modify it under the same terms as Perl itself.
@@ -458,6 +493,10 @@ and/or modify it under the same terms as Perl itself.
Unicode Normalization Forms - UAX #15
+=item http://www.unicode.org/Public/UNIDATA/CompositionExclusions.txt
+
+Composition Exclusion Table
+
=item http://www.unicode.org/Public/UNIDATA/DerivedNormalizationProps.txt
Derived Normalization Properties
diff --git a/ext/Unicode/Normalize/Normalize.xs b/ext/Unicode/Normalize/Normalize.xs
index 7398ce039a..e48ead9e2d 100644
--- a/ext/Unicode/Normalize/Normalize.xs
+++ b/ext/Unicode/Normalize/Normalize.xs
@@ -28,16 +28,29 @@
#endif
/* if utf8n_to_uvuni() sets retlen to 0 (?) */
-#define ErrRetlenIsZero "panic (Unicode::Normalize): zero-length character"
+#define ErrRetlenIsZero "panic (Unicode::Normalize %s): zero-length character"
/* utf8_hop() hops back before start. Maybe broken UTF-8 */
#define ErrHopBeforeStart "panic (Unicode::Normalize): hopping before start"
+/* It should never happen as there is no instance in UTF-8 and UTF-EBCDIC;
+ according to Versioning and Stability in UAX#15, no new composition
+ should come in future. */
+#define ErrLongerThanSrc "panic (Unicode::Normalize %s): longer than source"
+
+/* uvuni_to_utf8 wants UTF8_MAXBYTES free bytes available */
+#define ErrTargetNotEnough "panic (Unicode::Normalize %s): target not enough"
+
/* At present, char > 0x10ffff are unaffected without complaint, right? */
#define VALID_UTF_MAX (0x10ffff)
#define OVER_UTF_MAX(uv) (VALID_UTF_MAX < (uv))
-/* HANGUL_H */
+/* size of array for combining characters */
+/* enough as an initial value? */
+#define CC_SEQ_SIZE (10)
+#define CC_SEQ_STEP (5)
+
+/* HANGUL begin */
#define Hangul_SBase 0xAC00
#define Hangul_SFinal 0xD7A3
#define Hangul_SCount 11172
@@ -62,7 +75,7 @@
#define Hangul_IsL(u) ((Hangul_LBase <= (u)) && ((u) <= Hangul_LFinal))
#define Hangul_IsV(u) ((Hangul_VBase <= (u)) && ((u) <= Hangul_VFinal))
#define Hangul_IsT(u) ((Hangul_TBase < (u)) && ((u) <= Hangul_TFinal))
-/* HANGUL_H */
+/* HANGUL end */
/* this is used for canonical ordering of combining characters (c.c.). */
typedef struct {
@@ -71,7 +84,7 @@ typedef struct {
STRLEN pos; /* position */
} UNF_cc;
-static int compare_cc (const void *a, const void *b)
+static int compare_cc(const void *a, const void *b)
{
int ret_cc;
ret_cc = ((UNF_cc*) a)->cc - ((UNF_cc*) b)->cc;
@@ -82,7 +95,7 @@ static int compare_cc (const void *a, const void *b)
- ( ((UNF_cc*) a)->pos < ((UNF_cc*) b)->pos );
}
-static U8* dec_canonical (UV uv)
+static U8* dec_canonical(UV uv)
{
U8 ***plane, **row;
if (OVER_UTF_MAX(uv))
@@ -94,7 +107,7 @@ static U8* dec_canonical (UV uv)
return row ? row[uv & 0xff] : NULL;
}
-static U8* dec_compat (UV uv)
+static U8* dec_compat(UV uv)
{
U8 ***plane, **row;
if (OVER_UTF_MAX(uv))
@@ -106,21 +119,22 @@ static U8* dec_compat (UV uv)
return row ? row[uv & 0xff] : NULL;
}
-static UV composite_uv (UV uv, UV uv2)
+static UV composite_uv(UV uv, UV uv2)
{
UNF_complist ***plane, **row, *cell, *i;
- if (! uv2 || OVER_UTF_MAX(uv) || OVER_UTF_MAX(uv2))
+ if (!uv2 || OVER_UTF_MAX(uv) || OVER_UTF_MAX(uv2))
return 0;
if (Hangul_IsL(uv) && Hangul_IsV(uv2)) {
- uv -= Hangul_LBase; /* lindex */
- uv2 -= Hangul_VBase; /* vindex */
- return(Hangul_SBase + (uv * Hangul_VCount + uv2) * Hangul_TCount);
+ UV lindex = uv - Hangul_LBase;
+ UV vindex = uv2 - Hangul_VBase;
+ return(Hangul_SBase + (lindex * Hangul_VCount + vindex) *
+ Hangul_TCount);
}
if (Hangul_IsLV(uv) && Hangul_IsT(uv2)) {
- uv2 -= Hangul_TBase; /* tindex */
- return(uv + uv2);
+ UV tindex = uv2 - Hangul_TBase;
+ return(uv + tindex);
}
plane = UNF_compos[uv >> 16];
if (! plane)
@@ -138,7 +152,7 @@ static UV composite_uv (UV uv, UV uv2)
return 0;
}
-static U8 getCombinClass (UV uv)
+static U8 getCombinClass(UV uv)
{
U8 **plane, *row;
if (OVER_UTF_MAX(uv))
@@ -150,36 +164,21 @@ static U8 getCombinClass (UV uv)
return row ? row[uv & 0xff] : 0;
}
-static void sv_cat_decompHangul (SV* sv, UV uv)
+static U8* pv_cat_decompHangul(U8* d, UV uv)
{
- UV sindex, lindex, vindex, tindex;
- U8 *t, tmp[3 * UTF8_MAXLEN + 1];
+ UV sindex = uv - Hangul_SBase;
+ UV lindex = sindex / Hangul_NCount;
+ UV vindex = (sindex % Hangul_NCount) / Hangul_TCount;
+ UV tindex = sindex % Hangul_TCount;
if (! Hangul_IsS(uv))
- return;
+ return d;
- sindex = uv - Hangul_SBase;
- lindex = sindex / Hangul_NCount;
- vindex = (sindex % Hangul_NCount) / Hangul_TCount;
- tindex = sindex % Hangul_TCount;
-
- t = tmp;
- t = uvuni_to_utf8(t, (lindex + Hangul_LBase));
- t = uvuni_to_utf8(t, (vindex + Hangul_VBase));
+ d = uvuni_to_utf8(d, (lindex + Hangul_LBase));
+ d = uvuni_to_utf8(d, (vindex + Hangul_VBase));
if (tindex)
- t = uvuni_to_utf8(t, (tindex + Hangul_TBase));
- *t = '\0';
- sv_catpvn(sv, (char *)tmp, t - tmp);
-}
-
-static void sv_cat_uvuni (SV* sv, UV uv)
-{
- U8 *t, tmp[UTF8_MAXLEN + 1];
-
- t = tmp;
- t = uvuni_to_utf8(t, uv);
- *t = '\0';
- sv_catpvn(sv, (char *)tmp, t - tmp);
+ d = uvuni_to_utf8(d, (tindex + Hangul_TBase));
+ return d;
}
static char * sv_2pvunicode(SV *sv, STRLEN *lp)
@@ -194,140 +193,305 @@ static char * sv_2pvunicode(SV *sv, STRLEN *lp)
sv_utf8_upgrade(tmpsv);
s = (char*)SvPV(tmpsv,len);
}
- *lp = len;
+ if (lp)
+ *lp = len;
return s;
}
-MODULE = Unicode::Normalize PACKAGE = Unicode::Normalize
-
-SV*
-decompose(src, compat = &PL_sv_no)
- SV * src
- SV * compat
- PROTOTYPE: $;$
- PREINIT:
- SV *dst;
- STRLEN srclen, retlen;
- U8 *s, *e, *p, *r;
- UV uv;
- bool iscompat;
- CODE:
- iscompat = SvTRUE(compat);
- s = (U8*)sv_2pvunicode(src,&srclen);
- e = s + srclen;
-
- dst = newSV(1);
- (void)SvPOK_only(dst);
- SvUTF8_on(dst);
-
- for (p = s; p < e; p += retlen) {
- uv = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF);
+static
+U8* pv_utf8_decompose(U8* s, STRLEN slen, U8** dp, STRLEN dlen, bool iscompat)
+{
+ U8* p = s;
+ U8* e = s + slen;
+ U8* dstart = *dp;
+ U8* d = dstart;
+
+ while (p < e) {
+ STRLEN retlen;
+ UV uv = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF);
if (!retlen)
- croak(ErrRetlenIsZero);
+ croak(ErrRetlenIsZero, "decompose");
+ p += retlen;
+
+ if (Hangul_IsS(uv)) {
+ STRLEN cur = d - dstart;
- if (Hangul_IsS(uv))
- sv_cat_decompHangul(dst, uv);
+ if (dlen < cur + UTF8_MAXLEN * 3) {
+ dlen += UTF8_MAXLEN * 3;
+ Renew(dstart, dlen+1, U8);
+ d = dstart + cur;
+ }
+ d = pv_cat_decompHangul(d, uv);
+ }
else {
- r = iscompat ? dec_compat(uv) : dec_canonical(uv);
- if (r)
- sv_catpv(dst, (char *)r);
- else
- sv_cat_uvuni(dst, uv);
+ U8* r = iscompat ? dec_compat(uv) : dec_canonical(uv);
+
+ if (r) {
+ STRLEN len = (STRLEN)strlen((char *)r);
+ STRLEN cur = d - dstart;
+ if (dlen < cur + len) {
+ dlen += len;
+ Renew(dstart, dlen+1, U8);
+ d = dstart + cur;
+ }
+ while (len--)
+ *d++ = *r++;
+ }
+ else {
+ STRLEN cur = d - dstart;
+
+ if (dlen < cur + UTF8_MAXLEN) {
+ dlen += UTF8_MAXLEN;
+ Renew(dstart, dlen+1, U8);
+ d = dstart + cur;
+ }
+ d = uvuni_to_utf8(d, uv);
+ }
}
}
- RETVAL = dst;
- OUTPUT:
- RETVAL
+ *dp = dstart;
+ return d;
+}
+static
+U8* pv_utf8_reorder(U8* s, STRLEN slen, U8* d, STRLEN dlen)
+{
+ U8* p = s;
+ U8* e = s + slen;
+ U8* dend = d + dlen;
+
+ UNF_cc seq_ary[CC_SEQ_SIZE];
+ UNF_cc* seq_ptr = seq_ary; /* use array at the beginning */
+ UNF_cc* seq_ext = NULL; /* extend if need */
+ STRLEN seq_max = CC_SEQ_SIZE;
+ STRLEN cc_pos = 0;
+
+ if (dlen < slen || dlen < slen + UTF8_MAXLEN)
+ croak(ErrTargetNotEnough, "reorder");
+ dend -= UTF8_MAXLEN; /* safety */
+
+ while (p < e) {
+ U8 curCC;
+ STRLEN retlen;
+ UV uv = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF);
+ if (!retlen)
+ croak(ErrRetlenIsZero, "reorder");
+ p += retlen;
+ curCC = getCombinClass(uv);
-SV*
-reorder(src)
- SV * src
- PROTOTYPE: $
- PREINIT:
- SV *dst;
- STRLEN srclen, dstlen, retlen, stk_cc_max;
- U8 *s, *e, *p, *d, curCC;
- UV uv, uvlast;
- UNF_cc * stk_cc;
- STRLEN i, cc_pos;
- bool valid_uvlast;
- CODE:
- s = (U8*)sv_2pvunicode(src,&srclen);
- e = s + srclen;
+ if (curCC != 0) {
+ if (seq_max < cc_pos + 1) { /* extend if need */
+ seq_max = cc_pos + CC_SEQ_STEP; /* new size */
+ if (CC_SEQ_SIZE == cc_pos) { /* seq_ary full */
+ STRLEN i;
+ New(0, seq_ext, seq_max, UNF_cc);
+ for (i = 0; i < cc_pos; i++)
+ seq_ext[i] = seq_ary[i];
+ }
+ else {
+ Renew(seq_ext, seq_max, UNF_cc);
+ }
+ seq_ptr = seq_ext; /* till now use seq_ext */
+ }
- dstlen = srclen + 1;
- dst = newSV(dstlen);
- (void)SvPOK_only(dst);
- SvUTF8_on(dst);
- d = (U8*)SvPVX(dst);
+ seq_ptr[cc_pos].cc = curCC;
+ seq_ptr[cc_pos].uv = uv;
+ seq_ptr[cc_pos].pos = cc_pos;
+ ++cc_pos;
- stk_cc_max = 10; /* enough as an initial value? */
- New(0, stk_cc, stk_cc_max, UNF_cc);
+ if (p < e)
+ continue;
+ }
- for (p = s; p < e;) {
- uv = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF);
- if (!retlen)
- croak(ErrRetlenIsZero);
- p += retlen;
+ if (cc_pos) {
+ STRLEN i;
+
+ if (cc_pos > 1) /* reordered if there are two c.c.'s */
+ qsort((void*)seq_ptr, cc_pos, sizeof(UNF_cc), compare_cc);
+
+ for (i = 0; i < cc_pos; i++) {
+ d = uvuni_to_utf8(d, seq_ptr[i].uv);
+ if (dend < d) /* real end is dend + UTF8_MAXLEN */
+ croak(ErrLongerThanSrc, "reorder");
+ }
+ cc_pos = 0;
+ }
- curCC = getCombinClass(uv);
if (curCC == 0) {
d = uvuni_to_utf8(d, uv);
- continue;
+ if (dend < d) /* real end is dend + UTF8_MAXLEN */
+ croak(ErrLongerThanSrc, "reorder");
}
+ }
+ if (seq_ext)
+ Safefree(seq_ext);
+ return d;
+}
- cc_pos = 0;
- stk_cc[cc_pos].cc = curCC;
- stk_cc[cc_pos].uv = uv;
- stk_cc[cc_pos].pos = cc_pos;
+static
+U8* pv_utf8_compose(U8* s, STRLEN slen, U8* d, STRLEN dlen, bool iscontig)
+{
+ U8* p = s;
+ U8* e = s + slen;
+ U8* dend = d + dlen;
+
+ UV uvS; /* code point of the starter */
+ bool valid_uvS = FALSE; /* if FALSE, uvS isn't initialized yet */
+ U8 preCC = 0;
+
+ UV seq_ary[CC_SEQ_SIZE];
+ UV* seq_ptr = seq_ary; /* use array at the beginning */
+ UV* seq_ext = NULL; /* extend if need */
+ STRLEN seq_max = CC_SEQ_SIZE;
+ STRLEN cc_pos = 0;
+
+ if (dlen < slen || dlen < slen + UTF8_MAXLEN)
+ croak(ErrTargetNotEnough, "compose");
+ dend -= UTF8_MAXLEN; /* safety */
+
+ while (p < e) {
+ U8 curCC;
+ STRLEN retlen;
+ UV uv = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF);
+ if (!retlen)
+ croak(ErrRetlenIsZero, "compose");
+ p += retlen;
- valid_uvlast = FALSE;
- while (p < e) {
- uv = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF);
- if (!retlen)
- croak(ErrRetlenIsZero);
- p += retlen;
+ curCC = getCombinClass(uv);
- curCC = getCombinClass(uv);
+ if (!valid_uvS) {
if (curCC == 0) {
- uvlast = uv;
- valid_uvlast = TRUE;
- break;
+ uvS = uv; /* the first Starter is found */
+ valid_uvS = TRUE;
+ if (p < e)
+ continue;
}
+ else {
+ d = uvuni_to_utf8(d, uv);
+ if (dend < d) /* real end is dend + UTF8_MAXLEN */
+ croak(ErrLongerThanSrc, "compose");
+ continue;
+ }
+ }
+ else {
+ bool composed;
+
+ /* blocked */
+ if (iscontig && cc_pos || /* discontiguous combination */
+ curCC != 0 && preCC == curCC || /* blocked by same CC */
+ preCC > curCC) /* blocked by higher CC: revised D2 */
+ composed = FALSE;
+
+ /* not blocked:
+ iscontig && cc_pos == 0 -- contiguous combination
+ curCC == 0 && preCC == 0 -- starter + starter
+ curCC != 0 && preCC < curCC -- lower CC */
+ else {
+ /* try composition */
+ UV uvComp = composite_uv(uvS, uv);
+
+ if (uvComp && !isExclusion(uvComp)) {
+ uvS = uvComp;
+ composed = TRUE;
- cc_pos++;
- if (stk_cc_max <= cc_pos) { /* extend if need */
- stk_cc_max = cc_pos + 1;
- Renew(stk_cc, stk_cc_max, UNF_cc);
+ /* preCC should not be changed to curCC */
+ /* e.g. 1E14 = 0045 0304 0300 where CC(0304) == CC(0300) */
+ if (p < e)
+ continue;
+ }
+ else
+ composed = FALSE;
+ }
+
+ if (!composed) {
+ preCC = curCC;
+ if (curCC != 0 || !(p < e)) {
+ if (seq_max < cc_pos + 1) { /* extend if need */
+ seq_max = cc_pos + CC_SEQ_STEP; /* new size */
+ if (CC_SEQ_SIZE == cc_pos) { /* seq_ary full */
+ New(0, seq_ext, seq_max, UV);
+ Copy(seq_ary, seq_ext, cc_pos, UV);
+ }
+ else {
+ Renew(seq_ext, seq_max, UV);
+ }
+ seq_ptr = seq_ext; /* till now use seq_ext */
+ }
+ seq_ptr[cc_pos] = uv;
+ ++cc_pos;
+ }
+ if (curCC != 0 && p < e)
+ continue;
}
- stk_cc[cc_pos].cc = curCC;
- stk_cc[cc_pos].uv = uv;
- stk_cc[cc_pos].pos = cc_pos;
}
- /* reordered if there are two c.c.'s */
+ d = uvuni_to_utf8(d, uvS); /* starter (composed or not) */
+ if (dend < d) /* real end is dend + UTF8_MAXLEN */
+ croak(ErrLongerThanSrc, "compose");
+
if (cc_pos) {
- qsort((void*)stk_cc, cc_pos + 1, sizeof(UNF_cc), compare_cc);
- }
+ STRLEN i;
- for (i = 0; i <= cc_pos; i++) {
- d = uvuni_to_utf8(d, stk_cc[i].uv);
- }
- if (valid_uvlast)
- {
- d = uvuni_to_utf8(d, uvlast);
+ for (i = 0; i < cc_pos; i++) {
+ d = uvuni_to_utf8(d, seq_ptr[i]);
+ if (dend < d) /* real end is dend + UTF8_MAXLEN */
+ croak(ErrLongerThanSrc, "compose");
+ }
+ cc_pos = 0;
}
+
+ uvS = uv;
}
- *d = '\0';
- SvCUR_set(dst, d - (U8*)SvPVX(dst));
- Safefree(stk_cc);
+ if (seq_ext)
+ Safefree(seq_ext);
+ return d;
+}
+
+MODULE = Unicode::Normalize PACKAGE = Unicode::Normalize
+
+SV*
+decompose(src, compat = &PL_sv_no)
+ SV * src
+ SV * compat
+ PROTOTYPE: $;$
+ PREINIT:
+ SV* dst;
+ U8 *s, *d, *dend;
+ STRLEN slen, dlen;
+ CODE:
+ s = (U8*)sv_2pvunicode(src,&slen);
+ dst = newSVpvn("", 0);
+ dlen = slen;
+ New(0, d, dlen+1, U8);
+ dend = pv_utf8_decompose(s, slen, &d, dlen, SvTRUE(compat));
+ sv_setpvn(dst, d, dend - d);
+ SvUTF8_on(dst);
+ Safefree(d);
RETVAL = dst;
OUTPUT:
RETVAL
-
+SV*
+reorder(src)
+ SV * src
+ PROTOTYPE: $
+ PREINIT:
+ SV* dst;
+ U8 *s, *d, *dend;
+ STRLEN slen, dlen;
+ CODE:
+ s = (U8*)sv_2pvunicode(src,&slen);
+ dst = newSVpvn("", 0);
+ dlen = slen + UTF8_MAXLEN;
+ d = (U8*)SvGROW(dst,dlen+1);
+ SvUTF8_on(dst);
+ dend = pv_utf8_reorder(s, slen, d, dlen);
+ *dend = '\0';
+ SvCUR_set(dst, dend - d);
+ RETVAL = dst;
+ OUTPUT:
+ RETVAL
SV*
compose(src)
@@ -336,96 +500,99 @@ compose(src)
ALIAS:
composeContiguous = 1
PREINIT:
- SV *dst, *tmp;
- U8 *s, *p, *e, *d, *t, *tmp_start, curCC, preCC;
- UV uv, uvS, uvComp;
- STRLEN srclen, dstlen, tmplen, retlen;
- bool beginning = TRUE;
+ SV* dst;
+ U8 *s, *d, *dend;
+ STRLEN slen, dlen;
CODE:
- s = (U8*)sv_2pvunicode(src,&srclen);
- e = s + srclen;
-
- dstlen = srclen + 1;
- dst = newSV(dstlen);
- (void)SvPOK_only(dst);
+ s = (U8*)sv_2pvunicode(src,&slen);
+ dst = newSVpvn("", 0);
+ dlen = slen + UTF8_MAXLEN;
+ d = (U8*)SvGROW(dst,dlen+1);
SvUTF8_on(dst);
- d = (U8*)SvPVX(dst);
-
- /* for uncomposed combining char */
- tmp = sv_2mortal(newSV(dstlen));
- (void)SvPOK_only(tmp);
- SvUTF8_on(tmp);
-
- for (p = s; p < e;) {
- if (beginning) {
- uvS = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF);
- if (!retlen)
- croak(ErrRetlenIsZero);
- p += retlen;
-
- if (getCombinClass(uvS)) { /* no Starter found yet */
- d = uvuni_to_utf8(d, uvS);
- continue;
- }
- beginning = FALSE;
- }
+ dend = pv_utf8_compose(s, slen, d, dlen, (bool)ix);
+ *dend = '\0';
+ SvCUR_set(dst, dend - d);
+ RETVAL = dst;
+ OUTPUT:
+ RETVAL
- /* Starter */
- t = tmp_start = (U8*)SvPVX(tmp);
- preCC = 0;
+SV*
+NFD(src)
+ SV * src
+ PROTOTYPE: $
+ ALIAS:
+ NFKD = 1
+ PREINIT:
+ SV *dst;
+ U8 *s, *t, *tend, *d, *dend;
+ STRLEN slen, tlen, dlen;
+ CODE:
+ /* decompose */
+ s = (U8*)sv_2pvunicode(src,&slen);
+ tlen = slen;
+ New(0, t, tlen+1, U8);
+ tend = pv_utf8_decompose(s, slen, &t, tlen, (bool)ix);
+ *tend = '\0';
+ tlen = tend - t; /* no longer know real tlen */
+
+ /* reorder */
+ dst = newSVpvn("", 0);
+ dlen = tlen + UTF8_MAXLEN;
+ d = (U8*)SvGROW(dst,dlen+1);
+ SvUTF8_on(dst);
+ dend = pv_utf8_reorder(t, tlen, d, dlen);
+ *dend = '\0';
+ SvCUR_set(dst, dend - d);
- /* to the next Starter */
- while (p < e) {
- uv = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF);
- if (!retlen)
- croak(ErrRetlenIsZero);
- p += retlen;
+ /* return */
+ Safefree(t);
+ RETVAL = dst;
+ OUTPUT:
+ RETVAL
- curCC = getCombinClass(uv);
+SV*
+NFC(src)
+ SV * src
+ PROTOTYPE: $
+ ALIAS:
+ NFKC = 1
+ FCC = 2
+ PREINIT:
+ SV *dst;
+ U8 *s, *t, *tend, *u, *uend, *d, *dend;
+ STRLEN slen, tlen, ulen, dlen;
+ CODE:
+ /* decompose */
+ s = (U8*)sv_2pvunicode(src,&slen);
+ tlen = slen;
+ New(0, t, tlen+1, U8);
+ tend = pv_utf8_decompose(s, slen, &t, tlen, (bool)(ix==1));
+ *tend = '\0';
+ tlen = tend - t; /* no longer know real tlen */
+
+ /* reorder */
+ ulen = tlen + UTF8_MAXLEN;
+ New(0, u, ulen+1, U8);
+ uend = pv_utf8_reorder(t, tlen, u, ulen);
+ *uend = '\0';
+ ulen = uend - u;
+
+ /* compose */
+ dst = newSVpvn("", 0);
+ dlen = ulen + UTF8_MAXLEN;
+ d = (U8*)SvGROW(dst,dlen+1);
+ SvUTF8_on(dst);
+ dend = pv_utf8_compose(u, ulen, d, dlen, (bool)(ix==2));
+ *dend = '\0';
+ SvCUR_set(dst, dend - d);
- if (preCC && preCC == curCC) {
- preCC = curCC;
- t = uvuni_to_utf8(t, uv);
- } else {
- uvComp = composite_uv(uvS, uv);
-
- if (uvComp && ! isExclusion(uvComp) &&
- (ix ? (t == tmp_start) : (preCC <= curCC))) {
- STRLEN leftcur, rightcur, dstcur;
- leftcur = UNISKIP(uvComp);
- rightcur = UNISKIP(uvS) + UNISKIP(uv);
-
- if (leftcur > rightcur) {
- dstcur = d - (U8*)SvPVX(dst);
- dstlen += leftcur - rightcur;
- d = (U8*)SvGROW(dst,dstlen) + dstcur;
- }
- /* preCC not changed to curCC */
- uvS = uvComp;
- } else if (! curCC && p < e) { /* blocked */
- break;
- } else {
- preCC = curCC;
- t = uvuni_to_utf8(t, uv);
- }
- }
- }
- d = uvuni_to_utf8(d, uvS); /* starter (composed or not) */
- tmplen = t - tmp_start;
- if (tmplen) { /* uncomposed combining char */
- t = (U8*)SvPVX(tmp);
- while (tmplen--)
- *d++ = *t++;
- }
- uvS = uv;
- } /* for */
- *d = '\0';
- SvCUR_set(dst, d - (U8*)SvPVX(dst));
+ /* return */
+ Safefree(t);
+ Safefree(u);
RETVAL = dst;
OUTPUT:
RETVAL
-
void
checkNFD(src)
SV * src
@@ -435,16 +602,15 @@ checkNFD(src)
PREINIT:
STRLEN srclen, retlen;
U8 *s, *e, *p, curCC, preCC;
- UV uv;
CODE:
s = (U8*)sv_2pvunicode(src,&srclen);
e = s + srclen;
preCC = 0;
for (p = s; p < e; p += retlen) {
- uv = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF);
+ UV uv = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF);
if (!retlen)
- croak(ErrRetlenIsZero);
+ croak(ErrRetlenIsZero, "checkNFD or -NFKD");
curCC = getCombinClass(uv);
if (preCC > curCC && curCC != 0) /* canonical ordering violated */
@@ -466,7 +632,6 @@ checkNFC(src)
PREINIT:
STRLEN srclen, retlen;
U8 *s, *e, *p, curCC, preCC;
- UV uv;
bool isMAYBE;
CODE:
s = (U8*)sv_2pvunicode(src,&srclen);
@@ -475,12 +640,11 @@ checkNFC(src)
preCC = 0;
isMAYBE = FALSE;
for (p = s; p < e; p += retlen) {
- uv = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF);
+ UV uv = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF);
if (!retlen)
- croak(ErrRetlenIsZero);
+ croak(ErrRetlenIsZero, "checkNFC or -NFKC");
curCC = getCombinClass(uv);
-
if (preCC > curCC && curCC != 0) /* canonical ordering violated */
XSRETURN_NO;
@@ -516,27 +680,29 @@ checkFCD(src)
ALIAS:
checkFCC = 1
PREINIT:
- STRLEN srclen, retlen, canlen, canret;
+ STRLEN srclen, retlen;
U8 *s, *e, *p, curCC, preCC;
- UV uv, uvLead, uvTrail;
- U8 *sCan, *pCan, *eCan;
bool isMAYBE;
CODE:
s = (U8*)sv_2pvunicode(src,&srclen);
e = s + srclen;
-
preCC = 0;
isMAYBE = FALSE;
for (p = s; p < e; p += retlen) {
- uv = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF);
+ U8 *sCan;
+ UV uvLead;
+ STRLEN canlen, canret;
+ UV uv = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF);
if (!retlen)
- croak(ErrRetlenIsZero);
+ croak(ErrRetlenIsZero, "checkFCD or -FCC");
sCan = (U8*) dec_canonical(uv);
if (sCan) {
canlen = (STRLEN)strlen((char *) sCan);
uvLead = utf8n_to_uvuni(sCan, canlen, &canret, AllowAnyUTF);
+ if (!canret)
+ croak(ErrRetlenIsZero, "checkFCD or -FCC");
}
else {
uvLead = uv;
@@ -555,11 +721,14 @@ checkFCD(src)
}
if (sCan) {
- eCan = sCan + canlen;
- pCan = utf8_hop(eCan, -1);
+ UV uvTrail;
+ U8* eCan = sCan + canlen;
+ U8* pCan = utf8_hop(eCan, -1);
if (pCan < sCan)
croak(ErrHopBeforeStart);
uvTrail = utf8n_to_uvuni(pCan, eCan - pCan, &canret, AllowAnyUTF);
+ if (!canret)
+ croak(ErrRetlenIsZero, "checkFCD or -FCC");
preCC = getCombinClass(uvTrail);
}
else {
@@ -662,17 +831,14 @@ getCanon(uv)
PROTOTYPE: $
ALIAS:
getCompat = 1
- PREINIT:
- U8 * rstr;
CODE:
if (Hangul_IsS(uv)) {
- SV * dst;
- dst = newSV(1);
- (void)SvPOK_only(dst);
- sv_cat_decompHangul(dst, uv);
- RETVAL = dst;
+ U8 tmp[3 * UTF8_MAXLEN + 1];
+ U8 *t = tmp;
+ U8 *e = pv_cat_decompHangul(t, uv);
+ RETVAL = newSVpvn((char *)t, e - t);
} else {
- rstr = ix ? dec_compat(uv) : dec_canonical(uv);
+ U8* rstr = ix ? dec_compat(uv) : dec_canonical(uv);
if (!rstr)
XSRETURN_UNDEF;
RETVAL = newSVpvn((char *)rstr, strlen((char *)rstr));
@@ -687,18 +853,18 @@ splitOnLastStarter(src)
SV * src
PREINIT:
SV *svp;
- STRLEN srclen, retlen;
+ STRLEN srclen;
U8 *s, *e, *p;
- UV uv;
PPCODE:
s = (U8*)sv_2pvunicode(src,&srclen);
e = s + srclen;
-
- for (p = e; s < p; ) {
+ p = e;
+ while (s < p) {
+ UV uv;
p = utf8_hop(p, -1);
if (p < s)
croak(ErrHopBeforeStart);
- uv = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF);
+ uv = utf8n_to_uvuni(p, e - p, NULL, AllowAnyUTF);
if (getCombinClass(uv) == 0) /* Last Starter found */
break;
}
diff --git a/ext/Unicode/Normalize/README b/ext/Unicode/Normalize/README
index 34e24e3219..e70d7ea037 100644
--- a/ext/Unicode/Normalize/README
+++ b/ext/Unicode/Normalize/README
@@ -1,4 +1,4 @@
-Unicode/Normalize version 0.28
+Unicode/Normalize version 1.00
===================================
Unicode::Normalize - Unicode Normalization Forms
@@ -90,7 +90,7 @@ COPYRIGHT AND LICENCE
http://homepage1.nifty.com/nomenclator/perl/
- Copyright(C) 2001-2003, SADAHIRO Tomoyuki. Japan. All rights reserved.
+ Copyright(C) 2001-2006, SADAHIRO Tomoyuki. Japan. All rights reserved.
This module is free software; you can redistribute it
and/or modify it under the same terms as Perl itself.
diff --git a/ext/Unicode/Normalize/mkheader b/ext/Unicode/Normalize/mkheader
index ff30759c95..165c1c4c85 100644
--- a/ext/Unicode/Normalize/mkheader
+++ b/ext/Unicode/Normalize/mkheader
@@ -363,7 +363,7 @@ EOF
next if ! $val{ $p };
for (my $r = 0; $r < 256; $r++) {
next if ! $val{ $p }{ $r };
- printf "$type ${head}_%02x_%02x [256] = {\n", $p, $r;
+ printf "static $type ${head}_%02x_%02x [256] = {\n", $p, $r;
for (my $c = 0; $c < 256; $c++) {
print "\t", defined $val{$p}{$r}{$c}
? "($type)".$val{$p}{$r}{$c}
@@ -376,7 +376,7 @@ EOF
}
foreach my $p (sort { $a <=> $b } keys %val) {
next if ! $val{ $p };
- printf "$type* ${head}_%02x [256] = {\n", $p;
+ printf "static $type* ${head}_%02x [256] = {\n", $p;
for (my $r = 0; $r < 256; $r++) {
print $val{ $p }{ $r }
? sprintf("${head}_%02x_%02x", $p, $r)
@@ -386,7 +386,7 @@ EOF
}
print "};\n\n";
}
- print "$type** $head [] = {\n";
+ print "static $type** $head [] = {\n";
for (my $p = 0; $p <= 0x10; $p++) {
print $val{ $p } ? sprintf("${head}_%02x", $p) : "NULL";
print ',' if $p != 0x10;
diff --git a/ext/Unicode/Normalize/t/fcdc.t b/ext/Unicode/Normalize/t/fcdc.t
index ea10a64233..5fc78a5bee 100644
--- a/ext/Unicode/Normalize/t/fcdc.t
+++ b/ext/Unicode/Normalize/t/fcdc.t
@@ -19,53 +19,93 @@ BEGIN {
use Test;
use strict;
use warnings;
-BEGIN { plan tests => 35 };
+BEGIN { plan tests => 68 };
use Unicode::Normalize qw(:all);
ok(1); # If we made it this far, we're ok.
-sub _pack_U { Unicode::Normalize::pack_U(@_) }
-sub _unpack_U { Unicode::Normalize::unpack_U(@_) }
+sub _pack_U { Unicode::Normalize::pack_U(@_) }
+sub hexU { _pack_U map hex, split ' ', shift }
sub answer { defined $_[0] ? $_[0] ? "YES" : "NO" : "MAYBE" }
#########################
+ok(FCD(''), "");
+ok(FCC(''), "");
+ok(FCD('A'), "A");
+ok(FCC('A'), "A");
+
+ok(normalize('FCD', ""), "");
+ok(normalize('FCC', ""), "");
+ok(normalize('FCC', "A"), "A");
+ok(normalize('FCD', "A"), "A");
+
+# if checkFCD is YES, the return value from FCD should be same as the original
+ok(FCD(hexU("00C5")), hexU("00C5")); # A with ring above
+ok(FCD(hexU("0041 030A")), hexU("0041 030A")); # A+ring
+ok(FCD(hexU("0041 0327 030A")), hexU("0041 0327 030A")); # A+cedilla+ring
+ok(FCD(hexU("AC01 1100 1161")), hexU("AC01 1100 1161")); # hangul
+ok(FCD(hexU("212B F900")), hexU("212B F900")); # compat
+
+ok(normalize('FCD', hexU("00C5")), hexU("00C5"));
+ok(normalize('FCD', hexU("0041 030A")), hexU("0041 030A"));
+ok(normalize('FCD', hexU("0041 0327 030A")), hexU("0041 0327 030A"));
+ok(normalize('FCD', hexU("AC01 1100 1161")), hexU("AC01 1100 1161"));
+ok(normalize('FCD', hexU("212B F900")), hexU("212B F900"));
+
+# if checkFCD is MAYBE or NO, FCD returns NFD (this behavior isn't documented)
+ok(FCD(hexU("00C5 0327")), hexU("0041 0327 030A"));
+ok(FCD(hexU("0041 030A 0327")), hexU("0041 0327 030A"));
+ok(FCD(hexU("00C5 0327")), NFD(hexU("00C5 0327")));
+ok(FCD(hexU("0041 030A 0327")), NFD(hexU("0041 030A 0327")));
+
+ok(normalize('FCD', hexU("00C5 0327")), hexU("0041 0327 030A"));
+ok(normalize('FCD', hexU("0041 030A 0327")), hexU("0041 0327 030A"));
+ok(normalize('FCD', hexU("00C5 0327")), NFD(hexU("00C5 0327")));
+ok(normalize('FCD', hexU("0041 030A 0327")), NFD(hexU("0041 030A 0327")));
+
ok(answer(checkFCD('')), 'YES');
ok(answer(checkFCD('A')), 'YES');
ok(answer(checkFCD("\x{030A}")), 'YES'); # 030A;COMBINING RING ABOVE
-ok(answer(checkFCD("\x{0327}")), 'YES'); # 0327;COMBINING CEDILLA
+ok(answer(checkFCD("\x{0327}")), 'YES'); # 0327;COMBINING CEDILLA
ok(answer(checkFCD(_pack_U(0x00C5))), 'YES'); # A with ring above
-ok(answer(checkFCD(_pack_U(0x41, 0x30A))), 'YES'); # A+ring
-ok(answer(checkFCD(_pack_U(0x41, 0x327, 0x30A))), 'YES'); # A+cedilla+ring
-ok(answer(checkFCD(_pack_U(0x41, 0x30A, 0x327))), 'NO'); # A+ring+cedilla
-ok(answer(checkFCD(_pack_U(0xC5, 0x0327))), 'NO'); # A-ring+cedilla
-ok(answer(checkNFC(_pack_U(0xC5, 0x0327))), 'MAYBE'); # NFC: A-ring+cedilla
-ok(answer(check("FCD", _pack_U(0xC5, 0x0327))), 'NO');
-ok(answer(check("NFC", _pack_U(0xC5, 0x0327))), 'MAYBE');
+ok(answer(checkFCD(hexU("0041 030A"))), 'YES'); # A+ring
+ok(answer(checkFCD(hexU("0041 0327 030A"))), 'YES'); # A+cedilla+ring
+ok(answer(checkFCD(hexU("0041 030A 0327"))), 'NO'); # A+ring+cedilla
+ok(answer(checkFCD(hexU("00C5 0327"))), 'NO'); # A-ring+cedilla
+ok(answer(checkNFC(hexU("00C5 0327"))), 'MAYBE'); # NFC: A-ring+cedilla
+ok(answer(check("FCD", hexU("00C5 0327"))), 'NO');
+ok(answer(check("NFC", hexU("00C5 0327"))), 'MAYBE');
ok(answer(checkFCD("\x{AC01}\x{1100}\x{1161}")), 'YES'); # hangul
ok(answer(checkFCD("\x{212B}\x{F900}")), 'YES'); # compat
-ok(FCD(''), "");
-ok(FCC(''), "");
-
-ok(FCD('A'), "A");
-ok(FCC('A'), "A");
+ok(answer(checkFCD(hexU("1EA7 05AE 0315 0062"))), "NO");
+ok(answer(checkFCC(hexU("1EA7 05AE 0315 0062"))), "NO");
+ok(answer(check('FCD', hexU("1EA7 05AE 0315 0062"))), "NO");
+ok(answer(check('FCC', hexU("1EA7 05AE 0315 0062"))), "NO");
-ok(answer(checkFCD(_pack_U(0x1EA7, 0x05AE, 0x0315, 0x0062))), "NO");
-ok(answer(checkFCC(_pack_U(0x1EA7, 0x05AE, 0x0315, 0x0062))), "NO");
-
-ok(FCC(_pack_U(0xC5, 0x327)), _pack_U(0x41, 0x327, 0x30A));
-ok(FCC(_pack_U(0x45, 0x304, 0x300)), _pack_U(0x1E14));
+ok(FCC(hexU("00C5 0327")), hexU("0041 0327 030A"));
+ok(FCC(hexU("0045 0304 0300")), "\x{1E14}");
ok(FCC("\x{1100}\x{1161}\x{1100}\x{1173}\x{11AF}"), "\x{AC00}\x{AE00}");
+ok(normalize('FCC', hexU("00C5 0327")), hexU("0041 0327 030A"));
+ok(normalize('FCC', hexU("0045 0304 0300")), "\x{1E14}");
+ok(normalize('FCC', hexU("1100 1161 1100 1173 11AF")), "\x{AC00}\x{AE00}");
+
+ok(FCC("\x{0B47}\x{0300}\x{0B3E}"), "\x{0B47}\x{0300}\x{0B3E}");
+ok(FCC("\x{1100}\x{0300}\x{1161}"), "\x{1100}\x{0300}\x{1161}");
+ok(FCC("\x{0B47}\x{0B3E}\x{0300}"), "\x{0B4B}\x{0300}");
+ok(FCC("\x{1100}\x{1161}\x{0300}"), "\x{AC00}\x{0300}");
+ok(FCC("\x{0B47}\x{300}\x{0B3E}\x{327}"), "\x{0B47}\x{300}\x{0B3E}\x{327}");
+ok(FCC("\x{1100}\x{300}\x{1161}\x{327}"), "\x{1100}\x{300}\x{1161}\x{327}");
ok(answer(checkFCC('')), 'YES');
ok(answer(checkFCC('A')), 'YES');
ok(answer(checkFCC("\x{030A}")), 'MAYBE'); # 030A;COMBINING RING ABOVE
ok(answer(checkFCC("\x{0327}")), 'MAYBE'); # 0327;COMBINING CEDILLA
-ok(answer(checkFCC(_pack_U(0x00C5))), 'YES'); # A with ring above
-ok(answer(checkFCC(_pack_U(0x41, 0x30A))), 'MAYBE'); # A+ring
-ok(answer(checkFCC(_pack_U(0x41, 0x327, 0x30A))), 'MAYBE'); # A+cedilla+ring
-ok(answer(checkFCC(_pack_U(0x41, 0x30A, 0x327))), 'NO'); # A+ring+cedilla
-ok(answer(checkFCC(_pack_U(0xC5, 0x0327))), 'NO'); # A-ring+cedilla
+ok(answer(checkFCC(hexU("00C5"))), 'YES'); # A with ring above
+ok(answer(checkFCC(hexU("0041 030A"))), 'MAYBE'); # A+ring
+ok(answer(checkFCC(hexU("0041 0327 030A"))), 'MAYBE'); # A+cedilla+ring
+ok(answer(checkFCC(hexU("0041 030A 0327"))), 'NO'); # A+ring+cedilla
+ok(answer(checkFCC(hexU("00C5 0327"))), 'NO'); # A-ring+cedilla
ok(answer(checkFCC("\x{AC01}\x{1100}\x{1161}")), 'MAYBE'); # hangul
ok(answer(checkFCC("\x{212B}\x{F900}")), 'NO'); # compat
diff --git a/ext/Unicode/Normalize/t/func.t b/ext/Unicode/Normalize/t/func.t
index 6dbf41bf1a..81421ce1f3 100644
--- a/ext/Unicode/Normalize/t/func.t
+++ b/ext/Unicode/Normalize/t/func.t
@@ -19,130 +19,295 @@ BEGIN {
use Test;
use strict;
use warnings;
-BEGIN { plan tests => 13 };
+BEGIN { plan tests => 202 };
use Unicode::Normalize qw(:all);
ok(1); # If we made it this far, we're ok.
-sub _pack_U { Unicode::Normalize::pack_U(@_) }
-sub _unpack_U { Unicode::Normalize::unpack_U(@_) }
+sub _pack_U { Unicode::Normalize::pack_U(@_) }
+sub hexU { _pack_U map hex, split ' ', shift }
#########################
-print getCombinClass( 0) == 0
- && getCombinClass( 768) == 230
- && getCombinClass(1809) == 36
- && ($] < 5.007003 || getCombinClass(0x1D167) == 1) # Unicode 3.1
- ? "ok" : "not ok", " 2\n";
-
-print ! defined getCanon( 0)
- && ! defined getCanon(41)
- && getCanon(0x00C0) eq _pack_U(0x0041, 0x0300)
- && getCanon(0x00EF) eq _pack_U(0x0069, 0x0308)
- && getCanon(0x304C) eq _pack_U(0x304B, 0x3099)
- && getCanon(0x1EA4) eq _pack_U(0x0041, 0x0302, 0x0301)
- && getCanon(0x1F82) eq _pack_U(0x03B1, 0x0313, 0x0300, 0x0345)
- && getCanon(0x1FAF) eq _pack_U(0x03A9, 0x0314, 0x0342, 0x0345)
- && getCanon(0xAC00) eq _pack_U(0x1100, 0x1161)
- && getCanon(0xAE00) eq _pack_U(0x1100, 0x1173, 0x11AF)
- && ! defined getCanon(0x212C)
- && ! defined getCanon(0x3243)
- && getCanon(0xFA2D) eq _pack_U(0x9DB4)
- ? "ok" : "not ok", " 3\n";
-
-print ! defined getCompat( 0)
- && ! defined getCompat(41)
- && getCompat(0x00C0) eq _pack_U(0x0041, 0x0300)
- && getCompat(0x00EF) eq _pack_U(0x0069, 0x0308)
- && getCompat(0x304C) eq _pack_U(0x304B, 0x3099)
- && getCompat(0x1EA4) eq _pack_U(0x0041, 0x0302, 0x0301)
- && getCompat(0x1F82) eq _pack_U(0x03B1, 0x0313, 0x0300, 0x0345)
- && getCompat(0x1FAF) eq _pack_U(0x03A9, 0x0314, 0x0342, 0x0345)
- && getCompat(0x212C) eq _pack_U(0x0042)
- && getCompat(0x3243) eq _pack_U(0x0028, 0x81F3, 0x0029)
- && getCompat(0xAC00) eq _pack_U(0x1100, 0x1161)
- && getCompat(0xAE00) eq _pack_U(0x1100, 0x1173, 0x11AF)
- && getCompat(0xFA2D) eq _pack_U(0x9DB4)
- ? "ok" : "not ok", " 4\n";
-
-print ! defined getComposite( 0, 0)
- && ! defined getComposite( 0, 41)
- && ! defined getComposite(41, 0)
- && ! defined getComposite(41, 41)
- && ! defined getComposite(12, 0x0300)
- && ! defined getComposite(0x0055, 0xFF00)
- && 0x00C0 == getComposite(0x0041, 0x0300)
- && 0x00D9 == getComposite(0x0055, 0x0300)
- && 0x1E14 == getComposite(0x0112, 0x0300)
- && 0xAC00 == getComposite(0x1100, 0x1161)
- && 0xADF8 == getComposite(0x1100, 0x1173)
- && ! defined getComposite(0x1100, 0x11AF)
- && ! defined getComposite(0x1173, 0x11AF)
- && ! defined getComposite(0xAC00, 0x11A7)
- && 0xAC01 == getComposite(0xAC00, 0x11A8)
- && 0xAE00 == getComposite(0xADF8, 0x11AF)
- ? "ok" : "not ok", " 5\n";
-
-print ! isExclusion( 0)
- && ! isExclusion(41)
- && isExclusion(2392) # DEVANAGARI LETTER QA
- && isExclusion(3907) # TIBETAN LETTER GHA
- && isExclusion(64334) # HEBREW LETTER PE WITH RAFE
- ? "ok" : "not ok", " 6\n";
-
-print ! isSingleton( 0)
- && isSingleton(0x212B) # ANGSTROM SIGN
- ? "ok" : "not ok", " 7\n";
-
-print reorder("") eq ""
- && reorder(_pack_U(0x0041, 0x0300, 0x0315, 0x0313, 0x031b, 0x0061))
- eq _pack_U(0x0041, 0x031b, 0x0300, 0x0313, 0x0315, 0x0061)
- && reorder(_pack_U(0x00C1, 0x0300, 0x0315, 0x0313, 0x031b,
- 0x0061, 0x309A, 0x3099))
- eq _pack_U(0x00C1, 0x031b, 0x0300, 0x0313, 0x0315,
- 0x0061, 0x309A, 0x3099)
- ? "ok" : "not ok", " 8\n";
+ok(getCombinClass( 0), 0);
+ok(getCombinClass( 41), 0);
+ok(getCombinClass( 65), 0);
+ok(getCombinClass( 768), 230);
+ok(getCombinClass(1809), 36);
+
+ok(getCanon( 0), undef);
+ok(getCanon(0x29), undef);
+ok(getCanon(0x41), undef);
+ok(getCanon(0x00C0), _pack_U(0x0041, 0x0300));
+ok(getCanon(0x00EF), _pack_U(0x0069, 0x0308));
+ok(getCanon(0x304C), _pack_U(0x304B, 0x3099));
+ok(getCanon(0x1EA4), _pack_U(0x0041, 0x0302, 0x0301));
+ok(getCanon(0x1F82), _pack_U(0x03B1, 0x0313, 0x0300, 0x0345));
+ok(getCanon(0x1FAF), _pack_U(0x03A9, 0x0314, 0x0342, 0x0345));
+ok(getCanon(0xAC00), _pack_U(0x1100, 0x1161));
+ok(getCanon(0xAE00), _pack_U(0x1100, 0x1173, 0x11AF));
+ok(getCanon(0x212C), undef);
+ok(getCanon(0x3243), undef);
+ok(getCanon(0xFA2D), _pack_U(0x9DB4));
+
+ok(getCompat( 0), undef);
+ok(getCompat(0x29), undef);
+ok(getCompat(0x41), undef);
+ok(getCompat(0x00C0), _pack_U(0x0041, 0x0300));
+ok(getCompat(0x00EF), _pack_U(0x0069, 0x0308));
+ok(getCompat(0x304C), _pack_U(0x304B, 0x3099));
+ok(getCompat(0x1EA4), _pack_U(0x0041, 0x0302, 0x0301));
+ok(getCompat(0x1F82), _pack_U(0x03B1, 0x0313, 0x0300, 0x0345));
+ok(getCompat(0x1FAF), _pack_U(0x03A9, 0x0314, 0x0342, 0x0345));
+ok(getCompat(0x212C), _pack_U(0x0042));
+ok(getCompat(0x3243), _pack_U(0x0028, 0x81F3, 0x0029));
+ok(getCompat(0xAC00), _pack_U(0x1100, 0x1161));
+ok(getCompat(0xAE00), _pack_U(0x1100, 0x1173, 0x11AF));
+ok(getCompat(0xFA2D), _pack_U(0x9DB4));
+
+ok(getComposite( 0, 0), undef);
+ok(getComposite( 0, 0x29), undef);
+ok(getComposite(0x29, 0), undef);
+ok(getComposite(0x29, 0x29), undef);
+ok(getComposite( 0, 0x41), undef);
+ok(getComposite(0x41, 0), undef);
+ok(getComposite(0x41, 0x41), undef);
+ok(getComposite(12, 0x0300), undef);
+ok(getComposite(0x0055, 0xFF00), undef);
+ok(getComposite(0x0041, 0x0300), 0x00C0);
+ok(getComposite(0x0055, 0x0300), 0x00D9);
+ok(getComposite(0x0112, 0x0300), 0x1E14);
+ok(getComposite(0x1100, 0x1161), 0xAC00);
+ok(getComposite(0x1100, 0x1173), 0xADF8);
+ok(getComposite(0x1100, 0x11AF), undef);
+ok(getComposite(0x1173, 0x11AF), undef);
+ok(getComposite(0xAC00, 0x11A7), undef);
+ok(getComposite(0xAC00, 0x11A8), 0xAC01);
+ok(getComposite(0xADF8, 0x11AF), 0xAE00);
+
+sub uprops {
+ my $uv = shift;
+ my $r = "";
+ $r .= isExclusion($uv) ? 'X' : 'x';
+ $r .= isSingleton($uv) ? 'S' : 's';
+ $r .= isNonStDecomp($uv) ? 'N' : 'n'; # Non-Starter Decomposition
+ $r .= isComp_Ex($uv) ? 'F' : 'f'; # Full exclusion (X + S + N)
+ $r .= isComp2nd($uv) ? 'B' : 'b'; # B = M = Y
+ $r .= isNFD_NO($uv) ? 'D' : 'd';
+ $r .= isNFC_MAYBE($uv) ? 'M' : 'm'; # Maybe
+ $r .= isNFC_NO($uv) ? 'C' : 'c';
+ $r .= isNFKD_NO($uv) ? 'K' : 'k';
+ $r .= isNFKC_MAYBE($uv) ? 'Y' : 'y'; # maYbe
+ $r .= isNFKC_NO($uv) ? 'G' : 'g';
+ return $r;
+}
+
+ok(uprops(0x0000), 'xsnfbdmckyg');
+ok(uprops(0x0029), 'xsnfbdmckyg');
+ok(uprops(0x0041), 'xsnfbdmckyg');
+ok(uprops(0x00A0), 'xsnfbdmcKyG'); # NO-BREAK SPACE
+ok(uprops(0x00C0), 'xsnfbDmcKyg'); # LATIN CAPITAL LETTER A WITH GRAVE
+ok(uprops(0x0300), 'xsnfBdMckYg'); # COMBINING GRAVE ACCENT
+ok(uprops(0x0344), 'xsNFbDmCKyG'); # COMBINING GREEK DIALYTIKA TONOS
+ok(uprops(0x0387), 'xSnFbDmCKyG'); # GREEK ANO TELEIA
+ok(uprops(0x0958), 'XsnFbDmCKyG'); # DEVANAGARI LETTER QA
+ok(uprops(0x0F43), 'XsnFbDmCKyG'); # TIBETAN LETTER GHA
+ok(uprops(0x1100), 'xsnfbdmckyg'); # HANGUL CHOSEONG KIYEOK
+ok(uprops(0x1161), 'xsnfBdMckYg'); # HANGUL JUNGSEONG A
+ok(uprops(0x11AF), 'xsnfBdMckYg'); # HANGUL JONGSEONG RIEU
+ok(uprops(0x212B), 'xSnFbDmCKyG'); # ANGSTROM SIGN
+ok(uprops(0xAC00), 'xsnfbDmcKyg'); # HANGUL SYLLABLE GA
+ok(uprops(0xF900), 'xSnFbDmCKyG'); # CJK COMPATIBILITY IDEOGRAPH-F900
+ok(uprops(0xFB4E), 'XsnFbDmCKyG'); # HEBREW LETTER PE WITH RAFE
+ok(uprops(0xFF71), 'xsnfbdmcKyG'); # HALFWIDTH KATAKANA LETTER A
+
+ok(decompose(""), "");
+ok(decompose("A"), "A");
+ok(decompose("", 1), "");
+ok(decompose("A", 1), "A");
+
+ok(decompose(hexU("1E14 AC01")), hexU("0045 0304 0300 1100 1161 11A8"));
+ok(decompose(hexU("AC00 AE00")), hexU("1100 1161 1100 1173 11AF"));
+ok(decompose(hexU("304C FF76")), hexU("304B 3099 FF76"));
+
+ok(decompose(hexU("1E14 AC01"), 1), hexU("0045 0304 0300 1100 1161 11A8"));
+ok(decompose(hexU("AC00 AE00"), 1), hexU("1100 1161 1100 1173 11AF"));
+ok(decompose(hexU("304C FF76"), 1), hexU("304B 3099 30AB"));
+
+# don't modify the source
+my $sDec = "\x{FA19}";
+ok(decompose($sDec), "\x{795E}");
+ok($sDec, "\x{FA19}");
+
+ok(reorder(""), "");
+ok(reorder("A"), "A");
+ok(reorder(hexU("0041 0300 0315 0313 031b 0061")),
+ hexU("0041 031b 0300 0313 0315 0061"));
+ok(reorder(hexU("00C1 0300 0315 0313 031b 0061 309A 3099")),
+ hexU("00C1 031b 0300 0313 0315 0061 309A 3099"));
+
+# don't modify the source
+my $sReord = "\x{3000}\x{300}\x{31b}";
+ok(reorder($sReord), "\x{3000}\x{31b}\x{300}");
+ok($sReord, "\x{3000}\x{300}\x{31b}");
+
+ok(compose(""), "");
+ok(compose("A"), "A");
+ok(compose(hexU("0061 0300")), hexU("00E0"));
+ok(compose(hexU("0061 0300 031B")), hexU("00E0 031B"));
+ok(compose(hexU("0061 0300 0315")), hexU("00E0 0315"));
+ok(compose(hexU("0061 0300 0313")), hexU("00E0 0313"));
+ok(compose(hexU("0061 031B 0300")), hexU("00E0 031B"));
+ok(compose(hexU("0061 0315 0300")), hexU("0061 0315 0300"));
+ok(compose(hexU("0061 0313 0300")), hexU("0061 0313 0300"));
+
+# don't modify the source
+my $sCom = "\x{304B}\x{3099}";
+ok(compose($sCom), "\x{304C}");
+ok($sCom, "\x{304B}\x{3099}");
+
+ok(composeContiguous(""), "");
+ok(composeContiguous("A"), "A");
+ok(composeContiguous(hexU("0061 0300")), hexU("00E0"));
+ok(composeContiguous(hexU("0061 0300 031B")), hexU("00E0 031B"));
+ok(composeContiguous(hexU("0061 0300 0315")), hexU("00E0 0315"));
+ok(composeContiguous(hexU("0061 0300 0313")), hexU("00E0 0313"));
+ok(composeContiguous(hexU("0061 031B 0300")), hexU("0061 031B 0300"));
+ok(composeContiguous(hexU("0061 0315 0300")), hexU("0061 0315 0300"));
+ok(composeContiguous(hexU("0061 0313 0300")), hexU("0061 0313 0300"));
+
+# don't modify the source
+my $sCtg = "\x{30DB}\x{309A}";
+ok(composeContiguous($sCtg), "\x{30DD}");
+ok($sCtg, "\x{30DB}\x{309A}");
sub answer { defined $_[0] ? $_[0] ? "YES" : "NO" : "MAYBE" }
-print answer(checkNFD("")) eq "YES"
- && answer(checkNFC("")) eq "YES"
- && answer(checkNFKD("")) eq "YES"
- && answer(checkNFKC("")) eq "YES"
- && answer(check("NFD", "")) eq "YES"
- && answer(check("NFC", "")) eq "YES"
- && answer(check("NFKD","")) eq "YES"
- && answer(check("NFKC","")) eq "YES"
+ok(answer(checkNFD("")), "YES");
+ok(answer(checkNFC("")), "YES");
+ok(answer(checkNFKD("")), "YES");
+ok(answer(checkNFKC("")), "YES");
+ok(answer(check("NFD", "")), "YES");
+ok(answer(check("NFC", "")), "YES");
+ok(answer(check("NFKD","")), "YES");
+ok(answer(check("NFKC","")), "YES");
+
# U+0000 to U+007F are prenormalized in all the normalization forms.
- && answer(checkNFD("AZaz\t12!#`")) eq "YES"
- && answer(checkNFC("AZaz\t12!#`")) eq "YES"
- && answer(checkNFKD("AZaz\t12!#`")) eq "YES"
- && answer(checkNFKC("AZaz\t12!#`")) eq "YES"
- && answer(check("D", "AZaz\t12!#`")) eq "YES"
- && answer(check("C", "AZaz\t12!#`")) eq "YES"
- && answer(check("KD","AZaz\t12!#`")) eq "YES"
- && answer(check("KC","AZaz\t12!#`")) eq "YES"
- ? "ok" : "not ok", " 9\n";
-
-print 1
- && answer(checkNFD(NFD(_pack_U(0xC1, 0x1100, 0x1173, 0x11AF)))) eq "YES"
- && answer(checkNFD(_pack_U(0x20, 0xC1, 0x1100, 0x1173, 0x11AF))) eq "NO"
- && answer(checkNFC(_pack_U(0x20, 0xC1, 0x1173, 0x11AF))) eq "MAYBE"
- && answer(checkNFC(_pack_U(0x20, 0xC1, 0xAE00, 0x1100))) eq "YES"
- && answer(checkNFC(_pack_U(0x20, 0xC1, 0xAE00, 0x1100, 0x300))) eq "MAYBE"
- && answer(checkNFC(_pack_U(0x20, 0xC1, 0xFF71, 0x2025))) eq "YES"
- && answer(check("NFC", _pack_U(0x20, 0xC1, 0x212B, 0x300))) eq "NO"
- && answer(checkNFKD(_pack_U(0x20, 0xC1, 0xFF71, 0x2025))) eq "NO"
- && answer(checkNFKC(_pack_U(0x20, 0xC1, 0xAE00, 0x2025))) eq "NO"
- ? "ok" : "not ok", " 10\n";
+ok(answer(checkNFD("AZaz\t12!#`")), "YES");
+ok(answer(checkNFC("AZaz\t12!#`")), "YES");
+ok(answer(checkNFKD("AZaz\t12!#`")), "YES");
+ok(answer(checkNFKC("AZaz\t12!#`")), "YES");
+ok(answer(check("D", "AZaz\t12!#`")), "YES");
+ok(answer(check("C", "AZaz\t12!#`")), "YES");
+ok(answer(check("KD","AZaz\t12!#`")), "YES");
+ok(answer(check("KC","AZaz\t12!#`")), "YES");
+
+ok(answer(checkNFD(NFD(_pack_U(0xC1, 0x1100, 0x1173, 0x11AF)))), "YES");
+ok(answer(checkNFD(_pack_U(0x20, 0xC1, 0x1100, 0x1173, 0x11AF))), "NO");
+ok(answer(checkNFC(_pack_U(0x20, 0xC1, 0x1173, 0x11AF))), "MAYBE");
+ok(answer(checkNFC(_pack_U(0x20, 0xC1, 0xAE00, 0x1100))), "YES");
+ok(answer(checkNFC(_pack_U(0x20, 0xC1, 0xAE00, 0x1100, 0x300))), "MAYBE");
+ok(answer(checkNFC(_pack_U(0x20, 0xC1, 0xFF71, 0x2025))), "YES");
+ok(answer(check("NFC", _pack_U(0x20, 0xC1, 0x212B, 0x300))), "NO");
+ok(answer(checkNFKD(_pack_U(0x20, 0xC1, 0xFF71, 0x2025))), "NO");
+ok(answer(checkNFKC(_pack_U(0x20, 0xC1, 0xAE00, 0x2025))), "NO");
"012ABC" =~ /(\d+)(\w+)/;
-print "012" eq NFC $1 && "ABC" eq NFC $2
- ? "ok" : "not ok", " 11\n";
+ok("012" eq NFC $1 && "ABC" eq NFC $2);
-print "012" eq normalize('C', $1) && "ABC" eq normalize('C', $2)
- ? "ok" : "not ok", " 12\n";
+ok(normalize('C', $1), "012");
+ok(normalize('C', $2), "ABC");
-print "012" eq normalize('NFC', $1) && "ABC" eq normalize('NFC', $2)
- ? "ok" : "not ok", " 13\n";
+ok(normalize('NFC', $1), "012");
+ok(normalize('NFC', $2), "ABC");
# s/^NF// in normalize() must not prevent using $1, $&, etc.
+# a string with initial zero should be treated like a number
+
+# LATIN CAPITAL LETTER A WITH GRAVE
+ok(getCombinClass("0192"), 0);
+ok(getCanon ("0192"), _pack_U(0x41, 0x300));
+ok(getCompat("0192"), _pack_U(0x41, 0x300));
+ok(getComposite("065", "0768"), 192);
+ok(isNFD_NO ("0192"));
+ok(isNFKD_NO("0192"));
+
+# DEVANAGARI LETTER QA
+ok(isExclusion("02392"));
+ok(isComp_Ex ("02392"));
+ok(isNFC_NO ("02392"));
+ok(isNFKC_NO ("02392"));
+ok(isNFD_NO ("02392"));
+ok(isNFKD_NO ("02392"));
+
+# ANGSTROM SIGN
+ok(isSingleton("08491"));
+ok(isComp_Ex ("08491"));
+ok(isNFC_NO ("08491"));
+ok(isNFKC_NO ("08491"));
+ok(isNFD_NO ("08491"));
+ok(isNFKD_NO ("08491"));
+
+# COMBINING GREEK DIALYTIKA TONOS
+ok(isNonStDecomp("0836"));
+ok(isComp_Ex ("0836"));
+ok(isNFC_NO ("0836"));
+ok(isNFKC_NO ("0836"));
+ok(isNFD_NO ("0836"));
+ok(isNFKD_NO ("0836"));
+
+# COMBINING GRAVE ACCENT
+ok(getCombinClass("0768"), 230);
+ok(isComp2nd ("0768"));
+ok(isNFC_MAYBE ("0768"));
+ok(isNFKC_MAYBE("0768"));
+
+# HANGUL SYLLABLE GA
+ok(getCombinClass("044032"), 0);
+ok(getCanon("044032"), _pack_U(0x1100, 0x1161));
+ok(getCompat("044032"), _pack_U(0x1100, 0x1161));
+ok(getComposite("04352", "04449"), 0xAC00);
+
+# string with 22 combining characters: (0x300..0x315)
+my $str_cc22 = _pack_U(0x3041, 0x300..0x315, 0x3042);
+ok(decompose($str_cc22), $str_cc22);
+ok(reorder($str_cc22), $str_cc22);
+ok(compose($str_cc22), $str_cc22);
+ok(composeContiguous($str_cc22), $str_cc22);
+ok(NFD($str_cc22), $str_cc22);
+ok(NFC($str_cc22), $str_cc22);
+ok(NFKD($str_cc22), $str_cc22);
+ok(NFKC($str_cc22), $str_cc22);
+ok(FCD($str_cc22), $str_cc22);
+ok(FCC($str_cc22), $str_cc22);
+
+# string with 40 combining characters of the same class: (0x300..0x313)x2
+my $str_cc40 = _pack_U(0x3041, 0x300..0x313, 0x300..0x313, 0x3042);
+ok(decompose($str_cc40), $str_cc40);
+ok(reorder($str_cc40), $str_cc40);
+ok(compose($str_cc40), $str_cc40);
+ok(composeContiguous($str_cc40), $str_cc40);
+ok(NFD($str_cc40), $str_cc40);
+ok(NFC($str_cc40), $str_cc40);
+ok(NFKD($str_cc40), $str_cc40);
+ok(NFKC($str_cc40), $str_cc40);
+ok(FCD($str_cc40), $str_cc40);
+ok(FCC($str_cc40), $str_cc40);
+
+my $precomp = hexU("304C 304E 3050 3052 3054");
+my $combseq = hexU("304B 3099 304D 3099 304F 3099 3051 3099 3053 3099");
+ok(decompose($precomp x 5), $combseq x 5);
+ok(decompose($precomp x 10), $combseq x 10);
+ok(decompose($precomp x 20), $combseq x 20);
+
+my $hangsyl = hexU("AC00 B098 B2E4 B77C B9C8");
+my $jamoseq = hexU("1100 1161 1102 1161 1103 1161 1105 1161 1106 1161");
+ok(decompose($hangsyl x 5), $jamoseq x 5);
+ok(decompose($hangsyl x 10), $jamoseq x 10);
+ok(decompose($hangsyl x 20), $jamoseq x 20);
+
+my $notcomp = hexU("304B 304D 304F 3051 3053");
+ok(decompose($precomp . $notcomp), $combseq . $notcomp);
+ok(decompose($precomp . $notcomp x 5), $combseq . $notcomp x 5);
+ok(decompose($precomp . $notcomp x10), $combseq . $notcomp x10);
+
+
diff --git a/ext/Unicode/Normalize/t/norm.t b/ext/Unicode/Normalize/t/norm.t
index a9399075ba..5d93747965 100644
--- a/ext/Unicode/Normalize/t/norm.t
+++ b/ext/Unicode/Normalize/t/norm.t
@@ -19,7 +19,7 @@ BEGIN {
use Test;
use strict;
use warnings;
-BEGIN { plan tests => 29 };
+BEGIN { plan tests => 64 };
use Unicode::Normalize qw(normalize);
ok(1); # If we made it this far, we're ok.
@@ -28,8 +28,42 @@ sub _unpack_U { Unicode::Normalize::unpack_U(@_) }
#########################
-ok(normalize('C', ""), "");
ok(normalize('D', ""), "");
+ok(normalize('C', ""), "");
+ok(normalize('KD',""), "");
+ok(normalize('KC',""), "");
+
+ok(normalize('D', "A"), "A");
+ok(normalize('C', "A"), "A");
+ok(normalize('KD',"A"), "A");
+ok(normalize('KC',"A"), "A");
+
+ok(normalize('NFD', ""), "");
+ok(normalize('NFC', ""), "");
+ok(normalize('NFKD',""), "");
+ok(normalize('NFKC',""), "");
+
+ok(normalize('NFD', "A"), "A");
+ok(normalize('NFC', "A"), "A");
+ok(normalize('NFKD',"A"), "A");
+ok(normalize('NFKC',"A"), "A");
+
+# don't modify the source
+my $sNFD = "\x{FA19}";
+ok(normalize('NFD', $sNFD), "\x{795E}");
+ok($sNFD, "\x{FA19}");
+
+my $sNFC = "\x{FA1B}";
+ok(normalize('NFC', $sNFC), "\x{798F}");
+ok($sNFC, "\x{FA1B}");
+
+my $sNFKD = "\x{FA1E}";
+ok(normalize('NFKD', $sNFKD), "\x{7FBD}");
+ok($sNFKD, "\x{FA1E}");
+
+my $sNFKC = "\x{FA26}";
+ok(normalize('NFKC', $sNFKC), "\x{90FD}");
+ok($sNFKC, "\x{FA26}");
sub hexNFC {
join " ", map sprintf("%04X", $_),
@@ -40,6 +74,9 @@ sub hexNFD {
_unpack_U normalize 'D', _pack_U map hex, split ' ', shift;
}
+ok(hexNFD("1E14 AC01"), "0045 0304 0300 1100 1161 11A8");
+ok(hexNFD("AC00 AE00"), "1100 1161 1100 1173 11AF");
+
ok(hexNFC("0061 0315 0300 05AE 05C4 0062"), "00E0 05AE 05C4 0315 0062");
ok(hexNFC("00E0 05AE 05C4 0315 0062"), "00E0 05AE 05C4 0315 0062");
ok(hexNFC("0061 05AE 0300 05C4 0315 0062"), "00E0 05AE 05C4 0315 0062");
@@ -73,3 +110,16 @@ ok(hexNFC("1100 1161 0300"), "AC00 0300");
ok(hexNFC("0B47 0300 0B3E 0327"), "0B47 0300 0B3E 0327");
ok(hexNFC("1100 0300 1161 0327"), "1100 0300 1161 0327");
+
+ok(hexNFC("0300 0041"), "0300 0041");
+ok(hexNFC("0300 0301 0041"), "0300 0301 0041");
+ok(hexNFC("0301 0300 0041"), "0301 0300 0041");
+ok(hexNFC("0000 0300 0000 0301"), "0000 0300 0000 0301");
+ok(hexNFC("0000 0301 0000 0300"), "0000 0301 0000 0300");
+
+ok(hexNFC("0327 0061 0300"), "0327 00E0");
+ok(hexNFC("0301 0061 0300"), "0301 00E0");
+ok(hexNFC("0315 0061 0300"), "0315 00E0");
+ok(hexNFC("0000 0327 0061 0300"), "0000 0327 00E0");
+ok(hexNFC("0000 0301 0061 0300"), "0000 0301 00E0");
+ok(hexNFC("0000 0315 0061 0300"), "0000 0315 00E0");
diff --git a/ext/Unicode/Normalize/t/null.t b/ext/Unicode/Normalize/t/null.t
index ae75752480..6067da4775 100644
--- a/ext/Unicode/Normalize/t/null.t
+++ b/ext/Unicode/Normalize/t/null.t
@@ -20,7 +20,7 @@ use strict;
use warnings;
use Unicode::Normalize qw(:all);
-print "1..8\n";
+print "1..24\n";
print "ok 1\n";
@@ -47,3 +47,51 @@ print /c$/ ? "ok" : "not ok", " 7\n";
$_ = NFKC('abc');
print /c$/ ? "ok" : "not ok", " 8\n";
+$_ = FCC('abc');
+print /c$/ ? "ok" : "not ok", " 9\n";
+
+$_ = decompose("\x{304C}abc");
+print /c$/ ? "ok" : "not ok", " 10\n";
+
+$_ = decompose("\x{304B}\x{3099}abc");
+print /c$/ ? "ok" : "not ok", " 11\n";
+
+$_ = reorder("\x{304C}abc");
+print /c$/ ? "ok" : "not ok", " 12\n";
+
+$_ = reorder("\x{304B}\x{3099}abc");
+print /c$/ ? "ok" : "not ok", " 13\n";
+
+$_ = compose("\x{304C}abc");
+print /c$/ ? "ok" : "not ok", " 14\n";
+
+$_ = compose("\x{304B}\x{3099}abc");
+print /c$/ ? "ok" : "not ok", " 15\n";
+
+$_ = NFD("\x{304C}abc");
+print /c$/ ? "ok" : "not ok", " 16\n";
+
+$_ = NFC("\x{304C}abc");
+print /c$/ ? "ok" : "not ok", " 17\n";
+
+$_ = NFKD("\x{304C}abc");
+print /c$/ ? "ok" : "not ok", " 18\n";
+
+$_ = NFKC("\x{304C}abc");
+print /c$/ ? "ok" : "not ok", " 19\n";
+
+$_ = FCC("\x{304C}abc");
+print /c$/ ? "ok" : "not ok", " 20\n";
+
+$_ = getCanon(0x100);
+print s/.$// ? "ok" : "not ok", " 21\n";
+
+$_ = getCompat(0x100);
+print s/.$// ? "ok" : "not ok", " 22\n";
+
+$_ = getCanon(0xAC00);
+print s/.$// ? "ok" : "not ok", " 23\n";
+
+$_ = getCompat(0xAC00);
+print s/.$// ? "ok" : "not ok", " 24\n";
+
diff --git a/ext/Unicode/Normalize/t/test.t b/ext/Unicode/Normalize/t/test.t
index 8e3369f58a..e07f6f0925 100644
--- a/ext/Unicode/Normalize/t/test.t
+++ b/ext/Unicode/Normalize/t/test.t
@@ -19,7 +19,7 @@ BEGIN {
use Test;
use strict;
use warnings;
-BEGIN { plan tests => 31 };
+BEGIN { plan tests => 58 };
use Unicode::Normalize;
ok(1); # If we made it this far, we're ok.
@@ -28,8 +28,34 @@ sub _unpack_U { Unicode::Normalize::unpack_U(@_) }
#########################
-ok(NFC(""), "");
ok(NFD(""), "");
+ok(NFC(""), "");
+ok(NFKD(""), "");
+ok(NFKC(""), "");
+
+ok(NFD("A"), "A");
+ok(NFC("A"), "A");
+ok(NFKD("A"), "A");
+ok(NFKC("A"), "A");
+
+# don't modify the source
+# don't modify the source
+my $sNFD = "\x{FA19}";
+ok(NFD($sNFD), "\x{795E}");
+ok($sNFD, "\x{FA19}");
+
+my $sNFC = "\x{FA1B}";
+ok(NFC($sNFC), "\x{798F}");
+ok($sNFC, "\x{FA1B}");
+
+my $sNFKD = "\x{FA1E}";
+ok(NFKD($sNFKD), "\x{7FBD}");
+ok($sNFKD, "\x{FA1E}");
+
+my $sNFKC = "\x{FA26}";
+ok(NFKC($sNFKC), "\x{90FD}");
+ok($sNFKC, "\x{FA26}");
+
sub hexNFC {
join " ", map sprintf("%04X", $_),
@@ -40,6 +66,9 @@ sub hexNFD {
_unpack_U NFD _pack_U map hex, split ' ', shift;
}
+ok(hexNFD("1E14 AC01"), "0045 0304 0300 1100 1161 11A8");
+ok(hexNFD("AC00 AE00"), "1100 1161 1100 1173 11AF");
+
ok(hexNFC("0061 0315 0300 05AE 05C4 0062"), "00E0 05AE 05C4 0315 0062");
ok(hexNFC("00E0 05AE 05C4 0315 0062"), "00E0 05AE 05C4 0315 0062");
ok(hexNFC("0061 05AE 0300 05C4 0315 0062"), "00E0 05AE 05C4 0315 0062");
@@ -67,13 +96,24 @@ ok(hexNFC("AC00 11C3"), "AC00 11C3");
# cf. http://www.unicode.org/review/pr-29.html
ok(hexNFC("0B47 0300 0B3E"), "0B47 0300 0B3E");
ok(hexNFC("1100 0300 1161"), "1100 0300 1161");
-
ok(hexNFC("0B47 0B3E 0300"), "0B4B 0300");
ok(hexNFC("1100 1161 0300"), "AC00 0300");
-
ok(hexNFC("0B47 0300 0B3E 0327"), "0B47 0300 0B3E 0327");
ok(hexNFC("1100 0300 1161 0327"), "1100 0300 1161 0327");
+ok(hexNFC("0300 0041"), "0300 0041");
+ok(hexNFC("0300 0301 0041"), "0300 0301 0041");
+ok(hexNFC("0301 0300 0041"), "0301 0300 0041");
+ok(hexNFC("0000 0300 0000 0301"), "0000 0300 0000 0301");
+ok(hexNFC("0000 0301 0000 0300"), "0000 0301 0000 0300");
+
+ok(hexNFC("0327 0061 0300"), "0327 00E0");
+ok(hexNFC("0301 0061 0300"), "0301 00E0");
+ok(hexNFC("0315 0061 0300"), "0315 00E0");
+ok(hexNFC("0000 0327 0061 0300"), "0000 0327 00E0");
+ok(hexNFC("0000 0301 0061 0300"), "0000 0301 00E0");
+ok(hexNFC("0000 0315 0061 0300"), "0000 0315 00E0");
+
# NFC() should be unary.
my $str11 = _pack_U(0x41, 0x0302, 0x0301, 0x62);
my $str12 = _pack_U(0x1EA4, 0x62);