summaryrefslogtreecommitdiff
path: root/ext/Unicode/Normalize
diff options
context:
space:
mode:
Diffstat (limited to 'ext/Unicode/Normalize')
-rw-r--r--ext/Unicode/Normalize/Changes38
-rw-r--r--ext/Unicode/Normalize/Makefile.PL15
-rw-r--r--ext/Unicode/Normalize/Normalize.pm45
-rw-r--r--ext/Unicode/Normalize/Normalize.pod89
-rw-r--r--ext/Unicode/Normalize/Normalize.xs378
-rw-r--r--ext/Unicode/Normalize/README66
-rw-r--r--ext/Unicode/Normalize/mkheader284
-rw-r--r--ext/Unicode/Normalize/t/func.t69
-rw-r--r--ext/Unicode/Normalize/t/norm.t43
-rw-r--r--ext/Unicode/Normalize/t/test.t43
10 files changed, 1070 insertions, 0 deletions
diff --git a/ext/Unicode/Normalize/Changes b/ext/Unicode/Normalize/Changes
new file mode 100644
index 0000000000..bf17449ab2
--- /dev/null
+++ b/ext/Unicode/Normalize/Changes
@@ -0,0 +1,38 @@
+Revision history for Perl extension Unicode::Normalize.
+
+0.10 Sat Nov 03 16:30:20 2001
+ - The XS version is now independent of Lingua::KO::Hangul::Util.
+ (though the Non-XS version still requires that.)
+
+0.09 Fri Nov 02 22:39:30 2001
+ - remove pTHX_.
+
+0.08 Thu Nov 01 23:20:42 2001
+ - use Lingua::KO::Hangul::Util 0.06 and remove "hangul.h".
+
+0.07 Wed Oct 31 22:06:42 2001
+ - modify internal. decompose() - reorder() - compose().
+
+0.06 Sun Oct 28 14:28:46 2001
+ - an XS version.
+ (but the Non-XS version is also supported.)
+
+0.05 Wed Oct 10 22:02:15 2001 (not released)
+ - %Compos contains unnecessary singletons
+ (though it did not cause any bug, only useless).
+ They will not be stored.
+
+0.04 Wed Aug 15 19:02:41 2001
+ - fix: NFD("") and NFKD("") must return "", not but undef.
+
+0.03 Fri Aug 10 22:44:18 2001
+ - rename the module name to Unicode::Normalize.
+ - normalize takes two arguments.
+
+0.02 Thu Aug 9 22:56:36 2001
+ - add function normalize
+
+0.01 Mon Aug 6 21:45:11 2001
+ - original version; created by h2xs 1.21 with options
+ -A -X -n Text::Unicode::Normalize
+
diff --git a/ext/Unicode/Normalize/Makefile.PL b/ext/Unicode/Normalize/Makefile.PL
new file mode 100644
index 0000000000..88ab9b7b63
--- /dev/null
+++ b/ext/Unicode/Normalize/Makefile.PL
@@ -0,0 +1,15 @@
+use ExtUtils::MakeMaker;
+
+# This is not the CPAN Unicode::Normalize makefile
+# that can handle XS-NOXS installing. We do just XS.
+
+do "mkheader";
+
+WriteMakefile(
+ 'NAME' => 'Unicode::Normalize',
+ 'VERSION_FROM' => 'Normalize.pm', # finds $VERSION
+ ($] >= 5.005 ? ## Add these new keywords supported since 5.005
+ (ABSTRACT_FROM => 'Normalize.pod', # retrieve abstract from module
+ AUTHOR => 'SADAHIRO Tomoyuki <SADAHIRO@cpan.org>') : ()),
+ clean => {FILES=> 'unfcan.h unfcmb.h unfcmp.h unfcpt.h unfexc.h'},
+);
diff --git a/ext/Unicode/Normalize/Normalize.pm b/ext/Unicode/Normalize/Normalize.pm
new file mode 100644
index 0000000000..a583425a3b
--- /dev/null
+++ b/ext/Unicode/Normalize/Normalize.pm
@@ -0,0 +1,45 @@
+package Unicode::Normalize;
+
+use 5.006;
+use strict;
+use warnings;
+use Carp;
+
+our $VERSION = '0.10';
+our $PACKAGE = __PACKAGE__;
+
+require Exporter;
+require DynaLoader;
+require AutoLoader;
+
+our @ISA = qw(Exporter DynaLoader);
+our @EXPORT = qw( NFC NFD NFKC NFKD );
+our @EXPORT_OK = qw( normalize decompose reorder compose
+ getCanon getCompat getComposite getCombinClass getExclusion);
+our %EXPORT_TAGS = ( all => [ @EXPORT, @EXPORT_OK ] );
+
+bootstrap Unicode::Normalize $VERSION;
+
+use constant CANON => 0;
+use constant COMPAT => 1;
+
+sub NFD ($) { reorder(decompose($_[0], CANON)) }
+
+sub NFKD ($) { reorder(decompose($_[0], COMPAT)) }
+
+sub NFC ($) { compose(reorder(decompose($_[0], CANON))) }
+
+sub NFKC ($) { compose(reorder(decompose($_[0], COMPAT))) }
+
+sub normalize($$)
+{
+ my $form = shift;
+ $form eq 'D' || $form eq 'NFD' ? NFD ($_[0]) :
+ $form eq 'C' || $form eq 'NFC' ? NFC ($_[0]) :
+ $form eq 'KD' || $form eq 'NFKD' ? NFKD($_[0]) :
+ $form eq 'KC' || $form eq 'NFKC' ? NFKC($_[0]) :
+ croak $PACKAGE."::normalize: invalid form name: $form";
+}
+
+1;
+__END__
diff --git a/ext/Unicode/Normalize/Normalize.pod b/ext/Unicode/Normalize/Normalize.pod
new file mode 100644
index 0000000000..4ac8966a83
--- /dev/null
+++ b/ext/Unicode/Normalize/Normalize.pod
@@ -0,0 +1,89 @@
+
+=head1 NAME
+
+Unicode::Normalize - normalized forms of Unicode text
+
+=head1 SYNOPSIS
+
+ use Unicode::Normalize;
+
+ $string_NFD = NFD($raw_string); # Normalization Form D
+ $string_NFC = NFC($raw_string); # Normalization Form C
+ $string_NFKD = NFKD($raw_string); # Normalization Form KD
+ $string_NFKC = NFKC($raw_string); # Normalization Form KC
+
+ or
+
+ use Unicode::Normalize 'normalize';
+
+ $string_NFD = normalize('D', $raw_string); # Normalization Form D
+ $string_NFC = normalize('C', $raw_string); # Normalization Form C
+ $string_NFKD = normalize('KD', $raw_string); # Normalization Form KD
+ $string_NFKC = normalize('KC', $raw_string); # Normalization Form KC
+
+=head1 DESCRIPTION
+
+=over 4
+
+=item C<$string_NFD = NFD($raw_string)>
+
+returns the Normalization Form D (formed by canonical decomposition).
+
+
+=item C<$string_NFC = NFC($raw_string)>
+
+returns the Normalization Form C (formed by canonical decomposition
+followed by canonical composition).
+
+=item C<$string_NFKD = NFKD($raw_string)>
+
+returns the Normalization Form KD (formed by compatibility decomposition).
+
+=item C<$string_NFKC = NFKC($raw_string)>
+
+returns the Normalization Form KC (formed by compatibility decomposition
+followed by B<canonical> composition).
+
+=item C<$normalized_string = normalize($form_name, $raw_string)>
+
+As C<$form_name>, one of the following names must be given.
+
+ 'C' or 'NFC' for Normalization Form C
+ 'D' or 'NFD' for Normalization Form D
+ 'KC' or 'NFKC' for Normalization Form KC
+ 'KD' or 'NFKD' for Normalization Form KD
+
+=back
+
+=head2 EXPORT
+
+C<NFC>, C<NFD>, C<NFKC>, C<NFKD>: by default.
+
+C<normalize>: on request.
+
+=head1 AUTHOR
+
+SADAHIRO Tomoyuki, E<lt>SADAHIRO@cpan.orgE<gt>
+
+ http://homepage1.nifty.com/nomenclator/perl/
+
+ Copyright(C) 2001, SADAHIRO Tomoyuki. Japan. All rights reserved.
+
+ This program is free software; you can redistribute it and/or
+ modify it under the same terms as Perl itself.
+
+=head1 SEE ALSO
+
+=over 4
+
+=item L<Lingua::KO::Hangul::Util>
+
+utility functions for Hangul Syllables
+
+=item http://www.unicode.org/unicode/reports/tr15/
+
+Unicode Normalization Forms - UAX #15
+
+=back
+
+=cut
diff --git a/ext/Unicode/Normalize/Normalize.xs b/ext/Unicode/Normalize/Normalize.xs
new file mode 100644
index 0000000000..aca08538fb
--- /dev/null
+++ b/ext/Unicode/Normalize/Normalize.xs
@@ -0,0 +1,378 @@
+
+#include "EXTERN.h"
+#include "perl.h"
+#include "XSUB.h"
+
+/* These 5 files are prepared by mkheader */
+#include "unfcmb.h"
+#include "unfcan.h"
+#include "unfcpt.h"
+#include "unfcmp.h"
+#include "unfexc.h"
+
+/* Perl 5.6.1 ? */
+#ifndef uvuni_to_utf8
+#define uvuni_to_utf8 uv_to_utf8
+#endif /* uvuni_to_utf8 */
+
+/* Perl 5.6.1 ? */
+#ifndef utf8n_to_uvchr
+#define utf8n_to_uvchr utf8_to_uv
+#endif /* utf8n_to_uvchr */
+
+/* At present, char > 0x10ffff are unaffected without complaint, right? */
+#define VALID_UTF_MAX (0x10ffff)
+#define OVER_UTF_MAX(uv) (VALID_UTF_MAX < (uv))
+
+/* HANGUL_H */
+#define Hangul_SBase 0xAC00
+#define Hangul_SFinal 0xD7A3
+#define Hangul_SCount 11172
+
+#define Hangul_NCount 588
+
+#define Hangul_LBase 0x1100
+#define Hangul_LFinal 0x1112
+#define Hangul_LCount 19
+
+#define Hangul_VBase 0x1161
+#define Hangul_VFinal 0x1175
+#define Hangul_VCount 21
+
+#define Hangul_TBase 0x11A7
+#define Hangul_TFinal 0x11C2
+#define Hangul_TCount 28
+
+#define Hangul_IsS(u) ((Hangul_SBase <= (u)) && ((u) <= Hangul_SFinal))
+#define Hangul_IsN(u) (! (((u) - Hangul_SBase) % Hangul_TCount))
+#define Hangul_IsLV(u) (Hangul_IsS(u) && Hangul_IsN(u))
+#define Hangul_IsL(u) ((Hangul_LBase <= (u)) && ((u) <= Hangul_LFinal))
+#define Hangul_IsV(u) ((Hangul_VBase <= (u)) && ((u) <= Hangul_VFinal))
+#define Hangul_IsT(u) ((Hangul_TBase < (u)) && ((u) <= Hangul_TFinal))
+/* HANGUL_H */
+
+/* this is used for canonical ordering of combining characters (c.c.). */
+typedef struct {
+ U8 cc; /* combining class */
+ UV uv; /* codepoint */
+ STRLEN pos; /* position */
+} UNF_cc;
+
+int compare_cc(const void *a, const void *b)
+{
+ int ret_cc;
+ ret_cc = (*(UNF_cc*)a).cc - (*(UNF_cc*)b).cc;
+ if(ret_cc) return ret_cc;
+ return (*(UNF_cc*)a).pos - (*(UNF_cc*)b).pos;
+}
+
+U8* dec_canonical (UV uv)
+{
+ U8 ***plane, **row;
+ if(OVER_UTF_MAX(uv)) return NULL;
+ plane = (U8***)UNF_canon[uv >> 16];
+ if(! plane) return NULL;
+ row = plane[(uv >> 8) & 0xff];
+ return row ? row[uv & 0xff] : NULL;
+}
+
+U8* dec_compat (UV uv)
+{
+ U8 ***plane, **row;
+ if(OVER_UTF_MAX(uv)) return NULL;
+ plane = (U8***)UNF_compat[uv >> 16];
+ if(! plane) return NULL;
+ row = plane[(uv >> 8) & 0xff];
+ return row ? row[uv & 0xff] : NULL;
+}
+
+UV getComposite (UV uv, UV uv2)
+{
+ UNF_complist ***plane, **row, *cell, *i;
+
+ if(! uv2 || OVER_UTF_MAX(uv) || OVER_UTF_MAX(uv2)) return 0;
+
+ if(Hangul_IsL(uv) && Hangul_IsV(uv2)) {
+ uv -= Hangul_LBase; /* lindex */
+ uv2 -= Hangul_VBase; /* vindex */
+ return(Hangul_SBase + (uv * Hangul_VCount + uv2) * Hangul_TCount);
+ }
+ if(Hangul_IsLV(uv) && Hangul_IsT(uv2)) {
+ uv2 -= Hangul_TBase; /* tindex */
+ return (uv + uv2);
+ }
+ plane = UNF_compos[uv >> 16];
+ if(! plane) return 0;
+ row = plane[(uv >> 8) & 0xff];
+ if(! row) return 0;
+ cell = row[uv & 0xff];
+ if(! cell) return 0;
+ for(i = cell; i->nextchar; i++) {
+ if(uv2 == i->nextchar) return i->composite;
+ }
+ return 0;
+}
+
+U8 getCombinClass (UV uv)
+{
+ U8 **plane, *row;
+ if(OVER_UTF_MAX(uv)) return 0;
+ plane = (U8**)UNF_combin[uv >> 16];
+ if(! plane) return 0;
+ row = plane[(uv >> 8) & 0xff];
+ return row ? row[uv & 0xff] : 0;
+}
+
+void sv_cat_decompHangul (SV* sv, UV uv)
+{
+ UV sindex, lindex, vindex, tindex;
+ U8 *t, temp[3 * UTF8_MAXLEN + 1];
+
+ if(! Hangul_IsS(uv)) return;
+
+ sindex = uv - Hangul_SBase;
+ lindex = sindex / Hangul_NCount;
+ vindex = (sindex % Hangul_NCount) / Hangul_TCount;
+ tindex = sindex % Hangul_TCount;
+
+ t = temp;
+ t = uvuni_to_utf8(t, (lindex + Hangul_LBase));
+ t = uvuni_to_utf8(t, (vindex + Hangul_VBase));
+ if (tindex) t = uvuni_to_utf8(t, (tindex + Hangul_TBase));
+ *t = '\0';
+ sv_catpvn(sv, (char *)temp, strlen((char *)temp));
+}
+
+MODULE = Unicode::Normalize PACKAGE = Unicode::Normalize
+
+
+SV*
+decompose(arg, compat)
+ SV * arg
+ SV * compat
+ PROTOTYPE: $
+ PREINIT:
+ SV *src, *dst;
+ STRLEN srclen, dstlen, retlen;
+ U8 *s, *e, *p, *d, *r;
+ UV uv;
+ bool iscompat;
+ CODE:
+ if(SvUTF8(arg)) {
+ src = arg;
+ } else {
+ src = sv_mortalcopy(arg);
+ sv_utf8_upgrade(src);
+ }
+
+ iscompat = SvTRUE(compat);
+
+ dst = newSV(1);
+ (void)SvPOK_only(dst);
+ SvUTF8_on(dst);
+
+ s = (U8*)SvPV(src,srclen);
+ e = s + srclen;
+ for(p = s; p < e;){
+ uv = utf8n_to_uvchr(p, e - p, &retlen, 0);
+ p += retlen;
+ if(Hangul_IsS(uv)) sv_cat_decompHangul(dst, uv);
+ else {
+ r = iscompat ? dec_compat(uv) : dec_canonical(uv);
+ if(r) sv_catpv(dst, (char *)r);
+ else sv_catpvn(dst, (char *)p - retlen, retlen);
+ }
+ }
+ RETVAL = dst;
+ OUTPUT:
+ RETVAL
+
+
+
+SV*
+reorder(arg)
+ SV * arg
+ PROTOTYPE: $
+ PREINIT:
+ SV *src;
+ STRLEN srclen, retlen, stk_cc_max;
+ U8 *s, *e, *p, curCC;
+ UV uv;
+ UNF_cc * stk_cc;
+ CODE:
+ src = newSVsv(arg);
+ if(! SvUTF8(arg)) sv_utf8_upgrade(src);
+
+ stk_cc_max = 10; /* enough as an initial value? */
+ New(0, stk_cc, stk_cc_max, UNF_cc);
+
+ s = (U8*)SvPV(src,srclen);
+ e = s + srclen;
+ for(p = s; p < e;){
+ U8 *cc_in;
+ STRLEN cc_len, cc_iter, cc_pos;
+
+ uv = utf8n_to_uvchr(p, e - p, &retlen, 0);
+ p += retlen;
+ cc_pos = 0;
+ curCC = getCombinClass(uv);
+ if(! (curCC && p < e)) continue; else cc_in = p - retlen;
+
+ stk_cc[cc_pos].cc = curCC;
+ stk_cc[cc_pos].uv = uv;
+ stk_cc[cc_pos].pos = cc_pos;
+
+ while(p < e) {
+ uv = utf8n_to_uvchr(p, e - p, &retlen, 0);
+ curCC = getCombinClass(uv);
+ if(!curCC) break;
+ p += retlen;
+ cc_pos++;
+ if(stk_cc_max <= cc_pos) { /* extend if need */
+ stk_cc_max = cc_pos + 1;
+ Renew(stk_cc, stk_cc_max, UNF_cc);
+ }
+ stk_cc[cc_pos].cc = curCC;
+ stk_cc[cc_pos].uv = uv;
+ stk_cc[cc_pos].pos = cc_pos;
+ }
+
+ /* only one c.c. in cc_len from cc_in, no need of reordering */
+ if(!cc_pos) continue;
+
+ qsort((void*)stk_cc, cc_pos + 1, sizeof(UNF_cc), compare_cc);
+
+ cc_len = p - cc_in;
+ p = cc_in;
+ for(cc_iter = 0; cc_iter <= cc_pos; cc_iter++) {
+ p = uvuni_to_utf8(p, stk_cc[cc_iter].uv);
+ }
+ }
+ Safefree(stk_cc);
+ RETVAL = src;
+ OUTPUT:
+ RETVAL
+
+
+
+void
+compose(arg)
+ SV * arg
+ PROTOTYPE: $
+ PREINIT:
+ SV *src, *dst, *tmp;
+ U8 *s, *p, *e, *d, *t, *tmp_start, curCC, preCC;
+ UV uv, uvS, uvComp;
+ STRLEN srclen, dstlen, tmplen, dstcur, retlen;
+ bool beginning = TRUE;
+ PPCODE:
+ if(SvUTF8(arg)) {
+ src = arg;
+ } else {
+ src = sv_mortalcopy(arg);
+ sv_utf8_upgrade(src);
+ }
+ s = (U8*)SvPV(src, srclen);
+ e = s + srclen;
+ dstlen = srclen + 1; /* equal or shorter, XXX */
+ dst = sv_2mortal(newSV(dstlen));
+ (void)SvPOK_only(dst);
+ SvUTF8_on(dst);
+ d = (U8*)SvPVX(dst);
+
+ /* for uncomposed combining char */
+ tmp = sv_2mortal(newSV(dstlen));
+ (void)SvPOK_only(tmp);
+ SvUTF8_on(tmp);
+
+ for(p = s; p < e;){
+ if(beginning) {
+ uvS = utf8n_to_uvchr(p, e - p, &retlen, 0);
+ p += retlen;
+
+ if (getCombinClass(uvS)){ /* no Starter found yet */
+ d = uvuni_to_utf8(d, uvS);
+ continue;
+ }
+ beginning = FALSE;
+ }
+
+ /* Starter */
+ t = tmp_start = (U8*)SvPVX(tmp);
+ preCC = 0;
+
+ /* to the next Starter */
+ while(p < e) {
+ uv = utf8n_to_uvchr(p, e - p, &retlen, 0);
+ p += retlen;
+ curCC = getCombinClass(uv);
+
+ if(preCC && preCC == curCC) {
+ preCC = curCC;
+ t = uvuni_to_utf8(t, uv);
+ } else {
+ uvComp = getComposite(uvS, uv);
+
+ /* S + C + S => S-S + C would be also blocked. */
+ if( uvComp && ! getExclusion(uvComp) && preCC <= curCC)
+ {
+ /* preCC not changed to curCC */
+ uvS = uvComp;
+ } else if (! curCC && p < e) { /* blocked */
+ break;
+ } else {
+ preCC = curCC;
+ t = uvuni_to_utf8(t, uv);
+ }
+ }
+ }
+ d = uvuni_to_utf8(d, uvS); /* composed char */
+ if(tmplen = t - tmp_start) { /* uncomposed combining char */
+ t = (U8*)SvPVX(tmp);
+ while(tmplen--) *d++ = *t++;
+ }
+ uvS = uv;
+ } /* for */
+ dstcur = d - (U8*)SvPVX(dst);
+ SvCUR_set(dst, dstcur);
+ XPUSHs(dst);
+
+
+
+U8
+getCombinClass(uv)
+ UV uv
+
+bool
+getExclusion(uv)
+ UV uv
+
+UV
+getComposite(uv, uv2)
+ UV uv
+ UV uv2
+
+SV*
+getCanon(uv)
+ UV uv
+ PROTOTYPE: $
+ ALIAS:
+ getCompat = 1
+ PREINIT:
+ U8 * rstr;
+ CODE:
+ if(Hangul_IsS(uv)) {
+ SV * dst;
+ dst = newSV(1);
+ (void)SvPOK_only(dst);
+ sv_cat_decompHangul(dst, uv);
+ RETVAL = dst;
+ } else {
+ rstr = ix ? dec_compat(uv) : dec_canonical(uv);
+ if(!rstr) XSRETURN_UNDEF;
+ RETVAL = newSVpvn((char *)rstr, strlen((char *)rstr));
+ }
+ SvUTF8_on(RETVAL);
+ OUTPUT:
+ RETVAL
+
diff --git a/ext/Unicode/Normalize/README b/ext/Unicode/Normalize/README
new file mode 100644
index 0000000000..3f0c4240fe
--- /dev/null
+++ b/ext/Unicode/Normalize/README
@@ -0,0 +1,66 @@
+Unicode/Normalize version 0.10
+===================================
+
+Unicode::Normalize - normalized forms of Unicode text
+
+SYNOPSIS
+
+ use Unicode::Normalize;
+
+ $string_NFD = NFD($string); # Normalization Form D
+ $string_NFC = NFC($string); # Normalization Form C
+ $string_NFKD = NFKD($string); # Normalization Form KD
+ $string_NFKC = NFKC($string); # Normalization Form KC
+
+ or
+
+ use Unicode::Normalize 'normalize';
+
+ $string_NFD = normalize('D', $string); # Normalization Form D
+ $string_NFC = normalize('C', $string); # Normalization Form C
+ $string_NFKD = normalize('KD', $string); # Normalization Form KD
+ $string_NFKC = normalize('KC', $string); # Normalization Form KC
+
+INSTALLATION
+
+Perl 5.006 or later
+
+To install this module type the following:
+
+ perl Makefile.PL
+ make
+ make test
+ make install
+
+If you have a C compiler and want to use the XS version,
+type the following:
+
+ perl Makefile.PL xs
+ make
+ make test
+ make install
+
+DEPENDENCIES
+
+This module requires these other modules and libraries:
+
+Carp
+Exporter
+File::Copy
+File::Spec
+Lingua::KO::Hangul::Util 0.06
+unicore/CombiningClass.pl or unicode/CombiningClass.pl
+unicore/Decomposition.pl or unicode/Decomposition.pl
+unicore/CompExcl.txt or unicode/CompExcl.txt
+
+COPYRIGHT AND LICENCE
+
+SADAHIRO Tomoyuki, E<lt>SADAHIRO@cpan.orgE<gt>
+
+ http://homepage1.nifty.com/nomenclator/perl/
+
+ Copyright(C) 2001, SADAHIRO Tomoyuki. Japan. All rights reserved.
+
+ This program is free software; you can redistribute it and/or
+ modify it under the same terms as Perl itself.
+
diff --git a/ext/Unicode/Normalize/mkheader b/ext/Unicode/Normalize/mkheader
new file mode 100644
index 0000000000..85d2b90e62
--- /dev/null
+++ b/ext/Unicode/Normalize/mkheader
@@ -0,0 +1,284 @@
+#!perl
+#
+# This script generates "unfcan.h", "unfcpt.h", "unfcmb.h",
+# "unfcmp.h", and "unfexc.h"
+# from CombiningClass.pl, Decomposition.pl, CompExcl.txt
+# in lib/unicore or unicode directory
+# for Unicode::Normalize.xs. (cf. Makefile.PL)
+#
+use 5.006;
+use strict;
+use warnings;
+use Carp;
+
+our $PACKAGE = 'Unicode::Normalize, mkheader';
+
+our $Combin = do "unicore/CombiningClass.pl"
+ || do "unicode/CombiningClass.pl"
+ || croak "$PACKAGE: CombiningClass.pl not found";
+
+our $Decomp = do "unicore/Decomposition.pl"
+ || do "unicode/Decomposition.pl"
+ || croak "$PACKAGE: Decomposition.pl not found";
+
+our %Combin; # $codepoint => $number : combination class
+our %Canon; # $codepoint => $hexstring : canonical decomp.
+our %Compat; # $codepoint => $hexstring : compat. decomp.
+our %Compos; # $string => $codepoint : composite
+
+our %Exclus; # $codepoint => 1 : composition exclusions
+
+{
+ my($f, $fh);
+ foreach my $d (@INC) {
+ use File::Spec;
+ $f = File::Spec->catfile($d, "unicore", "CompExcl.txt");
+ last if open($fh, $f);
+ $f = File::Spec->catfile($d, "unicode", "CompExcl.txt");
+ last if open($fh, $f);
+ $f = undef;
+ }
+ croak "$PACKAGE: CompExcl.txt not found in @INC" unless defined $f;
+ while(<$fh>) {
+ next if /^#/ or /^$/;
+ s/#.*//;
+ $Exclus{ hex($1) } =1 if /([0-9A-Fa-f]+)/;
+ }
+ close $fh;
+}
+
+while($Combin =~ /(.+)/g) {
+ my @tab = split /\t/, $1;
+ my $ini = hex $tab[0];
+ if($tab[1] eq '') {
+ $Combin{ $ini } = $tab[2];
+ } else {
+ $Combin{ $_ } = $tab[2] foreach $ini .. hex($tab[1]);
+ }
+}
+
+while($Decomp =~ /(.+)/g) {
+ my @tab = split /\t/, $1;
+ my $compat = $tab[2] =~ s/<[^>]+>//;
+ my $dec = [ _getHexArray($tab[2]) ]; # decomposition
+ my $com = pack('U*', @$dec); # composable sequence
+ my $ini = hex($tab[0]);
+ if($tab[1] eq '') {
+ $Compat{ $ini } = $dec;
+ if(! $compat) {
+ $Canon{ $ini } = $dec;
+ $Compos{ $com } = $ini if @$dec > 1;
+ }
+ } else {
+ foreach my $u ($ini .. hex($tab[1])){
+ $Compat{ $u } = $dec;
+ if(! $compat){
+ $Canon{ $u } = $dec;
+ $Compos{ $com } = $ini if @$dec > 1;
+ }
+ }
+ }
+}
+
+# exhaustive decomposition
+foreach my $key (keys %Canon) {
+ $Canon{$key} = [ getCanonList($key) ];
+}
+
+# exhaustive decomposition
+foreach my $key (keys %Compat) {
+ $Compat{$key} = [ getCompatList($key) ];
+}
+
+sub getCanonList {
+ my @src = @_;
+ my @dec = map $Canon{$_} ? @{ $Canon{$_} } : $_, @src;
+ join(" ",@src) eq join(" ",@dec) ? @dec : getCanonList(@dec);
+ # condition @src == @dec is not ok.
+}
+
+sub getCompatList {
+ my @src = @_;
+ my @dec = map $Compat{$_} ? @{ $Compat{$_} } : $_, @src;
+ join(" ",@src) eq join(" ",@dec) ? @dec : getCompatList(@dec);
+ # condition @src == @dec is not ok.
+}
+
+sub _getHexArray {
+ my $str = shift;
+ map hex(), $str =~ /([0-9A-Fa-f]+)/g;
+}
+
+sub _U_stringify {
+ sprintf '"%s"', join '',
+ map sprintf("\\x%2x", $_), unpack 'C*', pack 'U*', @_;
+}
+
+foreach my $hash (\%Canon, \%Compat) {
+ foreach my $key (keys %$hash) {
+ $hash->{$key} = _U_stringify( @{ $hash->{$key} } );
+ }
+}
+
+sub utf8len {
+ my $uv = shift;
+ return $uv < 0x80 ? 1 :
+ $uv < 0x800 ? 2 :
+ $uv < 0x10000 ? 3 :
+ $uv < 0x110000 ? 4 :
+ croak "$PACKAGE: illegal char in the composite. utf-8 max is 0x10ffff.";
+}
+
+my $prefix = "UNF_";
+
+my $structname = "${prefix}complist";
+
+our (%Comp1st, %CompList);
+
+foreach(sort keys %Compos) {
+ my @a = unpack('U*', $_);
+ my $val = $Compos{$_};
+ my $name = sprintf "${structname}_%06x", $a[0];
+ $Comp1st{ $a[0] } = $name;
+ $CompList{ $name }{ $a[1] } = $val;
+
+ if( utf8len($a[0]) + utf8len($a[1]) < utf8len($val) ) {
+ croak "$PACKAGE: "
+ . "composable pair is longer than the composite in bytes!\n"
+ . sprintf("%d + %d => %d", $a[0], $a[1], $val);
+ }
+}
+
+my $compinit =
+ "typedef struct { UV nextchar; UV composite; } $structname;\n\n";
+
+foreach my $i (sort keys %CompList) {
+ $compinit .= "$structname $i [] = {\n";
+ $compinit .= join ",\n",
+ map sprintf("\t{ %d, %d }", $_, $CompList{$i}{$_}),
+ sort {$a <=> $b } keys %{ $CompList{$i} };
+ $compinit .= ",\n{0,0}\n};\n\n"; # with sentinel
+}
+
+####################################
+
+my @Exclus = sort {$a <=> $b} keys %Exclus;
+
+my $file = "unfexc.h";
+open FH, ">$file" or croak "$PACKAGE: $file can't be made";
+binmode FH; select FH;
+
+print "bool getExclusion (UV uv) \n{\nreturn\n\t";
+
+while(@Exclus) {
+ my $cur = shift @Exclus;
+ if(@Exclus && $cur + 1 == $Exclus[0]) {
+ print "$cur <= uv && uv <= ";
+ while(@Exclus && $cur + 1 == $Exclus[0]) {
+ $cur = shift @Exclus;
+ }
+ print $cur;
+ print "\n\t|| " if @Exclus;
+ } else {
+ print "uv == $cur";
+ print "\n\t|| " if @Exclus;
+ }
+}
+
+print "\n\t? TRUE : FALSE;\n}\n\n";
+close FH;
+
+####################################
+
+my @tripletable = (
+ {
+ file => "unfcmb",
+ name => "combin",
+ type => "char",
+ hash => \%Combin,
+ null => 0,
+ },
+ {
+ file => "unfcan",
+ name => "canon",
+ type => "char*",
+ hash => \%Canon,
+ null => "NULL",
+ },
+ {
+ file => "unfcpt",
+ name => "compat",
+ type => "char*",
+ hash => \%Compat,
+ null => "NULL",
+ },
+ {
+ file => "unfcmp",
+ name => "compos",
+ type => "$structname *",
+ hash => \%Comp1st,
+ null => "NULL",
+ init => $compinit,
+ },
+);
+
+foreach my $tbl (@tripletable) {
+ my $file = "$tbl->{file}.h";
+ my $head = "${prefix}$tbl->{name}";
+ my $type = $tbl->{type};
+ my $hash = $tbl->{hash};
+ my $null = $tbl->{null};
+ my $init = $tbl->{init};
+
+ open FH, ">$file" or croak "$PACKAGE: $file can't be made";
+ binmode FH; select FH;
+ my %val;
+
+ print FH << 'EOF';
+/*
+ * This file is auto-generated by mkheader.
+ * Any changes here will be lost!
+ */
+EOF
+
+ print $init if defined $init;
+
+ foreach my $uv (keys %$hash) {
+ my @c = unpack 'CCCC', pack 'N', $uv;
+ $val{ $c[1] }{ $c[2] }{ $c[3] } = $hash->{$uv};
+ }
+
+ foreach my $p (sort { $a <=> $b } keys %val) {
+ next if ! $val{ $p };
+ for(my $r = 0; $r < 256; $r++){
+ next if ! $val{ $p }{ $r };
+ printf "$type ${head}_%02x_%02x [256] = {\n", $p, $r;
+ for(my $c = 0; $c < 256; $c++){
+ print "\t", defined $val{$p}{$r}{$c} ? $val{$p}{$r}{$c} : $null;
+ print ',' if $c != 255;
+ print "\n" if $c % 8 == 7;
+ }
+ print "};\n\n";
+ }
+ }
+ foreach my $p (sort { $a <=> $b } keys %val) {
+ next if ! $val{ $p };
+ printf "$type* ${head}_%02x [256] = {\n", $p;
+ for(my $r = 0; $r < 256; $r++){
+ print $val{ $p }{ $r } ? sprintf("${head}_%02x_%02x", $p, $r) : "NULL";
+ print ',' if $r != 255;
+ print "\n" if $val{ $p }{ $r } || ($r+1) % 8 == 0;
+ }
+ print "};\n\n";
+ }
+ print "$type** $head [] = {\n";
+ for(my $p = 0; $p <= 0x10; $p++){
+ print $val{ $p } ? sprintf("${head}_%02x", $p) : "NULL";
+ print ',' if $p != 0x10;
+ print "\n";
+ }
+ print "};\n\n";
+ close FH;
+}
+
+__END__
diff --git a/ext/Unicode/Normalize/t/func.t b/ext/Unicode/Normalize/t/func.t
new file mode 100644
index 0000000000..8907634c47
--- /dev/null
+++ b/ext/Unicode/Normalize/t/func.t
@@ -0,0 +1,69 @@
+# Before `make install' is performed this script should be runnable with
+# `make test'. After `make install' it should work as `perl test.pl'
+
+#########################
+
+use Test;
+use strict;
+use warnings;
+BEGIN { plan tests => 6 };
+use Unicode::Normalize qw(:all);
+ok(1); # If we made it this far, we're ok.
+
+#########################
+
+print getCombinClass( 0) == 0
+ && getCombinClass( 768) == 230
+ && getCombinClass(1809) == 36
+# && getCombinClass(119143) == 1
+ ? "ok" : "not ok", " 2\n";
+
+print ! defined getCanon( 0)
+ && ! defined getCanon(41)
+ && getCanon(0x00C0) eq pack('U*', 0x0041, 0x0300)
+ && getCanon(0x00EF) eq pack('U*', 0x0069, 0x0308)
+ && getCanon(0x304C) eq pack('U*', 0x304B, 0x3099)
+ && getCanon(0x1EA4) eq pack('U*', 0x0041, 0x0302, 0x0301)
+ && getCanon(0x1FAF) eq pack('U*', 0x03A9, 0x0314, 0x0342, 0x0345)
+ && getCanon(0xAC00) eq pack('U*', 0x1100, 0x1161)
+ && getCanon(0xAE00) eq pack('U*', 0x1100, 0x1173, 0x11AF)
+ && ! defined getCanon(0x212C)
+ && ! defined getCanon(0x3243)
+ && getCanon(0xFA2D) eq pack('U*', 0x9DB4)
+ ? "ok" : "not ok", " 3\n";
+
+print ! defined getCompat( 0)
+ && ! defined getCompat(41)
+ && getCompat(0x00C0) eq pack('U*', 0x0041, 0x0300)
+ && getCompat(0x00EF) eq pack('U*', 0x0069, 0x0308)
+ && getCompat(0x304C) eq pack('U*', 0x304B, 0x3099)
+ && getCompat(0x1EA4) eq pack('U*', 0x0041, 0x0302, 0x0301)
+ && getCompat(0x1FAF) eq pack('U*', 0x03A9, 0x0314, 0x0342, 0x0345)
+ && getCompat(0x212C) eq pack('U*', 0x0042)
+ && getCompat(0x3243) eq pack('U*', 0x0028, 0x81F3, 0x0029)
+ && getCompat(0xAC00) eq pack('U*', 0x1100, 0x1161)
+ && getCompat(0xAE00) eq pack('U*', 0x1100, 0x1173, 0x11AF)
+ && getCompat(0xFA2D) eq pack('U*', 0x9DB4)
+ ? "ok" : "not ok", " 4\n";
+
+print ! getComposite( 0, 0)
+ && ! getComposite( 0, 41)
+ && ! getComposite(41, 0)
+ && ! getComposite(41, 41)
+ && ! getComposite(12, 0x0300)
+ && ! getComposite(0x0055, 0xFF00)
+ && 0x00D9 == getComposite(0x0055, 0x0300)
+ && 0x1E14 == getComposite(0x0112, 0x0300)
+ && 0xAC00 == getComposite(0x1100, 0x1161)
+ && 0xADF8 == getComposite(0x1100, 0x1173)
+ && ! getComposite(0x1100, 0x11AF)
+ && ! getComposite(0x1173, 0x11AF)
+ && 0xAE00 == getComposite(0xADF8, 0x11AF)
+ ? "ok" : "not ok", " 5\n";
+
+print ! getExclusion( 0)
+ && ! getExclusion(41)
+ && getExclusion(2392)
+ && getExclusion(3907)
+ && getExclusion(64334)
+ ? "ok" : "not ok", " 6\n";
diff --git a/ext/Unicode/Normalize/t/norm.t b/ext/Unicode/Normalize/t/norm.t
new file mode 100644
index 0000000000..1de2e7fcb8
--- /dev/null
+++ b/ext/Unicode/Normalize/t/norm.t
@@ -0,0 +1,43 @@
+# Before `make install' is performed this script should be runnable with
+# `make test'. After `make install' it should work as `perl test.pl'
+
+#########################
+
+use Test;
+use strict;
+use warnings;
+BEGIN { plan tests => 18 };
+use Unicode::Normalize qw(normalize);
+ok(1); # If we made it this far, we're ok.
+
+#########################
+
+ok(normalize('C', ""), "");
+ok(normalize('D', ""), "");
+
+sub hexNFC {
+ join " ", map sprintf("%04X", $_),
+ unpack 'U*', normalize 'C', pack 'U*', map hex(), split ' ', shift;
+}
+sub hexNFD {
+ join " ", map sprintf("%04X", $_),
+ unpack 'U*', normalize 'D', pack 'U*', map hex(), split ' ', shift;
+}
+
+ok(hexNFC("0061 0315 0300 05AE 05C4 0062"), "00E0 05AE 05C4 0315 0062");
+ok(hexNFC("00E0 05AE 05C4 0315 0062"), "00E0 05AE 05C4 0315 0062");
+ok(hexNFC("0061 05AE 0300 05C4 0315 0062"), "00E0 05AE 05C4 0315 0062");
+ok(hexNFC("0045 0304 0300 AC00 11A8"), "1E14 AC01");
+ok(hexNFC("1100 1161 1100 1173 11AF"), "AC00 AE00");
+ok(hexNFC("1100 0300 1161 1173 11AF"), "1100 0300 1161 1173 11AF");
+
+ok(hexNFD("0061 0315 0300 05AE 05C4 0062"), "0061 05AE 0300 05C4 0315 0062");
+ok(hexNFD("00E0 05AE 05C4 0315 0062"), "0061 05AE 0300 05C4 0315 0062");
+ok(hexNFD("0061 05AE 0300 05C4 0315 0062"), "0061 05AE 0300 05C4 0315 0062");
+ok(hexNFC("0061 05C4 0315 0300 05AE 0062"), "0061 05AE 05C4 0300 0315 0062");
+ok(hexNFC("0061 05AE 05C4 0300 0315 0062"), "0061 05AE 05C4 0300 0315 0062");
+ok(hexNFD("0061 05C4 0315 0300 05AE 0062"), "0061 05AE 05C4 0300 0315 0062");
+ok(hexNFD("0061 05AE 05C4 0300 0315 0062"), "0061 05AE 05C4 0300 0315 0062");
+ok(hexNFC("0000 0041 0000 0000"), "0000 0041 0000 0000");
+ok(hexNFD("0000 0041 0000 0000"), "0000 0041 0000 0000");
+
diff --git a/ext/Unicode/Normalize/t/test.t b/ext/Unicode/Normalize/t/test.t
new file mode 100644
index 0000000000..5544a3b13b
--- /dev/null
+++ b/ext/Unicode/Normalize/t/test.t
@@ -0,0 +1,43 @@
+# Before `make install' is performed this script should be runnable with
+# `make test'. After `make install' it should work as `perl test.pl'
+
+#########################
+
+use Test;
+use strict;
+use warnings;
+BEGIN { plan tests => 18 };
+use Unicode::Normalize;
+ok(1); # If we made it this far, we're ok.
+
+#########################
+
+ok(NFC(""), "");
+ok(NFD(""), "");
+
+sub hexNFC {
+ join " ", map sprintf("%04X", $_),
+ unpack 'U*', NFC pack 'U*', map hex(), split ' ', shift;
+}
+sub hexNFD {
+ join " ", map sprintf("%04X", $_),
+ unpack 'U*', NFD pack 'U*', map hex(), split ' ', shift;
+}
+
+ok(hexNFC("0061 0315 0300 05AE 05C4 0062"), "00E0 05AE 05C4 0315 0062");
+ok(hexNFC("00E0 05AE 05C4 0315 0062"), "00E0 05AE 05C4 0315 0062");
+ok(hexNFC("0061 05AE 0300 05C4 0315 0062"), "00E0 05AE 05C4 0315 0062");
+ok(hexNFC("0045 0304 0300 AC00 11A8"), "1E14 AC01");
+ok(hexNFC("1100 1161 1100 1173 11AF"), "AC00 AE00");
+ok(hexNFC("1100 0300 1161 1173 11AF"), "1100 0300 1161 1173 11AF");
+
+ok(hexNFD("0061 0315 0300 05AE 05C4 0062"), "0061 05AE 0300 05C4 0315 0062");
+ok(hexNFD("00E0 05AE 05C4 0315 0062"), "0061 05AE 0300 05C4 0315 0062");
+ok(hexNFD("0061 05AE 0300 05C4 0315 0062"), "0061 05AE 0300 05C4 0315 0062");
+ok(hexNFC("0061 05C4 0315 0300 05AE 0062"), "0061 05AE 05C4 0300 0315 0062");
+ok(hexNFC("0061 05AE 05C4 0300 0315 0062"), "0061 05AE 05C4 0300 0315 0062");
+ok(hexNFD("0061 05C4 0315 0300 05AE 0062"), "0061 05AE 05C4 0300 0315 0062");
+ok(hexNFD("0061 05AE 05C4 0300 0315 0062"), "0061 05AE 05C4 0300 0315 0062");
+ok(hexNFC("0000 0041 0000 0000"), "0000 0041 0000 0000");
+ok(hexNFD("0000 0041 0000 0000"), "0000 0041 0000 0000");
+