diff options
author | Jarkko Hietaniemi <jhi@iki.fi> | 2001-11-09 00:23:40 +0000 |
---|---|---|
committer | Jarkko Hietaniemi <jhi@iki.fi> | 2001-11-09 00:23:40 +0000 |
commit | ac5ea53171baa7dab1a92df1eacf8d2fe19cbdbb (patch) | |
tree | 5ce2221e6a7335594861f38233d4d665313a7c82 /ext | |
parent | 61a515a61510e728f2014674d12cb94cb5a90834 (diff) | |
download | perl-ac5ea53171baa7dab1a92df1eacf8d2fe19cbdbb.tar.gz |
Upgrade to Unicode::Normalize 0.10, now in XS.
The CPAN distribution has both pm and XS implementations,
and for performance reasons we choose the XS.
Another reason to choose the XS is that it doesn't
require Lingua::KO::Hangul::Util, which means that
we can delete that-- which in turn means that Unicode::UCD
cannot expect that: support it, but don't expect.
Ditto Unicode::Collate.
Note that Unicode::Normalize Makefile.PL and
Normalize.xs have been modified from the CPAN 0.10
versions: the first one to be simpler (no pm) and
clean up the generated unf*.h files, the second one
to quench compiler grumblings. Must notify Sadahiro
about these changes.
p4raw-id: //depot/perl@12909
Diffstat (limited to 'ext')
-rw-r--r-- | ext/Unicode/Normalize/Changes | 38 | ||||
-rw-r--r-- | ext/Unicode/Normalize/Makefile.PL | 15 | ||||
-rw-r--r-- | ext/Unicode/Normalize/Normalize.pm | 45 | ||||
-rw-r--r-- | ext/Unicode/Normalize/Normalize.pod | 89 | ||||
-rw-r--r-- | ext/Unicode/Normalize/Normalize.xs | 378 | ||||
-rw-r--r-- | ext/Unicode/Normalize/README | 66 | ||||
-rw-r--r-- | ext/Unicode/Normalize/mkheader | 284 | ||||
-rw-r--r-- | ext/Unicode/Normalize/t/func.t | 69 | ||||
-rw-r--r-- | ext/Unicode/Normalize/t/norm.t | 43 | ||||
-rw-r--r-- | ext/Unicode/Normalize/t/test.t | 43 |
10 files changed, 1070 insertions, 0 deletions
diff --git a/ext/Unicode/Normalize/Changes b/ext/Unicode/Normalize/Changes new file mode 100644 index 0000000000..bf17449ab2 --- /dev/null +++ b/ext/Unicode/Normalize/Changes @@ -0,0 +1,38 @@ +Revision history for Perl extension Unicode::Normalize. + +0.10 Sat Nov 03 16:30:20 2001 + - The XS version is now independent of Lingua::KO::Hangul::Util. + (though the Non-XS version still requires that.) + +0.09 Fri Nov 02 22:39:30 2001 + - remove pTHX_. + +0.08 Thu Nov 01 23:20:42 2001 + - use Lingua::KO::Hangul::Util 0.06 and remove "hangul.h". + +0.07 Wed Oct 31 22:06:42 2001 + - modify internal. decompose() - reorder() - compose(). + +0.06 Sun Oct 28 14:28:46 2001 + - an XS version. + (but the Non-XS version is also supported.) + +0.05 Wed Oct 10 22:02:15 2001 (not released) + - %Compos contains unnecessary singletons + (though it did not cause any bug, only useless). + They will not be stored. + +0.04 Wed Aug 15 19:02:41 2001 + - fix: NFD("") and NFKD("") must return "", not but undef. + +0.03 Fri Aug 10 22:44:18 2001 + - rename the module name to Unicode::Normalize. + - normalize takes two arguments. + +0.02 Thu Aug 9 22:56:36 2001 + - add function normalize + +0.01 Mon Aug 6 21:45:11 2001 + - original version; created by h2xs 1.21 with options + -A -X -n Text::Unicode::Normalize + diff --git a/ext/Unicode/Normalize/Makefile.PL b/ext/Unicode/Normalize/Makefile.PL new file mode 100644 index 0000000000..88ab9b7b63 --- /dev/null +++ b/ext/Unicode/Normalize/Makefile.PL @@ -0,0 +1,15 @@ +use ExtUtils::MakeMaker; + +# This is not the CPAN Unicode::Normalize makefile +# that can handle XS-NOXS installing. We do just XS. + +do "mkheader"; + +WriteMakefile( + 'NAME' => 'Unicode::Normalize', + 'VERSION_FROM' => 'Normalize.pm', # finds $VERSION + ($] >= 5.005 ? ## Add these new keywords supported since 5.005 + (ABSTRACT_FROM => 'Normalize.pod', # retrieve abstract from module + AUTHOR => 'SADAHIRO Tomoyuki <SADAHIRO@cpan.org>') : ()), + clean => {FILES=> 'unfcan.h unfcmb.h unfcmp.h unfcpt.h unfexc.h'}, +); diff --git a/ext/Unicode/Normalize/Normalize.pm b/ext/Unicode/Normalize/Normalize.pm new file mode 100644 index 0000000000..a583425a3b --- /dev/null +++ b/ext/Unicode/Normalize/Normalize.pm @@ -0,0 +1,45 @@ +package Unicode::Normalize; + +use 5.006; +use strict; +use warnings; +use Carp; + +our $VERSION = '0.10'; +our $PACKAGE = __PACKAGE__; + +require Exporter; +require DynaLoader; +require AutoLoader; + +our @ISA = qw(Exporter DynaLoader); +our @EXPORT = qw( NFC NFD NFKC NFKD ); +our @EXPORT_OK = qw( normalize decompose reorder compose + getCanon getCompat getComposite getCombinClass getExclusion); +our %EXPORT_TAGS = ( all => [ @EXPORT, @EXPORT_OK ] ); + +bootstrap Unicode::Normalize $VERSION; + +use constant CANON => 0; +use constant COMPAT => 1; + +sub NFD ($) { reorder(decompose($_[0], CANON)) } + +sub NFKD ($) { reorder(decompose($_[0], COMPAT)) } + +sub NFC ($) { compose(reorder(decompose($_[0], CANON))) } + +sub NFKC ($) { compose(reorder(decompose($_[0], COMPAT))) } + +sub normalize($$) +{ + my $form = shift; + $form eq 'D' || $form eq 'NFD' ? NFD ($_[0]) : + $form eq 'C' || $form eq 'NFC' ? NFC ($_[0]) : + $form eq 'KD' || $form eq 'NFKD' ? NFKD($_[0]) : + $form eq 'KC' || $form eq 'NFKC' ? NFKC($_[0]) : + croak $PACKAGE."::normalize: invalid form name: $form"; +} + +1; +__END__ diff --git a/ext/Unicode/Normalize/Normalize.pod b/ext/Unicode/Normalize/Normalize.pod new file mode 100644 index 0000000000..4ac8966a83 --- /dev/null +++ b/ext/Unicode/Normalize/Normalize.pod @@ -0,0 +1,89 @@ + +=head1 NAME + +Unicode::Normalize - normalized forms of Unicode text + +=head1 SYNOPSIS + + use Unicode::Normalize; + + $string_NFD = NFD($raw_string); # Normalization Form D + $string_NFC = NFC($raw_string); # Normalization Form C + $string_NFKD = NFKD($raw_string); # Normalization Form KD + $string_NFKC = NFKC($raw_string); # Normalization Form KC + + or + + use Unicode::Normalize 'normalize'; + + $string_NFD = normalize('D', $raw_string); # Normalization Form D + $string_NFC = normalize('C', $raw_string); # Normalization Form C + $string_NFKD = normalize('KD', $raw_string); # Normalization Form KD + $string_NFKC = normalize('KC', $raw_string); # Normalization Form KC + +=head1 DESCRIPTION + +=over 4 + +=item C<$string_NFD = NFD($raw_string)> + +returns the Normalization Form D (formed by canonical decomposition). + + +=item C<$string_NFC = NFC($raw_string)> + +returns the Normalization Form C (formed by canonical decomposition +followed by canonical composition). + +=item C<$string_NFKD = NFKD($raw_string)> + +returns the Normalization Form KD (formed by compatibility decomposition). + +=item C<$string_NFKC = NFKC($raw_string)> + +returns the Normalization Form KC (formed by compatibility decomposition +followed by B<canonical> composition). + +=item C<$normalized_string = normalize($form_name, $raw_string)> + +As C<$form_name>, one of the following names must be given. + + 'C' or 'NFC' for Normalization Form C + 'D' or 'NFD' for Normalization Form D + 'KC' or 'NFKC' for Normalization Form KC + 'KD' or 'NFKD' for Normalization Form KD + +=back + +=head2 EXPORT + +C<NFC>, C<NFD>, C<NFKC>, C<NFKD>: by default. + +C<normalize>: on request. + +=head1 AUTHOR + +SADAHIRO Tomoyuki, E<lt>SADAHIRO@cpan.orgE<gt> + + http://homepage1.nifty.com/nomenclator/perl/ + + Copyright(C) 2001, SADAHIRO Tomoyuki. Japan. All rights reserved. + + This program is free software; you can redistribute it and/or + modify it under the same terms as Perl itself. + +=head1 SEE ALSO + +=over 4 + +=item L<Lingua::KO::Hangul::Util> + +utility functions for Hangul Syllables + +=item http://www.unicode.org/unicode/reports/tr15/ + +Unicode Normalization Forms - UAX #15 + +=back + +=cut diff --git a/ext/Unicode/Normalize/Normalize.xs b/ext/Unicode/Normalize/Normalize.xs new file mode 100644 index 0000000000..aca08538fb --- /dev/null +++ b/ext/Unicode/Normalize/Normalize.xs @@ -0,0 +1,378 @@ + +#include "EXTERN.h" +#include "perl.h" +#include "XSUB.h" + +/* These 5 files are prepared by mkheader */ +#include "unfcmb.h" +#include "unfcan.h" +#include "unfcpt.h" +#include "unfcmp.h" +#include "unfexc.h" + +/* Perl 5.6.1 ? */ +#ifndef uvuni_to_utf8 +#define uvuni_to_utf8 uv_to_utf8 +#endif /* uvuni_to_utf8 */ + +/* Perl 5.6.1 ? */ +#ifndef utf8n_to_uvchr +#define utf8n_to_uvchr utf8_to_uv +#endif /* utf8n_to_uvchr */ + +/* At present, char > 0x10ffff are unaffected without complaint, right? */ +#define VALID_UTF_MAX (0x10ffff) +#define OVER_UTF_MAX(uv) (VALID_UTF_MAX < (uv)) + +/* HANGUL_H */ +#define Hangul_SBase 0xAC00 +#define Hangul_SFinal 0xD7A3 +#define Hangul_SCount 11172 + +#define Hangul_NCount 588 + +#define Hangul_LBase 0x1100 +#define Hangul_LFinal 0x1112 +#define Hangul_LCount 19 + +#define Hangul_VBase 0x1161 +#define Hangul_VFinal 0x1175 +#define Hangul_VCount 21 + +#define Hangul_TBase 0x11A7 +#define Hangul_TFinal 0x11C2 +#define Hangul_TCount 28 + +#define Hangul_IsS(u) ((Hangul_SBase <= (u)) && ((u) <= Hangul_SFinal)) +#define Hangul_IsN(u) (! (((u) - Hangul_SBase) % Hangul_TCount)) +#define Hangul_IsLV(u) (Hangul_IsS(u) && Hangul_IsN(u)) +#define Hangul_IsL(u) ((Hangul_LBase <= (u)) && ((u) <= Hangul_LFinal)) +#define Hangul_IsV(u) ((Hangul_VBase <= (u)) && ((u) <= Hangul_VFinal)) +#define Hangul_IsT(u) ((Hangul_TBase < (u)) && ((u) <= Hangul_TFinal)) +/* HANGUL_H */ + +/* this is used for canonical ordering of combining characters (c.c.). */ +typedef struct { + U8 cc; /* combining class */ + UV uv; /* codepoint */ + STRLEN pos; /* position */ +} UNF_cc; + +int compare_cc(const void *a, const void *b) +{ + int ret_cc; + ret_cc = (*(UNF_cc*)a).cc - (*(UNF_cc*)b).cc; + if(ret_cc) return ret_cc; + return (*(UNF_cc*)a).pos - (*(UNF_cc*)b).pos; +} + +U8* dec_canonical (UV uv) +{ + U8 ***plane, **row; + if(OVER_UTF_MAX(uv)) return NULL; + plane = (U8***)UNF_canon[uv >> 16]; + if(! plane) return NULL; + row = plane[(uv >> 8) & 0xff]; + return row ? row[uv & 0xff] : NULL; +} + +U8* dec_compat (UV uv) +{ + U8 ***plane, **row; + if(OVER_UTF_MAX(uv)) return NULL; + plane = (U8***)UNF_compat[uv >> 16]; + if(! plane) return NULL; + row = plane[(uv >> 8) & 0xff]; + return row ? row[uv & 0xff] : NULL; +} + +UV getComposite (UV uv, UV uv2) +{ + UNF_complist ***plane, **row, *cell, *i; + + if(! uv2 || OVER_UTF_MAX(uv) || OVER_UTF_MAX(uv2)) return 0; + + if(Hangul_IsL(uv) && Hangul_IsV(uv2)) { + uv -= Hangul_LBase; /* lindex */ + uv2 -= Hangul_VBase; /* vindex */ + return(Hangul_SBase + (uv * Hangul_VCount + uv2) * Hangul_TCount); + } + if(Hangul_IsLV(uv) && Hangul_IsT(uv2)) { + uv2 -= Hangul_TBase; /* tindex */ + return (uv + uv2); + } + plane = UNF_compos[uv >> 16]; + if(! plane) return 0; + row = plane[(uv >> 8) & 0xff]; + if(! row) return 0; + cell = row[uv & 0xff]; + if(! cell) return 0; + for(i = cell; i->nextchar; i++) { + if(uv2 == i->nextchar) return i->composite; + } + return 0; +} + +U8 getCombinClass (UV uv) +{ + U8 **plane, *row; + if(OVER_UTF_MAX(uv)) return 0; + plane = (U8**)UNF_combin[uv >> 16]; + if(! plane) return 0; + row = plane[(uv >> 8) & 0xff]; + return row ? row[uv & 0xff] : 0; +} + +void sv_cat_decompHangul (SV* sv, UV uv) +{ + UV sindex, lindex, vindex, tindex; + U8 *t, temp[3 * UTF8_MAXLEN + 1]; + + if(! Hangul_IsS(uv)) return; + + sindex = uv - Hangul_SBase; + lindex = sindex / Hangul_NCount; + vindex = (sindex % Hangul_NCount) / Hangul_TCount; + tindex = sindex % Hangul_TCount; + + t = temp; + t = uvuni_to_utf8(t, (lindex + Hangul_LBase)); + t = uvuni_to_utf8(t, (vindex + Hangul_VBase)); + if (tindex) t = uvuni_to_utf8(t, (tindex + Hangul_TBase)); + *t = '\0'; + sv_catpvn(sv, (char *)temp, strlen((char *)temp)); +} + +MODULE = Unicode::Normalize PACKAGE = Unicode::Normalize + + +SV* +decompose(arg, compat) + SV * arg + SV * compat + PROTOTYPE: $ + PREINIT: + SV *src, *dst; + STRLEN srclen, dstlen, retlen; + U8 *s, *e, *p, *d, *r; + UV uv; + bool iscompat; + CODE: + if(SvUTF8(arg)) { + src = arg; + } else { + src = sv_mortalcopy(arg); + sv_utf8_upgrade(src); + } + + iscompat = SvTRUE(compat); + + dst = newSV(1); + (void)SvPOK_only(dst); + SvUTF8_on(dst); + + s = (U8*)SvPV(src,srclen); + e = s + srclen; + for(p = s; p < e;){ + uv = utf8n_to_uvchr(p, e - p, &retlen, 0); + p += retlen; + if(Hangul_IsS(uv)) sv_cat_decompHangul(dst, uv); + else { + r = iscompat ? dec_compat(uv) : dec_canonical(uv); + if(r) sv_catpv(dst, (char *)r); + else sv_catpvn(dst, (char *)p - retlen, retlen); + } + } + RETVAL = dst; + OUTPUT: + RETVAL + + + +SV* +reorder(arg) + SV * arg + PROTOTYPE: $ + PREINIT: + SV *src; + STRLEN srclen, retlen, stk_cc_max; + U8 *s, *e, *p, curCC; + UV uv; + UNF_cc * stk_cc; + CODE: + src = newSVsv(arg); + if(! SvUTF8(arg)) sv_utf8_upgrade(src); + + stk_cc_max = 10; /* enough as an initial value? */ + New(0, stk_cc, stk_cc_max, UNF_cc); + + s = (U8*)SvPV(src,srclen); + e = s + srclen; + for(p = s; p < e;){ + U8 *cc_in; + STRLEN cc_len, cc_iter, cc_pos; + + uv = utf8n_to_uvchr(p, e - p, &retlen, 0); + p += retlen; + cc_pos = 0; + curCC = getCombinClass(uv); + if(! (curCC && p < e)) continue; else cc_in = p - retlen; + + stk_cc[cc_pos].cc = curCC; + stk_cc[cc_pos].uv = uv; + stk_cc[cc_pos].pos = cc_pos; + + while(p < e) { + uv = utf8n_to_uvchr(p, e - p, &retlen, 0); + curCC = getCombinClass(uv); + if(!curCC) break; + p += retlen; + cc_pos++; + if(stk_cc_max <= cc_pos) { /* extend if need */ + stk_cc_max = cc_pos + 1; + Renew(stk_cc, stk_cc_max, UNF_cc); + } + stk_cc[cc_pos].cc = curCC; + stk_cc[cc_pos].uv = uv; + stk_cc[cc_pos].pos = cc_pos; + } + + /* only one c.c. in cc_len from cc_in, no need of reordering */ + if(!cc_pos) continue; + + qsort((void*)stk_cc, cc_pos + 1, sizeof(UNF_cc), compare_cc); + + cc_len = p - cc_in; + p = cc_in; + for(cc_iter = 0; cc_iter <= cc_pos; cc_iter++) { + p = uvuni_to_utf8(p, stk_cc[cc_iter].uv); + } + } + Safefree(stk_cc); + RETVAL = src; + OUTPUT: + RETVAL + + + +void +compose(arg) + SV * arg + PROTOTYPE: $ + PREINIT: + SV *src, *dst, *tmp; + U8 *s, *p, *e, *d, *t, *tmp_start, curCC, preCC; + UV uv, uvS, uvComp; + STRLEN srclen, dstlen, tmplen, dstcur, retlen; + bool beginning = TRUE; + PPCODE: + if(SvUTF8(arg)) { + src = arg; + } else { + src = sv_mortalcopy(arg); + sv_utf8_upgrade(src); + } + s = (U8*)SvPV(src, srclen); + e = s + srclen; + dstlen = srclen + 1; /* equal or shorter, XXX */ + dst = sv_2mortal(newSV(dstlen)); + (void)SvPOK_only(dst); + SvUTF8_on(dst); + d = (U8*)SvPVX(dst); + + /* for uncomposed combining char */ + tmp = sv_2mortal(newSV(dstlen)); + (void)SvPOK_only(tmp); + SvUTF8_on(tmp); + + for(p = s; p < e;){ + if(beginning) { + uvS = utf8n_to_uvchr(p, e - p, &retlen, 0); + p += retlen; + + if (getCombinClass(uvS)){ /* no Starter found yet */ + d = uvuni_to_utf8(d, uvS); + continue; + } + beginning = FALSE; + } + + /* Starter */ + t = tmp_start = (U8*)SvPVX(tmp); + preCC = 0; + + /* to the next Starter */ + while(p < e) { + uv = utf8n_to_uvchr(p, e - p, &retlen, 0); + p += retlen; + curCC = getCombinClass(uv); + + if(preCC && preCC == curCC) { + preCC = curCC; + t = uvuni_to_utf8(t, uv); + } else { + uvComp = getComposite(uvS, uv); + + /* S + C + S => S-S + C would be also blocked. */ + if( uvComp && ! getExclusion(uvComp) && preCC <= curCC) + { + /* preCC not changed to curCC */ + uvS = uvComp; + } else if (! curCC && p < e) { /* blocked */ + break; + } else { + preCC = curCC; + t = uvuni_to_utf8(t, uv); + } + } + } + d = uvuni_to_utf8(d, uvS); /* composed char */ + if(tmplen = t - tmp_start) { /* uncomposed combining char */ + t = (U8*)SvPVX(tmp); + while(tmplen--) *d++ = *t++; + } + uvS = uv; + } /* for */ + dstcur = d - (U8*)SvPVX(dst); + SvCUR_set(dst, dstcur); + XPUSHs(dst); + + + +U8 +getCombinClass(uv) + UV uv + +bool +getExclusion(uv) + UV uv + +UV +getComposite(uv, uv2) + UV uv + UV uv2 + +SV* +getCanon(uv) + UV uv + PROTOTYPE: $ + ALIAS: + getCompat = 1 + PREINIT: + U8 * rstr; + CODE: + if(Hangul_IsS(uv)) { + SV * dst; + dst = newSV(1); + (void)SvPOK_only(dst); + sv_cat_decompHangul(dst, uv); + RETVAL = dst; + } else { + rstr = ix ? dec_compat(uv) : dec_canonical(uv); + if(!rstr) XSRETURN_UNDEF; + RETVAL = newSVpvn((char *)rstr, strlen((char *)rstr)); + } + SvUTF8_on(RETVAL); + OUTPUT: + RETVAL + diff --git a/ext/Unicode/Normalize/README b/ext/Unicode/Normalize/README new file mode 100644 index 0000000000..3f0c4240fe --- /dev/null +++ b/ext/Unicode/Normalize/README @@ -0,0 +1,66 @@ +Unicode/Normalize version 0.10 +=================================== + +Unicode::Normalize - normalized forms of Unicode text + +SYNOPSIS + + use Unicode::Normalize; + + $string_NFD = NFD($string); # Normalization Form D + $string_NFC = NFC($string); # Normalization Form C + $string_NFKD = NFKD($string); # Normalization Form KD + $string_NFKC = NFKC($string); # Normalization Form KC + + or + + use Unicode::Normalize 'normalize'; + + $string_NFD = normalize('D', $string); # Normalization Form D + $string_NFC = normalize('C', $string); # Normalization Form C + $string_NFKD = normalize('KD', $string); # Normalization Form KD + $string_NFKC = normalize('KC', $string); # Normalization Form KC + +INSTALLATION + +Perl 5.006 or later + +To install this module type the following: + + perl Makefile.PL + make + make test + make install + +If you have a C compiler and want to use the XS version, +type the following: + + perl Makefile.PL xs + make + make test + make install + +DEPENDENCIES + +This module requires these other modules and libraries: + +Carp +Exporter +File::Copy +File::Spec +Lingua::KO::Hangul::Util 0.06 +unicore/CombiningClass.pl or unicode/CombiningClass.pl +unicore/Decomposition.pl or unicode/Decomposition.pl +unicore/CompExcl.txt or unicode/CompExcl.txt + +COPYRIGHT AND LICENCE + +SADAHIRO Tomoyuki, E<lt>SADAHIRO@cpan.orgE<gt> + + http://homepage1.nifty.com/nomenclator/perl/ + + Copyright(C) 2001, SADAHIRO Tomoyuki. Japan. All rights reserved. + + This program is free software; you can redistribute it and/or + modify it under the same terms as Perl itself. + diff --git a/ext/Unicode/Normalize/mkheader b/ext/Unicode/Normalize/mkheader new file mode 100644 index 0000000000..85d2b90e62 --- /dev/null +++ b/ext/Unicode/Normalize/mkheader @@ -0,0 +1,284 @@ +#!perl +# +# This script generates "unfcan.h", "unfcpt.h", "unfcmb.h", +# "unfcmp.h", and "unfexc.h" +# from CombiningClass.pl, Decomposition.pl, CompExcl.txt +# in lib/unicore or unicode directory +# for Unicode::Normalize.xs. (cf. Makefile.PL) +# +use 5.006; +use strict; +use warnings; +use Carp; + +our $PACKAGE = 'Unicode::Normalize, mkheader'; + +our $Combin = do "unicore/CombiningClass.pl" + || do "unicode/CombiningClass.pl" + || croak "$PACKAGE: CombiningClass.pl not found"; + +our $Decomp = do "unicore/Decomposition.pl" + || do "unicode/Decomposition.pl" + || croak "$PACKAGE: Decomposition.pl not found"; + +our %Combin; # $codepoint => $number : combination class +our %Canon; # $codepoint => $hexstring : canonical decomp. +our %Compat; # $codepoint => $hexstring : compat. decomp. +our %Compos; # $string => $codepoint : composite + +our %Exclus; # $codepoint => 1 : composition exclusions + +{ + my($f, $fh); + foreach my $d (@INC) { + use File::Spec; + $f = File::Spec->catfile($d, "unicore", "CompExcl.txt"); + last if open($fh, $f); + $f = File::Spec->catfile($d, "unicode", "CompExcl.txt"); + last if open($fh, $f); + $f = undef; + } + croak "$PACKAGE: CompExcl.txt not found in @INC" unless defined $f; + while(<$fh>) { + next if /^#/ or /^$/; + s/#.*//; + $Exclus{ hex($1) } =1 if /([0-9A-Fa-f]+)/; + } + close $fh; +} + +while($Combin =~ /(.+)/g) { + my @tab = split /\t/, $1; + my $ini = hex $tab[0]; + if($tab[1] eq '') { + $Combin{ $ini } = $tab[2]; + } else { + $Combin{ $_ } = $tab[2] foreach $ini .. hex($tab[1]); + } +} + +while($Decomp =~ /(.+)/g) { + my @tab = split /\t/, $1; + my $compat = $tab[2] =~ s/<[^>]+>//; + my $dec = [ _getHexArray($tab[2]) ]; # decomposition + my $com = pack('U*', @$dec); # composable sequence + my $ini = hex($tab[0]); + if($tab[1] eq '') { + $Compat{ $ini } = $dec; + if(! $compat) { + $Canon{ $ini } = $dec; + $Compos{ $com } = $ini if @$dec > 1; + } + } else { + foreach my $u ($ini .. hex($tab[1])){ + $Compat{ $u } = $dec; + if(! $compat){ + $Canon{ $u } = $dec; + $Compos{ $com } = $ini if @$dec > 1; + } + } + } +} + +# exhaustive decomposition +foreach my $key (keys %Canon) { + $Canon{$key} = [ getCanonList($key) ]; +} + +# exhaustive decomposition +foreach my $key (keys %Compat) { + $Compat{$key} = [ getCompatList($key) ]; +} + +sub getCanonList { + my @src = @_; + my @dec = map $Canon{$_} ? @{ $Canon{$_} } : $_, @src; + join(" ",@src) eq join(" ",@dec) ? @dec : getCanonList(@dec); + # condition @src == @dec is not ok. +} + +sub getCompatList { + my @src = @_; + my @dec = map $Compat{$_} ? @{ $Compat{$_} } : $_, @src; + join(" ",@src) eq join(" ",@dec) ? @dec : getCompatList(@dec); + # condition @src == @dec is not ok. +} + +sub _getHexArray { + my $str = shift; + map hex(), $str =~ /([0-9A-Fa-f]+)/g; +} + +sub _U_stringify { + sprintf '"%s"', join '', + map sprintf("\\x%2x", $_), unpack 'C*', pack 'U*', @_; +} + +foreach my $hash (\%Canon, \%Compat) { + foreach my $key (keys %$hash) { + $hash->{$key} = _U_stringify( @{ $hash->{$key} } ); + } +} + +sub utf8len { + my $uv = shift; + return $uv < 0x80 ? 1 : + $uv < 0x800 ? 2 : + $uv < 0x10000 ? 3 : + $uv < 0x110000 ? 4 : + croak "$PACKAGE: illegal char in the composite. utf-8 max is 0x10ffff."; +} + +my $prefix = "UNF_"; + +my $structname = "${prefix}complist"; + +our (%Comp1st, %CompList); + +foreach(sort keys %Compos) { + my @a = unpack('U*', $_); + my $val = $Compos{$_}; + my $name = sprintf "${structname}_%06x", $a[0]; + $Comp1st{ $a[0] } = $name; + $CompList{ $name }{ $a[1] } = $val; + + if( utf8len($a[0]) + utf8len($a[1]) < utf8len($val) ) { + croak "$PACKAGE: " + . "composable pair is longer than the composite in bytes!\n" + . sprintf("%d + %d => %d", $a[0], $a[1], $val); + } +} + +my $compinit = + "typedef struct { UV nextchar; UV composite; } $structname;\n\n"; + +foreach my $i (sort keys %CompList) { + $compinit .= "$structname $i [] = {\n"; + $compinit .= join ",\n", + map sprintf("\t{ %d, %d }", $_, $CompList{$i}{$_}), + sort {$a <=> $b } keys %{ $CompList{$i} }; + $compinit .= ",\n{0,0}\n};\n\n"; # with sentinel +} + +#################################### + +my @Exclus = sort {$a <=> $b} keys %Exclus; + +my $file = "unfexc.h"; +open FH, ">$file" or croak "$PACKAGE: $file can't be made"; +binmode FH; select FH; + +print "bool getExclusion (UV uv) \n{\nreturn\n\t"; + +while(@Exclus) { + my $cur = shift @Exclus; + if(@Exclus && $cur + 1 == $Exclus[0]) { + print "$cur <= uv && uv <= "; + while(@Exclus && $cur + 1 == $Exclus[0]) { + $cur = shift @Exclus; + } + print $cur; + print "\n\t|| " if @Exclus; + } else { + print "uv == $cur"; + print "\n\t|| " if @Exclus; + } +} + +print "\n\t? TRUE : FALSE;\n}\n\n"; +close FH; + +#################################### + +my @tripletable = ( + { + file => "unfcmb", + name => "combin", + type => "char", + hash => \%Combin, + null => 0, + }, + { + file => "unfcan", + name => "canon", + type => "char*", + hash => \%Canon, + null => "NULL", + }, + { + file => "unfcpt", + name => "compat", + type => "char*", + hash => \%Compat, + null => "NULL", + }, + { + file => "unfcmp", + name => "compos", + type => "$structname *", + hash => \%Comp1st, + null => "NULL", + init => $compinit, + }, +); + +foreach my $tbl (@tripletable) { + my $file = "$tbl->{file}.h"; + my $head = "${prefix}$tbl->{name}"; + my $type = $tbl->{type}; + my $hash = $tbl->{hash}; + my $null = $tbl->{null}; + my $init = $tbl->{init}; + + open FH, ">$file" or croak "$PACKAGE: $file can't be made"; + binmode FH; select FH; + my %val; + + print FH << 'EOF'; +/* + * This file is auto-generated by mkheader. + * Any changes here will be lost! + */ +EOF + + print $init if defined $init; + + foreach my $uv (keys %$hash) { + my @c = unpack 'CCCC', pack 'N', $uv; + $val{ $c[1] }{ $c[2] }{ $c[3] } = $hash->{$uv}; + } + + foreach my $p (sort { $a <=> $b } keys %val) { + next if ! $val{ $p }; + for(my $r = 0; $r < 256; $r++){ + next if ! $val{ $p }{ $r }; + printf "$type ${head}_%02x_%02x [256] = {\n", $p, $r; + for(my $c = 0; $c < 256; $c++){ + print "\t", defined $val{$p}{$r}{$c} ? $val{$p}{$r}{$c} : $null; + print ',' if $c != 255; + print "\n" if $c % 8 == 7; + } + print "};\n\n"; + } + } + foreach my $p (sort { $a <=> $b } keys %val) { + next if ! $val{ $p }; + printf "$type* ${head}_%02x [256] = {\n", $p; + for(my $r = 0; $r < 256; $r++){ + print $val{ $p }{ $r } ? sprintf("${head}_%02x_%02x", $p, $r) : "NULL"; + print ',' if $r != 255; + print "\n" if $val{ $p }{ $r } || ($r+1) % 8 == 0; + } + print "};\n\n"; + } + print "$type** $head [] = {\n"; + for(my $p = 0; $p <= 0x10; $p++){ + print $val{ $p } ? sprintf("${head}_%02x", $p) : "NULL"; + print ',' if $p != 0x10; + print "\n"; + } + print "};\n\n"; + close FH; +} + +__END__ diff --git a/ext/Unicode/Normalize/t/func.t b/ext/Unicode/Normalize/t/func.t new file mode 100644 index 0000000000..8907634c47 --- /dev/null +++ b/ext/Unicode/Normalize/t/func.t @@ -0,0 +1,69 @@ +# Before `make install' is performed this script should be runnable with +# `make test'. After `make install' it should work as `perl test.pl' + +######################### + +use Test; +use strict; +use warnings; +BEGIN { plan tests => 6 }; +use Unicode::Normalize qw(:all); +ok(1); # If we made it this far, we're ok. + +######################### + +print getCombinClass( 0) == 0 + && getCombinClass( 768) == 230 + && getCombinClass(1809) == 36 +# && getCombinClass(119143) == 1 + ? "ok" : "not ok", " 2\n"; + +print ! defined getCanon( 0) + && ! defined getCanon(41) + && getCanon(0x00C0) eq pack('U*', 0x0041, 0x0300) + && getCanon(0x00EF) eq pack('U*', 0x0069, 0x0308) + && getCanon(0x304C) eq pack('U*', 0x304B, 0x3099) + && getCanon(0x1EA4) eq pack('U*', 0x0041, 0x0302, 0x0301) + && getCanon(0x1FAF) eq pack('U*', 0x03A9, 0x0314, 0x0342, 0x0345) + && getCanon(0xAC00) eq pack('U*', 0x1100, 0x1161) + && getCanon(0xAE00) eq pack('U*', 0x1100, 0x1173, 0x11AF) + && ! defined getCanon(0x212C) + && ! defined getCanon(0x3243) + && getCanon(0xFA2D) eq pack('U*', 0x9DB4) + ? "ok" : "not ok", " 3\n"; + +print ! defined getCompat( 0) + && ! defined getCompat(41) + && getCompat(0x00C0) eq pack('U*', 0x0041, 0x0300) + && getCompat(0x00EF) eq pack('U*', 0x0069, 0x0308) + && getCompat(0x304C) eq pack('U*', 0x304B, 0x3099) + && getCompat(0x1EA4) eq pack('U*', 0x0041, 0x0302, 0x0301) + && getCompat(0x1FAF) eq pack('U*', 0x03A9, 0x0314, 0x0342, 0x0345) + && getCompat(0x212C) eq pack('U*', 0x0042) + && getCompat(0x3243) eq pack('U*', 0x0028, 0x81F3, 0x0029) + && getCompat(0xAC00) eq pack('U*', 0x1100, 0x1161) + && getCompat(0xAE00) eq pack('U*', 0x1100, 0x1173, 0x11AF) + && getCompat(0xFA2D) eq pack('U*', 0x9DB4) + ? "ok" : "not ok", " 4\n"; + +print ! getComposite( 0, 0) + && ! getComposite( 0, 41) + && ! getComposite(41, 0) + && ! getComposite(41, 41) + && ! getComposite(12, 0x0300) + && ! getComposite(0x0055, 0xFF00) + && 0x00D9 == getComposite(0x0055, 0x0300) + && 0x1E14 == getComposite(0x0112, 0x0300) + && 0xAC00 == getComposite(0x1100, 0x1161) + && 0xADF8 == getComposite(0x1100, 0x1173) + && ! getComposite(0x1100, 0x11AF) + && ! getComposite(0x1173, 0x11AF) + && 0xAE00 == getComposite(0xADF8, 0x11AF) + ? "ok" : "not ok", " 5\n"; + +print ! getExclusion( 0) + && ! getExclusion(41) + && getExclusion(2392) + && getExclusion(3907) + && getExclusion(64334) + ? "ok" : "not ok", " 6\n"; diff --git a/ext/Unicode/Normalize/t/norm.t b/ext/Unicode/Normalize/t/norm.t new file mode 100644 index 0000000000..1de2e7fcb8 --- /dev/null +++ b/ext/Unicode/Normalize/t/norm.t @@ -0,0 +1,43 @@ +# Before `make install' is performed this script should be runnable with +# `make test'. After `make install' it should work as `perl test.pl' + +######################### + +use Test; +use strict; +use warnings; +BEGIN { plan tests => 18 }; +use Unicode::Normalize qw(normalize); +ok(1); # If we made it this far, we're ok. + +######################### + +ok(normalize('C', ""), ""); +ok(normalize('D', ""), ""); + +sub hexNFC { + join " ", map sprintf("%04X", $_), + unpack 'U*', normalize 'C', pack 'U*', map hex(), split ' ', shift; +} +sub hexNFD { + join " ", map sprintf("%04X", $_), + unpack 'U*', normalize 'D', pack 'U*', map hex(), split ' ', shift; +} + +ok(hexNFC("0061 0315 0300 05AE 05C4 0062"), "00E0 05AE 05C4 0315 0062"); +ok(hexNFC("00E0 05AE 05C4 0315 0062"), "00E0 05AE 05C4 0315 0062"); +ok(hexNFC("0061 05AE 0300 05C4 0315 0062"), "00E0 05AE 05C4 0315 0062"); +ok(hexNFC("0045 0304 0300 AC00 11A8"), "1E14 AC01"); +ok(hexNFC("1100 1161 1100 1173 11AF"), "AC00 AE00"); +ok(hexNFC("1100 0300 1161 1173 11AF"), "1100 0300 1161 1173 11AF"); + +ok(hexNFD("0061 0315 0300 05AE 05C4 0062"), "0061 05AE 0300 05C4 0315 0062"); +ok(hexNFD("00E0 05AE 05C4 0315 0062"), "0061 05AE 0300 05C4 0315 0062"); +ok(hexNFD("0061 05AE 0300 05C4 0315 0062"), "0061 05AE 0300 05C4 0315 0062"); +ok(hexNFC("0061 05C4 0315 0300 05AE 0062"), "0061 05AE 05C4 0300 0315 0062"); +ok(hexNFC("0061 05AE 05C4 0300 0315 0062"), "0061 05AE 05C4 0300 0315 0062"); +ok(hexNFD("0061 05C4 0315 0300 05AE 0062"), "0061 05AE 05C4 0300 0315 0062"); +ok(hexNFD("0061 05AE 05C4 0300 0315 0062"), "0061 05AE 05C4 0300 0315 0062"); +ok(hexNFC("0000 0041 0000 0000"), "0000 0041 0000 0000"); +ok(hexNFD("0000 0041 0000 0000"), "0000 0041 0000 0000"); + diff --git a/ext/Unicode/Normalize/t/test.t b/ext/Unicode/Normalize/t/test.t new file mode 100644 index 0000000000..5544a3b13b --- /dev/null +++ b/ext/Unicode/Normalize/t/test.t @@ -0,0 +1,43 @@ +# Before `make install' is performed this script should be runnable with +# `make test'. After `make install' it should work as `perl test.pl' + +######################### + +use Test; +use strict; +use warnings; +BEGIN { plan tests => 18 }; +use Unicode::Normalize; +ok(1); # If we made it this far, we're ok. + +######################### + +ok(NFC(""), ""); +ok(NFD(""), ""); + +sub hexNFC { + join " ", map sprintf("%04X", $_), + unpack 'U*', NFC pack 'U*', map hex(), split ' ', shift; +} +sub hexNFD { + join " ", map sprintf("%04X", $_), + unpack 'U*', NFD pack 'U*', map hex(), split ' ', shift; +} + +ok(hexNFC("0061 0315 0300 05AE 05C4 0062"), "00E0 05AE 05C4 0315 0062"); +ok(hexNFC("00E0 05AE 05C4 0315 0062"), "00E0 05AE 05C4 0315 0062"); +ok(hexNFC("0061 05AE 0300 05C4 0315 0062"), "00E0 05AE 05C4 0315 0062"); +ok(hexNFC("0045 0304 0300 AC00 11A8"), "1E14 AC01"); +ok(hexNFC("1100 1161 1100 1173 11AF"), "AC00 AE00"); +ok(hexNFC("1100 0300 1161 1173 11AF"), "1100 0300 1161 1173 11AF"); + +ok(hexNFD("0061 0315 0300 05AE 05C4 0062"), "0061 05AE 0300 05C4 0315 0062"); +ok(hexNFD("00E0 05AE 05C4 0315 0062"), "0061 05AE 0300 05C4 0315 0062"); +ok(hexNFD("0061 05AE 0300 05C4 0315 0062"), "0061 05AE 0300 05C4 0315 0062"); +ok(hexNFC("0061 05C4 0315 0300 05AE 0062"), "0061 05AE 05C4 0300 0315 0062"); +ok(hexNFC("0061 05AE 05C4 0300 0315 0062"), "0061 05AE 05C4 0300 0315 0062"); +ok(hexNFD("0061 05C4 0315 0300 05AE 0062"), "0061 05AE 05C4 0300 0315 0062"); +ok(hexNFD("0061 05AE 05C4 0300 0315 0062"), "0061 05AE 05C4 0300 0315 0062"); +ok(hexNFC("0000 0041 0000 0000"), "0000 0041 0000 0000"); +ok(hexNFD("0000 0041 0000 0000"), "0000 0041 0000 0000"); + |