summaryrefslogtreecommitdiff
path: root/cpan
diff options
context:
space:
mode:
authorChris 'BinGOs' Williams <chris@bingosnet.co.uk>2011-01-16 19:13:28 +0000
committerChris 'BinGOs' Williams <chris@bingosnet.co.uk>2011-01-16 19:14:52 +0000
commit788ba0f4016360d17406fdc02d8996b2757aeb6f (patch)
tree46359b3cee66a8b61b0036a2069f8b52aa3e2426 /cpan
parent4d220a7d395d394ec773c0d3eda665f350b3eab0 (diff)
downloadperl-788ba0f4016360d17406fdc02d8996b2757aeb6f.tar.gz
Update Unicode-Normalize to CPAN version 1.10
[DELTA] 1.10 Sun Jan 16 21:00:34 2011 - XSUB: reorder() and compose() treat with growing the string. - XSUB: provision against UTF8_ALLOW_* flags to be undefined in future. - doc: about perl 5.13.x and Unicode 6.0.0 - doc and comments: [perl #81876] Fix typos by Peter J. Acklam.
Diffstat (limited to 'cpan')
-rw-r--r--cpan/Unicode-Normalize/Changes8
-rw-r--r--cpan/Unicode-Normalize/Normalize.pm7
-rw-r--r--cpan/Unicode-Normalize/Normalize.xs184
-rw-r--r--cpan/Unicode-Normalize/README4
-rw-r--r--cpan/Unicode-Normalize/mkheader31
5 files changed, 107 insertions, 127 deletions
diff --git a/cpan/Unicode-Normalize/Changes b/cpan/Unicode-Normalize/Changes
index 893a604214..f872619dbc 100644
--- a/cpan/Unicode-Normalize/Changes
+++ b/cpan/Unicode-Normalize/Changes
@@ -1,5 +1,11 @@
Revision history for Perl extension Unicode::Normalize.
+1.10 Sun Jan 16 21:00:34 2011
+ - XSUB: reorder() and compose() treat with growing the string.
+ - XSUB: provision against UTF8_ALLOW_* flags to be undefined in future.
+ - doc: about perl 5.13.x and Unicode 6.0.0
+ - doc and comments: [perl #81876] Fix typos by Peter J. Acklam.
+
1.07 Mon Sep 20 20:20:02 2010
- doc: about perl 5.12.x and Unicode 5.2.0
- test: prototype of normalize_partial() and cousins in proto.t.
@@ -60,7 +66,7 @@ Revision history for Perl extension Unicode::Normalize.
- XSUB: even if string contains a malformed, "short" Unicode character,
decompose() and reorder() will be safe. Garbage will be no longer added.
- added null.t and short.t.
- - now truely added illegal.t (in 0.27, forgot to change MANIFEST).
+ - now truly added illegal.t (in 0.27, forgot to change MANIFEST).
0.27 Sun Nov 16 13:16:21 2003
- Illegal code points (surrogate and noncharacter) will be allowed
diff --git a/cpan/Unicode-Normalize/Normalize.pm b/cpan/Unicode-Normalize/Normalize.pm
index ab895aa36a..3c21e9dffd 100644
--- a/cpan/Unicode-Normalize/Normalize.pm
+++ b/cpan/Unicode-Normalize/Normalize.pm
@@ -13,7 +13,7 @@ use Carp;
no warnings 'utf8';
-our $VERSION = '1.08';
+our $VERSION = '1.10';
our $PACKAGE = __PACKAGE__;
our @EXPORT = qw( NFC NFD NFKC NFKD );
@@ -34,7 +34,7 @@ our %EXPORT_TAGS = (
);
##
-## utilites for tests
+## utilities for tests
##
sub pack_U {
@@ -549,6 +549,7 @@ normalization implemented by this module depends on your perl's version.
5.10.0 5.0.0
5.8.9, 5.10.1 5.1.0
5.12.0-5.12.2 5.2.0
+ (5.13.7-5.13.8) 6.0.0
=item Correction of decomposition mapping
@@ -576,7 +577,7 @@ lower than 4.1.0.
SADAHIRO Tomoyuki <SADAHIRO@cpan.org>
-Copyright(C) 2001-2010, SADAHIRO Tomoyuki. Japan. All rights reserved.
+Copyright(C) 2001-2011, SADAHIRO Tomoyuki. Japan. All rights reserved.
This module is free software; you can redistribute it
and/or modify it under the same terms as Perl itself.
diff --git a/cpan/Unicode-Normalize/Normalize.xs b/cpan/Unicode-Normalize/Normalize.xs
index 5f5357cbd8..b760dff0b3 100644
--- a/cpan/Unicode-Normalize/Normalize.xs
+++ b/cpan/Unicode-Normalize/Normalize.xs
@@ -17,32 +17,43 @@
/* Perl 5.6.1 ? */
#ifndef utf8n_to_uvuni
-#define utf8n_to_uvuni utf8_to_uv
+#define utf8n_to_uvuni utf8_to_uv
#endif /* utf8n_to_uvuni */
/* UTF8_ALLOW_BOM is used before Perl 5.8.0 */
-#ifdef UTF8_ALLOW_BOM
-#define AllowAnyUTF (UTF8_ALLOW_SURROGATE|UTF8_ALLOW_BOM|UTF8_ALLOW_FFFF)
-#else
-#define AllowAnyUTF (UTF8_ALLOW_SURROGATE|UTF8_ALLOW_FFFF)
-#endif
+#ifndef UTF8_ALLOW_BOM
+#define UTF8_ALLOW_BOM (0)
+#endif /* UTF8_ALLOW_BOM */
+
+#ifndef UTF8_ALLOW_SURROGATE
+#define UTF8_ALLOW_SURROGATE (0)
+#endif /* UTF8_ALLOW_SURROGATE */
+
+#ifndef UTF8_ALLOW_FE_FF
+#define UTF8_ALLOW_FE_FF (0)
+#endif /* UTF8_ALLOW_FE_FF */
+
+#ifndef UTF8_ALLOW_FFFF
+#define UTF8_ALLOW_FFFF (0)
+#endif /* UTF8_ALLOW_FFFF */
+
+#define AllowAnyUTF (UTF8_ALLOW_SURROGATE|UTF8_ALLOW_BOM|UTF8_ALLOW_FE_FF|UTF8_ALLOW_FFFF)
+
+/* check if the string buffer is enough before uvuni_to_utf8(). */
+/* dstart, d, and dlen should be defined outside before. */
+#define Renew_d_if_not_enough_to(need) STRLEN curlen = d - dstart; \
+ if (dlen < curlen + (need)) { \
+ dlen += (need); \
+ Renew(dstart, dlen+1, U8); \
+ d = dstart + curlen; \
+ }
-/* if utf8n_to_uvuni() sets retlen to 0 (?) */
+/* if utf8n_to_uvuni() sets retlen to 0 (if broken?) */
#define ErrRetlenIsZero "panic (Unicode::Normalize %s): zero-length character"
/* utf8_hop() hops back before start. Maybe broken UTF-8 */
#define ErrHopBeforeStart "panic (Unicode::Normalize): hopping before start"
-/* It should never happen as there is no instance in UTF-8 and UTF-EBCDIC.
- If Unicode would add a new composition of A + B to C
- where bytes::length(A) + bytes::length(B) < bytes::length(C),
- this code should be fixed.
- In this case, mkheader will prevent Unicode::Normalize from building. */
-#define ErrLongerThanSrc "panic (Unicode::Normalize %s): longer than source"
-
-/* uvuni_to_utf8 wants UTF8_MAXBYTES free bytes available */
-#define ErrTargetNotEnough "panic (Unicode::Normalize %s): target not enough"
-
/* At present, char > 0x10ffff are unaffected without complaint, right? */
#define VALID_UTF_MAX (0x10ffff)
#define OVER_UTF_MAX(uv) (VALID_UTF_MAX < (uv))
@@ -216,13 +227,7 @@ U8* pv_utf8_decompose(U8* s, STRLEN slen, U8** dp, STRLEN dlen, bool iscompat)
p += retlen;
if (Hangul_IsS(uv)) {
- STRLEN cur = d - dstart;
-
- if (dlen < cur + UTF8_MAXLEN * 3) {
- dlen += UTF8_MAXLEN * 3;
- Renew(dstart, dlen+1, U8);
- d = dstart + cur;
- }
+ Renew_d_if_not_enough_to(UTF8_MAXLEN * 3)
d = pv_cat_decompHangul(d, uv);
}
else {
@@ -230,23 +235,12 @@ U8* pv_utf8_decompose(U8* s, STRLEN slen, U8** dp, STRLEN dlen, bool iscompat)
if (r) {
STRLEN len = (STRLEN)strlen((char *)r);
- STRLEN cur = d - dstart;
- if (dlen < cur + len) {
- dlen += len;
- Renew(dstart, dlen+1, U8);
- d = dstart + cur;
- }
+ Renew_d_if_not_enough_to(len)
while (len--)
*d++ = *r++;
}
else {
- STRLEN cur = d - dstart;
-
- if (dlen < cur + UTF8_MAXLEN) {
- dlen += UTF8_MAXLEN;
- Renew(dstart, dlen+1, U8);
- d = dstart + cur;
- }
+ Renew_d_if_not_enough_to(UTF8_MAXLEN)
d = uvuni_to_utf8(d, uv);
}
}
@@ -256,11 +250,12 @@ U8* pv_utf8_decompose(U8* s, STRLEN slen, U8** dp, STRLEN dlen, bool iscompat)
}
static
-U8* pv_utf8_reorder(U8* s, STRLEN slen, U8* d, STRLEN dlen)
+U8* pv_utf8_reorder(U8* s, STRLEN slen, U8** dp, STRLEN dlen)
{
U8* p = s;
U8* e = s + slen;
- U8* dend = d + dlen;
+ U8* dstart = *dp;
+ U8* d = dstart;
UNF_cc seq_ary[CC_SEQ_SIZE];
UNF_cc* seq_ptr = seq_ary; /* use array at the beginning */
@@ -268,10 +263,6 @@ U8* pv_utf8_reorder(U8* s, STRLEN slen, U8* d, STRLEN dlen)
STRLEN seq_max = CC_SEQ_SIZE;
STRLEN cc_pos = 0;
- if (dlen < slen || dlen < slen + UTF8_MAXLEN)
- croak(ErrTargetNotEnough, "reorder");
- dend -= UTF8_MAXLEN; /* safety */
-
while (p < e) {
U8 curCC;
STRLEN retlen;
@@ -306,6 +297,7 @@ U8* pv_utf8_reorder(U8* s, STRLEN slen, U8* d, STRLEN dlen)
continue;
}
+ /* output */
if (cc_pos) {
STRLEN i;
@@ -313,30 +305,30 @@ U8* pv_utf8_reorder(U8* s, STRLEN slen, U8* d, STRLEN dlen)
qsort((void*)seq_ptr, cc_pos, sizeof(UNF_cc), compare_cc);
for (i = 0; i < cc_pos; i++) {
+ Renew_d_if_not_enough_to(UTF8_MAXLEN)
d = uvuni_to_utf8(d, seq_ptr[i].uv);
- if (dend < d) /* real end is dend + UTF8_MAXLEN */
- croak(ErrLongerThanSrc, "reorder");
}
cc_pos = 0;
}
if (curCC == 0) {
+ Renew_d_if_not_enough_to(UTF8_MAXLEN)
d = uvuni_to_utf8(d, uv);
- if (dend < d) /* real end is dend + UTF8_MAXLEN */
- croak(ErrLongerThanSrc, "reorder");
}
}
if (seq_ext)
Safefree(seq_ext);
+ *dp = dstart;
return d;
}
static
-U8* pv_utf8_compose(U8* s, STRLEN slen, U8* d, STRLEN dlen, bool iscontig)
+U8* pv_utf8_compose(U8* s, STRLEN slen, U8** dp, STRLEN dlen, bool iscontig)
{
U8* p = s;
U8* e = s + slen;
- U8* dend = d + dlen;
+ U8* dstart = *dp;
+ U8* d = dstart;
UV uvS = 0; /* code point of the starter */
bool valid_uvS = FALSE; /* if FALSE, uvS isn't initialized yet */
@@ -348,10 +340,6 @@ U8* pv_utf8_compose(U8* s, STRLEN slen, U8* d, STRLEN dlen, bool iscontig)
STRLEN seq_max = CC_SEQ_SIZE;
STRLEN cc_pos = 0;
- if (dlen < slen || dlen < slen + UTF8_MAXLEN)
- croak(ErrTargetNotEnough, "compose");
- dend -= UTF8_MAXLEN; /* safety */
-
while (p < e) {
U8 curCC;
STRLEN retlen;
@@ -370,9 +358,8 @@ U8* pv_utf8_compose(U8* s, STRLEN slen, U8* d, STRLEN dlen, bool iscontig)
continue;
}
else {
+ Renew_d_if_not_enough_to(UTF8_MAXLEN)
d = uvuni_to_utf8(d, uv);
- if (dend < d) /* real end is dend + UTF8_MAXLEN */
- croak(ErrLongerThanSrc, "compose");
continue;
}
}
@@ -382,7 +369,7 @@ U8* pv_utf8_compose(U8* s, STRLEN slen, U8* d, STRLEN dlen, bool iscontig)
/* blocked */
if ((iscontig && cc_pos) || /* discontiguous combination */
(curCC != 0 && preCC == curCC) || /* blocked by same CC */
- preCC > curCC) /* blocked by higher CC: revised D2 */
+ (preCC > curCC)) /* blocked by higher CC: revised D2 */
composed = FALSE;
/* not blocked:
@@ -428,17 +415,18 @@ U8* pv_utf8_compose(U8* s, STRLEN slen, U8* d, STRLEN dlen, bool iscontig)
}
}
- d = uvuni_to_utf8(d, uvS); /* starter (composed or not) */
- if (dend < d) /* real end is dend + UTF8_MAXLEN */
- croak(ErrLongerThanSrc, "compose");
+ /* output */
+ {
+ Renew_d_if_not_enough_to(UTF8_MAXLEN)
+ d = uvuni_to_utf8(d, uvS); /* starter (composed or not) */
+ }
if (cc_pos) {
STRLEN i;
for (i = 0; i < cc_pos; i++) {
+ Renew_d_if_not_enough_to(UTF8_MAXLEN)
d = uvuni_to_utf8(d, seq_ptr[i]);
- if (dend < d) /* real end is dend + UTF8_MAXLEN */
- croak(ErrLongerThanSrc, "compose");
}
cc_pos = 0;
}
@@ -447,6 +435,7 @@ U8* pv_utf8_compose(U8* s, STRLEN slen, U8* d, STRLEN dlen, bool iscontig)
}
if (seq_ext)
Safefree(seq_ext);
+ *dp = dstart;
return d;
}
@@ -474,6 +463,7 @@ decompose(src, compat = &PL_sv_no)
OUTPUT:
RETVAL
+
SV*
reorder(src)
SV * src
@@ -485,16 +475,17 @@ reorder(src)
CODE:
s = (U8*)sv_2pvunicode(src,&slen);
dst = newSVpvn("", 0);
- dlen = slen + UTF8_MAXLEN;
- d = (U8*)SvGROW(dst,dlen+1);
+ dlen = slen;
+ New(0, d, dlen+1, U8);
+ dend = pv_utf8_reorder(s, slen, &d, dlen);
+ sv_setpvn(dst, (char *)d, dend - d);
SvUTF8_on(dst);
- dend = pv_utf8_reorder(s, slen, d, dlen);
- *dend = '\0';
- SvCUR_set(dst, dend - d);
+ Safefree(d);
RETVAL = dst;
OUTPUT:
RETVAL
+
SV*
compose(src)
SV * src
@@ -508,16 +499,17 @@ compose(src)
CODE:
s = (U8*)sv_2pvunicode(src,&slen);
dst = newSVpvn("", 0);
- dlen = slen + UTF8_MAXLEN;
- d = (U8*)SvGROW(dst,dlen+1);
+ dlen = slen;
+ New(0, d, dlen+1, U8);
+ dend = pv_utf8_compose(s, slen, &d, dlen, (bool)ix);
+ sv_setpvn(dst, (char *)d, dend - d);
SvUTF8_on(dst);
- dend = pv_utf8_compose(s, slen, d, dlen, (bool)ix);
- *dend = '\0';
- SvCUR_set(dst, dend - d);
+ Safefree(d);
RETVAL = dst;
OUTPUT:
RETVAL
+
SV*
NFD(src)
SV * src
@@ -529,29 +521,34 @@ NFD(src)
U8 *s, *t, *tend, *d, *dend;
STRLEN slen, tlen, dlen;
CODE:
- /* decompose */
s = (U8*)sv_2pvunicode(src,&slen);
+
+ /* decompose */
tlen = slen;
New(0, t, tlen+1, U8);
- tend = pv_utf8_decompose(s, slen, &t, tlen, (bool)ix);
+ tend = pv_utf8_decompose(s, slen, &t, tlen, (bool)(ix==1));
*tend = '\0';
- tlen = tend - t; /* no longer know real tlen */
+ tlen = tend - t; /* no longer know real size of t */
/* reorder */
- dst = newSVpvn("", 0);
- dlen = tlen + UTF8_MAXLEN;
- d = (U8*)SvGROW(dst,dlen+1);
- SvUTF8_on(dst);
- dend = pv_utf8_reorder(t, tlen, d, dlen);
+ dlen = tlen;
+ New(0, d, dlen+1, U8);
+ dend = pv_utf8_reorder(t, tlen, &d, dlen);
*dend = '\0';
- SvCUR_set(dst, dend - d);
+ dlen = dend - d; /* no longer know real size of d */
/* return */
+ dst = newSVpvn("", 0);
+ sv_setpvn(dst, (char *)d, dlen);
+ SvUTF8_on(dst);
+
Safefree(t);
+ Safefree(d);
RETVAL = dst;
OUTPUT:
RETVAL
+
SV*
NFC(src)
SV * src
@@ -564,37 +561,42 @@ NFC(src)
U8 *s, *t, *tend, *u, *uend, *d, *dend;
STRLEN slen, tlen, ulen, dlen;
CODE:
- /* decompose */
s = (U8*)sv_2pvunicode(src,&slen);
+
+ /* decompose */
tlen = slen;
New(0, t, tlen+1, U8);
tend = pv_utf8_decompose(s, slen, &t, tlen, (bool)(ix==1));
*tend = '\0';
- tlen = tend - t; /* no longer know real tlen */
+ tlen = tend - t; /* no longer know real size of t */
/* reorder */
- ulen = tlen + UTF8_MAXLEN;
+ ulen = tlen;
New(0, u, ulen+1, U8);
- uend = pv_utf8_reorder(t, tlen, u, ulen);
+ uend = pv_utf8_reorder(t, tlen, &u, ulen);
*uend = '\0';
- ulen = uend - u;
+ ulen = uend - u; /* no longer know real size of u */
/* compose */
- dst = newSVpvn("", 0);
- dlen = ulen + UTF8_MAXLEN;
- d = (U8*)SvGROW(dst,dlen+1);
- SvUTF8_on(dst);
- dend = pv_utf8_compose(u, ulen, d, dlen, (bool)(ix==2));
+ dlen = ulen;
+ New(0, d, dlen+1, U8);
+ dend = pv_utf8_compose(u, ulen, &d, dlen, (bool)(ix==2));
*dend = '\0';
- SvCUR_set(dst, dend - d);
+ dlen = dend - d; /* no longer know real size of d */
/* return */
+ dst = newSVpvn("", 0);
+ sv_setpvn(dst, (char *)d, dlen);
+ SvUTF8_on(dst);
+
Safefree(t);
Safefree(u);
+ Safefree(d);
RETVAL = dst;
OUTPUT:
RETVAL
+
SV*
checkNFD(src)
SV * src
diff --git a/cpan/Unicode-Normalize/README b/cpan/Unicode-Normalize/README
index a18f7cd198..8cddf34869 100644
--- a/cpan/Unicode-Normalize/README
+++ b/cpan/Unicode-Normalize/README
@@ -1,4 +1,4 @@
-Unicode/Normalize version 1.07
+Unicode/Normalize version 1.10
===================================
Unicode::Normalize - Unicode Normalization Forms
@@ -83,7 +83,7 @@ COPYRIGHT AND LICENSE
SADAHIRO Tomoyuki <SADAHIRO@cpan.org>
-Copyright(C) 2001-2010, SADAHIRO Tomoyuki. Japan. All rights reserved.
+Copyright(C) 2001-2011, SADAHIRO Tomoyuki. Japan. All rights reserved.
This module is free software; you can redistribute it and/or
modify it under the same terms as Perl itself.
diff --git a/cpan/Unicode-Normalize/mkheader b/cpan/Unicode-Normalize/mkheader
index b6d153c552..acc7eaff77 100644
--- a/cpan/Unicode-Normalize/mkheader
+++ b/cpan/Unicode-Normalize/mkheader
@@ -90,7 +90,7 @@ sub decomposeHangul {
return wantarray ? @ret : pack_U(@ret);
}
-########## getting full decomposion ##########
+########## getting full decomposition ##########
{
my($f, $fh);
foreach my $d (@INC) {
@@ -194,28 +194,6 @@ foreach my $key (keys %Compat) {
##### The above part is common to mkheader and PP #####
-sub utf8len {
- my $uv = shift;
- return $uv < 0x80 ? 1 :
- $uv < 0x800 ? 2 :
- $uv < 0x10000 ? 3 :
- $uv < 0x110000 ? 4 :
- croak "$PACKAGE: illegal char in the composite. codepoint max is 0x10ffff.";
-}
-
-sub utfelen {
- my $uv = shift;
- return $uv < 0xA0 ? 1 :
- $uv < 0x400 ? 2 :
- $uv < 0x4000 ? 3 :
- $uv < 0x40000 ? 4 :
- $uv < 0x110000 ? 5 :
- croak "$PACKAGE: illegal char in the composite. codepoint max is 0x10ffff.";
-}
-
-my $errExpand = "$PACKAGE: Composition to U+%04X (from U+%04X and U+%04X) " .
- "needs growing the string in %s! Quit. Please inform the author...";
-
foreach my $comp1st (keys %Compos) {
my $listname = sprintf("${structname}_%06x", $comp1st);
# %04x is bad since it'd place _3046 after _1d157.
@@ -225,13 +203,6 @@ foreach my $comp1st (keys %Compos) {
foreach my $comp2nd (keys %$rh1st) {
my $uc = $rh1st->{$comp2nd};
$CompList{$listname}{$comp2nd} = $uc;
-
- if (utf8len($comp1st) + utf8len($comp2nd) < utf8len($uc)) {
- croak sprintf $errExpand, $uc, $comp1st, $comp2nd, "utf-8";
- }
- if (utfelen($comp1st) + utfelen($comp2nd) < utfelen($uc)) {
- croak sprintf $errExpand, $uc, $comp1st, $comp2nd, "utf-ebcdic";
- }
}
}