diff options
author | Chris 'BinGOs' Williams <chris@bingosnet.co.uk> | 2011-01-16 19:13:28 +0000 |
---|---|---|
committer | Chris 'BinGOs' Williams <chris@bingosnet.co.uk> | 2011-01-16 19:14:52 +0000 |
commit | 788ba0f4016360d17406fdc02d8996b2757aeb6f (patch) | |
tree | 46359b3cee66a8b61b0036a2069f8b52aa3e2426 /cpan | |
parent | 4d220a7d395d394ec773c0d3eda665f350b3eab0 (diff) | |
download | perl-788ba0f4016360d17406fdc02d8996b2757aeb6f.tar.gz |
Update Unicode-Normalize to CPAN version 1.10
[DELTA]
1.10 Sun Jan 16 21:00:34 2011
- XSUB: reorder() and compose() treat with growing the string.
- XSUB: provision against UTF8_ALLOW_* flags to be undefined in future.
- doc: about perl 5.13.x and Unicode 6.0.0
- doc and comments: [perl #81876] Fix typos by Peter J. Acklam.
Diffstat (limited to 'cpan')
-rw-r--r-- | cpan/Unicode-Normalize/Changes | 8 | ||||
-rw-r--r-- | cpan/Unicode-Normalize/Normalize.pm | 7 | ||||
-rw-r--r-- | cpan/Unicode-Normalize/Normalize.xs | 184 | ||||
-rw-r--r-- | cpan/Unicode-Normalize/README | 4 | ||||
-rw-r--r-- | cpan/Unicode-Normalize/mkheader | 31 |
5 files changed, 107 insertions, 127 deletions
diff --git a/cpan/Unicode-Normalize/Changes b/cpan/Unicode-Normalize/Changes index 893a604214..f872619dbc 100644 --- a/cpan/Unicode-Normalize/Changes +++ b/cpan/Unicode-Normalize/Changes @@ -1,5 +1,11 @@ Revision history for Perl extension Unicode::Normalize. +1.10 Sun Jan 16 21:00:34 2011 + - XSUB: reorder() and compose() treat with growing the string. + - XSUB: provision against UTF8_ALLOW_* flags to be undefined in future. + - doc: about perl 5.13.x and Unicode 6.0.0 + - doc and comments: [perl #81876] Fix typos by Peter J. Acklam. + 1.07 Mon Sep 20 20:20:02 2010 - doc: about perl 5.12.x and Unicode 5.2.0 - test: prototype of normalize_partial() and cousins in proto.t. @@ -60,7 +66,7 @@ Revision history for Perl extension Unicode::Normalize. - XSUB: even if string contains a malformed, "short" Unicode character, decompose() and reorder() will be safe. Garbage will be no longer added. - added null.t and short.t. - - now truely added illegal.t (in 0.27, forgot to change MANIFEST). + - now truly added illegal.t (in 0.27, forgot to change MANIFEST). 0.27 Sun Nov 16 13:16:21 2003 - Illegal code points (surrogate and noncharacter) will be allowed diff --git a/cpan/Unicode-Normalize/Normalize.pm b/cpan/Unicode-Normalize/Normalize.pm index ab895aa36a..3c21e9dffd 100644 --- a/cpan/Unicode-Normalize/Normalize.pm +++ b/cpan/Unicode-Normalize/Normalize.pm @@ -13,7 +13,7 @@ use Carp; no warnings 'utf8'; -our $VERSION = '1.08'; +our $VERSION = '1.10'; our $PACKAGE = __PACKAGE__; our @EXPORT = qw( NFC NFD NFKC NFKD ); @@ -34,7 +34,7 @@ our %EXPORT_TAGS = ( ); ## -## utilites for tests +## utilities for tests ## sub pack_U { @@ -549,6 +549,7 @@ normalization implemented by this module depends on your perl's version. 5.10.0 5.0.0 5.8.9, 5.10.1 5.1.0 5.12.0-5.12.2 5.2.0 + (5.13.7-5.13.8) 6.0.0 =item Correction of decomposition mapping @@ -576,7 +577,7 @@ lower than 4.1.0. SADAHIRO Tomoyuki <SADAHIRO@cpan.org> -Copyright(C) 2001-2010, SADAHIRO Tomoyuki. Japan. All rights reserved. +Copyright(C) 2001-2011, SADAHIRO Tomoyuki. Japan. All rights reserved. This module is free software; you can redistribute it and/or modify it under the same terms as Perl itself. diff --git a/cpan/Unicode-Normalize/Normalize.xs b/cpan/Unicode-Normalize/Normalize.xs index 5f5357cbd8..b760dff0b3 100644 --- a/cpan/Unicode-Normalize/Normalize.xs +++ b/cpan/Unicode-Normalize/Normalize.xs @@ -17,32 +17,43 @@ /* Perl 5.6.1 ? */ #ifndef utf8n_to_uvuni -#define utf8n_to_uvuni utf8_to_uv +#define utf8n_to_uvuni utf8_to_uv #endif /* utf8n_to_uvuni */ /* UTF8_ALLOW_BOM is used before Perl 5.8.0 */ -#ifdef UTF8_ALLOW_BOM -#define AllowAnyUTF (UTF8_ALLOW_SURROGATE|UTF8_ALLOW_BOM|UTF8_ALLOW_FFFF) -#else -#define AllowAnyUTF (UTF8_ALLOW_SURROGATE|UTF8_ALLOW_FFFF) -#endif +#ifndef UTF8_ALLOW_BOM +#define UTF8_ALLOW_BOM (0) +#endif /* UTF8_ALLOW_BOM */ + +#ifndef UTF8_ALLOW_SURROGATE +#define UTF8_ALLOW_SURROGATE (0) +#endif /* UTF8_ALLOW_SURROGATE */ + +#ifndef UTF8_ALLOW_FE_FF +#define UTF8_ALLOW_FE_FF (0) +#endif /* UTF8_ALLOW_FE_FF */ + +#ifndef UTF8_ALLOW_FFFF +#define UTF8_ALLOW_FFFF (0) +#endif /* UTF8_ALLOW_FFFF */ + +#define AllowAnyUTF (UTF8_ALLOW_SURROGATE|UTF8_ALLOW_BOM|UTF8_ALLOW_FE_FF|UTF8_ALLOW_FFFF) + +/* check if the string buffer is enough before uvuni_to_utf8(). */ +/* dstart, d, and dlen should be defined outside before. */ +#define Renew_d_if_not_enough_to(need) STRLEN curlen = d - dstart; \ + if (dlen < curlen + (need)) { \ + dlen += (need); \ + Renew(dstart, dlen+1, U8); \ + d = dstart + curlen; \ + } -/* if utf8n_to_uvuni() sets retlen to 0 (?) */ +/* if utf8n_to_uvuni() sets retlen to 0 (if broken?) */ #define ErrRetlenIsZero "panic (Unicode::Normalize %s): zero-length character" /* utf8_hop() hops back before start. Maybe broken UTF-8 */ #define ErrHopBeforeStart "panic (Unicode::Normalize): hopping before start" -/* It should never happen as there is no instance in UTF-8 and UTF-EBCDIC. - If Unicode would add a new composition of A + B to C - where bytes::length(A) + bytes::length(B) < bytes::length(C), - this code should be fixed. - In this case, mkheader will prevent Unicode::Normalize from building. */ -#define ErrLongerThanSrc "panic (Unicode::Normalize %s): longer than source" - -/* uvuni_to_utf8 wants UTF8_MAXBYTES free bytes available */ -#define ErrTargetNotEnough "panic (Unicode::Normalize %s): target not enough" - /* At present, char > 0x10ffff are unaffected without complaint, right? */ #define VALID_UTF_MAX (0x10ffff) #define OVER_UTF_MAX(uv) (VALID_UTF_MAX < (uv)) @@ -216,13 +227,7 @@ U8* pv_utf8_decompose(U8* s, STRLEN slen, U8** dp, STRLEN dlen, bool iscompat) p += retlen; if (Hangul_IsS(uv)) { - STRLEN cur = d - dstart; - - if (dlen < cur + UTF8_MAXLEN * 3) { - dlen += UTF8_MAXLEN * 3; - Renew(dstart, dlen+1, U8); - d = dstart + cur; - } + Renew_d_if_not_enough_to(UTF8_MAXLEN * 3) d = pv_cat_decompHangul(d, uv); } else { @@ -230,23 +235,12 @@ U8* pv_utf8_decompose(U8* s, STRLEN slen, U8** dp, STRLEN dlen, bool iscompat) if (r) { STRLEN len = (STRLEN)strlen((char *)r); - STRLEN cur = d - dstart; - if (dlen < cur + len) { - dlen += len; - Renew(dstart, dlen+1, U8); - d = dstart + cur; - } + Renew_d_if_not_enough_to(len) while (len--) *d++ = *r++; } else { - STRLEN cur = d - dstart; - - if (dlen < cur + UTF8_MAXLEN) { - dlen += UTF8_MAXLEN; - Renew(dstart, dlen+1, U8); - d = dstart + cur; - } + Renew_d_if_not_enough_to(UTF8_MAXLEN) d = uvuni_to_utf8(d, uv); } } @@ -256,11 +250,12 @@ U8* pv_utf8_decompose(U8* s, STRLEN slen, U8** dp, STRLEN dlen, bool iscompat) } static -U8* pv_utf8_reorder(U8* s, STRLEN slen, U8* d, STRLEN dlen) +U8* pv_utf8_reorder(U8* s, STRLEN slen, U8** dp, STRLEN dlen) { U8* p = s; U8* e = s + slen; - U8* dend = d + dlen; + U8* dstart = *dp; + U8* d = dstart; UNF_cc seq_ary[CC_SEQ_SIZE]; UNF_cc* seq_ptr = seq_ary; /* use array at the beginning */ @@ -268,10 +263,6 @@ U8* pv_utf8_reorder(U8* s, STRLEN slen, U8* d, STRLEN dlen) STRLEN seq_max = CC_SEQ_SIZE; STRLEN cc_pos = 0; - if (dlen < slen || dlen < slen + UTF8_MAXLEN) - croak(ErrTargetNotEnough, "reorder"); - dend -= UTF8_MAXLEN; /* safety */ - while (p < e) { U8 curCC; STRLEN retlen; @@ -306,6 +297,7 @@ U8* pv_utf8_reorder(U8* s, STRLEN slen, U8* d, STRLEN dlen) continue; } + /* output */ if (cc_pos) { STRLEN i; @@ -313,30 +305,30 @@ U8* pv_utf8_reorder(U8* s, STRLEN slen, U8* d, STRLEN dlen) qsort((void*)seq_ptr, cc_pos, sizeof(UNF_cc), compare_cc); for (i = 0; i < cc_pos; i++) { + Renew_d_if_not_enough_to(UTF8_MAXLEN) d = uvuni_to_utf8(d, seq_ptr[i].uv); - if (dend < d) /* real end is dend + UTF8_MAXLEN */ - croak(ErrLongerThanSrc, "reorder"); } cc_pos = 0; } if (curCC == 0) { + Renew_d_if_not_enough_to(UTF8_MAXLEN) d = uvuni_to_utf8(d, uv); - if (dend < d) /* real end is dend + UTF8_MAXLEN */ - croak(ErrLongerThanSrc, "reorder"); } } if (seq_ext) Safefree(seq_ext); + *dp = dstart; return d; } static -U8* pv_utf8_compose(U8* s, STRLEN slen, U8* d, STRLEN dlen, bool iscontig) +U8* pv_utf8_compose(U8* s, STRLEN slen, U8** dp, STRLEN dlen, bool iscontig) { U8* p = s; U8* e = s + slen; - U8* dend = d + dlen; + U8* dstart = *dp; + U8* d = dstart; UV uvS = 0; /* code point of the starter */ bool valid_uvS = FALSE; /* if FALSE, uvS isn't initialized yet */ @@ -348,10 +340,6 @@ U8* pv_utf8_compose(U8* s, STRLEN slen, U8* d, STRLEN dlen, bool iscontig) STRLEN seq_max = CC_SEQ_SIZE; STRLEN cc_pos = 0; - if (dlen < slen || dlen < slen + UTF8_MAXLEN) - croak(ErrTargetNotEnough, "compose"); - dend -= UTF8_MAXLEN; /* safety */ - while (p < e) { U8 curCC; STRLEN retlen; @@ -370,9 +358,8 @@ U8* pv_utf8_compose(U8* s, STRLEN slen, U8* d, STRLEN dlen, bool iscontig) continue; } else { + Renew_d_if_not_enough_to(UTF8_MAXLEN) d = uvuni_to_utf8(d, uv); - if (dend < d) /* real end is dend + UTF8_MAXLEN */ - croak(ErrLongerThanSrc, "compose"); continue; } } @@ -382,7 +369,7 @@ U8* pv_utf8_compose(U8* s, STRLEN slen, U8* d, STRLEN dlen, bool iscontig) /* blocked */ if ((iscontig && cc_pos) || /* discontiguous combination */ (curCC != 0 && preCC == curCC) || /* blocked by same CC */ - preCC > curCC) /* blocked by higher CC: revised D2 */ + (preCC > curCC)) /* blocked by higher CC: revised D2 */ composed = FALSE; /* not blocked: @@ -428,17 +415,18 @@ U8* pv_utf8_compose(U8* s, STRLEN slen, U8* d, STRLEN dlen, bool iscontig) } } - d = uvuni_to_utf8(d, uvS); /* starter (composed or not) */ - if (dend < d) /* real end is dend + UTF8_MAXLEN */ - croak(ErrLongerThanSrc, "compose"); + /* output */ + { + Renew_d_if_not_enough_to(UTF8_MAXLEN) + d = uvuni_to_utf8(d, uvS); /* starter (composed or not) */ + } if (cc_pos) { STRLEN i; for (i = 0; i < cc_pos; i++) { + Renew_d_if_not_enough_to(UTF8_MAXLEN) d = uvuni_to_utf8(d, seq_ptr[i]); - if (dend < d) /* real end is dend + UTF8_MAXLEN */ - croak(ErrLongerThanSrc, "compose"); } cc_pos = 0; } @@ -447,6 +435,7 @@ U8* pv_utf8_compose(U8* s, STRLEN slen, U8* d, STRLEN dlen, bool iscontig) } if (seq_ext) Safefree(seq_ext); + *dp = dstart; return d; } @@ -474,6 +463,7 @@ decompose(src, compat = &PL_sv_no) OUTPUT: RETVAL + SV* reorder(src) SV * src @@ -485,16 +475,17 @@ reorder(src) CODE: s = (U8*)sv_2pvunicode(src,&slen); dst = newSVpvn("", 0); - dlen = slen + UTF8_MAXLEN; - d = (U8*)SvGROW(dst,dlen+1); + dlen = slen; + New(0, d, dlen+1, U8); + dend = pv_utf8_reorder(s, slen, &d, dlen); + sv_setpvn(dst, (char *)d, dend - d); SvUTF8_on(dst); - dend = pv_utf8_reorder(s, slen, d, dlen); - *dend = '\0'; - SvCUR_set(dst, dend - d); + Safefree(d); RETVAL = dst; OUTPUT: RETVAL + SV* compose(src) SV * src @@ -508,16 +499,17 @@ compose(src) CODE: s = (U8*)sv_2pvunicode(src,&slen); dst = newSVpvn("", 0); - dlen = slen + UTF8_MAXLEN; - d = (U8*)SvGROW(dst,dlen+1); + dlen = slen; + New(0, d, dlen+1, U8); + dend = pv_utf8_compose(s, slen, &d, dlen, (bool)ix); + sv_setpvn(dst, (char *)d, dend - d); SvUTF8_on(dst); - dend = pv_utf8_compose(s, slen, d, dlen, (bool)ix); - *dend = '\0'; - SvCUR_set(dst, dend - d); + Safefree(d); RETVAL = dst; OUTPUT: RETVAL + SV* NFD(src) SV * src @@ -529,29 +521,34 @@ NFD(src) U8 *s, *t, *tend, *d, *dend; STRLEN slen, tlen, dlen; CODE: - /* decompose */ s = (U8*)sv_2pvunicode(src,&slen); + + /* decompose */ tlen = slen; New(0, t, tlen+1, U8); - tend = pv_utf8_decompose(s, slen, &t, tlen, (bool)ix); + tend = pv_utf8_decompose(s, slen, &t, tlen, (bool)(ix==1)); *tend = '\0'; - tlen = tend - t; /* no longer know real tlen */ + tlen = tend - t; /* no longer know real size of t */ /* reorder */ - dst = newSVpvn("", 0); - dlen = tlen + UTF8_MAXLEN; - d = (U8*)SvGROW(dst,dlen+1); - SvUTF8_on(dst); - dend = pv_utf8_reorder(t, tlen, d, dlen); + dlen = tlen; + New(0, d, dlen+1, U8); + dend = pv_utf8_reorder(t, tlen, &d, dlen); *dend = '\0'; - SvCUR_set(dst, dend - d); + dlen = dend - d; /* no longer know real size of d */ /* return */ + dst = newSVpvn("", 0); + sv_setpvn(dst, (char *)d, dlen); + SvUTF8_on(dst); + Safefree(t); + Safefree(d); RETVAL = dst; OUTPUT: RETVAL + SV* NFC(src) SV * src @@ -564,37 +561,42 @@ NFC(src) U8 *s, *t, *tend, *u, *uend, *d, *dend; STRLEN slen, tlen, ulen, dlen; CODE: - /* decompose */ s = (U8*)sv_2pvunicode(src,&slen); + + /* decompose */ tlen = slen; New(0, t, tlen+1, U8); tend = pv_utf8_decompose(s, slen, &t, tlen, (bool)(ix==1)); *tend = '\0'; - tlen = tend - t; /* no longer know real tlen */ + tlen = tend - t; /* no longer know real size of t */ /* reorder */ - ulen = tlen + UTF8_MAXLEN; + ulen = tlen; New(0, u, ulen+1, U8); - uend = pv_utf8_reorder(t, tlen, u, ulen); + uend = pv_utf8_reorder(t, tlen, &u, ulen); *uend = '\0'; - ulen = uend - u; + ulen = uend - u; /* no longer know real size of u */ /* compose */ - dst = newSVpvn("", 0); - dlen = ulen + UTF8_MAXLEN; - d = (U8*)SvGROW(dst,dlen+1); - SvUTF8_on(dst); - dend = pv_utf8_compose(u, ulen, d, dlen, (bool)(ix==2)); + dlen = ulen; + New(0, d, dlen+1, U8); + dend = pv_utf8_compose(u, ulen, &d, dlen, (bool)(ix==2)); *dend = '\0'; - SvCUR_set(dst, dend - d); + dlen = dend - d; /* no longer know real size of d */ /* return */ + dst = newSVpvn("", 0); + sv_setpvn(dst, (char *)d, dlen); + SvUTF8_on(dst); + Safefree(t); Safefree(u); + Safefree(d); RETVAL = dst; OUTPUT: RETVAL + SV* checkNFD(src) SV * src diff --git a/cpan/Unicode-Normalize/README b/cpan/Unicode-Normalize/README index a18f7cd198..8cddf34869 100644 --- a/cpan/Unicode-Normalize/README +++ b/cpan/Unicode-Normalize/README @@ -1,4 +1,4 @@ -Unicode/Normalize version 1.07 +Unicode/Normalize version 1.10 =================================== Unicode::Normalize - Unicode Normalization Forms @@ -83,7 +83,7 @@ COPYRIGHT AND LICENSE SADAHIRO Tomoyuki <SADAHIRO@cpan.org> -Copyright(C) 2001-2010, SADAHIRO Tomoyuki. Japan. All rights reserved. +Copyright(C) 2001-2011, SADAHIRO Tomoyuki. Japan. All rights reserved. This module is free software; you can redistribute it and/or modify it under the same terms as Perl itself. diff --git a/cpan/Unicode-Normalize/mkheader b/cpan/Unicode-Normalize/mkheader index b6d153c552..acc7eaff77 100644 --- a/cpan/Unicode-Normalize/mkheader +++ b/cpan/Unicode-Normalize/mkheader @@ -90,7 +90,7 @@ sub decomposeHangul { return wantarray ? @ret : pack_U(@ret); } -########## getting full decomposion ########## +########## getting full decomposition ########## { my($f, $fh); foreach my $d (@INC) { @@ -194,28 +194,6 @@ foreach my $key (keys %Compat) { ##### The above part is common to mkheader and PP ##### -sub utf8len { - my $uv = shift; - return $uv < 0x80 ? 1 : - $uv < 0x800 ? 2 : - $uv < 0x10000 ? 3 : - $uv < 0x110000 ? 4 : - croak "$PACKAGE: illegal char in the composite. codepoint max is 0x10ffff."; -} - -sub utfelen { - my $uv = shift; - return $uv < 0xA0 ? 1 : - $uv < 0x400 ? 2 : - $uv < 0x4000 ? 3 : - $uv < 0x40000 ? 4 : - $uv < 0x110000 ? 5 : - croak "$PACKAGE: illegal char in the composite. codepoint max is 0x10ffff."; -} - -my $errExpand = "$PACKAGE: Composition to U+%04X (from U+%04X and U+%04X) " . - "needs growing the string in %s! Quit. Please inform the author..."; - foreach my $comp1st (keys %Compos) { my $listname = sprintf("${structname}_%06x", $comp1st); # %04x is bad since it'd place _3046 after _1d157. @@ -225,13 +203,6 @@ foreach my $comp1st (keys %Compos) { foreach my $comp2nd (keys %$rh1st) { my $uc = $rh1st->{$comp2nd}; $CompList{$listname}{$comp2nd} = $uc; - - if (utf8len($comp1st) + utf8len($comp2nd) < utf8len($uc)) { - croak sprintf $errExpand, $uc, $comp1st, $comp2nd, "utf-8"; - } - if (utfelen($comp1st) + utfelen($comp2nd) < utfelen($uc)) { - croak sprintf $errExpand, $uc, $comp1st, $comp2nd, "utf-ebcdic"; - } } } |