diff options
author | Steve Hay <steve.m.hay@googlemail.com> | 2016-12-06 08:41:46 +0000 |
---|---|---|
committer | Steve Hay <steve.m.hay@googlemail.com> | 2016-12-06 08:41:46 +0000 |
commit | 15f5e486022b574631307e6a27ca1b961591e561 (patch) | |
tree | d14808feabe2cd097bd8b331234338c37d1d16cd /cpan/Encode | |
parent | 5aa240eab7dbaa91f98c2fee1f04b6c0b5a9b9e3 (diff) | |
download | perl-15f5e486022b574631307e6a27ca1b961591e561.tar.gz |
Upgrade Encode from version 2.86 to 2.88
(Unicode.pm is customized for a version-bump only, to silence
t/porting/cmp_version.t since Unicode.xs has changed.)
Diffstat (limited to 'cpan/Encode')
37 files changed, 1559 insertions, 547 deletions
diff --git a/cpan/Encode/Encode.pm b/cpan/Encode/Encode.pm index bda8e1b316..57b4292279 100644 --- a/cpan/Encode/Encode.pm +++ b/cpan/Encode/Encode.pm @@ -1,10 +1,10 @@ # -# $Id: Encode.pm,v 2.86 2016/08/10 18:08:01 dankogai Exp $ +# $Id: Encode.pm,v 2.88 2016/11/29 23:30:30 dankogai Exp dankogai $ # package Encode; use strict; use warnings; -our $VERSION = sprintf "%d.%02d", q$Revision: 2.86 $ =~ /(\d+)/g; +our $VERSION = sprintf "%d.%02d", q$Revision: 2.88 $ =~ /(\d+)/g; use constant DEBUG => !!$ENV{PERL_ENCODE_DEBUG}; use XSLoader (); XSLoader::load( __PACKAGE__, $VERSION ); @@ -15,7 +15,7 @@ use Exporter 5.57 'import'; our @EXPORT = qw( decode decode_utf8 encode encode_utf8 str2bytes bytes2str - encodings find_encoding clone_encoding + encodings find_encoding find_mime_encoding clone_encoding ); our @FB_FLAGS = qw( DIE_ON_ERR WARN_ON_ERR RETURN_ON_ERR LEAVE_SRC @@ -102,6 +102,8 @@ sub define_encoding { sub getEncoding { my ( $class, $name, $skip_external ) = @_; + defined($name) or return; + $name =~ s/\s+//g; # https://rt.cpan.org/Ticket/Display.html?id=65796 ref($name) && $name->can('renew') and return $name; @@ -130,6 +132,14 @@ sub find_encoding($;$) { return __PACKAGE__->getEncoding( $name, $skip_external ); } +sub find_mime_encoding($;$) { + my ( $mime_name, $skip_external ) = @_; + eval { require Encode::MIME::Name; }; + $@ and return; + my $name = Encode::MIME::Name::get_encode_name( $mime_name ); + return find_encoding( $name, $skip_external ); +} + sub resolve_alias($) { my $obj = find_encoding(shift); defined $obj and return $obj->name; @@ -254,6 +264,7 @@ sub from_to($$$;$) { sub encode_utf8($) { my ($str) = @_; + return undef unless defined $str; utf8::encode($str); return $str; } @@ -576,6 +587,20 @@ name of the encoding object. See L<Encode::Encoding> for details. +=head3 find_mime_encoding + + [$obj =] find_mime_encoding(MIME_ENCODING) + +Returns the I<encoding object> corresponding to I<MIME_ENCODING>. Acts +same as C<find_encoding()> but C<mime_name()> of returned object must +match to I<MIME_ENCODING>. So as opposite of C<find_encoding()> +canonical names and aliases are not used when searching for object. + + find_mime_encoding("utf8"); # returns undef because "utf8" is not valid I<MIME_ENCODING> + find_mime_encoding("utf-8"); # returns encode object "utf-8-strict" + find_mime_encoding("UTF-8"); # same as "utf-8" because I<MIME_ENCODING> is case insensitive + find_mime_encoding("utf-8-strict"); returns undef because "utf-8-strict" is not valid I<MIME_ENCODING> + =head3 from_to [$length =] from_to($octets, FROM_ENC, TO_ENC [, CHECK]) diff --git a/cpan/Encode/Encode.xs b/cpan/Encode/Encode.xs index 222f39b2ea..b5160d2516 100644 --- a/cpan/Encode/Encode.xs +++ b/cpan/Encode/Encode.xs @@ -1,5 +1,5 @@ /* - $Id: Encode.xs,v 2.37 2016/08/10 18:08:45 dankogai Exp dankogai $ + $Id: Encode.xs,v 2.39 2016/11/29 23:29:23 dankogai Exp dankogai $ */ #define PERL_NO_GET_CONTEXT @@ -31,6 +31,10 @@ UNIMPLEMENTED(_encoded_utf8_to_bytes, I32) UNIMPLEMENTED(_encoded_bytes_to_utf8, I32) +#ifndef SvIV_nomg +#define SvIV_nomg SvIV +#endif + #ifdef UTF8_DISALLOW_ILLEGAL_INTERCHANGE # define UTF8_ALLOW_STRICT UTF8_DISALLOW_ILLEGAL_INTERCHANGE #else @@ -76,6 +80,37 @@ call_failure(SV * routine, U8 * done, U8 * dest, U8 * orig) PERL_UNUSED_VAR(orig); } +static void +utf8_safe_downgrade(pTHX_ SV ** src, U8 ** s, STRLEN * slen, bool modify) +{ + if (!modify) { + SV *tmp = sv_2mortal(newSVpvn((char *)*s, *slen)); + SvUTF8_on(tmp); + if (SvTAINTED(*src)) + SvTAINTED_on(tmp); + *src = tmp; + *s = (U8 *)SvPVX(*src); + } + if (*slen) { + if (!utf8_to_bytes(*s, slen)) + croak("Wide character"); + SvCUR_set(*src, *slen); + } + SvUTF8_off(*src); +} + +static void +utf8_safe_upgrade(pTHX_ SV ** src, U8 ** s, STRLEN * slen, bool modify) +{ + if (!modify) { + SV *tmp = sv_2mortal(newSVpvn((char *)*s, *slen)); + if (SvTAINTED(*src)) + SvTAINTED_on(tmp); + *src = tmp; + } + sv_utf8_upgrade_nomg(*src); + *s = (U8 *)SvPV_nomg(*src, *slen); +} #define ERR_ENCODE_NOMAP "\"\\x{%04" UVxf "}\" does not map to %s" #define ERR_DECODE_NOMAP "%s \"\\x%02" UVXf "\" does not map to Unicode" @@ -104,18 +139,16 @@ do_fallback_cb(pTHX_ UV ch, SV *fallback_cb) } static SV * -encode_method(pTHX_ const encode_t * enc, const encpage_t * dir, SV * src, +encode_method(pTHX_ const encode_t * enc, const encpage_t * dir, SV * src, U8 * s, STRLEN slen, int check, STRLEN * offset, SV * term, int * retcode, SV *fallback_cb) { - STRLEN slen; - U8 *s = (U8 *) SvPV(src, slen); STRLEN tlen = slen; STRLEN ddone = 0; STRLEN sdone = 0; /* We allocate slen+1. PerlIO dumps core if this value is smaller than this. */ - SV *dst = sv_2mortal(newSV(slen+1)); + SV *dst = newSV(slen+1); U8 *d = (U8 *)SvPVX(dst); STRLEN dlen = SvLEN(dst)-1; int code = 0; @@ -191,10 +224,10 @@ encode_method(pTHX_ const encode_t * enc, const encpage_t * dir, SV * src, if (dir == enc->f_utf8) { STRLEN clen; UV ch = - utf8n_to_uvuni(s+slen, (SvCUR(src)-slen), + utf8n_to_uvuni(s+slen, (tlen-sdone-slen), &clen, UTF8_ALLOW_ANY|UTF8_CHECK_ONLY); /* if non-representable multibyte prefix at end of current buffer - break*/ - if (clen > tlen - sdone) break; + if (clen > tlen - sdone - slen) break; if (check & ENCODE_DIE_ON_ERR) { Perl_croak(aTHX_ ERR_ENCODE_NOMAP, (UV)ch, enc->name[0]); @@ -211,7 +244,7 @@ encode_method(pTHX_ const encode_t * enc, const encpage_t * dir, SV * src, SV* subchar = (fallback_cb != &PL_sv_undef) ? do_fallback_cb(aTHX_ ch, fallback_cb) - : newSVpvf(check & ENCODE_PERLQQ ? "\\x{%04"UVxf"}" : + : newSVpvf(check & ENCODE_PERLQQ ? "\\x{%04" UVxf "}" : check & ENCODE_HTMLCREF ? "&#%" UVuf ";" : "&#x%" UVxf ";", (UV)ch); SvUTF8_off(subchar); /* make sure no decoded string gets in */ @@ -279,6 +312,7 @@ encode_method(pTHX_ const encode_t * enc, const encpage_t * dir, SV * src, sv_setpvn(src, (char*)s+slen, sdone); } SvCUR_set(src, sdone); + SvSETMAGIC(src); } /* warn("check = 0x%X, code = 0x%d\n", check, code); */ @@ -318,6 +352,62 @@ strict_utf8(pTHX_ SV* sv) return SvTRUE(*svp); } +/* + * https://github.com/dankogai/p5-encode/pull/56#issuecomment-231959126 + */ +#ifndef UNICODE_IS_NONCHAR +#define UNICODE_IS_NONCHAR(c) ((c >= 0xFDD0 && c <= 0xFDEF) || (c & 0xFFFE) == 0xFFFE) +#endif + +#ifndef UNICODE_IS_SUPER +#define UNICODE_IS_SUPER(c) (c > PERL_UNICODE_MAX) +#endif + +#define UNICODE_IS_STRICT(c) (!UNICODE_IS_SURROGATE(c) && !UNICODE_IS_NONCHAR(c) && !UNICODE_IS_SUPER(c)) + +#ifndef UTF_ACCUMULATION_OVERFLOW_MASK +#ifndef CHARBITS +#define CHARBITS CHAR_BIT +#endif +#define UTF_ACCUMULATION_OVERFLOW_MASK (((UV) UTF_CONTINUATION_MASK) << ((sizeof(UV) * CHARBITS) - UTF_ACCUMULATION_SHIFT)) +#endif + +/* + * Convert non strict utf8 sequence of len >= 2 to unicode codepoint + */ +static UV +convert_utf8_multi_seq(U8* s, STRLEN len, STRLEN *rlen) +{ + UV uv; + U8 *ptr = s; + bool overflowed = 0; + + uv = NATIVE_TO_UTF(*s) & UTF_START_MASK(len); + + len--; + s++; + + while (len--) { + if (!UTF8_IS_CONTINUATION(*s)) { + *rlen = s-ptr; + return 0; + } + if (uv & UTF_ACCUMULATION_OVERFLOW_MASK) + overflowed = 1; + uv = UTF8_ACCUMULATE(uv, *s); + s++; + } + + *rlen = s-ptr; + + if (overflowed || *rlen > (STRLEN)UNISKIP(uv)) { + *rlen = 1; + return 0; + } + + return uv; +} + static U8* process_utf8(pTHX_ SV* dst, U8* s, U8* e, SV *check_sv, bool encode, bool strict, bool stop_at_partial) @@ -336,7 +426,7 @@ process_utf8(pTHX_ SV* dst, U8* s, U8* e, SV *check_sv, } else { fallback_cb = &PL_sv_undef; - check = SvIV(check_sv); + check = SvIV_nomg(check_sv); } SvPOK_only(dst); @@ -351,39 +441,30 @@ process_utf8(pTHX_ SV* dst, U8* s, U8* e, SV *check_sv, continue; } + ulen = 1; if (UTF8_IS_START(*s)) { U8 skip = UTF8SKIP(s); if ((s + skip) > e) { if (stop_at_partial || (check & ENCODE_STOP_AT_PARTIAL)) { const U8 *p = s + 1; for (; p < e; p++) { - if (!UTF8_IS_CONTINUATION(*p)) + if (!UTF8_IS_CONTINUATION(*p)) { + ulen = p-s; goto malformed_byte; + } } break; } + ulen = e-s; goto malformed_byte; } - uv = utf8n_to_uvuni(s, e - s, &ulen, - UTF8_CHECK_ONLY | (strict ? UTF8_ALLOW_STRICT : - UTF8_ALLOW_NONSTRICT) - ); -#if 1 /* perl-5.8.6 and older do not check UTF8_ALLOW_LONG */ - if (strict && uv > PERL_UNICODE_MAX) - ulen = (STRLEN) -1; -#endif - if (ulen == (STRLEN) -1) { - if (strict) { - uv = utf8n_to_uvuni(s, e - s, &ulen, - UTF8_CHECK_ONLY | UTF8_ALLOW_NONSTRICT); - if (ulen == (STRLEN) -1) - goto malformed_byte; - goto malformed; - } + uv = convert_utf8_multi_seq(s, skip, &ulen); + if (uv == 0) goto malformed_byte; - } + else if (strict && !UNICODE_IS_STRICT(uv)) + goto malformed; /* Whole char is good */ @@ -396,7 +477,8 @@ process_utf8(pTHX_ SV* dst, U8* s, U8* e, SV *check_sv, /* If we get here there is something wrong with alleged UTF-8 */ malformed_byte: uv = (UV)*s; - ulen = 1; + if (ulen == 0) + ulen = 1; malformed: if (check & ENCODE_DIE_ON_ERR){ @@ -456,10 +538,6 @@ MODULE = Encode PACKAGE = Encode::utf8 PREFIX = Method_ PROTOTYPES: DISABLE -#ifndef SvIsCOW -# define SvIsCOW(sv) (SvREADONLY(sv) && SvFAKE(sv)) -#endif - void Method_decode_xs(obj,src,check_sv = &PL_sv_no) SV * obj @@ -472,23 +550,26 @@ PREINIT: SV *dst; bool renewed = 0; int check; + bool modify; +INIT: + SvGETMAGIC(src); + SvGETMAGIC(check_sv); + check = SvROK(check_sv) ? ENCODE_PERLQQ|ENCODE_LEAVE_SRC : SvIV_nomg(check_sv); + modify = (check && !(check & ENCODE_LEAVE_SRC)); CODE: { - dSP; ENTER; SAVETMPS; - if (src == &PL_sv_undef || SvROK(src)) src = sv_2mortal(newSV(0)); - check = SvROK(check_sv) ? ENCODE_PERLQQ|ENCODE_LEAVE_SRC : SvIV(check_sv); - if (!(check & ENCODE_LEAVE_SRC) && SvIsCOW(src)) { - /* - * disassociate from any other scalars before doing - * in-place modifications - */ - sv_force_normal(src); - } - s = (U8 *) SvPV(src, slen); - e = (U8 *) SvEND(src); + dSP; + if (!SvOK(src)) + XSRETURN_UNDEF; + s = modify ? (U8 *)SvPV_force_nomg(src, slen) : (U8 *)SvPV_nomg(src, slen); + if (SvUTF8(src)) + utf8_safe_downgrade(aTHX_ &src, &s, &slen, modify); + e = s+slen; + /* * PerlIO check -- we assume the object is of PerlIO if renewed */ + ENTER; SAVETMPS; PUSHMARK(sp); XPUSHs(obj); PUTBACK; @@ -503,28 +584,17 @@ CODE: FREETMPS; LEAVE; /* end PerlIO check */ - if (SvUTF8(src)) { - s = utf8_to_bytes(s,&slen); - if (s) { - SvCUR_set(src,slen); - SvUTF8_off(src); - e = s+slen; - } - else { - croak("Cannot decode string with wide characters"); - } - } - dst = sv_2mortal(newSV(slen>0?slen:1)); /* newSV() abhors 0 -- inaba */ s = process_utf8(aTHX_ dst, s, e, check_sv, 0, strict_utf8(aTHX_ obj), renewed); /* Clear out translated part of source unless asked not to */ - if (check && !(check & ENCODE_LEAVE_SRC)){ + if (modify) { slen = e-s; if (slen) { sv_setpvn(src, (char*)s, slen); } SvCUR_set(src, slen); + SvSETMAGIC(src); } SvUTF8_on(dst); if (SvTAINTED(src)) SvTAINTED_on(dst); /* propagate taintedness */ @@ -543,12 +613,18 @@ PREINIT: U8 *e; SV *dst; int check; + bool modify; +INIT: + SvGETMAGIC(src); + SvGETMAGIC(check_sv); + check = SvROK(check_sv) ? ENCODE_PERLQQ|ENCODE_LEAVE_SRC : SvIV_nomg(check_sv); + modify = (check && !(check & ENCODE_LEAVE_SRC)); CODE: { - check = SvROK(check_sv) ? ENCODE_PERLQQ|ENCODE_LEAVE_SRC : SvIV(check_sv); - if (src == &PL_sv_undef || SvROK(src)) src = sv_2mortal(newSV(0)); - s = (U8 *) SvPV(src, slen); - e = (U8 *) SvEND(src); + if (!SvOK(src)) + XSRETURN_UNDEF; + s = modify ? (U8 *)SvPV_force_nomg(src, slen) : (U8 *)SvPV_nomg(src, slen); + e = s+slen; dst = sv_2mortal(newSV(slen>0?slen:1)); /* newSV() abhors 0 -- inaba */ if (SvUTF8(src)) { /* Already encoded */ @@ -584,12 +660,13 @@ CODE: } /* Clear out translated part of source unless asked not to */ - if (check && !(check & ENCODE_LEAVE_SRC)){ + if (modify) { slen = e-s; if (slen) { sv_setpvn(src, (char*)s, slen); } SvCUR_set(src, slen); + SvSETMAGIC(src); } SvPOK_only(dst); SvUTF8_off(dst); @@ -638,24 +715,35 @@ SV * src SV * off SV * term SV * check_sv -CODE: -{ +PREINIT: int check; - SV *fallback_cb = &PL_sv_undef; - encode_t *enc = INT2PTR(encode_t *, SvIV(SvRV(obj))); - STRLEN offset = (STRLEN)SvIV(off); + SV *fallback_cb; + bool modify; + encode_t *enc; + STRLEN offset; int code = 0; - if (SvUTF8(src)) { - sv_utf8_downgrade(src, FALSE); - } - if (SvROK(check_sv)){ - fallback_cb = check_sv; - check = ENCODE_PERLQQ|ENCODE_LEAVE_SRC; /* same as FB_PERLQQ */ - }else{ - check = SvIV(check_sv); - } - sv_catsv(dst, encode_method(aTHX_ enc, enc->t_utf8, src, check, - &offset, term, &code, fallback_cb)); + U8 *s; + STRLEN slen; + SV *tmp; +INIT: + SvGETMAGIC(src); + SvGETMAGIC(check_sv); + check = SvROK(check_sv) ? ENCODE_PERLQQ|ENCODE_LEAVE_SRC : SvIV_nomg(check_sv); + fallback_cb = SvROK(check_sv) ? check_sv : &PL_sv_undef; + modify = (check && !(check & ENCODE_LEAVE_SRC)); + enc = INT2PTR(encode_t *, SvIV(SvRV(obj))); + offset = (STRLEN)SvIV(off); +CODE: +{ + if (!SvOK(src)) + XSRETURN_NO; + s = modify ? (U8 *)SvPV_force_nomg(src, slen) : (U8 *)SvPV_nomg(src, slen); + if (SvUTF8(src)) + utf8_safe_downgrade(aTHX_ &src, &s, &slen, modify); + tmp = encode_method(aTHX_ enc, enc->t_utf8, src, s, slen, check, + &offset, term, &code, fallback_cb); + sv_catsv(dst, tmp); + SvREFCNT_dec(tmp); SvIV_set(off, (IV)offset); if (code == ENCODE_FOUND_TERM) { ST(0) = &PL_sv_yes; @@ -665,79 +753,70 @@ CODE: XSRETURN(1); } -void +SV * Method_decode(obj,src,check_sv = &PL_sv_no) SV * obj SV * src SV * check_sv +PREINIT: + int check; + SV *fallback_cb; + bool modify; + encode_t *enc; + U8 *s; + STRLEN slen; +INIT: + SvGETMAGIC(src); + SvGETMAGIC(check_sv); + check = SvROK(check_sv) ? ENCODE_PERLQQ|ENCODE_LEAVE_SRC : SvIV_nomg(check_sv); + fallback_cb = SvROK(check_sv) ? check_sv : &PL_sv_undef; + modify = (check && !(check & ENCODE_LEAVE_SRC)); + enc = INT2PTR(encode_t *, SvIV(SvRV(obj))); CODE: { - int check; - SV *fallback_cb = &PL_sv_undef; - encode_t *enc = INT2PTR(encode_t *, SvIV(SvRV(obj))); - if (SvREADONLY(src) || SvSMAGICAL(src) || SvGMAGICAL(src) || !SvPOK(src)) { - SV *tmp; - tmp = sv_newmortal(); - sv_copypv(tmp, src); - src = tmp; - } - if (SvUTF8(src)) { - sv_utf8_downgrade(src, FALSE); - } - if (SvROK(check_sv)){ - fallback_cb = check_sv; - check = ENCODE_PERLQQ|ENCODE_LEAVE_SRC; /* same as FB_PERLQQ */ - }else{ - check = SvIV(check_sv); - } - ST(0) = encode_method(aTHX_ enc, enc->t_utf8, src, check, + if (!SvOK(src)) + XSRETURN_UNDEF; + s = modify ? (U8 *)SvPV_force_nomg(src, slen) : (U8 *)SvPV_nomg(src, slen); + if (SvUTF8(src)) + utf8_safe_downgrade(aTHX_ &src, &s, &slen, modify); + RETVAL = encode_method(aTHX_ enc, enc->t_utf8, src, s, slen, check, NULL, Nullsv, NULL, fallback_cb); - SvUTF8_on(ST(0)); - XSRETURN(1); + SvUTF8_on(RETVAL); } +OUTPUT: + RETVAL - -#ifndef SvPV_force_nolen -# define SvPV_force_nolen(sv) SvPV_force_flags_nolen(sv, SV_GMAGIC) -#endif - -#ifndef SvPV_force_flags_nolen -# define SvPV_force_flags_nolen(sv, flags) \ - ((SvFLAGS(sv) & (SVf_POK|SVf_THINKFIRST)) == SVf_POK \ - ? SvPVX(sv) : sv_pvn_force_flags(sv, &PL_na, flags)) -#endif - -void +SV * Method_encode(obj,src,check_sv = &PL_sv_no) SV * obj SV * src SV * check_sv +PREINIT: + int check; + SV *fallback_cb; + bool modify; + encode_t *enc; + U8 *s; + STRLEN slen; +INIT: + SvGETMAGIC(src); + SvGETMAGIC(check_sv); + check = SvROK(check_sv) ? ENCODE_PERLQQ|ENCODE_LEAVE_SRC : SvIV_nomg(check_sv); + fallback_cb = SvROK(check_sv) ? check_sv : &PL_sv_undef; + modify = (check && !(check & ENCODE_LEAVE_SRC)); + enc = INT2PTR(encode_t *, SvIV(SvRV(obj))); CODE: { - int check; - SV *fallback_cb = &PL_sv_undef; - encode_t *enc = INT2PTR(encode_t *, SvIV(SvRV(obj))); - if (SvREADONLY(src) || SvSMAGICAL(src) || SvGMAGICAL(src) || !SvPOK(src)) { - /* - SV *tmp; - tmp = sv_newmortal(); - sv_copypv(tmp, src); - src = tmp; - */ - src = sv_mortalcopy(src); - SvPV_force_nolen(src); - } - sv_utf8_upgrade(src); - if (SvROK(check_sv)){ - fallback_cb = check_sv; - check = ENCODE_PERLQQ|ENCODE_LEAVE_SRC; /* same as FB_PERLQQ */ - }else{ - check = SvIV(check_sv); - } - ST(0) = encode_method(aTHX_ enc, enc->f_utf8, src, check, + if (!SvOK(src)) + XSRETURN_UNDEF; + s = modify ? (U8 *)SvPV_force_nomg(src, slen) : (U8 *)SvPV_nomg(src, slen); + if (!SvUTF8(src)) + utf8_safe_upgrade(aTHX_ &src, &s, &slen, modify); + RETVAL = encode_method(aTHX_ enc, enc->f_utf8, src, s, slen, check, NULL, Nullsv, NULL, fallback_cb); - XSRETURN(1); } +OUTPUT: + RETVAL void Method_needs_lines(obj) @@ -753,6 +832,8 @@ CODE: void Method_perlio_ok(obj) SV * obj +PREINIT: + SV *sv; CODE: { /* encode_t *enc = INT2PTR(encode_t *, SvIV(SvRV(obj))); */ @@ -762,7 +843,8 @@ CODE: eval_pv("require PerlIO::encoding", 0); SPAGAIN; - if (SvTRUE(get_sv("@", 0))) { + sv = get_sv("@", 0); + if (SvTRUE(sv)) { ST(0) = &PL_sv_no; }else{ ST(0) = &PL_sv_yes; @@ -773,6 +855,8 @@ CODE: void Method_mime_name(obj) SV * obj +PREINIT: + SV *sv; CODE: { encode_t *enc = INT2PTR(encode_t *, SvIV(SvRV(obj))); @@ -780,7 +864,8 @@ CODE: eval_pv("require Encode::MIME::Name", 0); SPAGAIN; - if (SvTRUE(get_sv("@", 0))) { + sv = get_sv("@", 0); + if (SvTRUE(sv)) { ST(0) = &PL_sv_undef; }else{ ENTER; @@ -903,17 +988,16 @@ bool is_utf8(sv, check = 0) SV * sv int check +PREINIT: + char *str; + STRLEN len; CODE: { - if (SvGMAGICAL(sv)) /* it could be $1, for example */ - sv = newSVsv(sv); /* GMAGIG will be done */ + SvGETMAGIC(sv); /* SvGETMAGIC() can modify SvOK flag */ + str = SvOK(sv) ? SvPV_nomg(sv, len) : NULL; /* SvPV() can modify SvUTF8 flag */ RETVAL = SvUTF8(sv) ? TRUE : FALSE; - if (RETVAL && - check && - !is_utf8_string((U8*)SvPVX(sv), SvCUR(sv))) + if (RETVAL && check && (!str || !is_utf8_string((U8 *)str, len))) RETVAL = FALSE; - if (sv != ST(0)) - SvREFCNT_dec(sv); /* it was a temp copy */ } OUTPUT: RETVAL @@ -923,13 +1007,14 @@ _utf8_on(sv) SV * sv CODE: { - if (SvPOK(sv)) { - SV *rsv = newSViv(SvUTF8(sv)); - RETVAL = rsv; - if (SvIsCOW(sv)) sv_force_normal(sv); - SvUTF8_on(sv); + SvGETMAGIC(sv); + if (!SvTAINTED(sv) && SvPOKp(sv)) { + if (SvTHINKFIRST(sv)) sv_force_normal(sv); + RETVAL = newSViv(SvUTF8(sv)); + SvUTF8_on(sv); + SvSETMAGIC(sv); } else { - RETVAL = &PL_sv_undef; + RETVAL = &PL_sv_undef; } } OUTPUT: @@ -940,124 +1025,38 @@ _utf8_off(sv) SV * sv CODE: { - if (SvPOK(sv)) { - SV *rsv = newSViv(SvUTF8(sv)); - RETVAL = rsv; - if (SvIsCOW(sv)) sv_force_normal(sv); - SvUTF8_off(sv); + SvGETMAGIC(sv); + if (!SvTAINTED(sv) && SvPOKp(sv)) { + if (SvTHINKFIRST(sv)) sv_force_normal(sv); + RETVAL = newSViv(SvUTF8(sv)); + SvUTF8_off(sv); + SvSETMAGIC(sv); } else { - RETVAL = &PL_sv_undef; + RETVAL = &PL_sv_undef; } } OUTPUT: RETVAL -int -DIE_ON_ERR() -CODE: - RETVAL = ENCODE_DIE_ON_ERR; -OUTPUT: - RETVAL - -int -WARN_ON_ERR() -CODE: - RETVAL = ENCODE_WARN_ON_ERR; -OUTPUT: - RETVAL - -int -LEAVE_SRC() -CODE: - RETVAL = ENCODE_LEAVE_SRC; -OUTPUT: - RETVAL - -int -RETURN_ON_ERR() -CODE: - RETVAL = ENCODE_RETURN_ON_ERR; -OUTPUT: - RETVAL - -int -PERLQQ() -CODE: - RETVAL = ENCODE_PERLQQ; -OUTPUT: - RETVAL - -int -HTMLCREF() -CODE: - RETVAL = ENCODE_HTMLCREF; -OUTPUT: - RETVAL - -int -XMLCREF() -CODE: - RETVAL = ENCODE_XMLCREF; -OUTPUT: - RETVAL - -int -STOP_AT_PARTIAL() -CODE: - RETVAL = ENCODE_STOP_AT_PARTIAL; -OUTPUT: - RETVAL - -int -FB_DEFAULT() -CODE: - RETVAL = ENCODE_FB_DEFAULT; -OUTPUT: - RETVAL - -int -FB_CROAK() -CODE: - RETVAL = ENCODE_FB_CROAK; -OUTPUT: - RETVAL - -int -FB_QUIET() -CODE: - RETVAL = ENCODE_FB_QUIET; -OUTPUT: - RETVAL - -int -FB_WARN() -CODE: - RETVAL = ENCODE_FB_WARN; -OUTPUT: - RETVAL - -int -FB_PERLQQ() -CODE: - RETVAL = ENCODE_FB_PERLQQ; -OUTPUT: - RETVAL - -int -FB_HTMLCREF() -CODE: - RETVAL = ENCODE_FB_HTMLCREF; -OUTPUT: - RETVAL - -int -FB_XMLCREF() -CODE: - RETVAL = ENCODE_FB_XMLCREF; -OUTPUT: - RETVAL - BOOT: { + HV *stash = gv_stashpvn("Encode", strlen("Encode"), GV_ADD); + newCONSTSUB(stash, "DIE_ON_ERR", newSViv(ENCODE_DIE_ON_ERR)); + newCONSTSUB(stash, "WARN_ON_ERR", newSViv(ENCODE_WARN_ON_ERR)); + newCONSTSUB(stash, "RETURN_ON_ERR", newSViv(ENCODE_RETURN_ON_ERR)); + newCONSTSUB(stash, "LEAVE_SRC", newSViv(ENCODE_LEAVE_SRC)); + newCONSTSUB(stash, "PERLQQ", newSViv(ENCODE_PERLQQ)); + newCONSTSUB(stash, "HTMLCREF", newSViv(ENCODE_HTMLCREF)); + newCONSTSUB(stash, "XMLCREF", newSViv(ENCODE_XMLCREF)); + newCONSTSUB(stash, "STOP_AT_PARTIAL", newSViv(ENCODE_STOP_AT_PARTIAL)); + newCONSTSUB(stash, "FB_DEFAULT", newSViv(ENCODE_FB_DEFAULT)); + newCONSTSUB(stash, "FB_CROAK", newSViv(ENCODE_FB_CROAK)); + newCONSTSUB(stash, "FB_QUIET", newSViv(ENCODE_FB_QUIET)); + newCONSTSUB(stash, "FB_WARN", newSViv(ENCODE_FB_WARN)); + newCONSTSUB(stash, "FB_PERLQQ", newSViv(ENCODE_FB_PERLQQ)); + newCONSTSUB(stash, "FB_HTMLCREF", newSViv(ENCODE_FB_HTMLCREF)); + newCONSTSUB(stash, "FB_XMLCREF", newSViv(ENCODE_FB_XMLCREF)); +} +{ #include "def_t.exh" } diff --git a/cpan/Encode/Encode/_T.e2x b/cpan/Encode/Encode/_T.e2x index 6cf5f293d5..7b9a67e43d 100644 --- a/cpan/Encode/Encode/_T.e2x +++ b/cpan/Encode/Encode/_T.e2x @@ -2,6 +2,8 @@ use strict; # Adjust the number here! use Test::More tests => 2; -use_ok('Encode'); -use_ok('Encode::$_Name_'); +BEGIN { + use_ok('Encode'); + use_ok('Encode::$_Name_'); +} # Add more test here! diff --git a/cpan/Encode/Makefile.PL b/cpan/Encode/Makefile.PL index c87153bbb3..8203105247 100644 --- a/cpan/Encode/Makefile.PL +++ b/cpan/Encode/Makefile.PL @@ -1,16 +1,26 @@ # -# $Id: Makefile.PL,v 2.17 2016/08/04 03:15:58 dankogai Exp $ +# $Id: Makefile.PL,v 2.18 2016/11/29 23:29:23 dankogai Exp dankogai $ # use 5.007003; use strict; use warnings; use ExtUtils::MakeMaker; use File::Spec; +use Config; # Just for sure :) my %ARGV = map { my @r = split /=/,$_; defined $r[1] or $r[1]=1; @r } @ARGV; $ARGV{DEBUG} and warn "$_ => $ARGV{$_}\n" for sort keys %ARGV; $ENV{PERL_CORE} ||= $ARGV{PERL_CORE} if $ARGV{PERL_CORE}; +# similar strictness as in core +my $ccflags = $Config{ccflags}; +if (!$ENV{PERL_CORE}) { + if ($Config{gccversion}) { + $ccflags .= ' -Werror=declaration-after-statement'; + $ccflags .= ' -Wpointer-sign' unless $Config{d_cplusplus}; + $ccflags .= ' -fpermissive' if $Config{d_cplusplus}; + } +} my %tables = ( @@ -45,6 +55,7 @@ WriteMakefile( SUFFIX => 'gz', DIST_DEFAULT => 'all tardist', }, + CCFLAGS => $ccflags, INC => '-I' . File::Spec->catfile( '.', 'Encode' ), LICENSE => 'perl', PREREQ_PM => { diff --git a/cpan/Encode/Unicode/Makefile.PL b/cpan/Encode/Unicode/Makefile.PL index ce48b7aace..b28d16bb96 100644 --- a/cpan/Encode/Unicode/Makefile.PL +++ b/cpan/Encode/Unicode/Makefile.PL @@ -3,7 +3,7 @@ use strict; use ExtUtils::MakeMaker; WriteMakefile( - INC => "-I../Encode", + INC => "-I../Encode", NAME => 'Encode::Unicode', VERSION_FROM => "Unicode.pm", MAN3PODS => {}, diff --git a/cpan/Encode/Unicode/Unicode.pm b/cpan/Encode/Unicode/Unicode.pm index 7dec3e3815..fc1d3d1382 100644 --- a/cpan/Encode/Unicode/Unicode.pm +++ b/cpan/Encode/Unicode/Unicode.pm @@ -4,7 +4,7 @@ use strict; use warnings; no warnings 'redefine'; -our $VERSION = do { my @r = ( q$Revision: 2.15 $ =~ /\d+/g ); sprintf "%d." . "%02d" x $#r, @r }; +our $VERSION = do { my @r = ( q$Revision: 2.15_01 $ =~ /\d+/g ); sprintf "%d." . "%02d" x $#r, @r }; use XSLoader; XSLoader::load( __PACKAGE__, $VERSION ); diff --git a/cpan/Encode/Unicode/Unicode.xs b/cpan/Encode/Unicode/Unicode.xs index 3bad2adae0..117e14d83f 100644 --- a/cpan/Encode/Unicode/Unicode.xs +++ b/cpan/Encode/Unicode/Unicode.xs @@ -1,5 +1,5 @@ /* - $Id: Unicode.xs,v 2.14 2016/01/22 06:33:07 dankogai Exp $ + $Id: Unicode.xs,v 2.15 2016/11/29 23:29:23 dankogai Exp dankogai $ */ #define PERL_NO_GET_CONTEXT @@ -125,8 +125,6 @@ PROTOTYPES: DISABLE #define attr(k, l) (hv_exists((HV *)SvRV(obj),k,l) ? \ *hv_fetch((HV *)SvRV(obj),k,l,0) : &PL_sv_undef) -#define attr_true(k, l) (hv_exists((HV *)SvRV(obj),k,l) ? \ - SvTRUE(*hv_fetch((HV *)SvRV(obj),k,l,0)) : FALSE) void decode_xs(obj, str, check = 0) @@ -135,26 +133,54 @@ SV * str IV check CODE: { - U8 endian = *((U8 *)SvPV_nolen(attr("endian", 6))); - int size = SvIV(attr("size", 4)); + SV *sve = attr("endian", 6); + U8 endian = *((U8 *)SvPV_nolen(sve)); + SV *svs = attr("size", 4); + int size = SvIV(svs); int ucs2 = -1; /* only needed in the event of surrogate pairs */ SV *result = newSVpvn("",0); STRLEN usize = (size > 0 ? size : 1); /* protect against rogue size<=0 */ STRLEN ulen; STRLEN resultbuflen; U8 *resultbuf; - U8 *s = (U8 *)SvPVbyte(str,ulen); - U8 *e = (U8 *)SvEND(str); + U8 *s; + U8 *e; + bool modify = (check && !(check & ENCODE_LEAVE_SRC)); + bool temp_result; + + SvGETMAGIC(str); + if (!SvOK(str)) + XSRETURN_UNDEF; + s = modify ? (U8 *)SvPV_force_nomg(str, ulen) : (U8 *)SvPV_nomg(str, ulen); + if (SvUTF8(str)) { + if (!modify) { + SV *tmp = sv_2mortal(newSVpvn((char *)s, ulen)); + SvUTF8_on(tmp); + if (SvTAINTED(str)) + SvTAINTED_on(tmp); + str = tmp; + s = (U8 *)SvPVX(str); + } + if (ulen) { + if (!utf8_to_bytes(s, &ulen)) + croak("Wide character"); + SvCUR_set(str, ulen); + } + SvUTF8_off(str); + } + e = s+ulen; + /* Optimise for the common case of being called from PerlIOEncode_fill() with a standard length buffer. In this case the result SV's buffer is only used temporarily, so we can afford to allocate the maximum needed and not care about unused space. */ - const bool temp_result = (ulen == PERLIO_BUFSIZ); + temp_result = (ulen == PERLIO_BUFSIZ); ST(0) = sv_2mortal(result); SvUTF8_on(result); if (!endian && s+size <= e) { + SV *sv; UV bom; endian = (size == 4) ? 'N' : 'n'; bom = enc_unpack(aTHX_ &s,e,size,endian); @@ -183,8 +209,9 @@ CODE: } #if 1 /* Update endian for next sequence */ - if (attr_true("renewed", 7)) { - hv_store((HV *)SvRV(obj),"endian",6,newSVpv((char *)&endian,1),0); + sv = attr("renewed", 7); + if (SvTRUE(sv)) { + (void)hv_store((HV *)SvRV(obj),"endian",6,newSVpv((char *)&endian,1),0); } #endif } @@ -202,11 +229,12 @@ CODE: U8 *d; if (issurrogate(ord)) { if (ucs2 == -1) { - ucs2 = attr_true("ucs2", 4); + SV *sv = attr("ucs2", 4); + ucs2 = SvTRUE(sv); } if (ucs2 || size == 4) { if (check) { - croak("%"SVf":no surrogates allowed %"UVxf, + croak("%" SVf ":no surrogates allowed %" UVxf, *hv_fetch((HV *)SvRV(obj),"Name",4,0), ord); } @@ -216,7 +244,7 @@ CODE: UV lo; if (!isHiSurrogate(ord)) { if (check) { - croak("%"SVf":Malformed HI surrogate %"UVxf, + croak("%" SVf ":Malformed HI surrogate %" UVxf, *hv_fetch((HV *)SvRV(obj),"Name",4,0), ord); } @@ -231,7 +259,7 @@ CODE: break; } else { - croak("%"SVf":Malformed HI surrogate %"UVxf, + croak("%" SVf ":Malformed HI surrogate %" UVxf, *hv_fetch((HV *)SvRV(obj),"Name",4,0), ord); } @@ -244,7 +272,7 @@ CODE: lo = enc_unpack(aTHX_ &s,e,size,endian); if (!isLoSurrogate(lo)) { if (check) { - croak("%"SVf":Malformed LO surrogate %"UVxf, + croak("%" SVf ":Malformed LO surrogate %" UVxf, *hv_fetch((HV *)SvRV(obj),"Name",4,0), ord); } @@ -262,7 +290,7 @@ CODE: if ((ord & 0xFFFE) == 0xFFFE || (ord >= 0xFDD0 && ord <= 0xFDEF)) { if (check) { - croak("%"SVf":Unicode character %"UVxf" is illegal", + croak("%" SVf ":Unicode character %" UVxf " is illegal", *hv_fetch((HV *)SvRV(obj),"Name",4,0), ord); } else { @@ -295,7 +323,7 @@ CODE: if (s < e) { /* unlikely to happen because it's fixed-length -- dankogai */ if (check & ENCODE_WARN_ON_ERR) { - Perl_warner(aTHX_ packWARN(WARN_UTF8),"%"SVf":Partial character", + Perl_warner(aTHX_ packWARN(WARN_UTF8),"%" SVf ":Partial character", *hv_fetch((HV *)SvRV(obj),"Name",4,0)); } } @@ -308,6 +336,7 @@ CODE: SvCUR_set(str,0); } *SvEND(str) = '\0'; + SvSETMAGIC(str); } if (!temp_result) shrink_buffer(result); @@ -322,19 +351,40 @@ SV * utf8 IV check CODE: { - U8 endian = *((U8 *)SvPV_nolen(attr("endian", 6))); - const int size = SvIV(attr("size", 4)); + SV *sve = attr("endian", 6); + U8 endian = *((U8 *)SvPV_nolen(sve)); + SV *svs = attr("size", 4); + const int size = SvIV(svs); int ucs2 = -1; /* only needed if there is invalid_ucs2 input */ const STRLEN usize = (size > 0 ? size : 1); SV *result = newSVpvn("", 0); STRLEN ulen; - U8 *s = (U8 *) SvPVutf8(utf8, ulen); - const U8 *e = (U8 *) SvEND(utf8); + U8 *s; + U8 *e; + bool modify = (check && !(check & ENCODE_LEAVE_SRC)); + bool temp_result; + + SvGETMAGIC(utf8); + if (!SvOK(utf8)) + XSRETURN_UNDEF; + s = modify ? (U8 *)SvPV_force_nomg(utf8, ulen) : (U8 *)SvPV_nomg(utf8, ulen); + if (!SvUTF8(utf8)) { + if (!modify) { + SV *tmp = sv_2mortal(newSVpvn((char *)s, ulen)); + if (SvTAINTED(utf8)) + SvTAINTED_on(tmp); + utf8 = tmp; + } + sv_utf8_upgrade_nomg(utf8); + s = (U8 *)SvPV_nomg(utf8, ulen); + } + e = s+ulen; + /* Optimise for the common case of being called from PerlIOEncode_flush() with a standard length buffer. In this case the result SV's buffer is only used temporarily, so we can afford to allocate the maximum needed and not care about unused space. */ - const bool temp_result = (ulen == PERLIO_BUFSIZ); + temp_result = (ulen == PERLIO_BUFSIZ); ST(0) = sv_2mortal(result); @@ -344,12 +394,14 @@ CODE: SvGROW(result, ((ulen+1) * usize)); if (!endian) { + SV *sv; endian = (size == 4) ? 'N' : 'n'; enc_pack(aTHX_ result,size,endian,BOM_BE); #if 1 /* Update endian for next sequence */ - if (attr_true("renewed", 7)) { - hv_store((HV *)SvRV(obj),"endian",6,newSVpv((char *)&endian,1),0); + sv = attr("renewed", 7); + if (SvTRUE(sv)) { + (void)hv_store((HV *)SvRV(obj),"endian",6,newSVpv((char *)&endian,1),0); } #endif } @@ -364,11 +416,12 @@ CODE: if (size != 4 && invalid_ucs2(ord)) { if (!issurrogate(ord)) { if (ucs2 == -1) { - ucs2 = attr_true("ucs2", 4); + SV *sv = attr("ucs2", 4); + ucs2 = SvTRUE(sv); } if (ucs2 || ord > 0x10FFFF) { if (check) { - croak("%"SVf":code point \"\\x{%"UVxf"}\" too high", + croak("%" SVf ":code point \"\\x{%" UVxf "}\" too high", *hv_fetch((HV *)SvRV(obj),"Name",4,0),ord); } enc_pack(aTHX_ result,size,endian,FBCHAR); @@ -394,7 +447,7 @@ CODE: But this is critical when you choose to LEAVE_SRC in which case we die */ if (check & (ENCODE_DIE_ON_ERR|ENCODE_LEAVE_SRC)) { - Perl_croak(aTHX_ "%"SVf":partial character is not allowed " + Perl_croak(aTHX_ "%" SVf ":partial character is not allowed " "when CHECK = 0x%" UVuf, *hv_fetch((HV *)SvRV(obj),"Name",4,0), check); } @@ -408,12 +461,11 @@ CODE: SvCUR_set(utf8,0); } *SvEND(utf8) = '\0'; + SvSETMAGIC(utf8); } if (!temp_result) shrink_buffer(result); if (SvTAINTED(utf8)) SvTAINTED_on(result); /* propagate taintedness */ - SvSETMAGIC(utf8); - XSRETURN(1); } diff --git a/cpan/Encode/bin/enc2xs b/cpan/Encode/bin/enc2xs index f2a228f68b..bd39639ae8 100644 --- a/cpan/Encode/bin/enc2xs +++ b/cpan/Encode/bin/enc2xs @@ -11,7 +11,7 @@ use warnings; use Getopt::Std; use Config; my @orig_ARGV = @ARGV; -our $VERSION = do { my @r = (q$Revision: 2.19 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; +our $VERSION = do { my @r = (q$Revision: 2.20 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; # These may get re-ordered. # RAW is a do_now as inserted by &enter @@ -123,7 +123,10 @@ my %encode_types = (U => \&encode_U, ); # Win32 does not expand globs on command line -eval "\@ARGV = map(glob(\$_),\@ARGV)" if ($^O eq 'MSWin32'); +if ($^O eq 'MSWin32' and !$ENV{PERL_CORE}) { + eval "\@ARGV = map(glob(\$_),\@ARGV)"; + @ARGV = @orig_ARGV unless @ARGV; +} my %opt; # I think these are: @@ -134,6 +137,8 @@ my %opt; # -o <output> to specify the output file name (else it's the first arg) # -f <inlist> to give a file with a list of input files (else use the args) # -n <name> to name the encoding (else use the basename of the input file. +#Getopt::Long::Configure("bundling"); +#GetOptions(\%opt, qw(C M=s S Q q O o=s f=s n=s v)); getopts('CM:SQqOo:f:n:v',\%opt); $opt{M} and make_makefile_pl($opt{M}, @ARGV); @@ -196,9 +201,9 @@ sub compiler_info { # This really should go first, else the die here causes empty (non-erroneous) # output files to be written. my @encfiles; -if (exists $opt{'f'}) { +if (exists $opt{f}) { # -F is followed by name of file containing list of filenames - my $flist = $opt{'f'}; + my $flist = $opt{f}; open(FLIST,$flist) || die "Cannot open $flist:$!"; chomp(@encfiles = <FLIST>); close(FLIST); @@ -206,9 +211,15 @@ if (exists $opt{'f'}) { @encfiles = @ARGV; } -my $cname = (exists $opt{'o'}) ? $opt{'o'} : shift(@ARGV); +my $cname = $opt{o} ? $opt{o} : shift(@ARGV); +unless ($cname) { #debuging a win32 nmake error-only. works via cmdline + print "\nARGV:"; + print "$_ " for @ARGV; + print "\nopt:"; + print " $_ => ",defined $opt{$_}?$opt{$_}:"undef","\n" for keys %opt; +} chmod(0666,$cname) if -f $cname && !-w $cname; -open(C,">$cname") || die "Cannot open $cname:$!"; +open(C,">", $cname) || die "Cannot open $cname:$!"; my $dname = $cname; my $hname = $cname; @@ -220,10 +231,10 @@ if ($cname =~ /\.(c|xs)$/i) # VMS may have upcased filenames with DECC$ARGV_PARS $doC = 1; $dname =~ s/(\.[^\.]*)?$/.exh/; chmod(0666,$dname) if -f $cname && !-w $dname; - open(D,">$dname") || die "Cannot open $dname:$!"; + open(D,">", $dname) || die "Cannot open $dname:$!"; $hname =~ s/(\.[^\.]*)?$/.h/; chmod(0666,$hname) if -f $cname && !-w $hname; - open(H,">$hname") || die "Cannot open $hname:$!"; + open(H,">", $hname) || die "Cannot open $hname:$!"; foreach my $fh (\*C,\*D,\*H) { @@ -469,7 +480,9 @@ sub compile_ucm $erep = $attr{'subchar'}; $erep =~ s/^\s+//; $erep =~ s/\s+$//; } - print "Reading $name ($cs)\n"; + print "Reading $name ($cs)\n" + unless defined $ENV{MAKEFLAGS} + and $ENV{MAKEFLAGS} =~ /\b(s|silent|quiet)\b/; my $nfb = 0; my $hfb = 0; while (<$fh>) @@ -755,9 +768,17 @@ sub addstrings if ($a->{'Forward'}) { my ($cpp, $static, $sized) = compiler_info(1); - my $var = $static ? 'static const' : 'extern'; my $count = $sized ? scalar(@{$a->{'Entries'}}) : ''; - print $fh "$var encpage_t $name\[$count];\n"; + if ($static) { + # we cannot ask Config for d_plusplus since we can override CC=g++-6 on the cmdline + print $fh "#ifdef __cplusplus\n"; # -fpermissive since g++-6 + print $fh "extern encpage_t $name\[$count];\n"; + print $fh "#else\n"; + print $fh "static const encpage_t $name\[$count];\n"; + print $fh "#endif\n"; + } else { + print $fh "extern encpage_t $name\[$count];\n"; + } } $a->{'DoneStrings'} = 1; foreach my $b (@{$a->{'Entries'}}) @@ -848,9 +869,16 @@ sub outtable outtable($fh,$t,$bigname) unless $t->{'Done'}; } my ($cpp, $static) = compiler_info(0); - my $var = $static ? 'static const ' : ''; - print $fh "\n${var}encpage_t $name\[", - scalar(@{$a->{'Entries'}}), "] = {\n"; + my $count = scalar(@{$a->{'Entries'}}); + if ($static) { + print $fh "#ifdef __cplusplus\n"; # -fpermissive since g++-6 + print $fh "encpage_t $name\[$count] = {\n"; + print $fh "#else\n"; + print $fh "static const encpage_t $name\[$count] = {\n"; + print $fh "#endif\n"; + } else { + print $fh "\nencpage_t $name\[$count] = {\n"; + } foreach my $b (@{$a->{'Entries'}}) { my ($sc,$ec,$out,$t,$end,$l,$fb) = @$b; @@ -1104,7 +1132,7 @@ sub _print_expand{ if ((my $d = dirname($dst)) ne '.'){ -d $d or mkdir $d, 0755 or die "mkdir $d : $!"; } - open my $out, ">$dst" or die "$!"; + open my $out, ">", $dst or die "$!"; my $asis = 0; while (<$in>){ if (/^#### END_OF_HEADER/){ diff --git a/cpan/Encode/encoding.pm b/cpan/Encode/encoding.pm index 754b3acb03..dc342683ee 100644 --- a/cpan/Encode/encoding.pm +++ b/cpan/Encode/encoding.pm @@ -1,6 +1,6 @@ -# $Id: encoding.pm,v 2.18 2016/08/10 18:08:45 dankogai Exp dankogai $ +# $Id: encoding.pm,v 2.19 2016/11/01 13:30:38 dankogai Exp $ package encoding; -our $VERSION = sprintf "%d.%02d", q$Revision: 2.18 $ =~ /(\d+)/g; +our $VERSION = sprintf "%d.%02d", q$Revision: 2.19 $ =~ /(\d+)/g; use Encode; use strict; diff --git a/cpan/Encode/lib/Encode/Alias.pm b/cpan/Encode/lib/Encode/Alias.pm index 04ad4967c9..0a252560f5 100644 --- a/cpan/Encode/lib/Encode/Alias.pm +++ b/cpan/Encode/lib/Encode/Alias.pm @@ -2,7 +2,7 @@ package Encode::Alias; use strict; use warnings; no warnings 'redefine'; -our $VERSION = do { my @r = ( q$Revision: 2.20 $ =~ /\d+/g ); sprintf "%d." . "%02d" x $#r, @r }; +our $VERSION = do { my @r = ( q$Revision: 2.21 $ =~ /\d+/g ); sprintf "%d." . "%02d" x $#r, @r }; use constant DEBUG => !!$ENV{PERL_ENCODE_DEBUG}; use Exporter 'import'; @@ -79,8 +79,10 @@ sub find_alias { sub define_alias { while (@_) { - my ( $alias, $name ) = splice( @_, 0, 2 ); - unshift( @Alias, $alias => $name ); # newer one has precedence + my $alias = shift; + my $name = shift; + unshift( @Alias, $alias => $name ) # newer one has precedence + if defined $alias; if ( ref($alias) ) { # clear %Alias cache to allow overrides @@ -96,10 +98,14 @@ sub define_alias { } } } - else { + elsif (defined $alias) { DEBUG and warn "delete \$Alias\{$alias\}"; delete $Alias{$alias}; } + elsif (DEBUG) { + require Carp; + Carp::croak("undef \$alias"); + } } } diff --git a/cpan/Encode/lib/Encode/CN/HZ.pm b/cpan/Encode/lib/Encode/CN/HZ.pm index f035d821f5..4510b0b400 100644 --- a/cpan/Encode/lib/Encode/CN/HZ.pm +++ b/cpan/Encode/lib/Encode/CN/HZ.pm @@ -5,7 +5,7 @@ use warnings; use utf8 (); use vars qw($VERSION); -$VERSION = do { my @r = ( q$Revision: 2.7 $ =~ /\d+/g ); sprintf "%d." . "%02d" x $#r, @r }; +$VERSION = do { my @r = ( q$Revision: 2.8 $ =~ /\d+/g ); sprintf "%d." . "%02d" x $#r, @r }; use Encode qw(:fallbacks); @@ -49,7 +49,8 @@ sub decode ($$;$) { else { # GB mode; the byte ranges are as in RFC 1843. no warnings 'uninitialized'; if ( $str =~ s/^((?:[\x21-\x77][\x21-\x7E])+)// ) { - $ret .= $GB->decode( $1, $chk ); + my $prefix = $1; + $ret .= $GB->decode( $prefix, $chk ); } elsif ( $str =~ s/^\x7E\x7D// ) { # '~}' $in_ascii = 1; diff --git a/cpan/Encode/lib/Encode/MIME/Header.pm b/cpan/Encode/lib/Encode/MIME/Header.pm index d74d453b8b..ad14dba374 100644 --- a/cpan/Encode/lib/Encode/MIME/Header.pm +++ b/cpan/Encode/lib/Encode/MIME/Header.pm @@ -1,22 +1,25 @@ package Encode::MIME::Header; use strict; use warnings; -no warnings 'redefine'; -our $VERSION = do { my @r = ( q$Revision: 2.23 $ =~ /\d+/g ); sprintf "%d." . "%02d" x $#r, @r }; -use Encode qw(find_encoding encode_utf8 decode_utf8); -use MIME::Base64; -use Carp; +our $VERSION = do { my @r = ( q$Revision: 2.24 $ =~ /\d+/g ); sprintf "%d." . "%02d" x $#r, @r }; + +use Carp (); +use Encode (); +use MIME::Base64 (); my %seed = ( - decode_b => '1', # decodes 'B' encoding ? - decode_q => '1', # decodes 'Q' encoding ? - encode => 'B', # encode with 'B' or 'Q' ? - bpl => 75, # bytes per line + decode_b => 1, # decodes 'B' encoding ? + decode_q => 1, # decodes 'Q' encoding ? + encode => 'B', # encode with 'B' or 'Q' ? + charset => 'UTF-8', # encode charset + bpl => 75, # bytes per line ); -$Encode::Encoding{'MIME-Header'} = - bless { %seed, Name => 'MIME-Header', } => __PACKAGE__; +$Encode::Encoding{'MIME-Header'} = bless { + %seed, + Name => 'MIME-Header', +} => __PACKAGE__; $Encode::Encoding{'MIME-B'} = bless { %seed, @@ -37,107 +40,186 @@ sub needs_lines { 1 } sub perlio_ok { 0 } # RFC 2047 and RFC 2231 grammar -my $re_charset = qr/[-0-9A-Za-z_]+/; -my $re_language = qr/[A-Za-z]{1,8}(?:-[A-Za-z]{1,8})*/; +my $re_charset = qr/[!"#\$%&'+\-0-9A-Z\\\^_`a-z\{\|\}~]+/; +my $re_language = qr/[A-Za-z]{1,8}(?:-[0-9A-Za-z]{1,8})*/; my $re_encoding = qr/[QqBb]/; -my $re_encoded_text = qr/[^\?\s]*/; +my $re_encoded_text = qr/[^\?]*/; my $re_encoded_word = qr/=\?$re_charset(?:\*$re_language)?\?$re_encoding\?$re_encoded_text\?=/; -my $re_capture_encoded_word = qr/=\?($re_charset)((?:\*$re_language)?)\?($re_encoding)\?($re_encoded_text)\?=/; +my $re_capture_encoded_word = qr/=\?($re_charset)((?:\*$re_language)?)\?($re_encoding\?$re_encoded_text)\?=/; +my $re_capture_encoded_word_split = qr/=\?($re_charset)((?:\*$re_language)?)\?($re_encoding)\?($re_encoded_text)\?=/; + +# in strict mode check also for valid base64 characters and also for valid quoted printable codes +my $re_encoding_strict_b = qr/[Bb]/; +my $re_encoding_strict_q = qr/[Qq]/; +my $re_encoded_text_strict_b = qr/[0-9A-Za-z\+\/]*={0,2}/; +my $re_encoded_text_strict_q = qr/(?:[^\?\s=]|=[0-9A-Fa-f]{2})*/; +my $re_encoded_word_strict = qr/=\?$re_charset(?:\*$re_language)?\?(?:$re_encoding_strict_b\?$re_encoded_text_strict_b|$re_encoding_strict_q\?$re_encoded_text_strict_q)\?=/; +my $re_capture_encoded_word_strict = qr/=\?($re_charset)((?:\*$re_language)?)\?($re_encoding_strict_b\?$re_encoded_text_strict_b|$re_encoding_strict_q\?$re_encoded_text_strict_q)\?=/; + +my $re_newline = qr/(?:\r\n|[\r\n])/; + +# in strict mode encoded words must be always separated by spaces or tabs (or folded newline) +# except in comments when separator between words and comment round brackets can be omitted +my $re_word_begin_strict = qr/(?:(?:[ \t]|\A)\(?|(?:[^\\]|\A)\)\()/; +my $re_word_sep_strict = qr/(?:$re_newline?[ \t])+/; +my $re_word_end_strict = qr/(?:\)\(|\)?(?:$re_newline?[ \t]|\z))/; + +my $re_match = qr/()((?:$re_encoded_word\s*)*$re_encoded_word)()/; +my $re_match_strict = qr/($re_word_begin_strict)((?:$re_encoded_word_strict$re_word_sep_strict)*$re_encoded_word_strict)(?=$re_word_end_strict)/; + +my $re_capture = qr/$re_capture_encoded_word(?:\s*)?/; +my $re_capture_strict = qr/$re_capture_encoded_word_strict$re_word_sep_strict?/; our $STRICT_DECODE = 0; sub decode($$;$) { - use utf8; - my ( $obj, $str, $chk ) = @_; + my ($obj, $str, $chk) = @_; - # multi-line header to single line - $str =~ s/(?:\r\n|[\r\n])([ \t])/$1/gos; + my $re_match_decode = $STRICT_DECODE ? $re_match_strict : $re_match; + my $re_capture_decode = $STRICT_DECODE ? $re_capture_strict : $re_capture; - # decode each line separately - my @input = split /(\r\n|\r|\n)/o, $str; + my $stop = 0; my $output = substr($str, 0, 0); # to propagate taintedness - while ( @input ) { + # decode each line separately, match whole continuous folded line at one call + 1 while not $stop and $str =~ s{^((?:[^\r\n]*(?:$re_newline[ \t])?)*)($re_newline)?}{ - my $line = shift @input; - my $sep = shift @input; + my $line = $1; + my $sep = defined $2 ? $2 : ''; - # in strict mode encoded words must be always separated by spaces or tabs - # except in comments when separator between words and comment round brackets can be omitted - my $re_word_begin = $STRICT_DECODE ? qr/(?:[ \t\n]|\A)\(?/ : qr//; - my $re_word_sep = $STRICT_DECODE ? qr/[ \t]+/ : qr/\s*/; - my $re_word_end = $STRICT_DECODE ? qr/\)?(?:[ \t\n]|\z)/ : qr//; + $stop = 1 unless length($line) or length($sep); - # concat consecutive encoded mime words with same charset, language and encoding + # NOTE: this code partially could break $chk support + # in non strict mode concat consecutive encoded mime words with same charset, language and encoding # fixes breaking inside multi-byte characters - 1 while $line =~ s/($re_word_begin)$re_capture_encoded_word$re_word_sep=\?\2\3\?\4\?($re_encoded_text)\?=(?=$re_word_end)/$1=\?$2$3\?$4\?$5$6\?=/; - - $line =~ s{($re_word_begin)((?:$re_encoded_word$re_word_sep)*$re_encoded_word)(?=$re_word_end)}{ - my $begin = $1; - my $words = $2; - $words =~ s{$re_capture_encoded_word$re_word_sep?}{ - if (uc($3) eq 'B') { - $obj->{decode_b} or croak qq(MIME "B" unsupported); - decode_b($1, $4, $chk); - } elsif (uc($3) eq 'Q') { - $obj->{decode_q} or croak qq(MIME "Q" unsupported); - decode_q($1, $4, $chk); + 1 while not $STRICT_DECODE and $line =~ s/$re_capture_encoded_word_split\s*=\?\1\2\?\3\?($re_encoded_text)\?=/=\?$1$2\?$3\?$4$5\?=/so; + + # process sequence of encoded MIME words at once + 1 while not $stop and $line =~ s{^(.*?)$re_match_decode}{ + + my $begin = $1 . $2; + my $words = $3; + + $begin =~ tr/\r\n//d; + $output .= $begin; + + # decode one MIME word + 1 while not $stop and $words =~ s{^(.*?)($re_capture_decode)}{ + + $output .= $1; + my $orig = $2; + my $charset = $3; + my ($mime_enc, $text) = split /\?/, $5; + + $text =~ tr/\r\n//d; + + my $enc = Encode::find_mime_encoding($charset); + + # in non strict mode allow also perl encoding aliases + if ( not defined $enc and not $STRICT_DECODE ) { + # make sure that decoded string will be always strict UTF-8 + $charset = 'UTF-8' if lc($charset) eq 'utf8'; + $enc = Encode::find_encoding($charset); + } + + if ( not defined $enc ) { + Carp::croak qq(Unknown charset "$charset") if not ref $chk and $chk & Encode::DIE_ON_ERR; + Carp::carp qq(Unknown charset "$charset") if not ref $chk and $chk & Encode::WARN_ON_ERR; + $stop = 1 if not ref $chk and $chk & Encode::RETURN_ON_ERR; + $output .= ($output =~ /(?:\A|[ \t])$/ ? '' : ' ') . $orig unless $stop; # $orig mime word is separated by whitespace + $stop ? $orig : ''; } else { - croak qq(MIME "$3" encoding is nonexistent!); + if ( uc($mime_enc) eq 'B' and $obj->{decode_b} ) { + my $decoded = _decode_b($enc, $text, $chk); + $stop = 1 if not defined $decoded and not ref $chk and $chk & Encode::RETURN_ON_ERR; + $output .= (defined $decoded ? $decoded : $text) unless $stop; + $stop ? $orig : ''; + } elsif ( uc($mime_enc) eq 'Q' and $obj->{decode_q} ) { + my $decoded = _decode_q($enc, $text, $chk); + $stop = 1 if not defined $decoded and not ref $chk and $chk & Encode::RETURN_ON_ERR; + $output .= (defined $decoded ? $decoded : $text) unless $stop; + $stop ? $orig : ''; + } else { + Carp::croak qq(MIME "$mime_enc" unsupported) if not ref $chk and $chk & Encode::DIE_ON_ERR; + Carp::carp qq(MIME "$mime_enc" unsupported) if not ref $chk and $chk & Encode::WARN_ON_ERR; + $stop = 1 if not ref $chk and $chk & Encode::RETURN_ON_ERR; + $output .= ($output =~ /(?:\A|[ \t])$/ ? '' : ' ') . $orig unless $stop; # $orig mime word is separated by whitespace + $stop ? $orig : ''; + } } - }eg; - $begin . $words; - }eg; - $output .= $line; - $output .= $sep if defined $sep; + }se; - } + if ( not $stop ) { + $output .= $words; + $words = ''; + } + + $words; + + }se; + + if ( not $stop ) { + $line =~ tr/\r\n//d; + $output .= $line . $sep; + $line = ''; + $sep = ''; + } + + $line . $sep; - $_[1] = '' if $chk; # empty the input string in the stack so perlio is ok + }se; + + $_[1] = $str if not ref $chk and $chk and !($chk & Encode::LEAVE_SRC); return $output; } -sub decode_b { - my ( $enc, $b, $chk ) = @_; - my $d = find_encoding($enc) or croak qq(Unknown encoding "$enc"); - # MIME::Base64::decode_base64 ignores everything after a '=' padding character - # split string after each sequence of padding characters and decode each substring - my $db64 = join('', map { decode_base64($_) } split /(?<==)(?=[^=])/, $b); - return $d->name eq 'utf8' - ? Encode::decode_utf8($db64) - : $d->decode( $db64, $chk || Encode::FB_PERLQQ ); +sub _decode_b { + my ($enc, $text, $chk) = @_; + # MIME::Base64::decode ignores everything after a '=' padding character + # in non strict mode split string after each sequence of padding characters and decode each substring + my $octets = $STRICT_DECODE ? + MIME::Base64::decode($text) : + join('', map { MIME::Base64::decode($_) } split /(?<==)(?=[^=])/, $text); + return _decode_octets($enc, $octets, $chk); +} + +sub _decode_q { + my ($enc, $text, $chk) = @_; + $text =~ s/_/ /go; + $text =~ s/=([0-9A-Fa-f]{2})/pack('C', hex($1))/ego; + return _decode_octets($enc, $text, $chk); } -sub decode_q { - my ( $enc, $q, $chk ) = @_; - my $d = find_encoding($enc) or croak qq(Unknown encoding "$enc"); - $q =~ s/_/ /go; - $q =~ s/=([0-9A-Fa-f]{2})/pack("C", hex($1))/ego; - return $d->name eq 'utf8' - ? Encode::decode_utf8($q) - : $d->decode( $q, $chk || Encode::FB_PERLQQ ); +sub _decode_octets { + my ($enc, $octets, $chk) = @_; + $chk &= ~Encode::LEAVE_SRC if not ref $chk and $chk; + local $Carp::CarpLevel = $Carp::CarpLevel + 1; # propagate Carp messages back to caller + my $output = $enc->decode($octets, $chk); + return undef if not ref $chk and $chk and $octets ne ''; + return $output; } sub encode($$;$) { - my ( $obj, $str, $chk ) = @_; - $_[1] = '' if $chk; # empty the input string in the stack so perlio is ok - return $obj->_fold_line($obj->_encode_line($str)); + my ($obj, $str, $chk) = @_; + my $output = $obj->_fold_line($obj->_encode_string($str, $chk)); + $_[1] = $str if not ref $chk and $chk and !($chk & Encode::LEAVE_SRC); + return $output . substr($str, 0, 0); # to propagate taintedness } sub _fold_line { - my ( $obj, $line ) = @_; + my ($obj, $line) = @_; my $bpl = $obj->{bpl}; - my $output = substr($line, 0, 0); # to propagate taintedness + my $output = ''; - while ( length $line ) { + while ( length($line) ) { if ( $line =~ s/^(.{0,$bpl})(\s|\z)// ) { $output .= $1; - $output .= "\r\n" . $2 if length $line; + $output .= "\r\n" . $2 if length($line); } elsif ( $line =~ s/(\s)(.*)$// ) { $output .= $line; $line = $2; - $output .= "\r\n" . $1 if length $line; + $output .= "\r\n" . $1 if length($line); } else { $output .= $line; last; @@ -147,56 +229,75 @@ sub _fold_line { return $output; } -use constant HEAD => '=?UTF-8?'; -use constant TAIL => '?='; -use constant SINGLE => { B => \&_encode_b, Q => \&_encode_q, B_len => \&_encode_b_len, Q_len => \&_encode_q_len }; - -sub _encode_line { - my ( $o, $str ) = @_; - my $enc = $o->{encode}; - my $enc_len = $enc . '_len'; - my $llen = ( $o->{bpl} - length(HEAD) - 2 - length(TAIL) ); - +sub _encode_string { + my ($obj, $str, $chk) = @_; + my $wordlen = $obj->{bpl} > 76 ? 76 : $obj->{bpl}; + my $enc = Encode::find_mime_encoding($obj->{charset}); + my $enc_chk = (not ref $chk and $chk) ? ($chk | Encode::LEAVE_SRC) : $chk; my @result = (); - my $chunk = ''; - while ( length( my $chr = substr( $str, 0, 1, '' ) ) ) { - if ( SINGLE->{$enc_len}($chunk . $chr) > $llen ) { - push @result, SINGLE->{$enc}($chunk); - $chunk = ''; + my $octets = ''; + while ( length( my $chr = substr($str, 0, 1, '') ) ) { + my $seq; + { + local $Carp::CarpLevel = $Carp::CarpLevel + 1; # propagate Carp messages back to caller + $seq = $enc->encode($chr, $enc_chk); } - $chunk .= $chr; + if ( not length($seq) ) { + substr($str, 0, 0, $chr); + last; + } + if ( $obj->_encoded_word_len($octets . $seq) > $wordlen ) { + push @result, $obj->_encode_word($octets); + $octets = ''; + } + $octets .= $seq; } - length($chunk) and push @result, SINGLE->{$enc}($chunk); + length($octets) and push @result, $obj->_encode_word($octets); + $_[1] = $str if not ref $chk and $chk and !($chk & Encode::LEAVE_SRC); return join(' ', @result); } +sub _encode_word { + my ($obj, $octets) = @_; + my $charset = $obj->{charset}; + my $encode = $obj->{encode}; + my $text = $encode eq 'B' ? _encode_b($octets) : _encode_q($octets); + return "=?$charset?$encode?$text?="; +} + +sub _encoded_word_len { + my ($obj, $octets) = @_; + my $charset = $obj->{charset}; + my $encode = $obj->{encode}; + my $text_len = $encode eq 'B' ? _encoded_b_len($octets) : _encoded_q_len($octets); + return length("=?$charset?$encode??=") + $text_len; +} + sub _encode_b { - HEAD . 'B?' . encode_base64( encode_utf8(shift), '' ) . TAIL; + my ($octets) = @_; + return MIME::Base64::encode($octets, ''); } -sub _encode_b_len { - my ( $chunk ) = @_; - use bytes (); - return bytes::length($chunk) * 4 / 3; +sub _encoded_b_len { + my ($octets) = @_; + return ( length($octets) + 2 ) / 3 * 4; } -my $valid_q_chars = '0-9A-Za-z !*+\-/'; +my $re_invalid_q_char = qr/[^0-9A-Za-z !*+\-\/]/; sub _encode_q { - my ( $chunk ) = @_; - $chunk = encode_utf8($chunk); - $chunk =~ s{([^$valid_q_chars])}{ - join("" => map {sprintf "=%02X", $_} unpack("C*", $1)) + my ($octets) = @_; + $octets =~ s{($re_invalid_q_char)}{ + join('', map { sprintf('=%02X', $_) } unpack('C*', $1)) }egox; - $chunk =~ s/ /_/go; - return HEAD . 'Q?' . $chunk . TAIL; + $octets =~ s/ /_/go; + return $octets; } -sub _encode_q_len { - my ( $chunk ) = @_; - use bytes (); - my $valid_count =()= $chunk =~ /[$valid_q_chars]/sgo; - return ( bytes::length($chunk) - $valid_count ) * 3 + $valid_count; +sub _encoded_q_len { + my ($octets) = @_; + my $invalid_count = () = $octets =~ /$re_invalid_q_char/sgo; + return ( $invalid_count * 3 ) + ( length($octets) - $invalid_count ); } 1; @@ -204,75 +305,119 @@ __END__ =head1 NAME -Encode::MIME::Header -- MIME 'B' and 'Q' encoding for unstructured header +Encode::MIME::Header -- MIME encoding for an unstructured email header =head1 SYNOPSIS - use Encode qw/encode decode/; - $utf8 = decode('MIME-Header', $header); - $header = encode('MIME-Header', $utf8); - -=head1 ABSTRACT - -This module implements RFC 2047 MIME encoding for unstructured header. -It cannot be used for structured headers like From or To. There are 3 -variant encoding names; C<MIME-Header>, C<MIME-B> and C<MIME-Q>. The -difference is described below + use Encode qw(encode decode); - decode() encode() - ---------------------------------------------- - MIME-Header Both B and Q =?UTF-8?B?....?= - MIME-B B only; Q croaks =?UTF-8?B?....?= - MIME-Q Q only; B croaks =?UTF-8?Q?....?= + my $mime_str = encode("MIME-Header", "Sample:Text \N{U+263A}"); + # $mime_str is "=?UTF-8?B?U2FtcGxlOlRleHQg4pi6?=" -=head1 DESCRIPTION + my $mime_q_str = encode("MIME-Q", "Sample:Text \N{U+263A}"); + # $mime_q_str is "=?UTF-8?Q?Sample=3AText_=E2=98=BA?=" -When you decode(=?I<encoding>?I<X>?I<ENCODED WORD>?=), I<ENCODED WORD> -is extracted and decoded for I<X> encoding (B for Base64, Q for -Quoted-Printable). Then the decoded chunk is fed to -decode(I<encoding>). So long as I<encoding> is supported by Encode, -any source encoding is fine. + my $str = decode("MIME-Header", + "=?ISO-8859-1?B?SWYgeW91IGNhbiByZWFkIHRoaXMgeW8=?=\r\n " . + "=?ISO-8859-2?B?dSB1bmRlcnN0YW5kIHRoZSBleGFtcGxlLg==?=" + ); + # $str is "If you can read this you understand the example." -When you encode, it just encodes UTF-8 string with I<X> encoding then -quoted with =?UTF-8?I<X>?....?= . The parts that RFC 2047 forbids to -encode are left as is and long lines are folded within 76 bytes per -line. + use Encode qw(decode :fallbacks); + use Encode::MIME::Header; + local $Encode::MIME::Header::STRICT_DECODE = 1; + my $strict_string = decode("MIME-Header", $mime_string, FB_CROAK); + # use strict decoding and croak on errors -=head1 BUGS - -Before version 2.83 this module had broken both decoder and encoder. -Encoder inserted additional spaces, incorrectly encoded input data -and produced invalid MIME strings. Decoder lot of times discarded -white space characters, incorrectly interpreted data or decoded -Base64 string as Quoted-Printable. +=head1 ABSTRACT -As of version 2.83 encoder should be fully compliant of RFC 2047. -Due to bugs in previous versions of encoder, decoder is by default in -less strict compatible mode. It should be able to decode strings -encoded by pre 2.83 version of this module. But this default mode is -not correct according to RFC 2047. +This module implements L<RFC 2047|https://tools.ietf.org/html/rfc2047> MIME +encoding for an unstructured field body of the email header. It can also be +used for L<RFC 822|https://tools.ietf.org/html/rfc822> 'text' token. However, +it cannot be used directly for the whole header with the field name or for the +structured header fields like From, To, Cc, Message-Id, etc... There are 3 +encoding names supported by this module: C<MIME-Header>, C<MIME-B> and +C<MIME-Q>. -In default mode decoder try to decode every substring which looks like -MIME encoded data. So it means that MIME data does not need to be -separated by white space. To enforce correct strict mode, set package -variable $Encode::MIME::Header::STRICT_DECODE to 1, e.g. by localizing: +=head1 DESCRIPTION -C<require Encode::MIME::Header; local $Encode::MIME::Header::STRICT_DECODE = 1;> +Decode method takes an unstructured field body of the email header (or +L<RFC 822|https://tools.ietf.org/html/rfc822> 'text' token) as its input and +decodes each MIME encoded-word from input string to a sequence of bytes +according to L<RFC 2047|https://tools.ietf.org/html/rfc2047> and +L<RFC 2231|https://tools.ietf.org/html/rfc2231>. Subsequently, each sequence +of bytes with the corresponding MIME charset is decoded with +L<the Encode module|Encode> and finally, one output string is returned. Text +parts of the input string which do not contain MIME encoded-word stay +unmodified in the output string. Folded newlines between two consecutive MIME +encoded-words are discarded, others are preserved in the output string. +C<MIME-B> can decode Base64 variant, C<MIME-Q> can decode Quoted-Printable +variant and C<MIME-Header> can decode both of them. If L<Encode module|Encode> +does not support particular MIME charset or chosen variant then an action based +on L<CHECK flags|Encode/Handling Malformed Data> is performed (by default, the +MIME encoded-word is not decoded). + +Encode method takes a scalar string as its input and uses +L<strict UTF-8|Encode/UTF-8 vs. utf8 vs. UTF8> encoder for encoding it to UTF-8 +bytes. Then a sequence of UTF-8 bytes is encoded into MIME encoded-words +(C<MIME-Header> and C<MIME-B> use a Base64 variant while C<MIME-Q> uses a +Quoted-Printable variant) where each MIME encoded-word is limited to 75 +characters. MIME encoded-words are separated by C<CRLF SPACE> and joined to +one output string. Output string is suitable for unstructured field body of +the email header. + +Both encode and decode methods propagate +L<CHECK flags|Encode/Handling Malformed Data> when encoding and decoding the +MIME charset. -It would be nice to support encoding to non-UTF8, such as =?ISO-2022-JP? -and =?ISO-8859-1?= but that makes the implementation too complicated. -These days major mail agents all support =?UTF-8? so I think it is -just good enough. +=head1 BUGS -Due to popular demand, 'MIME-Header-ISO_2022_JP' was introduced by -Makamaka. Thre are still too many MUAs especially cellular phone -handsets which does not grok UTF-8. +Versions prior to 2.22 (part of Encode 2.83) have a malfunctioning decoder +and encoder. The MIME encoder infamously inserted additional spaces or +discarded white spaces between consecutive MIME encoded-words, which led to +invalid MIME headers produced by this module. The MIME decoder had a tendency +to discard white spaces, incorrectly interpret data or attempt to decode Base64 +MIME encoded-words as Quoted-Printable. These problems were fixed in version +2.22. It is highly recommended not to use any version prior 2.22! + +Versions prior to 2.24 (part of Encode 2.87) ignored +L<CHECK flags|Encode/Handling Malformed Data>. The MIME encoder used +L<not strict utf8|Encode/UTF-8 vs. utf8 vs. UTF8> encoder for input Unicode +strings which could lead to invalid UTF-8 sequences. MIME decoder used also +L<not strict utf8|Encode/UTF-8 vs. utf8 vs. UTF8> decoder and additionally +called the decode method with a C<Encode::FB_PERLQQ> flag (thus user-specified +L<CHECK flags|Encode/Handling Malformed Data> were ignored). Moreover, it +automatically croaked when a MIME encoded-word contained unknown encoding. +Since version 2.24, this module uses +L<strict UTF-8|Encode/UTF-8 vs. utf8 vs. UTF8> encoder and decoder. And +L<CHECK flags|Encode/Handling Malformed Data> are correctly propagated. + +Since version 2.22 (part of Encode 2.83), the MIME encoder should be fully +compliant to L<RFC 2047|https://tools.ietf.org/html/rfc2047> and +L<RFC 2231|https://tools.ietf.org/html/rfc2231>. Due to the aforementioned +bugs in previous versions of the MIME encoder, there is a I<less strict> +compatible mode for the MIME decoder which is used by default. It should be +able to decode MIME encoded-words encoded by pre 2.22 versions of this module. +However, note that this is not correct according to +L<RFC 2047|https://tools.ietf.org/html/rfc2047>. + +In default I<not strict> mode the MIME decoder attempts to decode every substring +which looks like a MIME encoded-word. Therefore, the MIME encoded-words do not +need to be separated by white space. To enforce a correct I<strict> mode, set +variable C<$Encode::MIME::Header::STRICT_DECODE> to 1 e.g. by localizing: + + use Encode::MIME::Header; + local $Encode::MIME::Header::STRICT_DECODE = 1; + +=head1 AUTHORS + +Pali E<lt>pali@cpan.orgE<gt> =head1 SEE ALSO -L<Encode> - -RFC 2047, L<http://www.faqs.org/rfcs/rfc2047.html> and many other -locations. +L<Encode>, +L<RFC 822|https://tools.ietf.org/html/rfc822>, +L<RFC 2047|https://tools.ietf.org/html/rfc2047>, +L<RFC 2231|https://tools.ietf.org/html/rfc2231> =cut diff --git a/cpan/Encode/lib/Encode/MIME/Name.pm b/cpan/Encode/lib/Encode/MIME/Name.pm index 10d86a746d..1a8d788aec 100644 --- a/cpan/Encode/lib/Encode/MIME/Name.pm +++ b/cpan/Encode/lib/Encode/MIME/Name.pm @@ -1,8 +1,9 @@ package Encode::MIME::Name; use strict; use warnings; -our $VERSION = do { my @r = ( q$Revision: 1.1 $ =~ /\d+/g ); sprintf "%d." . "%02d" x $#r, @r }; +our $VERSION = do { my @r = ( q$Revision: 1.2 $ =~ /\d+/g ); sprintf "%d." . "%02d" x $#r, @r }; +# NOTE: This table must be 1:1 mapping our %MIME_NAME_OF = ( 'AdobeStandardEncoding' => 'Adobe-Standard-Encoding', 'AdobeSymbol' => 'Adobe-Symbol-Encoding', @@ -43,7 +44,7 @@ our %MIME_NAME_OF = ( 'hp-roman8' => 'hp-roman8', 'hz' => 'HZ-GB-2312', 'iso-2022-jp' => 'ISO-2022-JP', - 'iso-2022-jp-1' => 'ISO-2022-JP', + 'iso-2022-jp-1' => 'ISO-2022-JP-1', 'iso-2022-kr' => 'ISO-2022-KR', 'iso-8859-1' => 'ISO-8859-1', 'iso-8859-10' => 'ISO-8859-10', @@ -73,13 +74,20 @@ our %MIME_NAME_OF = ( 'UTF-32BE' => 'UTF-32BE', 'UTF-32LE' => 'UTF-32LE', 'UTF-7' => 'UTF-7', - 'utf8' => 'UTF-8', 'utf-8-strict' => 'UTF-8', 'viscii' => 'VISCII', ); +# NOTE: %MIME_NAME_OF is still 1:1 mapping +our %ENCODE_NAME_OF = map { uc $MIME_NAME_OF{$_} => $_ } keys %MIME_NAME_OF; + +# Add additional 1:N mapping +$MIME_NAME_OF{'utf8'} = 'UTF-8'; + sub get_mime_name($) { $MIME_NAME_OF{$_[0]} }; +sub get_encode_name($) { $ENCODE_NAME_OF{uc $_[0]} }; + 1; __END__ diff --git a/cpan/Encode/t/Aliases.t b/cpan/Encode/t/Aliases.t index 2fc14cc114..8d4752bddb 100644 --- a/cpan/Encode/t/Aliases.t +++ b/cpan/Encode/t/Aliases.t @@ -159,7 +159,7 @@ define_alias( sub { return "iso-8859-2" if $enc =~ /hebrew/i; return "does-not-exist" if $enc =~ /arabic/i; # should then use other override alias return "utf-8" if $enc =~ /eight/i; - return; + return "unknown"; }); print "# alias test with alias overrides\n"; diff --git a/cpan/Encode/t/Encode.t b/cpan/Encode/t/Encode.t index d12b2fa232..0536b4b714 100644 --- a/cpan/Encode/t/Encode.t +++ b/cpan/Encode/t/Encode.t @@ -25,7 +25,7 @@ my @character_set = ('0'..'9', 'A'..'Z', 'a'..'z'); my @source = qw(ascii iso8859-1 cp1250); my @destiny = qw(cp1047 cp37 posix-bc); my @ebcdic_sets = qw(cp1047 cp37 posix-bc); -plan tests => 38+$n*@encodings + 2*@source*@destiny*@character_set + 2*@ebcdic_sets*256 + 6 + 5 + 2; +plan tests => 38+$n*@encodings + 2*@source*@destiny*@character_set + 2*@ebcdic_sets*256 + 6 + 3 + 3*8 + 2; my $str = join('',map(chr($_),0x20..0x7E)); my $cpy = $str; @@ -156,15 +156,49 @@ ok(encode(utf8 => Encode::Dummy->new("foobar")), "foobar"); ok(decode_utf8(*1), "*main::1"); # hash keys -my $key = (keys %{{ "whatever\x{100}" => '' }})[0]; -my $kopy = $key; -encode("UTF-16LE", $kopy, Encode::FB_CROAK); -is $key, "whatever\x{100}", 'encode with shared hash key scalars'; -undef $key; -$key = (keys %{{ "whatever" => '' }})[0]; -$kopy = $key; -decode("UTF-16LE", $kopy, Encode::FB_CROAK); -is $key, "whatever", 'decode with shared hash key scalars'; +foreach my $name ("UTF-16LE", "UTF-8", "Latin1") { + my $key = (keys %{{ "whatever\x{CA}" => '' }})[0]; + my $kopy = $key; + encode($name, $kopy, Encode::FB_CROAK); + is $key, "whatever\x{CA}", "encode $name with shared hash key scalars"; + undef $key; + $key = (keys %{{ "whatever\x{CA}" => '' }})[0]; + $kopy = $key; + encode($name, $kopy, Encode::FB_CROAK | Encode::LEAVE_SRC); + is $key, "whatever\x{CA}", "encode $name with LEAVE_SRC and shared hash key scalars"; + undef $key; + $key = (keys %{{ "whatever" => '' }})[0]; + $kopy = $key; + decode($name, $kopy, Encode::FB_CROAK); + is $key, "whatever", "decode $name with shared hash key scalars"; + undef $key; + $key = (keys %{{ "whatever" => '' }})[0]; + $kopy = $key; + decode($name, $kopy, Encode::FB_CROAK | Encode::LEAVE_SRC); + is $key, "whatever", "decode $name with LEAVE_SRC and shared hash key scalars"; + + my $enc = find_encoding($name); + undef $key; + $key = (keys %{{ "whatever\x{CA}" => '' }})[0]; + $kopy = $key; + $enc->encode($kopy, Encode::FB_CROAK); + is $key, "whatever\x{CA}", "encode obj $name with shared hash key scalars"; + undef $key; + $key = (keys %{{ "whatever\x{CA}" => '' }})[0]; + $kopy = $key; + $enc->encode($kopy, Encode::FB_CROAK | Encode::LEAVE_SRC); + is $key, "whatever\x{CA}", "encode obj $name with LEAVE_SRC and shared hash key scalars"; + undef $key; + $key = (keys %{{ "whatever" => '' }})[0]; + $kopy = $key; + $enc->decode($kopy, Encode::FB_CROAK); + is $key, "whatever", "decode obj $name with shared hash key scalars"; + undef $key; + $key = (keys %{{ "whatever" => '' }})[0]; + $kopy = $key; + $enc->decode($kopy, Encode::FB_CROAK | Encode::LEAVE_SRC); + is $key, "whatever", "decode obj $name with LEAVE_SRC and shared hash key scalars"; +} my $latin1 = find_encoding('latin1'); my $orig = "\316"; diff --git a/cpan/Encode/t/at-cn.t b/cpan/Encode/t/at-cn.t index 03ba10955a..c82225ecae 100644 --- a/cpan/Encode/t/at-cn.t +++ b/cpan/Encode/t/at-cn.t @@ -21,7 +21,9 @@ use Encode; no utf8; # we have raw Chinese encodings here -use_ok('Encode::CN'); +BEGIN { + use_ok('Encode::CN'); +} # Since JP.t already tests basic file IO, we will just focus on # internal encode / decode test here. Unfortunately, to test diff --git a/cpan/Encode/t/at-tw.t b/cpan/Encode/t/at-tw.t index e6a559b0a1..203fc34bc9 100644 --- a/cpan/Encode/t/at-tw.t +++ b/cpan/Encode/t/at-tw.t @@ -23,7 +23,9 @@ use Encode; no utf8; # we have raw Chinese encodings here -use_ok('Encode::TW'); +BEGIN { + use_ok('Encode::TW'); +} # Since JP.t already tests basic file IO, we will just focus on # internal encode / decode test here. Unfortunately, to test diff --git a/cpan/Encode/t/decode.t b/cpan/Encode/t/decode.t index 6b24a8fa8c..3995412895 100644 --- a/cpan/Encode/t/decode.t +++ b/cpan/Encode/t/decode.t @@ -1,9 +1,9 @@ # -# $Id: decode.t,v 1.2 2016/08/04 03:15:58 dankogai Exp $ +# $Id: decode.t,v 1.3 2016/10/28 05:03:52 dankogai Exp $ # use strict; use Encode qw(decode_utf8 FB_CROAK find_encoding decode); -use Test::More tests => 5; +use Test::More tests => 17; sub croak_ok(&) { my $code = shift; @@ -32,3 +32,55 @@ SKIP: { *a = $orig; is($latin1->decode(*a), '*main::'.$orig, '[cpan #115168] passing typeglobs to decode'); } + +$orig = "\x80"; +$orig =~ /(.)/; +is($latin1->decode($1), "\N{U+0080}", 'passing magic regex to latin1 decode'); + +$orig = "\x80"; +*a = $orig; +is($latin1->decode(*a), "*main::\N{U+0080}", 'passing typeglob to latin1 decode'); + +$orig = "\N{U+0080}"; +$orig =~ /(.)/; +is($latin1->encode($1), "\x80", 'passing magic regex to latin1 encode'); + +$orig = "\xC3\x80"; +$orig =~ /(..)/; +is(Encode::decode_utf8($1), "\N{U+C0}", 'passing magic regex to Encode::decode_utf8'); + +$orig = "\xC3\x80"; +*a = $orig; +is(Encode::decode_utf8(*a), "*main::\N{U+C0}", 'passing typeglob to Encode::decode_utf8'); + +$orig = "\N{U+C0}"; +$orig =~ /(.)/; +is(Encode::encode_utf8($1), "\xC3\x80", 'passing magic regex to Encode::encode_utf8'); + +$orig = "\xC3\x80"; +$orig =~ /(..)/; +is(Encode::decode('utf-8', $1), "\N{U+C0}", 'passing magic regex to UTF-8 decode'); + +$orig = "\xC3\x80"; +*a = $orig; +is(Encode::decode('utf-8', *a), "*main::\N{U+C0}", 'passing typeglob to UTF-8 decode'); + +$orig = "\N{U+C0}"; +$orig =~ /(.)/; +is(Encode::encode('utf-8', $1), "\xC3\x80", 'passing magic regex to UTF-8 encode'); + +SKIP: { + skip "Perl Version ($]) is older than v5.16", 3 if $] < 5.016; + + $orig = "\N{U+0080}"; + *a = $orig; + is($latin1->encode(*a), "*main::\x80", 'passing typeglob to latin1 encode'); + + $orig = "\N{U+C0}"; + *a = $orig; + is(Encode::encode_utf8(*a), "*main::\xC3\x80", 'passing typeglob to Encode::encode_utf8'); + + $orig = "\N{U+C0}"; + *a = $orig; + is(Encode::encode('utf-8', *a), "*main::\xC3\x80", 'passing typeglob to UTF-8 encode'); +} diff --git a/cpan/Encode/t/enc_data.t b/cpan/Encode/t/enc_data.t index 99ea78d94c..2ead16ea95 100644 --- a/cpan/Encode/t/enc_data.t +++ b/cpan/Encode/t/enc_data.t @@ -1,4 +1,4 @@ -# $Id: enc_data.t,v 2.3 2016/08/10 18:08:45 dankogai Exp dankogai $ +# $Id: enc_data.t,v 2.5 2016/11/29 23:29:23 dankogai Exp dankogai $ BEGIN { require Config; import Config; @@ -11,11 +11,11 @@ BEGIN { exit 0; } if (ord("A") == 193) { - print "1..0 # encoding pragma does not support EBCDIC platforms\n"; + print "1..0 # Skip: encoding pragma does not support EBCDIC platforms\n"; exit(0); } - if ("$]" >= 5.025) { - print "1..0 # encoding pragma not supported in Perl 5.26\n"; + if ($] >= 5.025 and !$Config{usecperl}) { + print "1..0 # Skip: encoding pragma not supported in Perl 5.26\n"; exit(0); } if ($] <= 5.008 and !$Config{perl_patchlevel}){ diff --git a/cpan/Encode/t/enc_eucjp.t b/cpan/Encode/t/enc_eucjp.t index 952a8ae7bc..9b32459792 100644 --- a/cpan/Encode/t/enc_eucjp.t +++ b/cpan/Encode/t/enc_eucjp.t @@ -1,4 +1,4 @@ -# $Id: enc_eucjp.t,v 2.3 2016/08/10 18:08:45 dankogai Exp dankogai $ +# $Id: enc_eucjp.t,v 2.3 2016/08/10 18:08:45 dankogai Exp $ # This is the twin of enc_utf8.t . BEGIN { diff --git a/cpan/Encode/t/enc_module.t b/cpan/Encode/t/enc_module.t index 8796a9b343..7d7382b903 100644 --- a/cpan/Encode/t/enc_module.t +++ b/cpan/Encode/t/enc_module.t @@ -1,4 +1,4 @@ -# $Id: enc_module.t,v 2.3 2016/08/10 18:08:45 dankogai Exp dankogai $ +# $Id: enc_module.t,v 2.5 2016/11/29 23:29:23 dankogai Exp dankogai $ # This file is in euc-jp BEGIN { require Config; import Config; @@ -15,11 +15,11 @@ BEGIN { exit 0; } if (ord("A") == 193) { - print "1..0 # encoding pragma does not support EBCDIC platforms\n"; + print "1..0 # Skip: encoding pragma does not support EBCDIC platforms\n"; exit(0); } - if ("$]" >= 5.025) { - print "1..0 # encoding pragma not supported in Perl 5.26\n"; + if ($] >= 5.025 and !$Config{usecperl}) { + print "1..0 # Skip: encoding pragma not supported in Perl 5.26\n"; exit(0); } } diff --git a/cpan/Encode/t/enc_utf8.t b/cpan/Encode/t/enc_utf8.t index 7ffaac0f3f..b07c573960 100644 --- a/cpan/Encode/t/enc_utf8.t +++ b/cpan/Encode/t/enc_utf8.t @@ -1,4 +1,4 @@ -# $Id: enc_utf8.t,v 2.3 2016/08/10 18:08:45 dankogai Exp dankogai $ +# $Id: enc_utf8.t,v 2.3 2016/08/10 18:08:45 dankogai Exp $ # This is the twin of enc_eucjp.t . BEGIN { diff --git a/cpan/Encode/t/encoding-locale.t b/cpan/Encode/t/encoding-locale.t index 7a305a0dcf..87e7ecb45f 100644 --- a/cpan/Encode/t/encoding-locale.t +++ b/cpan/Encode/t/encoding-locale.t @@ -22,5 +22,5 @@ SKIP: { ok(defined $enc, 'encoding returned is supported') or diag("Encoding: ", explain($locale_encoding)); isa_ok($enc, 'Encode::Encoding'); - note($locale_encoding, ' => ', $enc->name); + eval { note($locale_encoding, ' => ', $enc->name); }; } diff --git a/cpan/Encode/t/encoding.t b/cpan/Encode/t/encoding.t index 18d1921428..33010e74b5 100644 --- a/cpan/Encode/t/encoding.t +++ b/cpan/Encode/t/encoding.t @@ -9,11 +9,11 @@ BEGIN { exit 0; } if (ord("A") == 193) { - print "1..0 # encoding pragma does not support EBCDIC platforms\n"; + print "1..0 # Skip: encoding pragma does not support EBCDIC platforms\n"; exit(0); } - if ("$]" >= 5.025) { - print "1..0 # encoding pragma not supported in Perl 5.26\n"; + if ($] >= 5.025 and !$Config{usecperl}) { + print "1..0 # Skip: encoding pragma not supported in Perl 5.26\n"; exit(0); } } diff --git a/cpan/Encode/t/fallback.t b/cpan/Encode/t/fallback.t index 8ef8ab38df..86605ef3b8 100644 --- a/cpan/Encode/t/fallback.t +++ b/cpan/Encode/t/fallback.t @@ -35,7 +35,7 @@ for my $i (0x80..0xff){ $uo .= chr($i); $residue .= chr($i); $af .= '?'; - $uf .= "\x{FFFD}"; + $uf .= "\x{FFFD}" if $i < 0xfd; $ap .= sprintf("\\x{%04x}", $i); $up .= sprintf("\\x%02X", $i); $ah .= sprintf("&#%d;", $i); diff --git a/cpan/Encode/t/jperl.t b/cpan/Encode/t/jperl.t index 475d8bc0db..a0e7a379f6 100644 --- a/cpan/Encode/t/jperl.t +++ b/cpan/Encode/t/jperl.t @@ -1,5 +1,5 @@ # -# $Id: jperl.t,v 2.3 2016/08/10 18:08:45 dankogai Exp dankogai $ +# $Id: jperl.t,v 2.5 2016/11/29 23:29:23 dankogai Exp dankogai $ # # This script is written in euc-jp @@ -17,8 +17,8 @@ BEGIN { print "1..0 # Skip: EBCDIC\n"; exit 0; } - if ("$]" >= 5.025) { - print "1..0 # encoding pragma not supported in Perl 5.26\n"; + if ($] >= 5.025 and !$Config{usecperl}) { + print "1..0 # Skip: encoding pragma not supported in Perl 5.26\n"; exit(0); } $| = 1; diff --git a/cpan/Encode/t/magic.t b/cpan/Encode/t/magic.t new file mode 100644 index 0000000000..8295152247 --- /dev/null +++ b/cpan/Encode/t/magic.t @@ -0,0 +1,144 @@ +BEGIN { + if ($ENV{'PERL_CORE'}) { + chdir 't'; + unshift @INC, '../lib'; + } + require Config; import Config; + if ($Config{'extensions'} !~ /\bEncode\b/) { + print "1..0 # Skip: Encode was not built\n"; + exit 0; + } + if (ord("A") == 193) { + print "1..0 # Skip: EBCDIC\n"; + exit 0; + } + $| = 1; +} + +use strict; +use warnings; + +use Encode qw(find_encoding encode decode encode_utf8 decode_utf8 is_utf8 _utf8_on _utf8_off FB_CROAK); + +use Test::More tests => 3*(2*(4*(4*4)+4)+4+3*3); + +my $ascii = find_encoding('ASCII'); +my $latin1 = find_encoding('Latin1'); +my $utf8 = find_encoding('UTF-8'); +my $utf16 = find_encoding('UTF-16LE'); + +my $undef = undef; +my $ascii_str = 'ascii_str'; +my $utf8_str = 'utf8_str'; +_utf8_on($utf8_str); + +{ + foreach my $str ($undef, $ascii_str, $utf8_str) { + foreach my $croak (0, 1) { + foreach my $enc ('ASCII', 'Latin1', 'UTF-8', 'UTF-16LE') { + my $mod = defined $str && $croak; + my $func = "encode('" . $enc . "', " . (!defined $str ? 'undef' : is_utf8($str) ? '$utf8_str' : '$ascii_str') . ($croak ? ', FB_CROAK' : '') . ')'; + tie my $input, 'TieScalarCounter', $str; + my $output = encode($enc, $input, $croak ? FB_CROAK : 0); + is(tied($input)->{fetch}, 1, "$func processes get magic only once"); + is(tied($input)->{store}, $mod ? 1 : 0, "$func " . ($mod ? 'processes set magic only once' : 'does not process set magic')); + is($input, $mod ? '' : $str, "$func " . ($mod ? 'modifies' : 'does not modify') . ' $input string'); + is($output, ((defined $str and $enc eq 'UTF-16LE') ? encode("UTF-16LE", $str) : $str), "$func returns correct \$output string"); + } + foreach my $enc ('ASCII', 'Latin1', 'UTF-8', 'UTF-16LE') { + my $mod = defined $str && $croak; + my $func = "decode('" . $enc . "', " . (!defined $str ? 'undef' : is_utf8($str) ? '$utf8_str' : '$ascii_str') . ($croak ? ', FB_CROAK' : '') . ')'; + my $input_str = ((defined $str and $enc eq 'UTF-16LE') ? encode("UTF-16LE", $str) : $str); + tie my $input, 'TieScalarCounter', $input_str; + my $output = decode($enc, $input, $croak ? FB_CROAK : 0); + is(tied($input)->{fetch}, 1, "$func processes get magic only once"); + is(tied($input)->{store}, $mod ? 1 : 0, "$func " . ($mod ? 'processes set magic only once' : 'does not process set magic')); + is($input, $mod ? '' : $input_str, "$func " . ($mod ? 'modifies' : 'does not modify') . ' $input string'); + is($output, $str, "$func returns correct \$output string"); + } + foreach my $obj ($ascii, $latin1, $utf8, $utf16) { + my $mod = defined $str && $croak; + my $func = '$' . $obj->name() . '->encode(' . (!defined $str ? 'undef' : is_utf8($str) ? '$utf8_str' : '$ascii_str') . ($croak ? ', FB_CROAK' : '') . ')'; + tie my $input, 'TieScalarCounter', $str; + my $output = $obj->encode($input, $croak ? FB_CROAK : 0); + is(tied($input)->{fetch}, 1, "$func processes get magic only once"); + is(tied($input)->{store}, $mod ? 1 : 0, "$func " . ($mod ? 'processes set magic only once' : 'does not process set magic')); + is($input, $mod ? '' : $str, "$func " . ($mod ? 'modifies' : 'does not modify') . ' $input string'); + is($output, ((defined $str and $obj == $utf16) ? encode("UTF-16LE", $str) : $str), "$func returns correct \$output string"); + } + foreach my $obj ($ascii, $latin1, $utf8, $utf16) { + my $mod = defined $str && $croak; + my $func = '$' . $obj->name() . '->decode(' . (!defined $str ? 'undef' : is_utf8($str) ? '$utf8_str' : '$ascii_str') . ($croak ? ', FB_CROAK' : '') . ')'; + my $input_str = ((defined $str and $obj == $utf16) ? encode("UTF-16LE", $str) : $str); + tie my $input, 'TieScalarCounter', $input_str; + my $output = $obj->decode($input, $croak ? FB_CROAK : 0); + is(tied($input)->{fetch}, 1, "$func processes get magic only once"); + is(tied($input)->{store}, $mod ? 1 : 0, "$func " . ($mod ? 'processes set magic only once' : 'does not process set magic')); + is($input, $mod ? '' : $input_str, "$func " . ($mod ? 'modifies' : 'does not modify') . ' $input string'); + is($output, $str, "$func returns correct \$output string"); + } + { + my $mod = defined $str && $croak; + my $func = 'decode_utf8(' . (!defined $str ? 'undef' : is_utf8($str) ? '$utf8_str' : '$ascii_str') . ($croak ? ', FB_CROAK' : '') . ')'; + tie my $input, 'TieScalarCounter', $str; + my $output = decode_utf8($input, $croak ? FB_CROAK : 0); + is(tied($input)->{fetch}, 1, "$func processes get magic only once"); + is(tied($input)->{store}, $mod ? 1 : 0, "$func " . ($mod ? 'processes set magic only once' : 'does not process set magic')); + is($input, $mod ? '' : $str, "$func " . ($mod ? 'modifies' : 'does not modify') . ' $input string'); + is($output, $str, "$func returns correct \$output string"); + } + } + { + my $func = 'encode_utf8(' . (!defined $str ? 'undef' : is_utf8($str) ? '$utf8_str' : '$ascii_str') . ')'; + tie my $input, 'TieScalarCounter', $str; + my $output = encode_utf8($input); + is(tied($input)->{fetch}, 1, "$func processes get magic only once"); + is(tied($input)->{store}, 0, "$func does not process set magic"); + is($input, $str, "$func does not modify \$input string"); + is($output, $str, "$func returns correct \$output string"); + } + { + my $func = '_utf8_on(' . (!defined $str ? 'undef' : is_utf8($str) ? '$utf8_str' : '$ascii_str') . ')'; + tie my $input, 'TieScalarCounter', $str; + _utf8_on($input); + is(tied($input)->{fetch}, 1, "$func processes get magic only once"); + is(tied($input)->{store}, defined $str ? 1 : 0, "$func " . (defined $str ? 'processes set magic only once' : 'does not process set magic')); + defined $str ? ok(is_utf8($input), "$func sets UTF8 status flag") : ok(!is_utf8($input), "$func does not set UTF8 status flag"); + } + { + my $func = '_utf8_off(' . (!defined $str ? 'undef' : is_utf8($str) ? '$utf8_str' : '$ascii_str') . ')'; + tie my $input, 'TieScalarCounter', $str; + _utf8_off($input); + is(tied($input)->{fetch}, 1, "$func processes get magic only once"); + is(tied($input)->{store}, defined $str ? 1 : 0, "$func " . (defined $str ? 'processes set magic only once' : 'does not process set magic')); + ok(!is_utf8($input), "$func unsets UTF8 status flag"); + } + { + my $func = 'is_utf8(' . (!defined $str ? 'undef' : is_utf8($str) ? '$utf8_str' : '$ascii_str') . ')'; + tie my $input, 'TieScalarCounter', $str; + my $utf8 = is_utf8($input); + is(tied($input)->{fetch}, 1, "$func processes get magic only once"); + is(tied($input)->{store}, 0, "$func does not process set magic"); + is($utf8, is_utf8($str), "$func returned correct state"); + } + } +} + +package TieScalarCounter; + +sub TIESCALAR { + my ($class, $value) = @_; + return bless { fetch => 0, store => 0, value => $value }, $class; +} + +sub FETCH { + my ($self) = @_; + $self->{fetch}++; + return $self->{value}; +} + +sub STORE { + my ($self, $value) = @_; + $self->{store}++; + $self->{value} = $value; +} diff --git a/cpan/Encode/t/mime-header.t b/cpan/Encode/t/mime-header.t index 4477a4eb87..a997dffb41 100644 --- a/cpan/Encode/t/mime-header.t +++ b/cpan/Encode/t/mime-header.t @@ -1,5 +1,5 @@ # -# $Id: mime-header.t,v 2.12 2016/04/11 07:17:02 dankogai Exp $ +# $Id: mime-header.t,v 2.14 2016/11/29 23:29:23 dankogai Exp dankogai $ # This script is written in utf8 # BEGIN { @@ -24,8 +24,22 @@ use strict; use utf8; use charnames ":full"; -use Test::More tests => 130; -use_ok("Encode::MIME::Header"); +use Test::More tests => 264; + +BEGIN { + use_ok("Encode::MIME::Header"); +} + +my @decode_long_tests; +if ($] < 5.009004) { # perl versions without Regular expressions Engine de-recursivised which cause stack overflow + push(@decode_long_tests, "a" x 1000000 => "a" x 1000000); + push(@decode_long_tests, "=?utf-8?Q?a?= " x 400 => "a" x 400 . " "); + push(@decode_long_tests, "=?utf-8?Q?a?= =?US-ASCII?Q?b?= " x 200 => "ab" x 200 . " "); +} else { + push(@decode_long_tests, "a" x 1000000 => "a" x 1000000); + push(@decode_long_tests, "=?utf-8?Q?a?= " x 10000 => "a" x 10000 . " "); + push(@decode_long_tests, "=?utf-8?Q?a?= =?US-ASCII?Q?b?= " x 10000 => "ab" x 10000 . " "); +} my @decode_tests = ( # RFC2047 p.5 @@ -54,6 +68,14 @@ my @decode_tests = ( "=?ISO-8859-1*da-DK?Q?Keld_J=F8rn_Simonsen?=" => "Keld Jørn Simonsen", "=?ISO-8859-1*fr-BE?Q?Andr=E9?= Pirard" => "André Pirard", "=?ISO-8859-1*en?B?SWYgeW91IGNhbiByZWFkIHRoaXMgeW8=?= =?ISO-8859-2?B?dSB1bmRlcnN0YW5kIHRoZSBleGFtcGxlLg==?=" => "If you can read this you understand the example.", + # multiple (separated by CRLF) + "=?US-ASCII?Q?a?=\r\n=?US-ASCII?Q?b?=" => "a\r\nb", + "a\r\nb" => "a\r\nb", + "a\r\n\r\nb" => "a\r\n\r\nb", + "a\r\n\r\nb\r\n" => "a\r\n\r\nb\r\n", + # multiple multiline (separated by CRLF) + "=?US-ASCII?Q?a?=\r\n =?US-ASCII?Q?b?=\r\n=?US-ASCII?Q?c?=" => "ab\r\nc", + "a\r\n b\r\nc" => "a b\r\nc", # RT67569 "foo =?us-ascii?q?bar?=" => "foo bar", "foo\r\n =?us-ascii?q?bar?=" => "foo bar", @@ -63,16 +85,38 @@ my @decode_tests = ( "foo\r\n bar" => "foo bar", "=?us-ascii?q?foo?= =?us-ascii?q?bar?=" => "foobar", "=?us-ascii?q?foo?=\r\n =?us-ascii?q?bar?=" => "foobar", - "=?us-ascii?q?foo bar?=" => "=?us-ascii?q?foo bar?=", - "=?us-ascii?q?foo\r\n bar?=" => "=?us-ascii?q?foo bar?=", # RT40027 "a: b\r\n c" => "a: b c", # RT104422 "=?utf-8?Q?pre?= =?utf-8?B?IGZvbw==?=\r\n =?utf-8?Q?bar?=" => "pre foobar", + # RT114034 - replace invalid UTF-8 sequence with unicode replacement character + "=?utf-8?Q?=f9=80=80=80=80?=" => "�", + "=?utf-8?Q?=28=c3=29?=" => "(�)", + # decode only known MIME charsets, do not crash on invalid + "prefix =?unknown?Q?a=20b=20c?= middle =?US-ASCII?Q?d=20e=20f?= suffix" => "prefix =?unknown?Q?a=20b=20c?= middle d e f suffix", + "prefix =?US-ASCII?Q?a_b_c?= =?unknown?Q?d_e_f?= suffix" => "prefix a b c =?unknown?Q?d_e_f?= suffix", + "prefix =?US-ASCII?Q?a_b_c?= =?unknown?Q?d_e_f?= =?US-ASCII?Q?g_h_i?= suffix" => "prefix a b c =?unknown?Q?d_e_f?= g h i suffix", + # long strings + @decode_long_tests, + # separators around encoded words + "\r\n =?US-ASCII?Q?a?=" => " a", + "\r\n (=?US-ASCII?Q?a?=)" => " (a)", + "\r\n (=?US-ASCII?Q?a?=)\r\n " => " (a) ", + "(=?US-ASCII?Q?a?=)\r\n " => "(a) ", + " (=?US-ASCII?Q?a?=) " => " (a) ", + "(=?US-ASCII?Q?a?=) " => "(a) ", + " (=?US-ASCII?Q?a?=)" => " (a)", + "(=?US-ASCII?Q?a?=)(=?US-ASCII?Q?b?=)" => "(a)(b)", + "(=?US-ASCII?Q?a?=) (=?US-ASCII?Q?b?=)" => "(a) (b)", + "(=?US-ASCII?Q?a?=)\r\n (=?US-ASCII?Q?b?=)" => "(a) (b)", + "\r\n (=?US-ASCII?Q?a?=)\r\n (=?US-ASCII?Q?b?=)\r\n " => " (a) (b) ", + "\r\n(=?US-ASCII?Q?a?=)\r\n(=?US-ASCII?Q?b?=)" => "\r\n(a)\r\n(b)", ); my @decode_default_tests = ( @decode_tests, + "=?us-ascii?q?foo bar?=" => "foo bar", + "=?us-ascii?q?foo\r\n bar?=" => "foo bar", '=?us-ascii?q?foo=20=3cbar=40baz=2efoo=3e=20bar?=' => 'foo <bar@baz.foo> bar', '"=?us-ascii?q?foo=20=3cbar=40baz=2efoo=3e=20bar?="' => '"foo <bar@baz.foo> bar"', "=?us-ascii?q?foo?==?us-ascii?q?bar?=" => "foobar", @@ -82,12 +126,35 @@ my @decode_default_tests = ( "[=?UTF-8?B?ZsOzcnVt?=]=?UTF-8?B?IHNwcsOhdmE=?=" => "[fórum] správa", "test:=?UTF-8?B?IHNwcsOhdmE=?=" => "test: správa", "=?UTF-8?B?dMOpc3Q=?=:=?UTF-8?B?IHNwcsOhdmE=?=", "tést: správa", + # multiple base64 parts in one b word + "=?us-ascii?b?Zg==Zg==?=" => "ff", + # b word with invalid characters + "=?us-ascii?b?Zm!!9!v?=" => "foo", + # concat consecutive words (with same parameters) and join them into one utf-8 symbol + "=?UTF-8?Q?=C3?= =?UTF-8?Q?=A1?=" => "á", + # RT114034 - use strict UTF-8 decoder for invalid MIME charsets utf8, UTF8 and utf-8-strict + "=?utf8?Q?=C3=A1=f9=80=80=80=80?=" => "á�", + "=?UTF8?Q?=C3=A1=f9=80=80=80=80?=" => "á�", + "=?utf-8-strict?Q?=C3=A1=f9=80=80=80=80?=" => "á�", ); my @decode_strict_tests = ( @decode_tests, + "=?us-ascii?q?foo bar?=" => "=?us-ascii?q?foo bar?=", + "=?us-ascii?q?foo\r\n bar?=" => "=?us-ascii?q?foo bar?=", '=?us-ascii?q?foo=20=3cbar=40baz=2efoo=3e=20bar?=' => 'foo <bar@baz.foo> bar', '"=?us-ascii?q?foo=20=3cbar=40baz=2efoo=3e=20bar?="' => '"=?us-ascii?q?foo=20=3cbar=40baz=2efoo=3e=20bar?="', + # do not decode invalid q words + "=?us-ascii?q?foo=?=" => "=?us-ascii?q?foo=?=", + "=?us-ascii?q?foo=?= =?us-ascii?q?foo?=" => "=?us-ascii?q?foo=?= foo", + # do not decode invalid b words + "=?us-ascii?b?----?=" => "=?us-ascii?b?----?=", + "=?us-ascii?b?Zm8=-?= =?us-ascii?b?Zm9v?= and =?us-ascii?b?Zg==?=" => "=?us-ascii?b?Zm8=-?= foo and f", + "=?us-ascii?b?----?= =?us-ascii?b?Zm9v?= and =?us-ascii?b?Zg==?=" => "=?us-ascii?b?----?= foo and f", + # RT114034 - utf8, UTF8 and also utf-8-strict are invalid MIME charset, do not decode it + "=?utf8?Q?=C3=A1?=" => "=?utf8?Q?=C3=A1?=", + "=?UTF8?Q?=C3=A1?=" => "=?UTF8?Q?=C3=A1?=", + "=?utf-8-strict?Q?=C3=A1?=" => "=?utf-8-strict?Q?=C3=A1?=", ); my @encode_tests = ( @@ -106,41 +173,161 @@ my @encode_tests = ( # RT88717 "Hey foo\x{2024}bar:whee" => "=?UTF-8?B?SGV5IGZvb+KApGJhcjp3aGVl?=", "=?UTF-8?Q?Hey_foo=E2=80=A4bar=3Awhee?=", # valid q chars - "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz !*+-/" => "=?UTF-8?B?MDEyMzQ1Njc4OUFCQ0RFRkdISUpLTE1OT1BRUlNUVVZXWFlaYWJjZGVmZ2hpams=?=\r\n =?UTF-8?B?bG1ub3BxcnN0dXZ3eHl6ICEqKy0v?=", "=?UTF-8?Q?0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz_?=\r\n =?UTF-8?Q?!*+-/?=", + "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz !*+-/" => "=?UTF-8?B?MDEyMzQ1Njc4OUFCQ0RFRkdISUpLTE1OT1BRUlNUVVZXWFlaYWJjZGVmZ2hp?=\r\n =?UTF-8?B?amtsbW5vcHFyc3R1dnd4eXogISorLS8=?=", "=?UTF-8?Q?0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz_?=\r\n =?UTF-8?Q?!*+-/?=", # invalid q chars "." => "=?UTF-8?B?Lg==?=", "=?UTF-8?Q?=2E?=", "," => "=?UTF-8?B?LA==?=", "=?UTF-8?Q?=2C?=", + # long ascii sequence + "a" x 100 => "=?UTF-8?B?YWFhYWFhYWFhYWFhYWFhYWFhYWFhYWFhYWFhYWFhYWFhYWFhYWFhYWFhYWFh?=\r\n =?UTF-8?B?YWFhYWFhYWFhYWFhYWFhYWFhYWFhYWFhYWFhYWFhYWFhYWFhYWFhYWFhYWFh?=\r\n =?UTF-8?B?YWFhYWFhYWFhYQ==?=", "=?UTF-8?Q?aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa?=\r\n =?UTF-8?Q?aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa?=", + # long unicode sequence + "😀" x 100 => "=?UTF-8?B?8J+YgPCfmIDwn5iA8J+YgPCfmIDwn5iA8J+YgPCfmIDwn5iA8J+YgPCfmIA=?=\r\n " x 9 . "=?UTF-8?B?8J+YgA==?=", join("\r\n ", ("=?UTF-8?Q?=F0=9F=98=80=F0=9F=98=80=F0=9F=98=80=F0=9F=98=80=F0=9F=98=80?=") x 20), ); sub info { - my ($str) = @_; + my ($str, $str1, $str2) = @_; + substr $str1, 1000, -3, "..." if defined $str1 and length $str1 > 1000; + substr $str2, 1000, -3, "..." if defined $str2 and length $str2 > 1000; + $str .= ": $str1" if defined $str1; + $str .= " => $str2" if defined $str2; $str = Encode::encode_utf8($str); $str =~ s/\r/\\r/gs; $str =~ s/\n/\\n/gs; return $str; } +sub check_length { + my ($str) = @_; + my @lines = split /\r\n /, $str; + my @long = grep { length($_) > 75 } @lines; + return scalar @long == 0; +} + my @splice; @splice = @encode_tests; while (my ($d, $b, $q) = splice @splice, 0, 3) { - is Encode::encode('MIME-Header', $d) => $b, info("encode default: $d => $b"); - is Encode::encode('MIME-B', $d) => $b, info("encode base64: $d => $b"); - is Encode::encode('MIME-Q', $d) => $q, info("encode qp: $d => $q"); - is Encode::decode('MIME-B', $b) => $d, info("decode base64: $b => $d"); - is Encode::decode('MIME-Q', $q) => $d, info("decode qp: $b => $d"); + is Encode::encode("MIME-Header", $d) => $b, info("encode default", $d => $b); + is Encode::encode("MIME-B", $d) => $b, info("encode base64", $d => $b); + is Encode::encode("MIME-Q", $d) => $q, info("encode qp", $d => $q); + is Encode::decode("MIME-B", $b) => $d, info("decode base64", $b => $d); + is Encode::decode("MIME-Q", $q) => $d, info("decode qp", $b => $d); + ok check_length($b), info("correct encoded length base64", $b); + ok check_length($q), info("correct encoded length qp", $q); } @splice = @decode_default_tests; while (my ($e, $d) = splice @splice, 0, 2) { - is Encode::decode('MIME-Header', $e) => $d, info("decode default: $e => $d"); + is Encode::decode("MIME-Header", $e) => $d, info("decode default", $e => $d); } local $Encode::MIME::Header::STRICT_DECODE = 1; @splice = @decode_strict_tests; while (my ($e, $d) = splice @splice, 0, 2) { - is Encode::decode('MIME-Header', $e) => $d, info("decode strict: $e => $d"); + is Encode::decode("MIME-Header", $e) => $d, info("decode strict", $e => $d); +} + +my $valid_unicode = "á"; +my $invalid_unicode = "\x{1000000}"; +{ + my $input = $valid_unicode; + my $output = Encode::encode("MIME-Header", $input, Encode::FB_QUIET); + is $output => Encode::encode("MIME-Header", $valid_unicode), "encode valid with FB_QUIET flag: output string is valid"; + is $input => "", "encode valid with FB_QUIET flag: input string is modified and empty"; +} +{ + my $input = $valid_unicode . $invalid_unicode; + my $output = Encode::encode("MIME-Header", $input, Encode::FB_QUIET); + is $output => Encode::encode("MIME-Header", $valid_unicode), "encode with FB_QUIET flag: output string stops before first invalid character"; + is $input => $invalid_unicode, "encode with FB_QUIET flag: input string is modified and starts with first invalid character"; +} +{ + my $input = $valid_unicode . $invalid_unicode; + my $output = Encode::encode("MIME-Header", $input, Encode::FB_QUIET | Encode::LEAVE_SRC); + is $output => Encode::encode("MIME-Header", $valid_unicode), "encode with FB_QUIET and LEAVE_SRC flags: output string stops before first invalid character"; + is $input => $valid_unicode . $invalid_unicode, "encode with FB_QUIET and LEAVE_SRC flags: input string is not modified"; +} +{ + my $input = $valid_unicode . $invalid_unicode; + my $output = Encode::encode("MIME-Header", $input, Encode::FB_PERLQQ); + is $output => Encode::encode("MIME-Header", $valid_unicode . '\x{1000000}'), "encode with FB_PERLQQ flag: output string contains perl qq representation of invalid character"; + is $input => $valid_unicode . $invalid_unicode, "encode with FB_PERLQQ flag: input string is not modified"; +} +{ + my $input = $valid_unicode; + my $output = Encode::encode("MIME-Header", $input, sub { sprintf("!0x%X!", $_[0]) }); + is $output => Encode::encode("MIME-Header", $valid_unicode), "encode valid with coderef check: output string is valid"; + is $input => $valid_unicode, "encode valid with coderef check: input string is not modified"; +} +{ + my $input = $valid_unicode . $invalid_unicode; + my $output = Encode::encode("MIME-Header", $input, sub { sprintf("!0x%X!", $_[0]) }); + is $output => Encode::encode("MIME-Header", $valid_unicode . '!0x1000000!'), "encode with coderef check: output string contains output from coderef"; + is $input => $valid_unicode . $invalid_unicode, "encode with coderef check: input string is not modified"; +} + +my $valid_mime = "=?US-ASCII?Q?d=20e=20f?="; +my $invalid_mime = "=?unknown?Q?a=20b=20c?="; +my $invalid_mime_unicode = "=?utf-8?Q?=28=c3=29?="; +{ + my $input = $valid_mime; + my $output = Encode::decode("MIME-Header", $input, Encode::FB_QUIET); + is $output => Encode::decode("MIME-Header", $valid_mime), "decode valid with FB_QUIET flag: output string is valid"; + is $input => "", "decode valid with FB_QUIET flag: input string is modified and empty"; +} +{ + my $input = $valid_mime . " " . $invalid_mime; + my $output = Encode::decode("MIME-Header", $input, Encode::FB_QUIET); + is $output => Encode::decode("MIME-Header", $valid_mime), "decode with FB_QUIET flag: output string stops before first mime word with unknown charset"; + is $input => $invalid_mime, "decode with FB_QUIET flag: input string is modified and starts with first mime word with unknown charset"; +} +{ + my $input = $valid_mime . " " . $invalid_mime_unicode; + my $output = Encode::decode("MIME-Header", $input, Encode::FB_QUIET); + is $output => Encode::decode("MIME-Header", $valid_mime), "decode with FB_QUIET flag: output string stops before first mime word with invalid unicode character"; + is $input => $invalid_mime_unicode, "decode with FB_QUIET flag: input string is modified and starts with first mime word with invalid unicode character"; +} +{ + my $input = $valid_mime . " " . $invalid_mime; + my $output = Encode::decode("MIME-Header", $input, Encode::FB_QUIET | Encode::LEAVE_SRC); + is $output => Encode::decode("MIME-Header", $valid_mime), "decode with FB_QUIET and LEAVE_SRC flags: output string stops before first mime word with unknown charset"; + is $input => $valid_mime . " " . $invalid_mime, "decode with FB_QUIET flag: input string is not modified"; +} +{ + my $input = $valid_mime . " " . $invalid_mime_unicode; + my $output = Encode::decode("MIME-Header", $input, Encode::FB_QUIET | Encode::LEAVE_SRC); + is $output => Encode::decode("MIME-Header", $valid_mime), "decode with FB_QUIET and LEAVE_SRC flags: output string stops before first mime word with invalid unicode character"; + is $input => $valid_mime . " " . $invalid_mime_unicode, "decode with FB_QUIET flag: input string is not modified"; +} +{ + my $input = $valid_mime . " " . $invalid_mime; + my $output = Encode::decode("MIME-Header", $input, Encode::FB_PERLQQ); + is $output => Encode::decode("MIME-Header", $valid_mime) . " " . $invalid_mime, "decode with FB_PERLQQ flag: output string contains unmodified mime word with unknown charset"; + is $input => $valid_mime . " " . $invalid_mime, "decode with FB_QUIET flag: input string is not modified"; +} +{ + my $input = $valid_mime . " " . $invalid_mime_unicode; + my $output = Encode::decode("MIME-Header", $input, Encode::FB_PERLQQ); + is $output => Encode::decode("MIME-Header", $valid_mime) . '(\xC3)', "decode with FB_PERLQQ flag: output string contains perl qq representation of invalid unicode character"; + is $input => $valid_mime . " " . $invalid_mime_unicode, "decode with FB_QUIET flag: input string is not modified"; +} +{ + my $input = $valid_mime; + my $output = Encode::decode("MIME-Header", $input, sub { sprintf("!0x%X!", $_[0]) }); + is $output => Encode::decode("MIME-Header", $valid_mime), "decode valid with coderef check: output string is valid"; + is $input => $valid_mime, "decode valid with coderef check: input string is not modified"; +} +{ + my $input = $valid_mime . " " . $invalid_mime; + my $output = Encode::decode("MIME-Header", $input, sub { sprintf("!0x%X!", $_[0]) }); + is $output => Encode::decode("MIME-Header", $valid_mime) . " " . $invalid_mime, "decode with coderef check: output string contains unmodified mime word with unknown charset"; + is $input => $valid_mime . " " . $invalid_mime, "decode with coderef check: input string is not modified"; +} +{ + my $input = $valid_mime . " " . $invalid_mime_unicode; + my $output = Encode::decode("MIME-Header", $input, sub { sprintf("!0x%X!", $_[0]) }); + is $output => Encode::decode("MIME-Header", $valid_mime) . '(!0xC3!)', "decode with coderef check: output string contains output from coderef for invalid unicode character"; + is $input => $valid_mime . " " . $invalid_mime_unicode, "decode with coderef check: input string is not modified"; } __END__ diff --git a/cpan/Encode/t/mime-name.t b/cpan/Encode/t/mime-name.t index 02ff49053a..ced4e7c031 100644 --- a/cpan/Encode/t/mime-name.t +++ b/cpan/Encode/t/mime-name.t @@ -1,5 +1,5 @@ # -# $Id: mime-name.t,v 1.1 2007/05/12 06:42:19 dankogai Exp $ +# $Id: mime-name.t,v 1.2 2016/10/28 05:03:52 dankogai Exp $ # This script is written in utf8 # BEGIN { @@ -23,14 +23,40 @@ use strict; use warnings; use Encode; #use Test::More qw(no_plan); -use Test::More tests => 68; +use Test::More tests => 277; + +BEGIN { + use_ok("Encode::MIME::Name"); +} -use_ok("Encode::MIME::Name"); for my $canon ( sort keys %Encode::MIME::Name::MIME_NAME_OF ) { my $enc = find_encoding($canon); my $mime_name = $Encode::MIME::Name::MIME_NAME_OF{$canon}; is $enc->mime_name, $mime_name, - qq(\$enc->mime_name("$canon") eq $mime_name); + qq(find_encoding($canon)->mime_name eq $mime_name); + is $enc->name, $canon, + qq(find_encoding($canon)->name eq $canon); +} +for my $mime_name ( sort keys %Encode::MIME::Name::ENCODE_NAME_OF ) { + my $enc = find_mime_encoding($mime_name); + my $canon = $Encode::MIME::Name::ENCODE_NAME_OF{$mime_name}; + my $mime_name = $Encode::MIME::Name::MIME_NAME_OF{$canon}; + is $enc->mime_name, $mime_name, + qq(find_mime_encoding($mime_name)->mime_name eq $mime_name); + is $enc->name, $canon, + qq(find_mime_encoding($mime_name)->name eq $canon); } +ok find_encoding("utf8"); +ok find_encoding("UTF8"); +ok find_encoding("utf-8-strict"); +ok find_encoding("utf-8"); +ok find_encoding("UTF-8"); + +ok not find_mime_encoding("utf8"); +ok not find_mime_encoding("UTF8"); +ok not find_mime_encoding("utf-8-strict"); +ok find_mime_encoding("utf-8"); +ok find_mime_encoding("UTF-8"); + __END__; diff --git a/cpan/Encode/t/rt113164.t b/cpan/Encode/t/rt113164.t new file mode 100644 index 0000000000..f0a94ea066 --- /dev/null +++ b/cpan/Encode/t/rt113164.t @@ -0,0 +1,38 @@ +BEGIN { + if ($ENV{'PERL_CORE'}) { + chdir 't'; + unshift @INC, '../lib'; + } + require Config; import Config; + if ($Config{'extensions'} !~ /\bEncode\b/) { + print "1..0 # Skip: Encode was not built\n"; + exit 0; + } + if (ord("A") == 193) { + print "1..0 # Skip: EBCDIC\n"; + exit 0; + } + $| = 1; +} + +use strict; +use warnings; + +use Test::More tests => 2; + +use Encode; + +my $str = "You" . chr(8217) . "re doomed!"; + +my $data; + +my $cb = sub { + $data = [ ('?') x 12_500 ]; + return ";"; +}; + +my $octets = encode('iso-8859-1', $str, $cb); +is $octets, "You;re doomed!", "stack was not overwritten"; + +$octets = encode('iso-8859-1', $str, $cb); +is $octets, "You;re doomed!", "stack was not overwritten"; diff --git a/cpan/Encode/t/rt65541.t b/cpan/Encode/t/rt65541.t new file mode 100644 index 0000000000..4a75ce3c7b --- /dev/null +++ b/cpan/Encode/t/rt65541.t @@ -0,0 +1,29 @@ +BEGIN { + if ($ENV{'PERL_CORE'}) { + chdir 't'; + unshift @INC, '../lib'; + } + require Config; import Config; + if ($Config{'extensions'} !~ /\bEncode\b/) { + print "1..0 # Skip: Encode was not built\n"; + exit 0; + } + if (ord("A") == 193) { + print "1..0 # Skip: EBCDIC\n"; + exit 0; + } + $| = 1; +} + +use strict; +use warnings; + +use Encode; +use PerlIO::encoding; +$PerlIO::encoding::fallback &= ~Encode::WARN_ON_ERR; + +use Test::More tests => 3; + +ok open my $fh, ">:encoding(cp1250)", do{\(my $str)}; +ok print $fh ("a" x 1023) . "\x{0378}"; +ok close $fh; diff --git a/cpan/Encode/t/rt76824.t b/cpan/Encode/t/rt76824.t new file mode 100644 index 0000000000..5d057f607e --- /dev/null +++ b/cpan/Encode/t/rt76824.t @@ -0,0 +1,60 @@ +BEGIN { + if ($ENV{'PERL_CORE'}) { + chdir 't'; + unshift @INC, '../lib'; + } + require Config; import Config; + if ($Config{'extensions'} !~ /\bEncode\b/) { + print "1..0 # Skip: Encode was not built\n"; + exit 0; + } + if (ord("A") == 193) { + print "1..0 # Skip: EBCDIC\n"; + exit 0; + } + $| = 1; +} + +use strict; +use warnings; + +use Encode; +use PerlIO::encoding; +$PerlIO::encoding::fallback &= ~Encode::WARN_ON_ERR; + +use Test::More tests => 2; + +my $out; +my @arr = ( + "\x{feff}\x{39f}\x{3af} \x{3a3}\x{3c5}\x{3bd}\x{3ad}\x{3bd}\x{3bf}\x{3c7}\x{3bf}\x{3b9}\n", + "\x{39f}\x{3b9} \x{393}\x{3b5}\x{3bd}\x{3bd}\x{3b1}\x{3af}\x{3bf}\x{3b9} \x{3c4}\x{3b7}\x{3c2} \x{3a3}\x{3b1}\x{3bc}\x{3bf}\x{3b8}\x{3c1}\x{3ac}\x{3ba}\x{3b7}\x{3c2}\n", + "\x{39f}\x{3b9} \x{393}\x{3b5}\x{3c1}\x{3bc}\x{3b1}\x{3bd}\x{3bf}\x{3af} \x{3be}\x{3b1}\x{3bd}\x{3ac}\x{3c1}\x{3c7}\x{3bf}\x{3bd}\x{3c4}\x{3b1}\x{3b9}...\n", + "\x{39f}\x{3b9} \x{395}\x{3c1}\x{3b1}\x{3c3}\x{3c4}\x{3ad}\x{3c2} \x{3a4}\x{3bf}\x{3c5} \x{391}\x{3b9}\x{3b3}\x{3b1}\x{3af}\x{3bf}\x{3c5}\n", + "\x{39f}\x{3b9} \x{39a}\x{3c5}\x{3bd}\x{3b7}\x{3b3}\x{3bf}\x{3af}\n", + "\x{39f}\x{3b9} \x{3a0}\x{3b1}\x{3bd}\x{3ba}\x{3c2} \x{3a4}\x{3b1} \x{39a}\x{3ac}\x{3bd}\x{3bf}\x{3c5}\x{3bd} \x{38c}\x{3bb}\x{3b1}\n", + "\x{39f}\x{3b9} \x{3a6}\x{3b1}\x{3bd}\x{3c4}\x{3b1}\x{3c1}\x{3af}\x{3bd}\x{3b5}\x{3c2}\n", + "\x{39f}\x{3b9}\x{3ba}\x{3bf}\x{3b3}\x{3ad}\x{3bd}\x{3b5}\x{3b9}\x{3b1} \x{3a0}\x{3b1}\x{3bd}\x{3c4}\x{3c1}\x{3b5}\x{3c5}\x{3cc}\x{3bc}\x{3b1}\x{3c3}\x{3c4}\x{3b5}\n", + "\x{39f}\x{3bb}\x{3b1} \x{3b5}\x{3af}\x{3bd}\x{3b1}\x{3b9} \x{3b4}\x{3c1}\x{3cc}\x{3bc}\x{3bf}\x{3c2}\n", + "\x{39f}\x{3bc}\x{3b7}\x{3c1}\x{3bf}\x{3c2}\n", + "\x{39f}\x{3be}\x{3c5}\x{3b3}\x{3cc}\x{3bd}\x{3bf}\n", + "\x{39f}\x{3c1}\x{3b1}\x{3c4}\x{3cc}\x{3c4}\x{3b7}\x{3c2} \x{3bc}\x{3b7}\x{3b4}\x{3ad}\x{3bd}\n", + "\x{3c0}\n", + "\x{3c0}\x{3ac}\x{3bd}\x{3c9}, \x{3ba}\x{3ac}\x{3c4}\x{3c9} \x{3ba}\x{3b1}\x{3b9} \x{3c0}\x{3bb}\x{3b1}\x{3b3}\x{3af}\x{3c9}\x{3c2}\n", + "\x{3a4}\x{3bf} \x{39a}\x{3b1}\x{3ba}\x{3cc}\n", + "\x{3a4}\x{3bf} \x{39a}\x{3b1}\x{3ba}\x{3cc} - \x{3a3}\x{3c4}\x{3b7}\x{3bd} \x{395}\x{3c0}\x{3bf}\x{3c7}\x{3ae} \x{3c4}\x{3c9}\x{3bd} \x{397}\x{3c1}\x{3ce}\x{3c9}\x{3bd}\n", + "\x{3a4}\x{3bf} \x{3ba}\x{3bb}\x{3ac}\x{3bc}\x{3b1} \x{3b2}\x{3b3}\x{3ae}\x{3ba}\x{3b5} \x{3b1}\x{3c0}'\x{3c4}\x{3bf}\x{3bd} \x{3c0}\x{3b1}\x{3c1}\x{3ac}\x{3b4}\x{3b5}\x{3b9}\x{3c3}\x{3bf}\n", + "\x{3a4}\x{3bf} \x{3ba}\x{3bf}\x{3c1}\x{3af}\x{3c4}\x{3c3}\x{3b9} \x{3bc}\x{3b5} \x{3c4}\x{3b1} \x{3bc}\x{3b1}\x{3cd}\x{3c1}\x{3b1}\n", + "\x{3a4}\x{3bf} \x{3ba}\x{3bf}\x{3c1}\x{3af}\x{3c4}\x{3c3}\x{3b9} \x{3c4}\x{3bf}\x{3c5} \x{3bb}\x{3bf}\x{3cd}\x{3bd}\x{3b1} \x{3c0}\x{3b1}\x{3c1}\x{3ba}\n", + "\x{3a4}\x{3bf} \x{39e}\x{3cd}\x{3bb}\x{3bf} \x{3b2}\x{3b3}\x{3ae}\x{3ba}\x{3b5} \x{3b1}\x{3c0}\x{3cc} \x{3c4}\x{3bf}\x{3bd} \x{3c0}\x{3b1}\x{3c1}\x{3ac}\x{3b4}\x{3b5}\x{3b9}\x{3c3}\x{3bf}\n", + "\x{3a4}\x{3bf} \x{3c0}\x{3b9}\x{3bf} \x{3bb}\x{3b1}\x{3bc}\x{3c0}\x{3c1}\x{3cc} \x{3b1}\x{3c3}\x{3c4}\x{3ad}\x{3c1}\x{3b9}\n", + "\x{3a4}\x{3bf} \x{3a1}\x{3b5}\x{3bc}\x{3b1}\x{3bb}\x{3b9} \x{3a4}\x{3b7}\x{3c2} \x{391}\x{3b8}\x{3b7}\x{3bd}\x{3b1}\x{3c2}\n", + "\x{3a4}\x{3bf} \x{3a4}\x{3b1}\x{3bd}\x{3b3}\x{3ba}\x{3cc} \x{3c4}\x{3c9}\x{3bd} \x{3a7}\x{3c1}\x{3b9}\x{3c3}\x{3c4}\x{3bf}\x{3c5}\x{3b3}\x{3ad}\x{3bd}\x{3bd}\x{3c9}\x{3bd}\n", + "\x{3a4}\x{3bf} \x{3c4}\x{3b5}\x{3bb}\x{3b5}\x{3c5}\x{3c4}\x{3b1}\x{3af}\x{3bf} \x{3c8}\x{3ad}\x{3bc}\x{3bc}\x{3b1}\n", + "\x{3a4}\x{3bf} \x{3c6}\x{3b9}\x{3bb}\x{3af} \x{3c4}\x{3b7}\x{3c2}... \x{396}\x{3c9}\x{3ae}\x{3c2}\n", + "\x{3a4}\x{3bf} \x{3c7}\x{3ce}\x{3bc}\x{3b1} \x{3b2}\x{3ac}\x{3c6}\x{3c4}\x{3b7}\x{3ba}\x{3b5} \x{3ba}\x{3cc}\x{3ba}\x{3ba}\x{3b9}\x{3bd}\x{3bf}\n", + "\x{3a4}\x{3bf}\x{3c0}\x{3af}\x{3bf} \x{3c3}\x{3c4}\x{3b7}\x{3bd} \x{3bf}\x{3bc}\x{3af}\x{3c7}\x{3bb}\x{3b7}\n", + "\x{3a4}\x{3c1}\x{3b9}\x{3bb}\x{3bf}\x{3b3}\x{3af}\x{3b1} 1: \x{3a4}\x{3bf} \x{39b}\x{3b9}\x{3b2}\x{3ac}\x{3b4}\x{3b9} \x{3c0}\x{3bf}\x{3c5} \x{3b4}\x{3b1}\x{3ba}\x{3c1}\x{3cd}\x{3b6}\x{3b5}\x{3b9}\n" + ); +ok open my $wh, '>:crlf:encoding(ISO-8859-1)', \$out; +print $wh $_ for @arr; +ok close $wh; diff --git a/cpan/Encode/t/rt85489.t b/cpan/Encode/t/rt85489.t new file mode 100644 index 0000000000..3b28e35af6 --- /dev/null +++ b/cpan/Encode/t/rt85489.t @@ -0,0 +1,48 @@ +BEGIN { + if ($ENV{'PERL_CORE'}) { + chdir 't'; + unshift @INC, '../lib'; + } + require Config; import Config; + if ($Config{'extensions'} !~ /\bEncode\b/) { + print "1..0 # Skip: Encode was not built\n"; + exit 0; + } + if (ord("A") == 193) { + print "1..0 # Skip: EBCDIC\n"; + exit 0; + } + $| = 1; +} + +use strict; +use warnings; + +use Test::More tests => 8; + +use Encode; + +my $ascii = Encode::find_encoding("ascii"); +my $orig = "str"; + +my $str = $orig; +ok !Encode::is_utf8($str), "UTF8 flag is not set on input string before ascii encode"; +$ascii->encode($str); +ok !Encode::is_utf8($str), "UTF8 flag is not set on input string after ascii encode"; + +$str = $orig; +ok !Encode::is_utf8($str), "UTF8 flag is not set on input string before Encode::encode ascii"; +Encode::encode("ascii", $str); +ok !Encode::is_utf8($str), "UTF8 flag is not set on input string after Encode::encode ascii"; + +$str = $orig; +Encode::_utf8_on($str); +ok Encode::is_utf8($str), "UTF8 flag is set on input string before ascii decode"; +$ascii->decode($str); +ok Encode::is_utf8($str), "UTF8 flag is set on input string after ascii decode"; + +$str = $orig; +Encode::_utf8_on($str); +ok Encode::is_utf8($str), "UTF8 flag is set on input string before Encode::decode ascii"; +Encode::decode("ascii", $str); +ok Encode::is_utf8($str), "UTF8 flag is set on input string after Encode::decode ascii"; diff --git a/cpan/Encode/t/rt86327.t b/cpan/Encode/t/rt86327.t new file mode 100644 index 0000000000..91527f849c --- /dev/null +++ b/cpan/Encode/t/rt86327.t @@ -0,0 +1,33 @@ +BEGIN { + if ($ENV{'PERL_CORE'}) { + chdir 't'; + unshift @INC, '../lib'; + } + require Config; import Config; + if ($Config{'extensions'} !~ /\bEncode\b/) { + print "1..0 # Skip: Encode was not built\n"; + exit 0; + } + if (ord("A") == 193) { + print "1..0 # Skip: EBCDIC\n"; + exit 0; + } + $| = 1; +} + +use strict; +use warnings; + +use Encode; +use PerlIO::encoding; +$PerlIO::encoding::fallback &= ~Encode::WARN_ON_ERR; + +use Test::More tests => 3; + +my @t = qw/230 13 90 65 34 239 86 15 8 26 181 25 305 123 22 139 111 6 3 +100 37 1 20 1 166 1 300 19 1 42 153 81 106 114 67 1 32 34/; +my $str; +ok open OUT, '>:encoding(iso-8859-1)', \$str; +my $string = join "\x{fffd}", map { '.'x$_ } @t; +ok print OUT $string; +ok close OUT; diff --git a/cpan/Encode/t/taint.t b/cpan/Encode/t/taint.t index 2446dd76d3..6fa46bd957 100644 --- a/cpan/Encode/t/taint.t +++ b/cpan/Encode/t/taint.t @@ -1,13 +1,17 @@ #!/usr/bin/perl -T use strict; use Encode qw(encode decode); +local %Encode::ExtModule = %Encode::Config::ExtModule; use Scalar::Util qw(tainted); use Test::More; my $taint = substr($ENV{PATH},0,0); my $str = "dan\x{5f3e}" . $taint; # tainted string to encode my $bin = encode('UTF-8', $str); # tainted binary to decode +my $notaint = ""; +my $notaint_str = "dan\x{5f3e}" . $notaint; +my $notaint_bin = encode('UTF-8', $notaint_str); my @names = Encode->encodings(':all'); -plan tests => 2 * @names; +plan tests => 4 * @names + 2; for my $name (@names) { my ($d, $e, $s); eval { @@ -26,3 +30,25 @@ for my $name (@names) { ok tainted($d), "decode $name"; } } +for my $name (@names) { + my ($d, $e, $s); + eval { + $e = encode($name, $notaint_str); + }; + SKIP: { + skip $@, 1 if $@; + ok ! tainted($e), "encode $name"; + } + $notaint_bin = $e.$notaint if $e; + eval { + $d = decode($name, $notaint_bin); + }; + SKIP: { + skip $@, 1 if $@; + ok ! tainted($d), "decode $name"; + } +} +Encode::_utf8_on($bin); +ok(!Encode::is_utf8($bin), "Encode::_utf8_on does not work on tainted values"); +Encode::_utf8_off($str); +ok(Encode::is_utf8($str), "Encode::_utf8_off does not work on tainted values"); diff --git a/cpan/Encode/t/utf8ref.t b/cpan/Encode/t/utf8ref.t index 3253e08639..288f15b44f 100644 --- a/cpan/Encode/t/utf8ref.t +++ b/cpan/Encode/t/utf8ref.t @@ -1,12 +1,12 @@ # -# $Id: utf8ref.t,v 1.1 2010/09/18 18:39:51 dankogai Exp $ +# $Id: utf8ref.t,v 1.2 2016/10/28 05:03:52 dankogai Exp $ # use strict; use warnings; use Encode; use Test::More; -plan tests => 4; +plan tests => 12; #plan 'no_plan'; # my $a = find_encoding('ASCII'); @@ -14,7 +14,20 @@ my $u = find_encoding('UTF-8'); my $r = []; no warnings 'uninitialized'; is encode_utf8($r), ''.$r; -is $u->encode($r), ''; +is $u->encode($r), ''.$r; $r = {}; is decode_utf8($r), ''.$r; -is $u->decode($r), ''; +is $u->decode($r), ''.$r; +use warnings 'uninitialized'; + +is encode_utf8(undef), undef; +is decode_utf8(undef), undef; + +is encode_utf8(''), ''; +is decode_utf8(''), ''; + +is Encode::encode('utf8', undef), undef; +is Encode::decode('utf8', undef), undef; + +is Encode::encode('utf8', ''), ''; +is Encode::decode('utf8', ''), ''; diff --git a/cpan/Encode/t/utf8strict.t b/cpan/Encode/t/utf8strict.t index 3f362f4981..39293d3067 100644 --- a/cpan/Encode/t/utf8strict.t +++ b/cpan/Encode/t/utf8strict.t @@ -47,8 +47,8 @@ BEGIN { qq/dd 67 41 41/ => 0, # 2.3.2 qq/ee 42 73 73 71/ => 0, # 2.3.3 qq/f4 90 80 80/ => 1, # 2.3.4 -- out of range so NG - # "3 Malformed sequences" are checked by perl. - # "4 Overlong sequences" are checked by perl. + # EBCDIC TODO: "3 Malformed sequences" + # EBCDIC TODO: "4 Overlong sequences" ); } else { %SEQ = ( @@ -56,8 +56,49 @@ BEGIN { qq/ee 80 80/ => 0, # 2.3.2 qq/f4 8f bf bd/ => 0, # 2.3.3 qq/f4 90 80 80/ => 1, # 2.3.4 -- out of range so NG - # "3 Malformed sequences" are checked by perl. - # "4 Overlong sequences" are checked by perl. + qq/80/ => 1, # 3.1.1 + qq/bf/ => 1, # 3.1.2 + qq/80 bf/ => 1, # 3.1.3 + qq/80 bf 80/ => 1, # 3.1.4 + qq/80 bf 80 bf/ => 1, # 3.1.5 + qq/80 bf 80 bf 80/ => 1, # 3.1.6 + qq/80 bf 80 bf 80 bf/ => 1, # 3.1.7 + qq/80 bf 80 bf 80 bf 80/ => 1, # 3.1.8 + qq/80 81 82 83 84 85 86 87 88 89 8a 8b 8c 8d 8e 8f 90 91 92 93 94 95 96 97 98 99 9a 9b 9c 9d 9e 9f a0 a1 a2 a3 a4 a5 a6 a7 a8 a9 aa ab ac ad ae af b0 b1 b2 b3 b4 b5 b6 b7 b8 b9 ba bb bc bd be bf/ => 1, # 3.1.9 + qq/c0 20 c1 20 c2 20 c3 20 c4 20 c5 20 c6 20 c7 20 c8 20 c9 20 ca 20 cb 20 cc 20 cd 20 ce 20 cf 20 d0 20 d1 20 d2 20 d3 20 d4 20 d5 20 d6 20 d7 20 d8 20 d9 20 da 20 db 20 dc 20 dd 20 de 20 df 20/ => 1, # 3.2.1 + qq/e0 20 e1 20 e2 20 e3 20 e4 20 e5 20 e6 20 e7 20 e8 20 e9 20 ea 20 eb 20 ec 20 ed 20 ee 20 ef 20/ => 1, # 3.2.2 + qq/f0 20 f1 20 f2 20 f3 20 f4 20 f5 20 f6 20 f7 20/ => 1, # 3.2.3 + qq/f8 20 f9 20 fa 20 fb 20/ => 1, # 3.2.4 + qq/fc 20 fd 20/ => 1, # 3.2.5 + qq/c0/ => 1, # 3.3.1 + qq/e0 80/ => 1, # 3.3.2 + qq/f0 80 80/ => 1, # 3.3.3 + qq/f8 80 80 80/ => 1, # 3.3.4 + qq/fc 80 80 80 80/ => 1, # 3.3.5 + qq/df/ => 1, # 3.3.6 + qq/ef bf/ => 1, # 3.3.7 + qq/f7 bf bf/ => 1, # 3.3.8 + qq/fb bf bf bf/ => 1, # 3.3.9 + qq/fd bf bf bf bf/ => 1, # 3.3.10 + qq/c0 e0 80 f0 80 80 f8 80 80 80 fc 80 80 80 80 df ef bf f7 bf bf fb bf bf bf fd bf bf bf bf/ => 1, # 3.4.1 + qq/fe/ => 1, # 3.5.1 + qq/ff/ => 1, # 3.5.2 + qq/fe fe ff ff/ => 1, # 3.5.3 + qq/c0 af/ => 1, # 4.1.1 + qq/e0 80 af/ => 1, # 4.1.2 + qq/f0 80 80 af/ => 1, # 4.1.3 + qq/f8 80 80 80 af/ => 1, # 4.1.4 + qq/fc 80 80 80 80 af/ => 1, # 4.1.5 + qq/c1 bf/ => 1, # 4.2.1 + qq/e0 9f bf/ => 1, # 4.2.2 + qq/f0 8f bf bf/ => 1, # 4.2.3 + qq/f8 87 bf bf bf/ => 1, # 4.2.4 + qq/fc 83 bf bf bf bf/ => 1, # 4.2.5 + qq/c0 80/ => 1, # 4.3.1 + qq/e0 80 80/ => 1, # 4.3.2 + qq/f0 80 80 80/ => 1, # 4.3.3 + qq/f8 80 80 80 80/ => 1, # 4.3.4 + qq/fc 80 80 80 80 80/ => 1, # 4.3.5 ); } $NTESTS += scalar keys %SEQ; @@ -82,7 +123,7 @@ for my $s (sort keys %SEQ){ eval { $d->decode($o,1) }; $DEBUG and $@ and warn $@; my $t = $@ ? 1 : 0; - is($t, $SEQ{$s}, $s); + is($t, $SEQ{$s}, "sequence: $s"); } __END__ |