diff options
author | Nick Ing-Simmons <nik@tiuk.ti.com> | 2002-04-26 17:36:16 +0000 |
---|---|---|
committer | Nick Ing-Simmons <nik@tiuk.ti.com> | 2002-04-26 17:36:16 +0000 |
commit | 5f228b1d3feafe3247efca23709f3c7bd5daf91b (patch) | |
tree | f917a045995abe71f5d8c726bebf6768680e3d73 /ext | |
parent | 2583bd17aea1ca96fac50929c91872157a7782b3 (diff) | |
parent | cb5780feb6b3d31503eb651fb2d3d543cc89f6c6 (diff) | |
download | perl-5f228b1d3feafe3247efca23709f3c7bd5daf91b.tar.gz |
Integrate mainline
p4raw-id: //depot/perlio@16194
Diffstat (limited to 'ext')
44 files changed, 1817 insertions, 176 deletions
diff --git a/ext/B/B.xs b/ext/B/B.xs index 6392b9b2cc..111116a21b 100644 --- a/ext/B/B.xs +++ b/ext/B/B.xs @@ -668,7 +668,7 @@ OP_ppaddr(o) CODE: sv_setpvn(sv, "PL_ppaddr[OP_", 13); sv_catpv(sv, PL_op_name[o->op_type]); - for (i=13; i<SvCUR(sv); ++i) + for (i=13; (STRLEN)i < SvCUR(sv); ++i) SvPVX(sv)[i] = toUPPER(SvPVX(sv)[i]); sv_catpv(sv, "]"); ST(0) = sv; diff --git a/ext/ByteLoader/ByteLoader.xs b/ext/ByteLoader/ByteLoader.xs index d559bfe57f..4588b02ef0 100644 --- a/ext/ByteLoader/ByteLoader.xs +++ b/ext/ByteLoader/ByteLoader.xs @@ -11,7 +11,7 @@ int bl_getc(struct byteloader_fdata *data) { dTHX; - if (SvCUR(data->datasv) <= data->next_out) { + if (SvCUR(data->datasv) <= (STRLEN)data->next_out) { int result; /* Run out of buffered data, so attempt to read some more */ *(SvPV_nolen (data->datasv)) = '\0'; diff --git a/ext/Cwd/t/cwd.t b/ext/Cwd/t/cwd.t index bcbcd17b3d..92079c0b10 100644 --- a/ext/Cwd/t/cwd.t +++ b/ext/Cwd/t/cwd.t @@ -28,14 +28,18 @@ eval { fastcwd }; # Must find an external pwd (or equivalent) command. +my $pwd = $^O eq 'MSWin32' ? "cmd" : "pwd"; my $pwd_cmd = - ($^O eq "MSWin32" || $^O eq "NetWare") ? + ($^O eq "NetWare") ? "cd" : - (grep { -x && -f } map { "$_/pwd$Config{exe_ext}" } + (grep { -x && -f } map { "$_/$pwd$Config{exe_ext}" } split m/$Config{path_sep}/, $ENV{PATH})[0]; $pwd_cmd = 'SHOW DEFAULT' if $IsVMS; - +if ($^O eq 'MSWin32') { + $pwd_cmd =~ s,/,\\,g; + $pwd_cmd = "$pwd_cmd /c cd"; +} print "# native pwd = '$pwd_cmd'\n"; SKIP: { diff --git a/ext/Data/Dumper/Dumper.xs b/ext/Data/Dumper/Dumper.xs index 19037a839e..383707a4e3 100644 --- a/ext/Data/Dumper/Dumper.xs +++ b/ext/Data/Dumper/Dumper.xs @@ -147,10 +147,10 @@ esc_q_utf8(pTHX_ SV* sv, register char *src, register STRLEN slen) if (k == '"' || k == '\\' || k == '$' || k == '@') { *r++ = '\\'; - *r++ = k; + *r++ = (char)k; } else if (k < 0x80) - *r++ = k; + *r++ = (char)k; else { r += sprintf(r, "\\x{%"UVxf"}", k); } diff --git a/ext/Devel/DProf/DProf.xs b/ext/Devel/DProf/DProf.xs index 3380d786a5..2219bd2189 100644 --- a/ext/Devel/DProf/DProf.xs +++ b/ext/Devel/DProf/DProf.xs @@ -632,7 +632,7 @@ BOOT: * while we do this. */ { - I32 warn_tmp = PL_dowarn; + bool warn_tmp = PL_dowarn; PL_dowarn = 0; newXS("DB::sub", XS_DB_sub, file); newXS("DB::goto", XS_DB_goto, file); diff --git a/ext/Digest/MD5/MD5.xs b/ext/Digest/MD5/MD5.xs index 5828df3350..0852e526db 100644 --- a/ext/Digest/MD5/MD5.xs +++ b/ext/Digest/MD5/MD5.xs @@ -80,10 +80,10 @@ extern "C" { #ifndef BYTESWAP static void u2s(U32 u, U8* s) { - *s++ = u & 0xFF; - *s++ = (u >> 8) & 0xFF; - *s++ = (u >> 16) & 0xFF; - *s = (u >> 24) & 0xFF; + *s++ = (U8)(u & 0xFF); + *s++ = (U8)((u >> 8) & 0xFF); + *s++ = (U8)((u >> 16) & 0xFF); + *s = (U8)((u >> 24) & 0xFF); } #define s2u(s,u) ((u) = (U32)(*s) | \ diff --git a/ext/Digest/MD5/t/files.t b/ext/Digest/MD5/t/files.t index 67289925b1..1a1f032eae 100644 --- a/ext/Digest/MD5/t/files.t +++ b/ext/Digest/MD5/t/files.t @@ -16,12 +16,12 @@ my $EXPECT; if (ord('A') == 193) { # EBCDIC $EXPECT = <<EOT; ee6a09094632cd610199278bbb0f910e ext/Digest/MD5/MD5.pm -491dfb1027eb154cff18beb609d6068a ext/Digest/MD5/MD5.xs +XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX ext/Digest/MD5/MD5.xs EOT } else { # ASCII $EXPECT = <<EOT; 665ddc08b12d6b1bf85ac6dc5aae68b3 ext/Digest/MD5/MD5.pm -95444a9c6ad17e443e4606c6c7fd9e28 ext/Digest/MD5/MD5.xs +5f21e907b2e7dbffe6aba2c762ea93d0 ext/Digest/MD5/MD5.xs EOT } diff --git a/ext/Encode/AUTHORS b/ext/Encode/AUTHORS index 2ba72f844f..86100126b5 100644 --- a/ext/Encode/AUTHORS +++ b/ext/Encode/AUTHORS @@ -27,6 +27,7 @@ Nicholas Clark <nick@ccl4.org> Nick Ing-Simmons <nick@ing-simmons.net> Paul Marquess <paul_marquess@yahoo.co.uk> Philip Newton <pne@cpan.org> +Robin Barker <rmb1@cise.npl.co.uk> SADAHIRO Tomoyuki <SADAHIRO@cpan.org> Spider Boardman <spider@web.zk3.dec.com> Tatsuhiko Miyagawa <miyagawa@edge.co.jp> diff --git a/ext/Encode/CN/Makefile.PL b/ext/Encode/CN/Makefile.PL index 46b262dacd..775a8f5b38 100644 --- a/ext/Encode/CN/Makefile.PL +++ b/ext/Encode/CN/Makefile.PL @@ -1,6 +1,7 @@ use 5.7.2; use strict; use ExtUtils::MakeMaker; +use strict; my %tables = (euc_cn_t => ['euc-cn.ucm', 'cp936.ucm', @@ -11,6 +12,20 @@ my %tables = (euc_cn_t => ['euc-cn.ucm', ir_165_t => ['ir-165.ucm'], ); +unless ($ENV{AGGREGATE_TABLES}){ + my @ucm; + for my $k (keys %tables){ + push @ucm, @{$tables{$k}}; + } + %tables = (); + my $seq = 0; + for my $ucm (sort @ucm){ + # 8.3 compliance ! + my $t = sprintf ("%s_%02d_t", substr($ucm, 0, 2), $seq++); + $tables{$t} = [ $ucm ]; + } +} + my $name = 'CN'; WriteMakefile( diff --git a/ext/Encode/Changes b/ext/Encode/Changes index 77a5f04120..ad4fabb76a 100644 --- a/ext/Encode/Changes +++ b/ext/Encode/Changes @@ -1,9 +1,53 @@ # Revision history for Perl extension Encode. # -# $Id: Changes,v 1.58 2002/04/22 23:54:22 dankogai Exp $ +# $Id: Changes,v 1.61 2002/04/26 03:02:04 dankogai Exp $ # -$Revision: 1.58 $ $Date: 2002/04/22 23:54:22 $ +$Revision: 1.61 $ $Date: 2002/04/26 03:02:04 $ +! t/mime-header.t + Now does decent tests besides use_ok() +! lib/Encode/Guess.pm t/guess.t + UI streamlined, document added +! Unicode/Unicode.xs + various signed/unsigned mismatch nits (#16173) + http://public.activestate.com/cgi-bin/perlbrowse?patch=16173 +! Encode.pm + POD: utf8-flag-related caveats added. A few sections completely + rewritten. +! Encode.xs +! AUTHORS + Thou shalt not assume %d works, either! + Robin Baker added to AUTHORS for this + Message-Id: <200204251132.MAA28237@tempest.npl.co.uk> +! t/CJKT.t + "Change 16144 by gsar@onru on 2002/04/24 18:59:05" + +1.60 2002/04/24 20:06:52 +! Encode.xs + "Thou shalt not assume %x works." -- jhi + Message-Id: <20020424210618.E24347@alpha.hut.fi> +! CN/Makefile.PL JP/Makefile.PL KR/Makefile.PL TW/Makefile.PL To make + low-memory build machines happy, now *.c is created for each *.ucm + (no table aggregation). You can still override this by setting + $ENV{AGGREGATE_TABLES}. + Message-Id: <00B1B3E4-579F-11D6-A441-00039301D480@dan.co.jp> ++ lib/Encode/Guess.pm ++ lib/Encode/JP/JIS7.pm + Encoding-autodetect (mainly for Japanese encoding) added. In a + course of development, JIS7.pm was improved. ++ lib/Encode/HTML/Header.pm ++ lib/Encode/Config.pm + MIME B/Q Header Encoding Added! +! Encode.pm Encode.xs t/fallback.t + new fallbacks; XMLCREF and HTMLCREF upon Bart's request. + Message-Id: <20020424130709.GA14211@tanglefoot> + +1.59 $ 2002/04/22 23:54:22 +! Encode.pm Encode.xs + needs_lines() and perlio_ok() are added to Internal encodings such + as utf8 so XML::SAX is happy. FB_* stub xsubs are now prototyped. + +1.58 2002/04/22 23:54:22 ! TW/TW.pm s/MacChineseSimp/MacChineseTrad/ # ... oops. ! bin/ucm2text @@ -467,7 +511,7 @@ $Revision: 1.58 $ $Date: 2002/04/22 23:54:22 $ Typo fixes and improvements by jhi Message-Id: <200204010201.FAA03564@alpha.hut.fi>, et al. -1.11 $Date: 2002/04/22 23:54:22 $ +1.11 $Date: 2002/04/26 03:02:04 $ + t/encoding.t + t/jperl.t ! MANIFEST diff --git a/ext/Encode/Encode.pm b/ext/Encode/Encode.pm index b03d93d707..e6c54f0a9f 100644 --- a/ext/Encode/Encode.pm +++ b/ext/Encode/Encode.pm @@ -1,12 +1,12 @@ package Encode; use strict; -our $VERSION = do { my @r = (q$Revision: 1.58 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; +our $VERSION = do { my @r = (q$Revision: 1.61 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; our $DEBUG = 0; use XSLoader (); XSLoader::load 'Encode'; require Exporter; -our @ISA = qw(Exporter); +use base qw/Exporter/; # Public, encouraged API is exported by default @@ -15,8 +15,10 @@ our @EXPORT = qw( encodings find_encoding ); -our @FB_FLAGS = qw(DIE_ON_ERR WARN_ON_ERR RETURN_ON_ERR LEAVE_SRC PERLQQ); -our @FB_CONSTS = qw(FB_DEFAULT FB_QUIET FB_WARN FB_PERLQQ FB_CROAK); +our @FB_FLAGS = qw(DIE_ON_ERR WARN_ON_ERR RETURN_ON_ERR LEAVE_SRC + PERLQQ HTMLCREF XMLCREF); +our @FB_CONSTS = qw(FB_DEFAULT FB_CROAK FB_QUIET FB_WARN + FB_PERLQQ FB_HTMLCREF FB_XMLCREF); our @EXPORT_OK = ( @@ -194,6 +196,11 @@ sub predefine_encodings{ package Encode::UTF_EBCDIC; *name = sub{ shift->{'Name'} }; *new_sequence = sub{ return $_[0] }; + *needs_lines = sub{ 0 }; + *perlio_ok = sub { + eval{ require PerlIO::encoding }; + return $@ ? 0 : 1; + }; *decode = sub{ my ($obj,$str,$chk) = @_; my $res = ''; @@ -221,6 +228,11 @@ sub predefine_encodings{ package Encode::Internal; *name = sub{ shift->{'Name'} }; *new_sequence = sub{ return $_[0] }; + *needs_lines = sub{ 0 }; + *perlio_ok = sub { + eval{ require PerlIO::encoding }; + return $@ ? 0 : 1; + }; *decode = sub{ my ($obj,$str,$chk) = @_; utf8::upgrade($str); @@ -237,6 +249,11 @@ sub predefine_encodings{ package Encode::utf8; *name = sub{ shift->{'Name'} }; *new_sequence = sub{ return $_[0] }; + *needs_lines = sub{ 0 }; + *perlio_ok = sub { + eval{ require PerlIO::encoding }; + return $@ ? 0 : 1; + }; *decode = sub{ my ($obj,$octets,$chk) = @_; my $str = Encode::decode_utf8($octets); @@ -314,7 +331,7 @@ byte has 256 possible values, it easily fits in Perl's much larger =head2 TERMINOLOGY -=over 4 +=over 2 =item * @@ -339,7 +356,7 @@ and such details may change in future releases. =head1 PERL ENCODING API -=over 4 +=over 2 =item $octets = encode(ENCODING, $string[, CHECK]) @@ -351,7 +368,13 @@ For CHECK, see L</"Handling Malformed Data">. For example, to convert (internally UTF-8 encoded) Unicode string to iso-8859-1 (also known as Latin1), - $octets = encode("iso-8859-1", $unicode); + $octets = encode("iso-8859-1", $utf8); + +B<CAVEAT>: When you C<$octets = encode("utf8", $utf8)>, then $octets +B<ne> $utf8. Though they both contain the same data, the utf8 flag +for $octets is B<always> off. When you encode anything, utf8 flag of +the result is always off, even when it contains completely valid utf8 +string. See L</"The UTF-8 flag"> below. =item $string = decode(ENCODING, $octets[, CHECK]) @@ -365,16 +388,22 @@ For example, to convert ISO-8859-1 data to UTF-8: $utf8 = decode("iso-8859-1", $latin1); -=item [$length =] from_to($string, FROM_ENCODING, TO_ENCODING [,CHECK]) +B<CAVEAT>: When you C<$utf8 = encode("utf8", $octets)>, then $utf8 +B<may not be equal to> $utf8. Though they both contain the same data, +the utf8 flag for $utf8 is on unless $octets entirely conststs of +ASCII data (or EBCDIC on EBCDIC machines). See L</"The UTF-8 flag"> +below. -Converts B<in-place> data between two encodings. -For example, to convert ISO-8859-1 data to UTF-8: +=item [$length =] from_to($string, FROM_ENC, TO_ENC [, CHECK]) + +Converts B<in-place> data between two encodings. For example, to +convert ISO-8859-1 data to UTF-8: - from_to($data, "iso-8859-1", "utf-8"); + from_to($data, "iso-8859-1", "utf8"); and to convert it back: - from_to($data, "utf-8", "iso-8859-1"); + from_to($data, "utf8", "iso-8859-1"); Note that because the conversion happens in place, the data to be converted cannot be a string constant; it must be a scalar variable. @@ -382,32 +411,34 @@ converted cannot be a string constant; it must be a scalar variable. from_to() returns the length of the converted string on success, undef otherwise. -=back +B<CAVEAT>: The following operations look the same but not quite so; + + from_to($data, "iso-8859-1", "utf8"); #1 + $data = decode("iso-8859-1", $data); #2 -=head2 UTF-8 / utf8 +Both #1 and #2 makes $data consists of completely valid UTF-8 string +but only #2 turns utf8 flag on. #1 is equivalent to -The Unicode Consortium defines the UTF-8 transformation format as a -way of encoding the entire Unicode repertoire as sequences of octets. -This encoding is expected to become very widespread. Perl can use this -form internally to represent strings, so conversions to and from this -form are particularly efficient (as octets in memory do not have to -change, just the meta-data that tells Perl how to treat them). + $data = encode("utf8", decode("iso-8859-1", $data)); -=over 4 +See L</"The UTF-8 flag"> below. =item $octets = encode_utf8($string); -The characters that comprise $string are encoded in Perl's superset of -UTF-8 and the resulting octets are returned as a sequence of bytes. All -possible characters have a UTF-8 representation so this function cannot -fail. +Equivalent to C<$octets = encode("utf8", $string);> The characters +that comprise $string are encoded in Perl's superset of UTF-8 and the +resulting octets are returned as a sequence of bytes. All possible +characters have a UTF-8 representation so this function cannot fail. + =item $string = decode_utf8($octets [, CHECK]); -The sequence of octets represented by $octets is decoded from UTF-8 -into a sequence of logical characters. Not all sequences of octets -form valid UTF-8 encodings, so it is possible for this call to fail. -For CHECK, see L</"Handling Malformed Data">. +equivalent to C<$string = decode("utf8", $octets [, CHECK])>. +decode_utf8($octets [, CHECK]); The sequence of octets represented by +$octets is decoded from UTF-8 into a sequence of logical +characters. Not all sequences of octets form valid UTF-8 encodings, so +it is possible for this call to fail. For CHECK, see +L</"Handling Malformed Data">. =back @@ -493,7 +524,7 @@ For gory details, see L<Encode::PerlIO>. =head1 Handling Malformed Data -=over 4 +=over 2 The I<CHECK> argument is used as follows. When you omit it, the behaviour is the same as if you had passed a value of 0 for @@ -507,7 +538,7 @@ E<lt>subcharE<gt> will be used. For Unicode, "\x{FFFD}" is used. If the data is supposed to be UTF-8, an optional lexical warning (category utf8) is given. -=item I<CHECK> = Encode::DIE_ON_ERROR (== 1) +=item I<CHECK> = Encode::FB_CROAK ( == 1) If I<CHECK> is 1, methods will die immediately with an error message. Therefore, when I<CHECK> is set to 1, you should trap the @@ -539,6 +570,10 @@ you are debugging the mode above. =item perlqq mode (I<CHECK> = Encode::FB_PERLQQ) +=item HTML charref mode (I<CHECK> = Encode::FB_HTMLCREF) + +=item XML charref mode (I<CHECK> = Encode::FB_XMLCREF) + For encodings that are implemented by Encode::XS, CHECK == Encode::FB_PERLQQ turns (en|de)code into C<perlqq> fallback mode. @@ -548,6 +583,10 @@ decoded to utf8. And when you encode, '\x{I<xxxx>}' will be inserted, where I<xxxx> is the Unicode ID of the character that cannot be found in the character repertoire of the encoding. +HTML/XML character reference modes are about the same, in place of +\x{I<xxxx>}, HTML uses &#I<1234>; where I<1234> is a decimal digit and +XML uses &#xI<abcd>; where I<abcd> is the hexadecimal digit. + =item The bitmask These modes are actually set via a bitmask. Here is how the FB_XX @@ -561,6 +600,8 @@ constants via C<use Encode qw(:fallback_all)>. RETURN_ON_ERR 0x0004 X X LEAVE_SRC 0x0008 PERLQQ 0x0100 X + HTMLCREF 0x0200 + XMLCREF 0x0400 =head2 Unimplemented fallback schemes @@ -581,12 +622,84 @@ arguments are taken as aliases for I<$object>, as for C<define_alias>. See L<Encode::Encoding> for more details. -=head1 Messing with Perl's Internals +=head1 The UTF-8 flag + +Before the introduction of utf8 support in perl, The C<eq> operator +just compares internal data of the scalars. Now C<eq> means internal +data equality AND I<the utf8 flag>. To explain why we made it so, I +will quote page 402 of C<Programming Perl, 3rd ed.> + +=over 2 + +=item Goal #1: + +Old byte-oriented programs should not spontaneously break on the old +byte-oriented data they used to work on. + +=item Goal #2: + +Old byte-oriented programs should magically start working on the new +character-oriented data when appropriate. + +=item Goal #3: + +Programs should run just as fast in the new character-oriented mode +as in the old byte-oriented mode. + +=item Goal #4: + +Perl should remain one language, rather than forking into a +byte-oriented Perl and a character-oriented Perl. + +=back + +Back when C<Programming Perl, 3rd ed.> was written, not even Perl 5.6.0 +was born and many features documented in the book remained +unimplemented. Perl 5.8 hopefully correct this and the introduction +of UTF-8 flag is one of them. You can think this perl notion of +byte-oriented mode (utf8 flag off) and character-oriented mode (utf8 +flag on). + +Here is how Encode takes care of the utf8 flag. + +=over 2 + +=item * + +When you encode, the resulting utf8 flag is always off. + +=item + +When you decode, the resuting utf8 flag is on unless you can +unambiguously represent data. Here is the definition of +dis-ambiguity. + + After C<$utf8 = decode('foo', $octet);>, + + When $octet is... The utf8 flag in $utf8 is + --------------------------------------------- + In ASCII only (or EBCDIC only) OFF + In ISO-8859-1 ON + In any other Encoding ON + --------------------------------------------- + +As you see, there is one exception, In ASCII. That way you can assue +Goal #1. And with Encode Goal #2 is assumed but you still have to be +careful in such cases mentioned in B<CAVEAT> paragraphs. + +This utf8 flag is not visible in perl scripts, exactly for the same +reason you cannot (or you I<don't have to>) see if a scalar contains a +string, integer, or floating point number. But you can still peek +and poke these if you will. See the section below. + +=back + +=head2 Messing with Perl's Internals The following API uses parts of Perl's internals in the current implementation. As such, they are efficient but may change. -=over 4 +=over 2 =item is_utf8(STRING [, CHECK]) @@ -626,8 +739,8 @@ the Perl Unicode Mailing List E<lt>perl-unicode@perl.orgE<gt> =head1 MAINTAINER This project was originated by Nick Ing-Simmons and later maintained -by Dan Kogai E<lt>dankogai@dan.co.jpE<gt>. See AUTHORS for a full list -of people involved. For any questions, use -E<lt>perl-unicode@perl.orgE<gt> so others can share. +by Dan Kogai E<lt>dankogai@dan.co.jpE<gt>. See AUTHORS for a full +list of people involved. For any questions, use +E<lt>perl-unicode@perl.orgE<gt> so we can all share share. =cut diff --git a/ext/Encode/Encode.xs b/ext/Encode/Encode.xs index be69c33352..1311d8dacb 100644 --- a/ext/Encode/Encode.xs +++ b/ext/Encode/Encode.xs @@ -1,5 +1,5 @@ /* - $Id: Encode.xs,v 1.34 2002/04/22 20:27:30 dankogai Exp $ + $Id: Encode.xs,v 1.39 2002/04/26 03:02:04 dankogai Exp $ */ #define PERL_NO_GET_CONTEXT @@ -141,10 +141,22 @@ encode_method(pTHX_ encode_t * enc, encpage_t * dir, SV * src, goto ENCODE_SET_SRC; }else if (check & ENCODE_PERLQQ){ SV* perlqq = - sv_2mortal(newSVpvf("\\x{%04x}", ch)); + sv_2mortal(newSVpvf("\\x{%04"UVxf"}", ch)); sdone += slen + clen; ddone += dlen + SvCUR(perlqq); sv_catsv(dst, perlqq); + }else if (check & ENCODE_HTMLCREF){ + SV* htmlcref = + sv_2mortal(newSVpvf("&#%" UVuf ";", ch)); + sdone += slen + clen; + ddone += dlen + SvCUR(htmlcref); + sv_catsv(dst, htmlcref); + }else if (check & ENCODE_XMLCREF){ + SV* xmlcref = + sv_2mortal(newSVpvf("&#x%" UVxf ";", ch)); + sdone += slen + clen; + ddone += dlen + SvCUR(xmlcref); + sv_catsv(dst, xmlcref); } else { /* fallback char */ sdone += slen + clen; @@ -168,7 +180,8 @@ encode_method(pTHX_ encode_t * enc, encpage_t * dir, SV * src, enc->name[0], (U8) s[slen], code); } goto ENCODE_SET_SRC; - }else if (check & ENCODE_PERLQQ){ + }else if (check & + (ENCODE_PERLQQ|ENCODE_HTMLCREF|ENCODE_XMLCREF)){ SV* perlqq = sv_2mortal(newSVpvf("\\x%02X", s[slen])); sdone += slen + 1; @@ -441,9 +454,6 @@ CODE: OUTPUT: RETVAL -PROTOTYPES: DISABLE - - int DIE_ON_ERR() CODE: @@ -480,6 +490,20 @@ OUTPUT: RETVAL int +HTMLCREF() +CODE: + RETVAL = ENCODE_HTMLCREF; +OUTPUT: + RETVAL + +int +XMLCREF() +CODE: + RETVAL = ENCODE_XMLCREF; +OUTPUT: + RETVAL + +int FB_DEFAULT() CODE: RETVAL = ENCODE_FB_DEFAULT; @@ -514,6 +538,20 @@ CODE: OUTPUT: RETVAL +int +FB_HTMLCREF() +CODE: + RETVAL = ENCODE_FB_HTMLCREF; +OUTPUT: + RETVAL + +int +FB_XMLCREF() +CODE: + RETVAL = ENCODE_FB_XMLCREF; +OUTPUT: + RETVAL + BOOT: { #include "def_t.h" diff --git a/ext/Encode/Encode/encode.h b/ext/Encode/Encode/encode.h index 04df7f9b38..b860578f22 100644 --- a/ext/Encode/Encode/encode.h +++ b/ext/Encode/Encode/encode.h @@ -94,11 +94,15 @@ extern void Encode_DefineEncoding(encode_t *enc); #define ENCODE_RETURN_ON_ERR 0x0004 /* immediately returns on NOREP */ #define ENCODE_LEAVE_SRC 0x0008 /* $src updated unless set */ #define ENCODE_PERLQQ 0x0100 /* perlqq fallback string */ +#define ENCODE_HTMLCREF 0x0200 /* HTML character ref. fb mode */ +#define ENCODE_XMLCREF 0x0400 /* XML character ref. fb mode */ #define ENCODE_FB_DEFAULT 0x0000 #define ENCODE_FB_CROAK 0x0001 #define ENCODE_FB_QUIET ENCODE_RETURN_ON_ERR #define ENCODE_FB_WARN (ENCODE_RETURN_ON_ERR|ENCODE_WARN_ON_ERR) #define ENCODE_FB_PERLQQ ENCODE_PERLQQ +#define ENCODE_FB_HTMLCREF ENCODE_HTMLCREF +#define ENCODE_FB_XMLCREF ENCODE_XMLCREF #endif /* ENCODE_H */ diff --git a/ext/Encode/JP/Makefile.PL b/ext/Encode/JP/Makefile.PL index ce47d2fc97..a1df35d169 100644 --- a/ext/Encode/JP/Makefile.PL +++ b/ext/Encode/JP/Makefile.PL @@ -1,6 +1,7 @@ use 5.7.2; use strict; use ExtUtils::MakeMaker; +use strict; my %tables = ( euc_jp_t => ['euc-jp.ucm'], @@ -12,6 +13,20 @@ my %tables = ( ], ); +unless ($ENV{AGGREGATE_TABLES}){ + my @ucm; + for my $k (keys %tables){ + push @ucm, @{$tables{$k}}; + } + %tables = (); + my $seq = 0; + for my $ucm (sort @ucm){ + # 8.3 compliance ! + my $t = sprintf ("%s_%02d_t", substr($ucm, 0, 2), $seq++); + $tables{$t} = [ $ucm ]; + } +} + my $name = 'JP'; WriteMakefile( diff --git a/ext/Encode/KR/Makefile.PL b/ext/Encode/KR/Makefile.PL index df0eeb68b2..4ba99ab82d 100644 --- a/ext/Encode/KR/Makefile.PL +++ b/ext/Encode/KR/Makefile.PL @@ -1,6 +1,7 @@ use 5.7.2; use strict; use ExtUtils::MakeMaker; +use strict; my %tables = (euc_kr_t => ['euc-kr.ucm', 'macKorean.ucm', @@ -10,6 +11,20 @@ my %tables = (euc_kr_t => ['euc-kr.ucm', johab_t => ['johab.ucm'], ); +unless ($ENV{AGGREGATE_TABLES}){ + my @ucm; + for my $k (keys %tables){ + push @ucm, @{$tables{$k}}; + } + %tables = (); + my $seq = 0; + for my $ucm (sort @ucm){ + # 8.3 compliance ! + my $t = sprintf ("%s_%02d_t", substr($ucm, 0, 2), $seq++); + $tables{$t} = [ $ucm ]; + } +} + my $name = 'KR'; WriteMakefile( diff --git a/ext/Encode/MANIFEST b/ext/Encode/MANIFEST index 2a35d9f6ec..cc6a1414c9 100644 --- a/ext/Encode/MANIFEST +++ b/ext/Encode/MANIFEST @@ -42,12 +42,13 @@ lib/Encode/CN/HZ.pm Encode extension lib/Encode/Config.pm Encode configuration module lib/Encode/Encoder.pm OO Encoder lib/Encode/Encoding.pm Encode extension +lib/Encode/Guess.pm Encode Extension lib/Encode/JP/H2Z.pm Encode extension lib/Encode/JP/JIS7.pm Encode extension lib/Encode/KR/2022_KR.pm Encode extension +lib/Encode/MIME/Header.pm Encode extension lib/Encode/PerlIO.pod Documents for Encode & PerlIO lib/Encode/Supported.pod Documents for supported encodings -t/unibench.pl benchmark script t/Aliases.t test script t/CJKT.t test script t/Encode.t test script @@ -64,6 +65,7 @@ t/fallback.t test script t/gb2312.enc test data t/gb2312.utf test data t/grow.t test script +t/guess.t test script t/jisx0201.enc test data t/jisx0201.utf test data t/jisx0208.enc test data @@ -73,7 +75,9 @@ t/jisx0212.utf test data t/jperl.t test script t/ksc5601.enc test data t/ksc5601.utf test data +t/mime-header.t test script t/perlio.t test script +t/unibench.pl benchmark script ucm/8859-1.ucm Unicode Character Map ucm/8859-10.ucm Unicode Character Map ucm/8859-11.ucm Unicode Character Map diff --git a/ext/Encode/TW/Makefile.PL b/ext/Encode/TW/Makefile.PL index 4fdae9e3f5..8f12a81aee 100644 --- a/ext/Encode/TW/Makefile.PL +++ b/ext/Encode/TW/Makefile.PL @@ -1,6 +1,7 @@ use 5.7.2; use strict; use ExtUtils::MakeMaker; +use strict; my %tables = (big5_t => ['big5-eten.ucm', 'big5-hkscs.ucm', @@ -8,6 +9,20 @@ my %tables = (big5_t => ['big5-eten.ucm', 'cp950.ucm'], ); +unless ($ENV{AGGREGATE_TABLES}){ + my @ucm; + for my $k (keys %tables){ + push @ucm, @{$tables{$k}}; + } + %tables = (); + my $seq = 0; + for my $ucm (sort @ucm){ + # 8.3 compliance ! + my $t = sprintf ("%s_%02d_t", substr($ucm, 0, 2), $seq++); + $tables{$t} = [ $ucm ]; + } +} + my $name = 'TW'; WriteMakefile( diff --git a/ext/Encode/Unicode/Unicode.xs b/ext/Encode/Unicode/Unicode.xs index 4689b498e1..e3ad82c7f0 100644 --- a/ext/Encode/Unicode/Unicode.xs +++ b/ext/Encode/Unicode/Unicode.xs @@ -1,5 +1,5 @@ /* - $Id: Unicode.xs,v 1.3 2002/04/20 23:43:47 dankogai Exp $ + $Id: Unicode.xs,v 1.4 2002/04/26 03:02:04 dankogai Exp $ */ #define PERL_NO_GET_CONTEXT @@ -61,7 +61,7 @@ enc_pack(pTHX_ SV *result,STRLEN size,U8 endian,UV value) d += SvCUR(result); SvCUR_set(result,SvCUR(result)+size); while (size--) { - *d++ = value & 0xFF; + *d++ = (U8)(value & 0xFF); value >>= 8; } break; @@ -70,7 +70,7 @@ enc_pack(pTHX_ SV *result,STRLEN size,U8 endian,UV value) SvCUR_set(result,SvCUR(result)+size); d += SvCUR(result); while (size--) { - *--d = value & 0xFF; + *--d = (U8)(value & 0xFF); value >>= 8; } break; diff --git a/ext/Encode/lib/Encode/Config.pm b/ext/Encode/lib/Encode/Config.pm index dcbc524b7b..a834967a11 100644 --- a/ext/Encode/lib/Encode/Config.pm +++ b/ext/Encode/lib/Encode/Config.pm @@ -2,7 +2,7 @@ # Demand-load module list # package Encode::Config; -our $VERSION = do { my @r = (q$Revision: 1.5 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; +our $VERSION = do { my @r = (q$Revision: 1.6 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; use strict; @@ -139,6 +139,11 @@ unless (ord("A") == 193){ #'big5plus' => 'Encode::HanExtra', #'euc-tw' => 'Encode::HanExtra', #'gb18030' => 'Encode::HanExtra', + + 'MIME-Header' => 'Encode::MIME::Header', + 'MIME-B' => 'Encode::MIME::Header', + 'MIME-Q' => 'Encode::MIME::Header', + ); } diff --git a/ext/Encode/lib/Encode/Guess.pm b/ext/Encode/lib/Encode/Guess.pm new file mode 100644 index 0000000000..d2aac44565 --- /dev/null +++ b/ext/Encode/lib/Encode/Guess.pm @@ -0,0 +1,297 @@ +package Encode::Guess; +use strict; +use Carp; + +use Encode qw(:fallbacks find_encoding); +our $VERSION = do { my @r = (q$Revision: 1.2 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; + +my $Canon = 'Guess'; +our $DEBUG = 0; +our %DEF_SUSPECTS = map { $_ => find_encoding($_) } qw(ascii utf8); +$Encode::Encoding{$Canon} = + bless { + Name => $Canon, + Suspects => { %DEF_SUSPECTS }, + } => __PACKAGE__; + +sub name { shift->{'Name'} } +sub new_sequence { $_[0] } +sub needs_lines { 1 } +sub perlio_ok { 0 } +sub DESTROY{} + +our @EXPORT = qw(guess_encoding); + +sub import { # Exporter not used so we do it on our own + my $callpkg = caller; + for my $item (@EXPORT){ + no strict 'refs'; + *{"$callpkg\::$item"} = \&{"$item"}; + } + set_suspects(@_); +} + +sub set_suspects{ + my $class = shift; + my $self = ref($class) ? $class : $Encode::Encoding{$Canon}; + $self->{Suspects} = { %DEF_SUSPECTS }; + $self->add_suspects(@_); +} + +sub add_suspects{ + my $class = shift; + my $self = ref($class) ? $class : $Encode::Encoding{$Canon}; + for my $c (@_){ + my $e = find_encoding($c) or die "Unknown encoding: $c"; + $self->{Suspects}{$e->name} = $e; + $DEBUG and warn "Added: ", $e->name; + } +} + +sub decode($$;$){ + my ($obj, $octet, $chk) = @_; + my $guessed = guess($obj, $octet); + ref($guessed) or croak $guessed; + my $utf8 = $guessed->decode($octet, $chk); + $_[1] = $octet if $chk; + return $utf8; +} + +sub encode{ + croak "Tsk, tsk, tsk. You can't be too lazy here!"; +} + +sub guess_encoding{ + guess($Encode::Encoding{$Canon}, @_); +} + +sub guess { + my $class = shift; + my $obj = ref($class) ? $class : $Encode::Encoding{$Canon}; + my $octet = shift; + # cheat 0: utf8 flag; + Encode::is_utf8($octet) and return find_encoding('utf8'); + # cheat 1: BOM + use Encode::Unicode; + my $BOM = unpack('n', $octet); + return find_encoding('UTF-16') + if ($BOM == 0xFeFF or $BOM == 0xFFFe); + $BOM = unpack('N', $octet); + return find_encoding('UTF-32') + if ($BOM == 0xFeFF or $BOM == 0xFFFe0000); + + my %try = %{$obj->{Suspects}}; + for my $c (@_){ + my $e = find_encoding($c) or die "Unknown encoding: $c"; + $try{$e->name} = $e; + $DEBUG and warn "Added: ", $e->name; + } + my $nline = 1; + for my $line (split /\r|\n|\r\n/, $octet){ + # cheat 2 -- \e in the string + if ($line =~ /\e/o){ + my @keys = keys %try; + delete @try{qw/utf8 ascii/}; + for my $k (@keys){ + ref($try{$k}) eq 'Encode::XS' and delete $try{$k}; + } + } + my %ok = %try; + # warn join(",", keys %try); + for my $k (keys %try){ + my $scratch = $line; + $try{$k}->decode($scratch, FB_QUIET); + if ($scratch eq ''){ + $DEBUG and warn sprintf("%4d:%-24s ok\n", $nline, $k); + }else{ + use bytes (); + $DEBUG and + warn sprintf("%4d:%-24s not ok; %d bytes left\n", + $nline, $k, bytes::length($scratch)); + delete $ok{$k}; + + } + } + %ok or return "No appropriate encodings found!"; + if (scalar(keys(%ok)) == 1){ + my ($retval) = values(%ok); + return $retval; + } + %try = %ok; $nline++; + } + $try{ascii} or + return "Encodings too ambiguous: ", join(" or ", keys %try); + return $try{ascii}; +} + + + +1; +__END__ + +=head1 NAME + +Encode::Guess -- Guesses encoding from data + +=head1 SYNOPSIS + + # if you are sure $data won't contain anything bogus + + use Encode::Guess qw/euc-jp shiftjis 7bit-jis/; + my $utf8 = decode("Guess", $data); + my $data = encode("Guess", $utf8); # this doesn't work! + + # more elaborate way + use Encode::Guess, + my $enc = guess_encoding($data, qw/euc-jp shiftjis 7bit-jis/); + ref($enc) or die "Can't guess: $enc"; # trap error this way + $utf8 = $enc->decode($data); + # or + $utf8 = decode($enc->name, $data) + +=head1 ABSTRACT + +Encode::Guess enables you to guess in what encoding a given data is +encoded, or at least tries to. + +=head1 DESCRIPTION + +By default, it checks only ascii, utf8 and UTF-16/32 with BOM. + + use Encode::Guess; # ascii/utf8/BOMed UTF + +To use it more practically, you have to give the names of encodings to +check (I<suspects> as follows). The name of suspects can either be +canonical names or aliases. + + # tries all major Japanese Encodings as well + use Encode::Guess qw/euc-jp shiftjis 7bit-jis/; + +=over 4 + +=item Encode::Guess->set_suspects + +You can also change the internal suspects list via C<set_suspects> +method. + + use Encode::Guess; + Encode::Guess->set_suspects(qw/euc-jp shiftjis 7bit-jis/); + +=item Encode::Guess->add_suspects + +Or you can use C<add_suspects> method. The difference is that +C<set_suspects> flushes the current suspects list while +C<add_suspects> adds. + + use Encode::Guess; + Encode::Guess->add_suspects(qw/euc-jp shiftjis 7bit-jis/); + # now the suspects are euc-jp,shiftjis,7bit-jis, AND + # euc-kr,euc-cn, and big5-eten + Encode::Guess->add_suspects(qw/euc-kr euc-cn big5-eten/); + +=item Encode::decode("Guess" ...) + +When you are content with suspects list, you can now + + my $utf8 = Encode::decode("Guess", $data); + +=item Encode::Guess->guess($data) + +But it will croak if Encode::Guess fails to eliminate all other +suspects but the right one or no suspect was good. So you should +instead try this; + + my $decoder = Encode::Guess->guess($data); + +On success, $decoder is an object that is documented in +L<Encode::Encoding>. So you can now do this; + + my $utf8 = $decoder->decode($data); + +On failure, $decoder now contains an error message so the whole thing +would be as follows; + + my $decoder = Encode::Guess->guess($data); + die $decoder unless ref($decoder); + my $utf8 = $decoder->decode($data); + +=item guess_encoding($data, [, I<list of suspects>]) + +You can also try C<guess_encoding> function which is exported by +default. It takes $data to check and it also takes the list of +suspects by option. The optional suspect list is I<not reflected> to +the internal suspects list. + + my $decoder = guess_encoding($data, qw/euc-jp euc-kr euc-cn/); + die $decoder unless ref($decoder); + my $utf8 = $decoder->decode($data); + # check only ascii and utf8 + my $decoder = guess_encoding($data); + +=back + +=head1 CAVEATS + +=over 4 + +=item * + +Because of the algorithm used, ISO-8859 series and other single-byte +encodings do not work well unless either one of ISO-8859 is the only +one suspect (besides ascii and utf8). + + use Encode::Guess; + # perhaps ok + my $decoder = guess_encoding($data, 'latin1'); + # definitely NOT ok + my $decoder = guess_encoding($data, qw/latin1 greek/); + +The reason is that Encode::Guess guesses encoding by trial and error. +It first splits $data into lines and tries to decode the line for each +suspect. It keeps it going until all but one encoding was eliminated +out of suspects list. ISO-8859 series is just too successful for most +cases (because it fills almost all code points in \x00-\xff). + +=item * + +Do not mix national standard encodings and the corresponding vendor +encodings. + + # a very bad idea + my $decoder + = guess_encoding($data, qw/shiftjis MacJapanese cp932/); + +The reason is that vendor encoding is usually a superset of national +standard so it becomes too ambiguous for most cases. + +=item * + +On the other hand, mixing various national standard encodings +automagically works unless $data is too short to allow for guessing. + + # This is ok if $data is long enough + my $decoder = + guess_encoding($data, qw/euc-cn + euc-jp shiftjis 7bit-jis + euc-kr + big5-eten/); + +=item * + +DO NOT PUT TOO MANY SUSPECTS! Don't you try something like this! + + my $decoder = guess_encoding($data, + Encode->encodings(":all")); + +=back + +It is, after all, just a guess. You should alway be explicit when it +comes to encodings. But there are some, especially Japanese, +environment that guess-coding is a must. Use this module with care. + +=head1 SEE ALSO + +L<Encode>, L<Encode::Encoding> + +=cut + diff --git a/ext/Encode/lib/Encode/JP/JIS7.pm b/ext/Encode/lib/Encode/JP/JIS7.pm index c0a0d0622a..09ec94f9d6 100644 --- a/ext/Encode/lib/Encode/JP/JIS7.pm +++ b/ext/Encode/lib/Encode/JP/JIS7.pm @@ -1,7 +1,7 @@ package Encode::JP::JIS7; use strict; -our $VERSION = do { my @r = (q$Revision: 1.6 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; +our $VERSION = do { my @r = (q$Revision: 1.7 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; use Encode qw(:fallbacks); @@ -42,9 +42,13 @@ our $DEBUG = 0; sub decode($$;$) { - my ($obj,$str,$chk) = @_; - my $residue = jis_euc(\$str); - # This is for PerlIO + my ($obj, $str, $chk) = @_; + my $residue = ''; + if ($chk){ + $str =~ s/([^\x00-\x7f].*)$//so; + $1 and $residue = $1; + } + $residue .= jis_euc(\$str); $_[1] = $residue if $chk; return Encode::decode('euc-jp', $str, FB_PERLQQ); } diff --git a/ext/Encode/lib/Encode/MIME/Header.pm b/ext/Encode/lib/Encode/MIME/Header.pm new file mode 100644 index 0000000000..ce7b872876 --- /dev/null +++ b/ext/Encode/lib/Encode/MIME/Header.pm @@ -0,0 +1,212 @@ +package Encode::MIME::Header; +use strict; +# use warnings; +our $VERSION = do { my @r = (q$Revision: 1.2 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; + +use Encode qw(find_encoding encode_utf8); +use MIME::Base64; +use Carp; + +my %seed = + ( + decode_b => '1', # decodes 'B' encoding ? + decode_q => '1', # decodes 'Q' encoding ? + encode => 'B', # encode with 'B' or 'Q' ? + bpl => 75, # bytes per line + ); + +$Encode::Encoding{'MIME-Header'} = + bless { + %seed, + Name => 'MIME-Header', + } => __PACKAGE__; + +$Encode::Encoding{'MIME-B'} = + bless { + %seed, + decode_q => 0, + Name => 'MIME-B', + } => __PACKAGE__; + +$Encode::Encoding{'MIME-Q'} = + bless { + %seed, + decode_q => 1, + encode => 'Q', + Name => 'MIME-Q', + } => __PACKAGE__; + +sub name { shift->{'Name'} } +sub new_sequence { $_[0] } +sub needs_lines { 1 } +sub perlio_ok{ 0 }; + +sub decode($$;$){ + use utf8; + my ($obj, $str, $chk) = @_; + # zap spaces between encoded words + $str =~ s/\?=\s+=\?/\?==\?/gos; + # multi-line header to single line + $str =~ s/(:?\r|\n|\r\n)[ \t]//gos; + $str =~ + s{ + =\? # begin encoded word + ([0-9A-Za-z\-]+) # charset (encoding) + \?([QqBb])\? # delimiter + (.*?) # Base64-encodede contents + \?= # end encoded word + }{ + if (uc($2) eq 'B'){ + $obj->{decode_b} or croak qq(MIME "B" unsupported); + decode_b($1, $3); + }elsif(uc($2) eq 'Q'){ + $obj->{decode_q} or croak qq(MIME "Q" unsupported); + decode_q($1, $3); + }else{ + croak qq(MIME "$2" encoding is nonexistent!); + } + }egox; + $_[1] = '' if $chk; + return $str; +} + +sub decode_b{ + my $enc = shift; + my $d = find_encoding($enc) or croak(Unknown encoding "$enc"); + my $db64 = decode_base64(shift); + return $d->decode($db64, Encode::FB_PERLQQ); +} + +sub decode_q{ + my ($enc, $q) = @_; + my $d = find_encoding($enc) or croak(Unknown encoding "$enc"); + $q =~ s/_/ /go; + $q =~ s/=([0-9A-Fa-f]{2})/pack("C", hex($1))/ego; + return $d->decode($q, Encode::FB_PERLQQ); +} + +my $especials = + join('|' => + map {quotemeta(chr($_))} + unpack("C*", qq{()<>@,;:\"\'/[]?.=})); + +my $re_especials = qr/$especials/o; + +sub encode($$;$){ + my ($obj, $str, $chk) = @_; + my @line = (); + for my $line (split /\r|\n|\r\n/o, $str){ + my (@word, @subline); + for my $word (split /($re_especials)/o, $line){ + if ($word =~ /[^\x00-\x7f]/o){ + push @word, $obj->_encode($word); + }else{ + push @word, $word; + } + } + my $subline = ''; + for my $word (@word){ + use bytes (); + if (bytes::length($subline) + bytes::length($word) > $obj->{bpl}){ + push @subline, $subline; + $subline = ''; + } + $subline .= $word; + } + $subline and push @subline, $subline; + push @line, join("\n " => @subline); + } + $_[1] = '' if $chk; + return join("\n", @line); +} + +use constant HEAD => '=?UTF-8?'; +use constant TAIL => '?='; +use constant SINGLE => { B => \&_encode_b, Q => \&_encode_q, }; + +sub _encode{ + my ($o, $str) = @_; + my $enc = $o->{encode}; + my $llen = ($o->{bpl} - length(HEAD) - 2 - length(TAIL)); + $llen *= $enc eq 'B' ? 3/4 : 1/3; + my @result = (); + my $chunk = ''; + while(my $chr = substr($str, 0, 1, '')){ + use bytes (); + if (bytes::length($chunk) + bytes::length($chr) > $llen){ + push @result, SINGLE->{$enc}($chunk); + $chunk = ''; + } + $chunk .= $chr; + } + $chunk and push @result, SINGLE->{$enc}($chunk); + return @result; +} + +sub _encode_b{ + HEAD . 'B?' . encode_base64(encode_utf8(shift), '') . TAIL; +} + +sub _encode_q{ + my $chunk = shift; + $chunk =~ s{ + ([^0-9A-Za-z]) + }{ + join("" => map {sprintf "=%02X", $_} unpack("C*", $1)) + }egox; + return HEAD . 'Q?' . $chunk . TAIL; +} + +1; +__END__ + +=head1 NAME + +Encode::MIME::Header -- MIME 'B' and 'Q' header encoding + +=head1 SYNOPSIS + + use Encode qw/encode decode/; + $utf8 = decode('MIME-Header', $header); + $header = encode('MIME-Header', $utf8); + +=head1 ABSTRACT + +This module implements RFC 2047 Mime Header Encoding. There are 3 +variant encoding names; C<MIME-Header>, C<MIME-B> and C<MIME-Q>. The +difference is described below + + decode() encode() + ---------------------------------------------- + MIME-Header Both B and Q =?UTF-8?B?....?= + MIME-B B only; Q croaks =?UTF-8?B?....?= + MIME-Q Q only; B croaks =?UTF-8?Q?....?= + +=head1 DESCRIPTION + +When you decode(=?I<encoding>?I<X>?I<ENCODED WORD>?=), I<ENCODED WORD> +is extracted and decoded for I<X> encoding (B for Base64, Q for +Quoted-Printable). Then the decoded chunk is fed to +decode(I<encoding>). So long as I<encoding> is supported by Encode, +any source encoding is fine. + +When you encode, it just encodes UTF-8 string with I<X> encoding then +quoted with =?UTF-8?I<X>?....?= . The parts that RFC 2047 forbids to +encode are left as is and long lines are folded within 76 bytes per +line. + +=head1 BUGS + +It would be nice to support encoding to non-UTF8, such as =?ISO-2022-JP? +and =?ISO-8859-1?= but that makes the implementation too complicated. +These days major mail agents all support =?UTF-8? so I think it is +just good enough. + +=head1 SEE ALSO + +L<Encode> + +RFC 2047, L<http://www.faqs.org/rfcs/rfc2047.html> and many other +locations. + +=cut diff --git a/ext/Encode/t/CJKT.t b/ext/Encode/t/CJKT.t index 4540034e55..31c0aa1916 100644 --- a/ext/Encode/t/CJKT.t +++ b/ext/Encode/t/CJKT.t @@ -55,7 +55,8 @@ for my $charset (sort keys %Charset){ open $src, "<$src_enc" or die "$src_enc : $!"; - binmode($src); + # binmode($src); # not needed! + $txt = join('',<$src>); close($src); diff --git a/ext/Encode/t/at-cn.t b/ext/Encode/t/at-cn.t index 893c29fa6d..6249feee38 100644 --- a/ext/Encode/t/at-cn.t +++ b/ext/Encode/t/at-cn.t @@ -19,9 +19,11 @@ use strict; use Test::More tests => 29; use Encode; +no utf8; # we have raw Chinese encodings here + use_ok('Encode::CN'); -# Since JP.t already test basic file IO, we will just focus on +# Since JP.t already tests basic file IO, we will just focus on # internal encode / decode test here. Unfortunately, to test # against all the UniHan characters will take a huge disk space, # not to mention the time it will take, and the fact that Perl diff --git a/ext/Encode/t/at-tw.t b/ext/Encode/t/at-tw.t index 830eb8686a..11abbf3807 100644 --- a/ext/Encode/t/at-tw.t +++ b/ext/Encode/t/at-tw.t @@ -21,9 +21,11 @@ use strict; use Test::More tests => 17; use Encode; +no utf8; # we have raw Chinese encodings here + use_ok('Encode::TW'); -# Since JP.t already test basic file IO, we will just focus on +# Since JP.t already tests basic file IO, we will just focus on # internal encode / decode test here. Unfortunately, to test # against all the UniHan characters will take a huge disk space, # not to mention the time it will take, and the fact that Perl diff --git a/ext/Encode/t/fallback.t b/ext/Encode/t/fallback.t index cf867beb01..3b6625851c 100644 --- a/ext/Encode/t/fallback.t +++ b/ext/Encode/t/fallback.t @@ -13,17 +13,18 @@ BEGIN { use strict; #use Test::More qw(no_plan); -use Test::More tests => 15; +use Test::More tests => 19; use Encode q(:all); my $original = ''; my $nofallback = ''; -my ($fallenback, $quiet, $perlqq); +my ($fallenback, $quiet, $perlqq, $htmlcref, $xmlcref); for my $i (0x20..0x7e){ $original .= chr($i); } -$fallenback = $quiet = $perlqq = $nofallback = $original; +$fallenback = $quiet = +$perlqq = $htmlcref = $xmlcref = $nofallback = $original; my $residue = ''; for my $i (0x80..0xff){ @@ -31,6 +32,8 @@ for my $i (0x80..0xff){ $residue .= chr($i); $fallenback .= '?'; $perlqq .= sprintf("\\x{%04x}", $i); + $htmlcref .= sprintf("&#%d;", $i); + $xmlcref .= sprintf("&#x%x;", $i); } utf8::upgrade($original); my $meth = find_encoding('ascii'); @@ -75,3 +78,13 @@ $src = $original; $dst = $meth->encode($src, FB_PERLQQ); is($dst, $perlqq, "FB_PERLQQ"); is($src, '', "FB_PERLQQ residue"); + +$src = $original; +$dst = $meth->encode($src, FB_HTMLCREF); +is($dst, $htmlcref, "FB_HTMLCREF"); +is($src, '', "FB_HTMLCREF residue"); + +$src = $original; +$dst = $meth->encode($src, FB_XMLCREF); +is($dst, $xmlcref, "FB_XMLCREF"); +is($src, '', "FB_XMLCREF residue"); diff --git a/ext/Encode/t/guess.t b/ext/Encode/t/guess.t new file mode 100644 index 0000000000..ace13ddec7 --- /dev/null +++ b/ext/Encode/t/guess.t @@ -0,0 +1,83 @@ +BEGIN { + if ($ENV{'PERL_CORE'}){ + chdir 't'; + unshift @INC, '../lib'; + } + require Config; import Config; + if ($Config{'extensions'} !~ /\bEncode\b/) { + print "1..0 # Skip: Encode was not built\n"; + exit 0; + } + $| = 1; +} + +use strict; +use File::Basename; +use File::Spec; +use Encode qw(decode encode find_encoding _utf8_off); + +#use Test::More qw(no_plan); +use Test::More tests => 17; +use_ok("Encode::Guess"); +{ + no warnings; + $Encode::Guess::DEBUG = shift || 0; +} + +my $ascii = join('' => map {chr($_)}(0x21..0x7e)); +my $latin1 = join('' => map {chr($_)}(0xa1..0xfe)); +my $utf8on = join('' => map {chr($_)}(0x3000..0x30fe)); +my $utf8off = $utf8on; _utf8_off($utf8off); +my $utf16 = encode('UTF-16', $utf8on); +my $utf32 = encode('UTF-32', $utf8on); + +is(guess_encoding($ascii)->name, 'ascii', 'ascii'); +like(guess_encoding($latin1), qr/No appropriate encoding/io, 'no ascii'); +is(guess_encoding($latin1, 'latin1')->name, 'iso-8859-1', 'iso-8859-1'); +is(guess_encoding($utf8on)->name, 'utf8', 'utf8 w/ flag'); +is(guess_encoding($utf8off)->name, 'utf8', 'utf8 w/o flag'); +is(guess_encoding($utf16)->name, 'UTF-16', 'UTF-16'); +is(guess_encoding($utf32)->name, 'UTF-32', 'UTF-32'); + +my $jisx0201 = File::Spec->catfile(dirname(__FILE__), 'jisx0201.utf'); +my $jisx0208 = File::Spec->catfile(dirname(__FILE__), 'jisx0208.utf'); +my $jisx0212 = File::Spec->catfile(dirname(__FILE__), 'jisx0212.utf'); + +open my $fh, $jisx0208 or die "$jisx0208: $!"; +$utf8off = join('' => <$fh>); +close $fh; +$utf8on = decode('utf8', $utf8off); + +my @jp = qw(7bit-jis shiftjis euc-jp); + +Encode::Guess->set_suspects(@jp); + +for my $jp (@jp){ + my $test = encode($jp, $utf8on); + is(guess_encoding($test)->name, $jp, "JP:$jp"); +} + +is (decode('Guess', encode('euc-jp', $utf8on)), $utf8on, "decode('Guess')"); +eval{ encode('Guess', $utf8on) }; +like($@, qr/lazy/io, "no encode()"); + +my %CJKT = + ( + 'euc-cn' => File::Spec->catfile(dirname(__FILE__), 'gb2312.utf'), + 'euc-jp' => File::Spec->catfile(dirname(__FILE__), 'jisx0208.utf'), + 'euc-kr' => File::Spec->catfile(dirname(__FILE__), 'ksc5601.utf'), + 'big5-eten' => File::Spec->catfile(dirname(__FILE__), 'big5-eten.utf'), +); + +Encode::Guess->set_suspects(keys %CJKT); + +for my $name (keys %CJKT){ + open my $fh, $CJKT{$name} or die "$CJKT{$name}: $!"; + $utf8off = join('' => <$fh>); + close $fh; + + my $test = encode($name, decode('utf8', $utf8off)); + is(guess_encoding($test)->name, $name, "CJKT:$name"); +} + +__END__; diff --git a/ext/Encode/t/jperl.t b/ext/Encode/t/jperl.t index faaf743f89..82f7a84dd6 100644 --- a/ext/Encode/t/jperl.t +++ b/ext/Encode/t/jperl.t @@ -1,5 +1,5 @@ # -# $Id: jperl.t,v 1.23 2002/04/22 09:48:07 dankogai Exp $ +# $Id: jperl.t,v 1.24 2002/04/26 03:02:04 dankogai Exp $ # # This script is written in euc-jp @@ -20,6 +20,8 @@ BEGIN { $| = 1; } +no utf8; # we have raw Japanese encodings here + use strict; use Test::More tests => 18; my $Debug = shift; diff --git a/ext/Encode/t/mime-header.t b/ext/Encode/t/mime-header.t new file mode 100644 index 0000000000..826efbfddd --- /dev/null +++ b/ext/Encode/t/mime-header.t @@ -0,0 +1,77 @@ +# +# $Id: mime-header.t,v 1.3 2002/04/26 03:07:59 dankogai Exp $ +# This script is written in utf8 +# +BEGIN { + if ($ENV{'PERL_CORE'}){ + chdir 't'; + unshift @INC, '../lib'; + } + require Config; import Config; + if ($Config{'extensions'} !~ /\bEncode\b/) { + print "1..0 # Skip: Encode was not built\n"; + exit 0; + } + $| = 1; +} + +use strict; +#use Test::More qw(no_plan); +use Test::More tests => 6; +use_ok("Encode::MIME::Header"); + +my $eheader =<<'EOS'; +From: =?US-ASCII?Q?Keith_Moore?= <moore@cs.utk.edu> +To: =?ISO-8859-1?Q?Keld_J=F8rn_Simonsen?= <keld@dkuug.dk> +CC: =?ISO-8859-1?Q?Andr=E9?= Pirard <PIRARD@vm1.ulg.ac.be> +Subject: =?ISO-8859-1?B?SWYgeW91IGNhbiByZWFkIHRoaXMgeW8=?= + =?ISO-8859-2?B?dSB1bmRlcnN0YW5kIHRoZSBleGFtcGxlLg==?= +EOS + +my $dheader=<<"EOS"; +From: Keith Moore <moore\@cs.utk.edu> +To: Keld J\xF8rn Simonsen <keld\@dkuug.dk> +CC: Andr\xE9 Pirard <PIRARD\@vm1.ulg.ac.be> +Subject: If you can read this you understand the example. +EOS + +is(Encode::decode('MIME-Header', $eheader), $dheader, "decode (RFC2047)"); + +use utf8; + +$dheader=<<'EOS'; +From: 小飼 弾 <dankogai@dan.co.jp> +To: dankogai@dan.co.jp (小飼=Kogai, 弾=Dan) +Subject: 漢字、カタカナ、ひらがなを含む、非常に長いタイトル行が一体全体どのようにしてEncodeされるのか? +EOS + +my $bheader =<<'EOS'; +From:=?UTF-8?B?IOWwj+mjvCDlvL4g?=<dankogai@dan.co.jp> +To: dankogai@dan.co.jp (=?UTF-8?B?5bCP6aO8?==Kogai,=?UTF-8?B?IOW8vg==?==Dan + ) +Subject: + =?UTF-8?B?IOa8ouWtl+OAgeOCq+OCv+OCq+ODiuOAgeOBsuOCieOBjOOBquOCkuWQq+OCgA==?= + =?UTF-8?B?44CB6Z2e5bi444Gr6ZW344GE44K/44Kk44OI44Or6KGM44GM5LiA5L2T5YWo?= + =?UTF-8?B?5L2T44Gp44Gu44KI44GG44Gr44GX44GmRW5jb2Rl44GV44KM44KL44Gu44GL?= + =?UTF-8?B?77yf?= +EOS + +my $qheader=<<'EOS'; +From:=?UTF-8?Q?=20=E5=B0=8F=E9=A3=BC=20=E5=BC=BE=20?=<dankogai@dan.co.jp> +To: dankogai@dan.co.jp (=?UTF-8?Q?=E5=B0=8F=E9=A3=BC?==Kogai, + =?UTF-8?Q?=20=E5=BC=BE?==Dan) +Subject: + =?UTF-8?Q?=20=E6=BC=A2=E5=AD=97=E3=80=81=E3=82=AB=E3=82=BF=E3=82=AB?= + =?UTF-8?Q?=E3=83=8A=E3=80=81=E3=81=B2=E3=82=89=E3=81=8C=E3=81=AA=E3=82=92?= + =?UTF-8?Q?=E5=90=AB=E3=82=80=E3=80=81=E9=9D=9E=E5=B8=B8=E3=81=AB=E9=95=B7?= + =?UTF-8?Q?=E3=81=84=E3=82=BF=E3=82=A4=E3=83=88=E3=83=AB=E8=A1=8C=E3=81=8C?= + =?UTF-8?Q?=E4=B8=80=E4=BD=93=E5=85=A8=E4=BD=93=E3=81=A9=E3=81=AE=E3=82=88?= + =?UTF-8?Q?=E3=81=86=E3=81=AB=E3=81=97=E3=81=A6Encode=E3=81=95?= + =?UTF-8?Q?=E3=82=8C=E3=82=8B=E3=81=AE=E3=81=8B=EF=BC=9F?= +EOS + +is(Encode::decode('MIME-Header', $bheader), $dheader, "decode B"); +is(Encode::decode('MIME-Header', $qheader), $dheader, "decode Q"); +is(Encode::encode('MIME-B', $dheader)."\n", $bheader, "encode B"); +is(Encode::encode('MIME-Q', $dheader)."\n", $qheader, "encode Q"); +__END__; diff --git a/ext/File/Glob/bsd_glob.c b/ext/File/Glob/bsd_glob.c index 14af31cba2..7ec58aa489 100644 --- a/ext/File/Glob/bsd_glob.c +++ b/ext/File/Glob/bsd_glob.c @@ -520,7 +520,7 @@ globtilde(const Char *pattern, Char *patbuf, size_t patbuf_len, glob_t *pglob) /* Copy up to the end of the string or / */ eb = &patbuf[patbuf_len - 1]; for (p = pattern + 1, h = (char *) patbuf; - h < (char*)eb && *p && *p != BG_SLASH; *h++ = *p++) + h < (char*)eb && *p && *p != BG_SLASH; *h++ = (char)*p++) ; *h = BG_EOS; @@ -1164,7 +1164,7 @@ static int g_Ctoc(register const Char *str, char *buf, STRLEN len) { while (len--) { - if ((*buf++ = *str++) == BG_EOS) + if ((*buf++ = (char)*str++) == BG_EOS) return (0); } return (1); diff --git a/ext/IO/IO.xs b/ext/IO/IO.xs index 7edbf2c3e3..26b332b6a4 100644 --- a/ext/IO/IO.xs +++ b/ext/IO/IO.xs @@ -242,7 +242,7 @@ PPCODE: for(i=1, j=0 ; j < nfd ; j++) { fds[j].fd = SvIV(ST(i)); i++; - fds[j].events = SvIV(ST(i)); + fds[j].events = (short)SvIV(ST(i)); i++; fds[j].revents = 0; } diff --git a/ext/Opcode/Opcode.xs b/ext/Opcode/Opcode.xs index 6ad7107966..66710edeb7 100644 --- a/ext/Opcode/Opcode.xs +++ b/ext/Opcode/Opcode.xs @@ -151,7 +151,7 @@ verify_opset(pTHX_ SV *opset, int fatal) if (!SvOK(opset)) err = "undefined"; else if (!SvPOK(opset)) err = "wrong type"; - else if (SvCUR(opset) != opset_len) err = "wrong size"; + else if (SvCUR(opset) != (STRLEN)opset_len) err = "wrong size"; if (err && fatal) { croak("Invalid opset: %s", err); } @@ -178,7 +178,7 @@ set_opset_bits(pTHX_ char *bitmap, SV *bitspec, int on, char *opname) else bitmap[offset] &= ~(1 << bit); } - else if (SvPOK(bitspec) && SvCUR(bitspec) == opset_len) { + else if (SvPOK(bitspec) && SvCUR(bitspec) == (STRLEN)opset_len) { STRLEN len; char *specbits = SvPV(bitspec, len); @@ -464,7 +464,7 @@ PPCODE: croak("panic: opcode %d (%s) out of range",myopcode,opname); XPUSHs(sv_2mortal(newSVpv(op_desc[myopcode], 0))); } - else if (SvPOK(bitspec) && SvCUR(bitspec) == opset_len) { + else if (SvPOK(bitspec) && SvCUR(bitspec) == (STRLEN)opset_len) { int b, j; STRLEN n_a; char *bitmap = SvPV(bitspec,n_a); diff --git a/ext/POSIX/POSIX.xs b/ext/POSIX/POSIX.xs index 2d1abf3060..c92c389788 100644 --- a/ext/POSIX/POSIX.xs +++ b/ext/POSIX/POSIX.xs @@ -457,7 +457,8 @@ __END__ if (memEQ(name, "WSTOPSIG", 8)) { /* ^ */ #ifdef WSTOPSIG - *arg_result = WSTOPSIG(WMUNGE(*arg_result)); + int i = *arg_result; + *arg_result = WSTOPSIG(WMUNGE(i)); return PERL_constant_ISIV; #else return PERL_constant_NOTDEF; @@ -468,7 +469,8 @@ __END__ if (memEQ(name, "WTERMSIG", 8)) { /* ^ */ #ifdef WTERMSIG - *arg_result = WTERMSIG(WMUNGE(*arg_result)); + int i = *arg_result; + *arg_result = WTERMSIG(WMUNGE(i)); return PERL_constant_ISIV; #else return PERL_constant_NOTDEF; @@ -491,7 +493,8 @@ __END__ case 9: if (memEQ(name, "WIFEXITED", 9)) { #ifdef WIFEXITED - *arg_result = WIFEXITED(WMUNGE(*arg_result)); + int i = *arg_result; + *arg_result = WIFEXITED(WMUNGE(i)); return PERL_constant_ISIV; #else return PERL_constant_NOTDEF; @@ -501,7 +504,8 @@ __END__ case 10: if (memEQ(name, "WIFSTOPPED", 10)) { #ifdef WIFSTOPPED - *arg_result = WIFSTOPPED(WMUNGE(*arg_result)); + int i = *arg_result; + *arg_result = WIFSTOPPED(WMUNGE(i)); return PERL_constant_ISIV; #else return PERL_constant_NOTDEF; @@ -517,7 +521,8 @@ __END__ if (memEQ(name, "WEXITSTATUS", 11)) { /* ^ */ #ifdef WEXITSTATUS - *arg_result = WEXITSTATUS(WMUNGE(*arg_result)); + int i = *arg_result; + *arg_result = WEXITSTATUS(WMUNGE(i)); return PERL_constant_ISIV; #else return PERL_constant_NOTDEF; @@ -528,7 +533,8 @@ __END__ if (memEQ(name, "WIFSIGNALED", 11)) { /* ^ */ #ifdef WIFSIGNALED - *arg_result = WIFSIGNALED(WMUNGE(*arg_result)); + int i = *arg_result; + *arg_result = WIFSIGNALED(WMUNGE(i)); return PERL_constant_ISIV; #else return PERL_constant_NOTDEF; diff --git a/ext/PerlIO/encoding/encoding.xs b/ext/PerlIO/encoding/encoding.xs index a9b74354d0..bff16e73f6 100644 --- a/ext/PerlIO/encoding/encoding.xs +++ b/ext/PerlIO/encoding/encoding.xs @@ -145,7 +145,7 @@ PerlIOEncode_pushed(pTHX_ PerlIO * f, const char *mode, SV * arg) if (SvIV(result = get_sv("PerlIO::encoding::check", 1)) == 0){ PUSHMARK(sp); PUTBACK; - if (call_pv("Encode::FB_QUIET", G_SCALAR|G_NOARGS) != 1) { + if (call_pv("Encode::FB_QUIET", G_SCALAR) != 1) { /* should never happen */ Perl_die(aTHX_ "Encode::FB_QUIET did not return a value"); return -1; @@ -317,7 +317,7 @@ PerlIOEncode_fill(pTHX_ PerlIO * f) if (SvLEN(e->dataSV) && SvPVX(e->dataSV)) { Safefree(SvPVX(e->dataSV)); } - if (use > e->base.bufsiz) { + if (use > (SSize_t)e->base.bufsiz) { if (e->flags & NEEDS_LINES) { /* Have to grow buffer */ e->base.bufsiz = use; @@ -427,7 +427,7 @@ PerlIOEncode_flush(pTHX_ PerlIO * f) PUTBACK; s = SvPV(str, len); count = PerlIO_write(PerlIONext(f),s,len); - if (count != len) { + if ((STRLEN)count != len) { code = -1; } FREETMPS; @@ -447,7 +447,7 @@ PerlIOEncode_flush(pTHX_ PerlIO * f) if (e->dataSV && SvCUR(e->dataSV)) { s = SvPV(e->dataSV, len); count = PerlIO_unread(PerlIONext(f),s,len); - if (count != len) { + if ((STRLEN)count != len) { code = -1; } } @@ -478,7 +478,7 @@ PerlIOEncode_flush(pTHX_ PerlIO * f) PUTBACK; s = SvPV(str, len); count = PerlIO_unread(PerlIONext(f),s,len); - if (count != len) { + if ((STRLEN)count != len) { code = -1; } FREETMPS; diff --git a/ext/Storable/Storable.pm b/ext/Storable/Storable.pm index 2f352f30b7..1ac12e189d 100644 --- a/ext/Storable/Storable.pm +++ b/ext/Storable/Storable.pm @@ -79,18 +79,7 @@ $VERSION = '1.015'; eval "use Log::Agent"; -unless (defined @Log::Agent::EXPORT) { - eval q{ - sub logcroak { - require Carp; - Carp::croak(@_); - } - sub logcarp { - require Carp; - Carp::carp(@_); - } - }; -} +require Carp; # # They might miss :flock in Fcntl @@ -107,22 +96,33 @@ BEGIN { } } -sub logcroak; -sub logcarp; - # Can't Autoload cleanly as this clashes 8.3 with &retrieve sub retrieve_fd { &fd_retrieve } # Backward compatibility +# By default restricted hashes are downgraded on earlier perls. + +$Storable::downgrade_restricted = 1; bootstrap Storable; 1; __END__ +# +# Use of Log::Agent is optional. If it hasn't imported these subs then +# Autoloader will kindly supply our fallback implementation. +# + +sub logcroak { + Carp::croak(@_); +} + +sub logcarp { + Carp::carp(@_); +} # # Determine whether locking is possible, but only when needed. # -sub CAN_FLOCK { - my $CAN_FLOCK if 0; +sub CAN_FLOCK; my $CAN_FLOCK; sub CAN_FLOCK { return $CAN_FLOCK if defined $CAN_FLOCK; require Config; import Config; return $CAN_FLOCK = diff --git a/ext/Storable/Storable.xs b/ext/Storable/Storable.xs index 6098d70763..d3cb8072d5 100644 --- a/ext/Storable/Storable.xs +++ b/ext/Storable/Storable.xs @@ -58,7 +58,7 @@ #include <patchlevel.h> /* Perl's one, needed since 5.6 */ #include <XSUB.h> -#if 0 +#if 1 #define DEBUGME /* Debug mode, turns assertions on as well */ #define DASSERT /* Assertion mode */ #endif @@ -272,6 +272,39 @@ typedef unsigned long stag_t; /* Used by pre-0.6 binary format */ #define MY_VERSION "Storable(" XS_VERSION ")" + +/* + * Conditional UTF8 support. + * + */ +#ifdef SvUTF8_on +#define STORE_UTF8STR(pv, len) STORE_PV_LEN(pv, len, SX_UTF8STR, SX_LUTF8STR) +#define HAS_UTF8_SCALARS +#ifdef HeKUTF8 +#define HAS_UTF8_HASHES +#define HAS_UTF8_ALL +#else +/* 5.6 perl has utf8 scalars but not hashes */ +#endif +#else +#define SvUTF8(sv) 0 +#define STORE_UTF8STR(pv, len) CROAK(("panic: storing UTF8 in non-UTF8 perl")) +#endif +#ifndef HAS_UTF8_ALL +#define UTF8_CROAK() CROAK(("Cannot retrieve UTF8 data in non-UTF8 perl")) +#endif + +#ifdef HvPLACEHOLDERS +#define HAS_RESTRICTED_HASHES +#else +#define HVhek_PLACEHOLD 0x200 +#define RESTRICTED_HASH_CROAK() CROAK(("Cannot retrieve restricted hash")) +#endif + +#ifdef HvHASKFLAGS +#define HAS_HASH_KEY_FLAGS +#endif + /* * Fields s_tainted and s_dirty are prefixed with s_ because Perl's include * files remap tainted and dirty when threading is enabled. That's bad for @@ -293,6 +326,12 @@ typedef struct stcxt { int s_tainted; /* true if input source is tainted, at retrieve time */ int forgive_me; /* whether to be forgiving... */ int canonical; /* whether to store hashes sorted by key */ +#ifndef HAS_RESTRICTED_HASHES + int derestrict; /* whether to downgrade restrcted hashes */ +#endif +#ifndef HAS_UTF8_ALL + int use_bytes; /* whether to bytes-ify utf8 */ +#endif int s_dirty; /* context is dirty due to CROAK() -- can be cleaned */ int membuf_ro; /* true means membuf is read-only and msaved is rw */ struct extendable keybuf; /* for hash key retrieval */ @@ -658,15 +697,23 @@ static stcxt_t *Context_ptr = &Context; static char old_magicstr[] = "perl-store"; /* Magic number before 0.6 */ static char magicstr[] = "pst0"; /* Used as a magic number */ + #define STORABLE_BIN_MAJOR 2 /* Binary major "version" */ +#define STORABLE_BIN_MINOR 5 /* Binary minor "version" */ + +/* If we aren't 5.7.3 or later, we won't be writing out files that use the + * new flagged hash introdued in 2.5, so put 2.4 in the binary header to + * maximise ease of interoperation with older Storables. + * Could we write 2.3s if we're on 5.005_03? NWC + */ #if (PATCHLEVEL <= 6) -#define STORABLE_BIN_MINOR 4 /* Binary minor "version" */ +#define STORABLE_BIN_WRITE_MINOR 4 #else /* * As of perl 5.7.3, utf8 hash key is introduced. * So this must change -- dankogai */ -#define STORABLE_BIN_MINOR 5 /* Binary minor "version" */ +#define STORABLE_BIN_WRITE_MINOR 5 #endif /* (PATCHLEVEL <= 6) */ /* @@ -731,19 +778,6 @@ static char magicstr[] = "pst0"; /* Used as a magic number */ #define STORE_SCALAR(pv, len) STORE_PV_LEN(pv, len, SX_SCALAR, SX_LSCALAR) /* - * Conditional UTF8 support. - * On non-UTF8 perls, UTF8 strings are returned as normal strings. - * - */ -#ifdef SvUTF8_on -#define STORE_UTF8STR(pv, len) STORE_PV_LEN(pv, len, SX_UTF8STR, SX_LUTF8STR) -#else -#define SvUTF8(sv) 0 -#define STORE_UTF8STR(pv, len) CROAK(("panic: storing UTF8 in non-UTF8 perl")) -#define SvUTF8_on(sv) CROAK(("Cannot retrieve UTF8 data in non-UTF8 perl")) -#endif - -/* * Store undef in arrays and hashes without recursing through store(). */ #define STORE_UNDEF() do { \ @@ -1202,6 +1236,12 @@ static void init_retrieve_context(stcxt_t *cxt, int optype, int is_tainted) cxt->optype = optype; cxt->s_tainted = is_tainted; cxt->entry = 1; /* No recursion yet */ +#ifndef HAS_RESTRICTED_HASHES + cxt->derestrict = -1; /* Fetched from perl if needed */ +#endif +#ifndef HAS_UTF8_ALL + cxt->use_bytes = -1; /* Fetched from perl if needed */ +#endif } /* @@ -1902,12 +1942,21 @@ sortcmp(const void *a, const void *b) */ static int store_hash(stcxt_t *cxt, HV *hv) { - I32 len = HvTOTALKEYS(hv); + I32 len = +#ifdef HAS_RESTRICTED_HASHES + HvTOTALKEYS(hv); +#else + HvKEYS(hv); +#endif I32 i; int ret = 0; I32 riter; HE *eiter; - int flagged_hash = ((SvREADONLY(hv) || HvHASKFLAGS(hv)) ? 1 : 0); + int flagged_hash = ((SvREADONLY(hv) +#ifdef HAS_HASH_KEY_FLAGS + || HvHASKFLAGS(hv) +#endif + ) ? 1 : 0); unsigned char hash_flags = (SvREADONLY(hv) ? SHV_RESTRICTED : 0); if (flagged_hash) { @@ -1969,7 +2018,11 @@ static int store_hash(stcxt_t *cxt, HV *hv) TRACEME(("using canonical order")); for (i = 0; i < len; i++) { +#ifdef HAS_RESTRICTED_HASHES HE *he = hv_iternext_flags(hv, HV_ITERNEXT_WANTPLACEHOLDERS); +#else + HE *he = hv_iternext(hv); +#endif SV *key = hv_iterkeysv(he); av_store(av, AvFILLp(av)+1, key); /* av_push(), really */ } @@ -2015,6 +2068,12 @@ static int store_hash(stcxt_t *cxt, HV *hv) keyval = SvPV(key, keylen_tmp); keylen = keylen_tmp; +#ifdef HAS_UTF8_HASHES + /* If you build without optimisation on pre 5.6 + then nothing spots that SvUTF8(key) is always 0, + so the block isn't optimised away, at which point + the linker dislikes the reference to + bytes_from_utf8. */ if (SvUTF8(key)) { const char *keysave = keyval; bool is_utf8 = TRUE; @@ -2039,6 +2098,7 @@ static int store_hash(stcxt_t *cxt, HV *hv) flags |= SHV_K_UTF8; } } +#endif if (flagged_hash) { PUTMARK(flags); @@ -2072,7 +2132,11 @@ static int store_hash(stcxt_t *cxt, HV *hv) char *key; I32 len; unsigned char flags; +#ifdef HV_ITERNEXT_WANTPLACEHOLDERS HE *he = hv_iternext_flags(hv, HV_ITERNEXT_WANTPLACEHOLDERS); +#else + HE *he = hv_iternext(hv); +#endif SV *val = (he ? hv_iterval(hv, he) : 0); SV *key_sv = NULL; HEK *hek; @@ -2111,10 +2175,12 @@ static int store_hash(stcxt_t *cxt, HV *hv) flags |= SHV_K_ISSV; } else { /* Regular string key. */ +#ifdef HAS_HASH_KEY_FLAGS if (HEK_UTF8(hek)) flags |= SHV_K_UTF8; if (HEK_WASUTF8(hek)) flags |= SHV_K_WASUTF8; +#endif key = HEK_KEY(hek); } /* @@ -2629,7 +2695,7 @@ static int store_hook( PUTMARK(clen); } if (len2) - WRITE(pv, len2); /* Final \0 is omitted */ + WRITE(pv, (SSize_t)len2); /* Final \0 is omitted */ /* [<len3> <object-IDs>] */ if (flags & SHF_HAS_LIST) { @@ -2993,7 +3059,7 @@ static int magic_write(stcxt_t *cxt) : -1)); if (cxt->fio) - WRITE(magicstr, strlen(magicstr)); /* Don't write final \0 */ + WRITE(magicstr, (SSize_t)strlen(magicstr)); /* Don't write final \0 */ /* * Starting with 0.6, the "use_network_order" byte flag is also used to @@ -3011,7 +3077,7 @@ static int magic_write(stcxt_t *cxt) * introduced, for instance, but when backward compatibility is preserved. */ - PUTMARK((unsigned char) STORABLE_BIN_MINOR); + PUTMARK((unsigned char) STORABLE_BIN_WRITE_MINOR); if (use_network_order) return 0; /* Don't bother with byte ordering */ @@ -3019,7 +3085,7 @@ static int magic_write(stcxt_t *cxt) sprintf(buf, "%lx", (unsigned long) BYTEORDER); c = (unsigned char) strlen(buf); PUTMARK(c); - WRITE(buf, (unsigned int) c); /* Don't write final \0 */ + WRITE(buf, (SSize_t)c); /* Don't write final \0 */ PUTMARK((unsigned char) sizeof(int)); PUTMARK((unsigned char) sizeof(long)); PUTMARK((unsigned char) sizeof(char *)); @@ -4098,15 +4164,25 @@ static SV *retrieve_scalar(stcxt_t *cxt, char *cname) */ static SV *retrieve_utf8str(stcxt_t *cxt, char *cname) { - SV *sv; + SV *sv; - TRACEME(("retrieve_utf8str")); + TRACEME(("retrieve_utf8str")); - sv = retrieve_scalar(cxt, cname); - if (sv) - SvUTF8_on(sv); + sv = retrieve_scalar(cxt, cname); + if (sv) { +#ifdef HAS_UTF8_SCALARS + SvUTF8_on(sv); +#else + if (cxt->use_bytes < 0) + cxt->use_bytes + = (SvTRUE(perl_get_sv("Storable::drop_utf8", TRUE)) + ? 1 : 0); + if (cxt->use_bytes == 0) + UTF8_CROAK(); +#endif + } - return sv; + return sv; } /* @@ -4117,15 +4193,24 @@ static SV *retrieve_utf8str(stcxt_t *cxt, char *cname) */ static SV *retrieve_lutf8str(stcxt_t *cxt, char *cname) { - SV *sv; - - TRACEME(("retrieve_lutf8str")); + SV *sv; - sv = retrieve_lscalar(cxt, cname); - if (sv) - SvUTF8_on(sv); + TRACEME(("retrieve_lutf8str")); - return sv; + sv = retrieve_lscalar(cxt, cname); + if (sv) { +#ifdef HAS_UTF8_SCALARS + SvUTF8_on(sv); +#else + if (cxt->use_bytes < 0) + cxt->use_bytes + = (SvTRUE(perl_get_sv("Storable::drop_utf8", TRUE)) + ? 1 : 0); + if (cxt->use_bytes == 0) + UTF8_CROAK(); +#endif + } + return sv; } /* @@ -4394,7 +4479,7 @@ static SV *retrieve_hash(stcxt_t *cxt, char *cname) */ RLEN(size); /* Get key size */ - KBUFCHK(size); /* Grow hash key read pool if needed */ + KBUFCHK((STRLEN)size); /* Grow hash key read pool if needed */ if (size) READ(kbuf, size); kbuf[size] = '\0'; /* Mark string end, just in case */ @@ -4434,11 +4519,22 @@ static SV *retrieve_flag_hash(stcxt_t *cxt, char *cname) int hash_flags; GETMARK(hash_flags); - TRACEME(("retrieve_flag_hash (#%d)", cxt->tagnum)); + TRACEME(("retrieve_flag_hash (#%d)", cxt->tagnum)); /* * Read length, allocate table. */ +#ifndef HAS_RESTRICTED_HASHES + if (hash_flags & SHV_RESTRICTED) { + if (cxt->derestrict < 0) + cxt->derestrict + = (SvTRUE(perl_get_sv("Storable::downgrade_restricted", TRUE)) + ? 1 : 0); + if (cxt->derestrict == 0) + RESTRICTED_HASH_CROAK(); + } +#endif + RLEN(len); TRACEME(("size = %d, flags = %d", len, hash_flags)); hv = newHV(); @@ -4464,8 +4560,10 @@ static SV *retrieve_flag_hash(stcxt_t *cxt, char *cname) return (SV *) 0; GETMARK(flags); +#ifdef HAS_RESTRICTED_HASHES if ((hash_flags & SHV_RESTRICTED) && (flags & SHV_K_LOCKED)) SvREADONLY_on(sv); +#endif if (flags & SHV_K_ISSV) { /* XXX you can't set a placeholder with an SV key. @@ -4493,13 +4591,25 @@ static SV *retrieve_flag_hash(stcxt_t *cxt, char *cname) sv = &PL_sv_undef; store_flags |= HVhek_PLACEHOLD; } - if (flags & SHV_K_UTF8) + if (flags & SHV_K_UTF8) { +#ifdef HAS_UTF8_HASHES store_flags |= HVhek_UTF8; +#else + if (cxt->use_bytes < 0) + cxt->use_bytes + = (SvTRUE(perl_get_sv("Storable::drop_utf8", TRUE)) + ? 1 : 0); + if (cxt->use_bytes == 0) + UTF8_CROAK(); +#endif + } +#ifdef HAS_UTF8_HASHES if (flags & SHV_K_WASUTF8) store_flags |= HVhek_WASUTF8; +#endif RLEN(size); /* Get key size */ - KBUFCHK(size); /* Grow hash key read pool if needed */ + KBUFCHK((STRLEN)size); /* Grow hash key read pool if needed */ if (size) READ(kbuf, size); kbuf[size] = '\0'; /* Mark string end, just in case */ @@ -4510,12 +4620,20 @@ static SV *retrieve_flag_hash(stcxt_t *cxt, char *cname) * Enter key/value pair into hash table. */ +#ifdef HAS_RESTRICTED_HASHES if (hv_store_flags(hv, kbuf, size, sv, 0, flags) == 0) return (SV *) 0; +#else + if (!(store_flags & HVhek_PLACEHOLD)) + if (hv_store(hv, kbuf, size, sv, 0) == 0) + return (SV *) 0; +#endif } } +#ifdef HAS_RESTRICTED_HASHES if (hash_flags & SHV_RESTRICTED) SvREADONLY_on(hv); +#endif TRACEME(("ok (retrieve_hash at 0x%"UVxf")", PTR2UV(hv))); @@ -4655,7 +4773,7 @@ static SV *old_retrieve_hash(stcxt_t *cxt, char *cname) if (c != SX_KEY) (void) retrieve_other((stcxt_t *) 0, 0); /* Will croak out */ RLEN(size); /* Get key size */ - KBUFCHK(size); /* Grow hash key read pool if needed */ + KBUFCHK((STRLEN)size); /* Grow hash key read pool if needed */ if (size) READ(kbuf, size); kbuf[size] = '\0'; /* Mark string end, just in case */ @@ -4708,7 +4826,7 @@ static SV *magic_check(stcxt_t *cxt) STRLEN len = sizeof(magicstr) - 1; STRLEN old_len; - READ(buf, len); /* Not null-terminated */ + READ(buf, (SSize_t)len); /* Not null-terminated */ buf[len] = '\0'; /* Is now */ if (0 == strcmp(buf, magicstr)) @@ -4720,7 +4838,7 @@ static SV *magic_check(stcxt_t *cxt) */ old_len = sizeof(old_magicstr) - 1; - READ(&buf[len], old_len - len); + READ(&buf[len], (SSize_t)(old_len - len)); buf[old_len] = '\0'; /* Is now null-terminated */ if (strcmp(buf, old_magicstr)) @@ -4765,10 +4883,14 @@ magic_ok: version_major > STORABLE_BIN_MAJOR || (version_major == STORABLE_BIN_MAJOR && version_minor > STORABLE_BIN_MINOR) - ) + ) { + TRACEME(("but I am version is %d.%d", STORABLE_BIN_MAJOR, + STORABLE_BIN_MINOR)); + CROAK(("Storable binary image v%d.%d more recent than I am (v%d.%d)", version_major, version_minor, STORABLE_BIN_MAJOR, STORABLE_BIN_MINOR)); + } /* * If they stored using network order, there's no byte ordering @@ -4783,6 +4905,8 @@ magic_ok: READ(buf, c); /* Not null-terminated */ buf[c] = '\0'; /* Is now */ + TRACEME(("byte order '%s'", buf)); + if (strcmp(buf, byteorder)) CROAK(("Byte order is not compatible")); @@ -4941,7 +5065,7 @@ first_time: /* Will disappear when support for old format is dropped */ default: return (SV *) 0; /* Failed */ } - KBUFCHK(len); /* Grow buffer as necessary */ + KBUFCHK((STRLEN)len); /* Grow buffer as necessary */ if (len) READ(kbuf, len); kbuf[len] = '\0'; /* Mark string end */ diff --git a/ext/Storable/t/croak.t b/ext/Storable/t/croak.t new file mode 100644 index 0000000000..ad07f3ad03 --- /dev/null +++ b/ext/Storable/t/croak.t @@ -0,0 +1,41 @@ +#!./perl -w + +# Please keep this test this simple. (ie just one test.) +# There's some sort of not-croaking properly problem in Storable when built +# with 5.005_03. This test shows it up, whereas malice.t does not. +# In particular, don't use Test; as this covers up the problem. + +sub BEGIN { + if ($ENV{PERL_CORE}){ + chdir('t') if -d 't'; + @INC = '.'; + push @INC, '../lib'; + } + require Config; import Config; + if ($ENV{PERL_CORE} and $Config{'extensions'} !~ /\bStorable\b/) { + print "1..0 # Skip: Storable was not built\n"; + exit 0; + } + # require 'lib/st-dump.pl'; +} + +use strict; + +BEGIN { + die "Oi! No! Don't change this test so that Carp is used before Storable" + if defined &Carp::carp; +} +use Storable qw(freeze thaw); + +print "1..2\n"; + +for my $test (1,2) { + eval {thaw "\xFF\xFF"}; + if ($@ =~ /Storable binary image v127.255 more recent than I am \(v2\.\d+\)/) + { + print "ok $test\n"; + } else { + chomp $@; + print "not ok $test # Expected a meaningful croak. Got '$@'\n"; + } +} diff --git a/ext/Storable/t/downgrade.t b/ext/Storable/t/downgrade.t new file mode 100644 index 0000000000..af5de4a62c --- /dev/null +++ b/ext/Storable/t/downgrade.t @@ -0,0 +1,378 @@ +#!./perl -w + +# +# Copyright 2002, Larry Wall. +# +# You may redistribute only under the same terms as Perl 5, as specified +# in the README file that comes with the distribution. +# + +# I ought to keep this test easily backwards compatible to 5.004, so no +# qr//; + +# This test checks downgrade behaviour on pre-5.8 perls when new 5.8 features +# are encountered. + +sub BEGIN { + if ($ENV{PERL_CORE}){ + chdir('t') if -d 't'; + @INC = '.'; + push @INC, '../lib'; + } + require Config; import Config; + if ($ENV{PERL_CORE} and $Config{'extensions'} !~ /\bStorable\b/) { + print "1..0 # Skip: Storable was not built\n"; + exit 0; + } + # require 'lib/st-dump.pl'; +} + +BEGIN { + if (ord 'A' != 65) { + die <<'EBCDIC'; +This test doesn't have EBCDIC data yet. Please run t/make_downgrade.pl using +perl 5.8 (or later) and append its output to the end of the test. +Please also mail the output to perlbug@perl.org so that the CPAN copy of +Storable can be updated. +EBCDIC + } +} +use Test::More; +use Storable 'thaw'; + +use strict; +use vars qw(@RESTRICT_TESTS %R_HASH %U_HASH $UTF8_CROAK $RESTRICTED_CROAK); + +@RESTRICT_TESTS = ('Locked hash', 'Locked hash placeholder', + 'Locked keys', 'Locked keys placeholder', + ); +%R_HASH = (perl => 'rules'); + +if ($] >= 5.007003) { + my $utf8 = "Schlo\xdf" . chr 256; + chop $utf8; + + %U_HASH = (map {$_, $_} 'castle', "ch\xe5teau", $utf8, chr 0x57CE); + plan tests => 169; +} elsif ($] >= 5.006) { + plan tests => 59; +} else { + plan tests => 67; +} + +$UTF8_CROAK = qr/^Cannot retrieve UTF8 data in non-UTF8 perl/; +$RESTRICTED_CROAK = qr/^Cannot retrieve restricted hash/; + +my %tests; +{ + local $/ = "\n\nend\n"; + while (<DATA>) { + next unless /\S/s; + unless (/begin ([0-7]{3}) ([^\n]*)\n(.*)$/s) { + s/\n.*//s; + warn "Dodgy data in section starting '$_'"; + next; + } + next unless oct $1 == ord 'A'; # Skip ASCII on EBCDIC, and vice versa + my $data = unpack 'u', $3; + $tests{$2} = $data; + } +} + +# use Data::Dumper; $Data::Dumper::Useqq = 1; print Dumper \%tests; +sub thaw_hash { + my ($name, $expected) = @_; + my $hash = eval {thaw $tests{$name}}; + is ($@, '', "Thawed $name without error?"); + isa_ok ($hash, 'HASH'); + ok (defined $hash && eq_hash($hash, $expected), + "And it is the hash we expected?"); + $hash; +} + +sub thaw_scalar { + my ($name, $expected) = @_; + my $scalar = eval {thaw $tests{$name}}; + is ($@, '', "Thawed $name without error?"); + isa_ok ($scalar, 'SCALAR', "Thawed $name?"); + is ($$scalar, $expected, "And it is the data we expected?"); + $scalar; +} + +sub thaw_fail { + my ($name, $expected) = @_; + my $thing = eval {thaw $tests{$name}}; + is ($thing, undef, "Thawed $name failed as expected?"); + like ($@, $expected, "Error as predicted?"); +} + +sub test_locked_hash { + my $hash = shift; + my @keys = keys %$hash; + my ($key, $value) = each %$hash; + eval {$hash->{$key} = reverse $value}; + like( $@, qr/^Modification of a read-only value attempted/, + 'trying to change a locked key' ); + is ($hash->{$key}, $value, "hash should not change?"); + eval {$hash->{use} = 'perl'}; + like( $@, qr/^Attempt to access disallowed key 'use' in a restricted hash/, + 'trying to add another key' ); + ok (eq_array([keys %$hash], \@keys), "Still the same keys?"); +} + +sub test_restricted_hash { + my $hash = shift; + my @keys = keys %$hash; + my ($key, $value) = each %$hash; + eval {$hash->{$key} = reverse $value}; + is( $@, '', + 'trying to change a restricted key' ); + is ($hash->{$key}, reverse ($value), "hash should change"); + eval {$hash->{use} = 'perl'}; + like( $@, qr/^Attempt to access disallowed key 'use' in a restricted hash/, + 'trying to add another key' ); + ok (eq_array([keys %$hash], \@keys), "Still the same keys?"); +} + +sub test_placeholder { + my $hash = shift; + eval {$hash->{rules} = 42}; + is ($@, '', 'No errors'); + is ($hash->{rules}, 42, "New value added"); +} + +sub test_newkey { + my $hash = shift; + eval {$hash->{nms} = "http://nms-cgi.sourceforge.net/"}; + is ($@, '', 'No errors'); + is ($hash->{nms}, "http://nms-cgi.sourceforge.net/", "New value added"); +} + +# $Storable::DEBUGME = 1; +thaw_hash ('Hash with utf8 flag but no utf8 keys', \%R_HASH); + +if (eval "use Hash::Util; 1") { + print "# We have Hash::Util, so test that the restricted hashes in <DATA> are valid\n"; + for $Storable::downgrade_restricted (0, 1, undef, "cheese") { + my $hash = thaw_hash ('Locked hash', \%R_HASH); + test_locked_hash ($hash); + $hash = thaw_hash ('Locked hash placeholder', \%R_HASH); + test_locked_hash ($hash); + test_placeholder ($hash); + + $hash = thaw_hash ('Locked keys', \%R_HASH); + test_restricted_hash ($hash); + $hash = thaw_hash ('Locked keys placeholder', \%R_HASH); + test_restricted_hash ($hash); + test_placeholder ($hash); + } +} else { + print "# We don't have Hash::Util, so test that the restricted hashes downgrade\n"; + my $hash = thaw_hash ('Locked hash', \%R_HASH); + test_newkey ($hash); + $hash = thaw_hash ('Locked hash placeholder', \%R_HASH); + test_newkey ($hash); + $hash = thaw_hash ('Locked keys', \%R_HASH); + test_newkey ($hash); + $hash = thaw_hash ('Locked keys placeholder', \%R_HASH); + test_newkey ($hash); + local $Storable::downgrade_restricted = 0; + thaw_fail ('Locked hash', $RESTRICTED_CROAK); + thaw_fail ('Locked hash placeholder', $RESTRICTED_CROAK); + thaw_fail ('Locked keys', $RESTRICTED_CROAK); + thaw_fail ('Locked keys placeholder', $RESTRICTED_CROAK); +} + +if ($] >= 5.006) { + print "# We have utf8 scalars, so test that the utf8 scalars in <DATA> are valid\n"; + print "# These seem to fail on 5.6 - you should seriously consider upgrading to 5.6.1\n" if $] == 5.006; + thaw_scalar ('Short 8 bit utf8 data', "\xDF"); + thaw_scalar ('Long 8 bit utf8 data', "\xDF" x 256); + thaw_scalar ('Short 24 bit utf8 data', chr 0xC0FFEE); + thaw_scalar ('Long 24 bit utf8 data', chr (0xC0FFEE) x 256); +} else { + print "# We don't have utf8 scalars, so test that the utf8 scalars downgrade\n"; + thaw_fail ('Short 8 bit utf8 data', $UTF8_CROAK); + thaw_fail ('Long 8 bit utf8 data', $UTF8_CROAK); + thaw_fail ('Short 24 bit utf8 data', $UTF8_CROAK); + thaw_fail ('Long 24 bit utf8 data', $UTF8_CROAK); + local $Storable::drop_utf8 = 1; + my $bytes = thaw $tests{'Short 8 bit utf8 data as bytes'}; + thaw_scalar ('Short 8 bit utf8 data', $$bytes); + thaw_scalar ('Long 8 bit utf8 data', $$bytes x 256); + $bytes = thaw $tests{'Short 24 bit utf8 data as bytes'}; + thaw_scalar ('Short 24 bit utf8 data', $$bytes); + thaw_scalar ('Long 24 bit utf8 data', $$bytes x 256); +} + +if ($] >= 5.007003) { + print "# We have utf8 hashes, so test that the utf8 hashes in <DATA> are valid\n"; + my $hash = thaw_hash ('Hash with utf8 keys', \%U_HASH); + for (keys %$hash) { + my $l = 0 + /^\w+$/; + my $r = 0 + $hash->{$_} =~ /^\w+$/; + cmp_ok ($l, '==', $r, sprintf "key length %d", length $_); + cmp_ok ($l, '==', $_ eq "ch\xe5teau" ? 0 : 1); + } + if (eval "use Hash::Util; 1") { + print "# We have Hash::Util, so test that the restricted utf8 hash is valid\n"; + my $hash = thaw_hash ('Locked hash with utf8 keys', \%U_HASH); + for (keys %$hash) { + my $l = 0 + /^\w+$/; + my $r = 0 + $hash->{$_} =~ /^\w+$/; + cmp_ok ($l, '==', $r, sprintf "key length %d", length $_); + cmp_ok ($l, '==', $_ eq "ch\xe5teau" ? 0 : 1); + } + test_locked_hash ($hash); + } else { + print "# We don't have Hash::Util, so test that the utf8 hash downgrades\n"; + fail ("You can't get here [perl version $]]. This is a bug in the test. +# Please send the output of perl -V to perlbug\@perl.org"); + } +} else { + print "# We don't have utf8 hashes, so test that the utf8 hashes downgrade\n"; + thaw_fail ('Hash with utf8 keys', $UTF8_CROAK); + thaw_fail ('Locked hash with utf8 keys', $UTF8_CROAK); + local $Storable::drop_utf8 = 1; + my $what = $] < 5.006 ? 'pre 5.6' : '5.6'; + my $expect = thaw $tests{"Hash with utf8 keys for $what"}; + thaw_hash ('Hash with utf8 keys', $expect); + #foreach (keys %$expect) { print "'$_':\t'$expect->{$_}'\n"; } + #foreach (keys %$got) { print "'$_':\t'$got->{$_}'\n"; } + if (eval "use Hash::Util; 1") { + print "# We have Hash::Util, so test that the restricted hashes in <DATA> are valid\n"; + fail ("You can't get here [perl version $]]. This is a bug in the test. +# Please send the output of perl -V to perlbug\@perl.org"); + } else { + print "# We don't have Hash::Util, so test that the restricted hashes downgrade\n"; + my $hash = thaw_hash ('Locked hash with utf8 keys', $expect); + test_newkey ($hash); + local $Storable::downgrade_restricted = 0; + thaw_fail ('Locked hash with utf8 keys', $RESTRICTED_CROAK); + # Which croak comes first is a bit of an implementation issue :-) + local $Storable::drop_utf8 = 0; + thaw_fail ('Locked hash with utf8 keys', $RESTRICTED_CROAK); + } +} +__END__ +# A whole run of 2.x nfreeze data, uuencoded. The "mode bits" are the octal +# value of 'A', the "file name" is the test name. Use make_downgrade.pl to +# generate these. +begin 101 Locked hash +8!049`0````$*!7)U;&5S!`````1P97)L + +end + +begin 101 Locked hash placeholder +C!049`0````(*!7)U;&5S!`````1P97)L#A0````%<G5L97,` + +end + +begin 101 Locked keys +8!049`0````$*!7)U;&5S``````1P97)L + +end + +begin 101 Locked keys placeholder +C!049`0````(*!7)U;&5S``````1P97)L#A0````%<G5L97,` + +end + +begin 101 Short 8 bit utf8 data +&!047`L.? + +end + +begin 101 Short 8 bit utf8 data as bytes +&!04*`L.? + +end + +begin 101 Long 8 bit utf8 data +M!048```"`,.?PY_#G\.?PY_#G\.?PY_#G\.?PY_#G\.?PY_#G\.?PY_#G\.? +MPY_#G\.?PY_#G\.?PY_#G\.?PY_#G\.?PY_#G\.?PY_#G\.?PY_#G\.?PY_# +MG\.?PY_#G\.?PY_#G\.?PY_#G\.?PY_#G\.?PY_#G\.?PY_#G\.?PY_#G\.? +MPY_#G\.?PY_#G\.?PY_#G\.?PY_#G\.?PY_#G\.?PY_#G\.?PY_#G\.?PY_# +MG\.?PY_#G\.?PY_#G\.?PY_#G\.?PY_#G\.?PY_#G\.?PY_#G\.?PY_#G\.? +MPY_#G\.?PY_#G\.?PY_#G\.?PY_#G\.?PY_#G\.?PY_#G\.?PY_#G\.?PY_# +MG\.?PY_#G\.?PY_#G\.?PY_#G\.?PY_#G\.?PY_#G\.?PY_#G\.?PY_#G\.? +MPY_#G\.?PY_#G\.?PY_#G\.?PY_#G\.?PY_#G\.?PY_#G\.?PY_#G\.?PY_# +MG\.?PY_#G\.?PY_#G\.?PY_#G\.?PY_#G\.?PY_#G\.?PY_#G\.?PY_#G\.? +MPY_#G\.?PY_#G\.?PY_#G\.?PY_#G\.?PY_#G\.?PY_#G\.?PY_#G\.?PY_# +MG\.?PY_#G\.?PY_#G\.?PY_#G\.?PY_#G\.?PY_#G\.?PY_#G\.?PY_#G\.? +8PY_#G\.?PY_#G\.?PY_#G\.?PY_#G\.? + +end + +begin 101 Short 24 bit utf8 data +)!047!?BPC[^N + +end + +begin 101 Short 24 bit utf8 data as bytes +)!04*!?BPC[^N + +end + +begin 101 Long 24 bit utf8 data +M!048```%`/BPC[^N^+"/OZ[XL(^_KOBPC[^N^+"/OZ[XL(^_KOBPC[^N^+"/ +MOZ[XL(^_KOBPC[^N^+"/OZ[XL(^_KOBPC[^N^+"/OZ[XL(^_KOBPC[^N^+"/ +MOZ[XL(^_KOBPC[^N^+"/OZ[XL(^_KOBPC[^N^+"/OZ[XL(^_KOBPC[^N^+"/ +MOZ[XL(^_KOBPC[^N^+"/OZ[XL(^_KOBPC[^N^+"/OZ[XL(^_KOBPC[^N^+"/ +MOZ[XL(^_KOBPC[^N^+"/OZ[XL(^_KOBPC[^N^+"/OZ[XL(^_KOBPC[^N^+"/ +MOZ[XL(^_KOBPC[^N^+"/OZ[XL(^_KOBPC[^N^+"/OZ[XL(^_KOBPC[^N^+"/ +MOZ[XL(^_KOBPC[^N^+"/OZ[XL(^_KOBPC[^N^+"/OZ[XL(^_KOBPC[^N^+"/ +MOZ[XL(^_KOBPC[^N^+"/OZ[XL(^_KOBPC[^N^+"/OZ[XL(^_KOBPC[^N^+"/ +MOZ[XL(^_KOBPC[^N^+"/OZ[XL(^_KOBPC[^N^+"/OZ[XL(^_KOBPC[^N^+"/ +MOZ[XL(^_KOBPC[^N^+"/OZ[XL(^_KOBPC[^N^+"/OZ[XL(^_KOBPC[^N^+"/ +MOZ[XL(^_KOBPC[^N^+"/OZ[XL(^_KOBPC[^N^+"/OZ[XL(^_KOBPC[^N^+"/ +MOZ[XL(^_KOBPC[^N^+"/OZ[XL(^_KOBPC[^N^+"/OZ[XL(^_KOBPC[^N^+"/ +MOZ[XL(^_KOBPC[^N^+"/OZ[XL(^_KOBPC[^N^+"/OZ[XL(^_KOBPC[^N^+"/ +MOZ[XL(^_KOBPC[^N^+"/OZ[XL(^_KOBPC[^N^+"/OZ[XL(^_KOBPC[^N^+"/ +MOZ[XL(^_KOBPC[^N^+"/OZ[XL(^_KOBPC[^N^+"/OZ[XL(^_KOBPC[^N^+"/ +MOZ[XL(^_KOBPC[^N^+"/OZ[XL(^_KOBPC[^N^+"/OZ[XL(^_KOBPC[^N^+"/ +MOZ[XL(^_KOBPC[^N^+"/OZ[XL(^_KOBPC[^N^+"/OZ[XL(^_KOBPC[^N^+"/ +MOZ[XL(^_KOBPC[^N^+"/OZ[XL(^_KOBPC[^N^+"/OZ[XL(^_KOBPC[^N^+"/ +MOZ[XL(^_KOBPC[^N^+"/OZ[XL(^_KOBPC[^N^+"/OZ[XL(^_KOBPC[^N^+"/ +MOZ[XL(^_KOBPC[^N^+"/OZ[XL(^_KOBPC[^N^+"/OZ[XL(^_KOBPC[^N^+"/ +MOZ[XL(^_KOBPC[^N^+"/OZ[XL(^_KOBPC[^N^+"/OZ[XL(^_KOBPC[^N^+"/ +MOZ[XL(^_KOBPC[^N^+"/OZ[XL(^_KOBPC[^N^+"/OZ[XL(^_KOBPC[^N^+"/ +MOZ[XL(^_KOBPC[^N^+"/OZ[XL(^_KOBPC[^N^+"/OZ[XL(^_KOBPC[^N^+"/ +MOZ[XL(^_KOBPC[^N^+"/OZ[XL(^_KOBPC[^N^+"/OZ[XL(^_KOBPC[^N^+"/ +MOZ[XL(^_KOBPC[^N^+"/OZ[XL(^_KOBPC[^N^+"/OZ[XL(^_KOBPC[^N^+"/ +MOZ[XL(^_KOBPC[^N^+"/OZ[XL(^_KOBPC[^N^+"/OZ[XL(^_KOBPC[^N^+"/ +MOZ[XL(^_KOBPC[^N^+"/OZ[XL(^_KOBPC[^N^+"/OZ[XL(^_KOBPC[^N^+"/ +MOZ[XL(^_KOBPC[^N^+"/OZ[XL(^_KOBPC[^N^+"/OZ[XL(^_KOBPC[^N^+"/ +;OZ[XL(^_KOBPC[^N^+"/OZ[XL(^_KOBPC[^N + +end + +begin 101 Hash with utf8 flag but no utf8 keys +8!049``````$*!7)U;&5S``````1P97)L + +end + +begin 101 Hash with utf8 keys +M!049``````0*!F-A<W1L90`````&8V%S=&QE"@=C:.5T96%U``````=C:.5T +D96%U%P/EGXX!`````^6?CA<'4V-H;&_#GP(````&4V-H;&_? + +end + +begin 101 Locked hash with utf8 keys +M!049`0````0*!F-A<W1L900````&8V%S=&QE"@=C:.5T96%U!`````=C:.5T +D96%U%P/EGXX%`````^6?CA<'4V-H;&_#GP8````&4V-H;&_? + +end + +begin 101 Hash with utf8 keys for pre 5.6 +M!049``````0*!F-A<W1L90`````&8V%S=&QE"@=C:.5T96%U``````=C:.5T +D96%U"@/EGXX``````^6?C@H'4V-H;&_#GP(````&4V-H;&_? + +end + +begin 101 Hash with utf8 keys for 5.6 +M!049``````0*!F-A<W1L90`````&8V%S=&QE"@=C:.5T96%U``````=C:.5T +D96%U%P/EGXX``````^6?CA<'4V-H;&_#GP(````&4V-H;&_? + +end + diff --git a/ext/Storable/t/make_downgrade.pl b/ext/Storable/t/make_downgrade.pl new file mode 100644 index 0000000000..d806ebbaa1 --- /dev/null +++ b/ext/Storable/t/make_downgrade.pl @@ -0,0 +1,103 @@ +#!/usr/local/bin/perl -w +use strict; + +use 5.007003; +use Hash::Util qw(lock_hash unlock_hash lock_keys); +use Storable qw(nfreeze); + +# If this looks like a hack, it's probably because it is :-) +sub uuencode_it { + my ($data, $name) = @_; + my $frozen = nfreeze $data; + + my $uu = pack 'u', $frozen; + + printf "begin %3o $name\n", ord 'A'; + print $uu; + print "\nend\n\n"; +} + + +my %hash = (perl=>"rules"); + +lock_hash %hash; + +uuencode_it (\%hash, "Locked hash"); + +unlock_hash %hash; + +lock_keys %hash, 'perl', 'rules'; +lock_hash %hash; + +uuencode_it (\%hash, "Locked hash placeholder"); + +unlock_hash %hash; + +lock_keys %hash, 'perl'; + +uuencode_it (\%hash, "Locked keys"); + +unlock_hash %hash; + +lock_keys %hash, 'perl', 'rules'; + +uuencode_it (\%hash, "Locked keys placeholder"); + +unlock_hash %hash; + +my $utf8 = "\x{DF}\x{100}"; +chop $utf8; + +uuencode_it (\$utf8, "Short 8 bit utf8 data"); + +utf8::encode ($utf8); + +uuencode_it (\$utf8, "Short 8 bit utf8 data as bytes"); + +$utf8 x= 256; + +uuencode_it (\$utf8, "Long 8 bit utf8 data"); + +$utf8 = "\x{C0FFEE}"; + +uuencode_it (\$utf8, "Short 24 bit utf8 data"); + +utf8::encode ($utf8); + +uuencode_it (\$utf8, "Short 24 bit utf8 data as bytes"); + +$utf8 x= 256; + +uuencode_it (\$utf8, "Long 24 bit utf8 data"); + +# Hash which has the utf8 bit set, but no longer has any utf8 keys +my %uhash = ("\x{100}", "gone", "perl", "rules"); +delete $uhash{"\x{100}"}; + +# use Devel::Peek; Dump \%uhash; +uuencode_it (\%uhash, "Hash with utf8 flag but no utf8 keys"); + +$utf8 = "Schlo\xdf" . chr 256; +chop $utf8; +%uhash = (map {$_, $_} 'castle', "ch\xe5teau", $utf8, "\x{57CE}"); + +uuencode_it (\%uhash, "Hash with utf8 keys"); + +lock_hash %uhash; + +uuencode_it (\%uhash, "Locked hash with utf8 keys"); + +my (%pre56, %pre58); + +while (my ($key, $val) = each %uhash) { + # hash keys are always stored downgraded to bytes if possible, with a flag + # to say "promote back to utf8" + # Whereas scalars are stored as is. + utf8::encode ($key) if ord $key > 256; + $pre58{$key} = $val; + utf8::encode ($val) unless $val eq "ch\xe5teau"; + $pre56{$key} = $val; + +} +uuencode_it (\%pre56, "Hash with utf8 keys for pre 5.6"); +uuencode_it (\%pre58, "Hash with utf8 keys for 5.6"); diff --git a/ext/Storable/t/malice.t b/ext/Storable/t/malice.t index 54c0ea4c1c..9f1d8ff201 100644 --- a/ext/Storable/t/malice.t +++ b/ext/Storable/t/malice.t @@ -30,14 +30,14 @@ sub BEGIN { } use strict; -use vars qw($file_magic_str $other_magic $network_magic $major $minor); - -# header size depends on the size of the byteorder string +use vars qw($file_magic_str $other_magic $network_magic $major $minor + $minor_write); $file_magic_str = 'pst0'; $other_magic = 7 + length($Config{byteorder}); $network_magic = 2; $major = 2; $minor = 5; +$minor_write = $] > 5.007 ? 5 : 4; use Test; BEGIN { plan tests => 334 + length($Config{byteorder}) * 4} @@ -63,7 +63,7 @@ sub test_header { my ($header, $isfile, $isnetorder) = @_; ok (!!$header->{file}, !!$isfile, "is file"); ok ($header->{major}, $major, "major number"); - ok ($header->{minor}, $minor, "minor number"); + ok ($header->{minor}, $minor_write, "minor number"); ok (!!$header->{netorder}, !!$isnetorder, "is network order"); if ($isnetorder) { # Skip these @@ -148,24 +148,34 @@ sub test_things { } $copy = $contents; - my $minor1 = $header->{minor} + 1; - substr ($copy, $file_magic + 1, 1) = chr $minor1; + # Needs to be more than 1, as we're already coding a spread of 1 minor version + # number on writes (2.5, 2.4). May increase to 2 if we figure we can do 2.3 + # on 5.005_03 (No utf8). + # 4 allows for a small safety margin + # (Joke: + # Question: What is the value of pi? + # Mathematician answers "It's pi, isn't it" + # Physicist answers "3.1, within experimental error" + # Engineer answers "Well, allowing for a small safety margin, 18" + # ) + my $minor4 = $header->{minor} + 4; + substr ($copy, $file_magic + 1, 1) = chr $minor4; test_corrupt ($copy, $sub, - "/^Storable binary image v$header->{major}\.$minor1 more recent than I am \\(v$header->{major}\.$header->{minor}\\)/", + "/^Storable binary image v$header->{major}\.$minor4 more recent than I am \\(v$header->{major}\.$minor\\)/", "higher minor"); $copy = $contents; my $major1 = $header->{major} + 1; substr ($copy, $file_magic, 1) = chr 2*$major1; test_corrupt ($copy, $sub, - "/^Storable binary image v$major1\.$header->{minor} more recent than I am \\(v$header->{major}\.$header->{minor}\\)/", + "/^Storable binary image v$major1\.$header->{minor} more recent than I am \\(v$header->{major}\.$minor\\)/", "higher major"); # Continue messing with the previous copy - $minor1 = $header->{minor} - 1; + my $minor1 = $header->{minor} - 1; substr ($copy, $file_magic + 1, 1) = chr $minor1; test_corrupt ($copy, $sub, - "/^Storable binary image v$major1\.$minor1 more recent than I am \\(v$header->{major}\.$header->{minor}\\)/", + "/^Storable binary image v$major1\.$minor1 more recent than I am \\(v$header->{major}\.$minor\\)/", "higher major, lower minor"); my $where; diff --git a/ext/Storable/t/restrict.t b/ext/Storable/t/restrict.t index 0eb299ff52..841baab3c8 100644 --- a/ext/Storable/t/restrict.t +++ b/ext/Storable/t/restrict.t @@ -1,4 +1,4 @@ -#!./perl +#!./perl -w # # Copyright 2002, Larry Wall. @@ -8,13 +8,24 @@ # sub BEGIN { - chdir('t') if -d 't'; - @INC = '.'; - push @INC, '../lib'; - require Config; import Config; - if ($Config{'extensions'} !~ /\bStorable\b/) { - print "1..0 # Skip: Storable was not built\n"; - exit 0; + if ($ENV{PERL_CORE}){ + chdir('t') if -d 't'; + @INC = '.'; + push @INC, '../lib'; + require Config; + if ($Config::Config{'extensions'} !~ /\bStorable\b/) { + print "1..0 # Skip: Storable was not built\n"; + exit 0; + } + } else { + unless (eval "require Hash::Util") { + if ($@ =~ /Can\'t locate Hash\/Util\.pm in \@INC/) { + print "1..0 # Skip: No Hash::Util\n"; + exit 0; + } else { + die; + } + } } require 'lib/st-dump.pl'; } @@ -67,7 +78,7 @@ sub testit { unless (ok ++$test, !$@, "Can assign to reserved key 'extra'?") { my $diag = $@; $diag =~ s/\n.*\z//s; - print "# \$@: $diag\n"; + print "# \$\@: $diag\n"; } eval { $copy->{nono} = 7 } ; diff --git a/ext/Storable/t/utf8hash.t b/ext/Storable/t/utf8hash.t index 5e93914799..25d5307399 100644 --- a/ext/Storable/t/utf8hash.t +++ b/ext/Storable/t/utf8hash.t @@ -38,6 +38,8 @@ use bytes (); use Encode qw(is_utf8); my %utf8hash; +$Storable::canonical = $Storable::canonical; # Shut up a used only once warning. + for $Storable::canonical (0, 1) { # first we generate a nasty hash which keys include both utf8 diff --git a/ext/Time/HiRes/HiRes.xs b/ext/Time/HiRes/HiRes.xs index 529223160e..9d3586dee8 100644 --- a/ext/Time/HiRes/HiRes.xs +++ b/ext/Time/HiRes/HiRes.xs @@ -618,7 +618,7 @@ sleep(...) if (items > 0) { NV seconds = SvNV(ST(0)); if (seconds >= 0.0) { - UV useconds = 1E6 * (seconds - (UV)seconds); + UV useconds = (UV)(1E6 * (seconds - (UV)seconds)); if (seconds >= 1.0) sleep((UV)seconds); usleep(useconds); diff --git a/ext/Unicode/Normalize/Normalize.xs b/ext/Unicode/Normalize/Normalize.xs index 3cb221fd75..93cb471a5d 100644 --- a/ext/Unicode/Normalize/Normalize.xs +++ b/ext/Unicode/Normalize/Normalize.xs @@ -553,10 +553,10 @@ getComposite(uv, uv2) UV uv2 PROTOTYPE: $$ PREINIT: - UV comp; + UV composite; CODE: - comp = composite_uv(uv, uv2); - RETVAL = comp ? newSVuv(comp) : &PL_sv_undef; + composite = composite_uv(uv, uv2); + RETVAL = composite ? newSVuv(composite) : &PL_sv_undef; OUTPUT: RETVAL |