diff options
author | Nicholas Clark <nick@ccl4.org> | 2009-10-01 15:42:40 +0100 |
---|---|---|
committer | Nicholas Clark <nick@ccl4.org> | 2009-10-01 15:42:40 +0100 |
commit | 762503fe89d6bc3dbcf0ca6003a3c53f728553a6 (patch) | |
tree | bc39a76096a9e6f0fd2731fac9ce2d5395b73ca0 /cpan/encoding-warnings | |
parent | a8fcbca871bd80c3bc14d13fa544740e0753130c (diff) | |
download | perl-762503fe89d6bc3dbcf0ca6003a3c53f728553a6.tar.gz |
Move encoding::warnings from ext/ to cpan/
Diffstat (limited to 'cpan/encoding-warnings')
-rw-r--r-- | cpan/encoding-warnings/Changes | 27 | ||||
-rw-r--r-- | cpan/encoding-warnings/lib/encoding/warnings.pm | 239 | ||||
-rw-r--r-- | cpan/encoding-warnings/t/1-warning.t | 36 | ||||
-rw-r--r-- | cpan/encoding-warnings/t/2-fatal.t | 34 | ||||
-rw-r--r-- | cpan/encoding-warnings/t/3-normal.t | 23 | ||||
-rw-r--r-- | cpan/encoding-warnings/t/4-lexical.t | 49 |
6 files changed, 408 insertions, 0 deletions
diff --git a/cpan/encoding-warnings/Changes b/cpan/encoding-warnings/Changes new file mode 100644 index 0000000000..4ada1a7c3e --- /dev/null +++ b/cpan/encoding-warnings/Changes @@ -0,0 +1,27 @@ +[Changes for 0.11 - 2007-06-05] + +* This module's effect is now lexical for Perl 5.9.5 and later. +* "no encoding::warnings" is made more inefficient by doing away + with an empty encoding handler. + Contributed by: Rafaƫl Garcia-Suarez + +[Changes for 0.04 - 2004-03-16] + +* This be 0.04, from the YAPC::Taipei::2004 release party. +* Various POD grammar updates. +* Mentions that we will be making encoding.pm to be lexical during 5.9. + +[Changes for 0.03 - 2004-03-15] + +* Fixes various typo and punctuations. +* Unified terminology to use "byte-string" and "unicode-string". + +[Changes for 0.02 - 2004-03-14] + +* Added lots of documentations, as well as explained the subtlety of + "use encoding" better. Prompted by Ton Hospel. +* Do not bother decoding a string twice if it is us-ascii. + +[Changes for 0.01 - 2004-03-14] + +* Initial release on CPAN. diff --git a/cpan/encoding-warnings/lib/encoding/warnings.pm b/cpan/encoding-warnings/lib/encoding/warnings.pm new file mode 100644 index 0000000000..5e6aec0c8f --- /dev/null +++ b/cpan/encoding-warnings/lib/encoding/warnings.pm @@ -0,0 +1,239 @@ +package encoding::warnings; +$encoding::warnings::VERSION = '0.11'; + +use strict; +use 5.007; + +=head1 NAME + +encoding::warnings - Warn on implicit encoding conversions + +=head1 VERSION + +This document describes version 0.11 of encoding::warnings, released +June 5, 2007. + +=head1 SYNOPSIS + + use encoding::warnings; # or 'FATAL' to raise fatal exceptions + + utf8::encode($a = chr(20000)); # a byte-string (raw bytes) + $b = chr(20000); # a unicode-string (wide characters) + + # "Bytes implicitly upgraded into wide characters as iso-8859-1" + $c = $a . $b; + +=head1 DESCRIPTION + +=head2 Overview of the problem + +By default, there is a fundamental asymmetry in Perl's unicode model: +implicit upgrading from byte-strings to unicode-strings assumes that +they were encoded in I<ISO 8859-1 (Latin-1)>, but unicode-strings are +downgraded with UTF-8 encoding. This happens because the first 256 +codepoints in Unicode happens to agree with Latin-1. + +However, this silent upgrading can easily cause problems, if you happen +to mix unicode strings with non-Latin1 data -- i.e. byte-strings encoded +in UTF-8 or other encodings. The error will not manifest until the +combined string is written to output, at which time it would be impossible +to see where did the silent upgrading occur. + +=head2 Detecting the problem + +This module simplifies the process of diagnosing such problems. Just put +this line on top of your main program: + + use encoding::warnings; + +Afterwards, implicit upgrading of high-bit bytes will raise a warning. +Ex.: C<Bytes implicitly upgraded into wide characters as iso-8859-1 at +- line 7>. + +However, strings composed purely of ASCII code points (C<0x00>..C<0x7F>) +will I<not> trigger this warning. + +You can also make the warnings fatal by importing this module as: + + use encoding::warnings 'FATAL'; + +=head2 Solving the problem + +Most of the time, this warning occurs when a byte-string is concatenated +with a unicode-string. There are a number of ways to solve it: + +=over 4 + +=item * Upgrade both sides to unicode-strings + +If your program does not need compatibility for Perl 5.6 and earlier, +the recommended approach is to apply appropriate IO disciplines, so all +data in your program become unicode-strings. See L<encoding>, L<open> and +L<perlfunc/binmode> for how. + +=item * Downgrade both sides to byte-strings + +The other way works too, especially if you are sure that all your data +are under the same encoding, or if compatibility with older versions +of Perl is desired. + +You may downgrade strings with C<Encode::encode> and C<utf8::encode>. +See L<Encode> and L<utf8> for details. + +=item * Specify the encoding for implicit byte-string upgrading + +If you are confident that all byte-strings will be in a specific +encoding like UTF-8, I<and> need not support older versions of Perl, +use the C<encoding> pragma: + + use encoding 'utf8'; + +Similarly, this will silence warnings from this module, and preserve the +default behaviour: + + use encoding 'iso-8859-1'; + +However, note that C<use encoding> actually had three distinct effects: + +=over 4 + +=item * PerlIO layers for B<STDIN> and B<STDOUT> + +This is similar to what L<open> pragma does. + +=item * Literal conversions + +This turns I<all> literal string in your program into unicode-strings +(equivalent to a C<use utf8>), by decoding them using the specified +encoding. + +=item * Implicit upgrading for byte-strings + +This will silence warnings from this module, as shown above. + +=back + +Because literal conversions also work on empty strings, it may surprise +some people: + + use encoding 'big5'; + + my $byte_string = pack("C*", 0xA4, 0x40); + print length $a; # 2 here. + $a .= ""; # concatenating with a unicode string... + print length $a; # 1 here! + +In other words, do not C<use encoding> unless you are certain that the +program will not deal with any raw, 8-bit binary data at all. + +However, the C<Filter =E<gt> 1> flavor of C<use encoding> will I<not> +affect implicit upgrading for byte-strings, and is thus incapable of +silencing warnings from this module. See L<encoding> for more details. + +=back + +=head1 CAVEATS + +For Perl 5.9.4 or later, this module's effect is lexical. + +For Perl versions prior to 5.9.4, this module affects the whole script, +instead of inside its lexical block. + +=cut + +# Constants. +sub ASCII () { 0 } +sub LATIN1 () { 1 } +sub FATAL () { 2 } + +# Install a ${^ENCODING} handler if no other one are already in place. +sub import { + my $class = shift; + my $fatal = shift || ''; + + local $@; + return if ${^ENCODING} and ref(${^ENCODING}) ne $class; + return unless eval { require Encode; 1 }; + + my $ascii = Encode::find_encoding('us-ascii') or return; + my $latin1 = Encode::find_encoding('iso-8859-1') or return; + + # Have to undef explicitly here + undef ${^ENCODING}; + + # Install a warning handler for decode() + my $decoder = bless( + [ + $ascii, + $latin1, + (($fatal eq 'FATAL') ? 'Carp::croak' : 'Carp::carp'), + ], $class, + ); + + ${^ENCODING} = $decoder; + $^H{$class} = 1; +} + +sub unimport { + my $class = shift; + $^H{$class} = undef; + undef ${^ENCODING}; +} + +# Don't worry about source code literals. +sub cat_decode { + my $self = shift; + return $self->[LATIN1]->cat_decode(@_); +} + +# Warn if the data is not purely US-ASCII. +sub decode { + my $self = shift; + + DO_WARN: { + if ($] >= 5.009004) { + my $hints = (caller(0))[10]; + $hints->{ref($self)} or last DO_WARN; + } + + local $@; + my $rv = eval { $self->[ASCII]->decode($_[0], Encode::FB_CROAK()) }; + return $rv unless $@; + + require Carp; + no strict 'refs'; + $self->[FATAL]->( + "Bytes implicitly upgraded into wide characters as iso-8859-1" + ); + + } + + return $self->[LATIN1]->decode(@_); +} + +sub name { 'iso-8859-1' } + +1; + +__END__ + +=head1 SEE ALSO + +L<perlunicode>, L<perluniintro> + +L<open>, L<utf8>, L<encoding>, L<Encode> + +=head1 AUTHORS + +Audrey Tang + +=head1 COPYRIGHT + +Copyright 2004, 2005, 2006, 2007 by Audrey Tang E<lt>cpan@audreyt.orgE<gt>. + +This program is free software; you can redistribute it and/or modify it +under the same terms as Perl itself. + +See L<http://www.perl.com/perl/misc/Artistic.html> + +=cut diff --git a/cpan/encoding-warnings/t/1-warning.t b/cpan/encoding-warnings/t/1-warning.t new file mode 100644 index 0000000000..c7525aef1f --- /dev/null +++ b/cpan/encoding-warnings/t/1-warning.t @@ -0,0 +1,36 @@ +#!/usr/bin/perl +# $File: /member/local/autrijus/encoding-warnings//t/1-warning.t $ $Author: autrijus $ +# $Revision: #5 $ $Change: 6145 $ $DateTime: 2004-07-16T03:49:06.717424Z $ + +BEGIN { + unless (eval { require Encode } ) { + print "1..0 # Skip: no Encode\n"; + exit 0; + } +} + +use Test; +BEGIN { plan tests => 2 } + +use strict; +use encoding::warnings; +ok(encoding::warnings->VERSION); + +if ($] < 5.008) { + ok(1); + exit; +} + +my ($a, $b, $c, $ok); + +$SIG{__WARN__} = sub { + if ($_[0] =~ /upgraded/) { ok(1); exit } +}; + +utf8::encode($a = chr(20000)); +$b = chr(20000); +$c = $a . $b; + +ok($ok); + +__END__ diff --git a/cpan/encoding-warnings/t/2-fatal.t b/cpan/encoding-warnings/t/2-fatal.t new file mode 100644 index 0000000000..4fc16a1df6 --- /dev/null +++ b/cpan/encoding-warnings/t/2-fatal.t @@ -0,0 +1,34 @@ +#!/usr/bin/perl +# $File: /member/local/autrijus/encoding-warnings/t/2-fatal.t $ $Author: autrijus $ +# $Revision: #4 $ $Change: 1626 $ $DateTime: 2004-03-14T16:53:19.351256Z $ + +BEGIN { + unless (eval { require Encode } ) { + print "1..0 # Skip: no Encode\n"; + exit 0; + } +} + +use Test; +BEGIN { plan tests => 2 } + +use strict; +use encoding::warnings 'FATAL'; +ok(encoding::warnings->VERSION); + +if ($] < 5.008) { + ok(1); + exit; +} + +my ($a, $b, $c, $ok); + +$SIG{__DIE__} = sub { + if ($_[0] =~ /upgraded/) { ok(1); exit } +}; + +utf8::encode($a = chr(20000)); +$b = chr(20000); +$c = $a . $b; + +ok($ok); diff --git a/cpan/encoding-warnings/t/3-normal.t b/cpan/encoding-warnings/t/3-normal.t new file mode 100644 index 0000000000..f0e6446a56 --- /dev/null +++ b/cpan/encoding-warnings/t/3-normal.t @@ -0,0 +1,23 @@ +use Test; +BEGIN { plan tests => 2 } + +use strict; +use encoding::warnings 'FATAL'; +ok(encoding::warnings->VERSION); + +if ($] < 5.008) { + ok(1); + exit; +} + +my ($a, $b, $c, $ok); +$ok = 1; + +$SIG{__DIE__} = sub { $ok = 0 }; +$SIG{__WARN__} = sub { $ok = 0 }; + +$a = chr(20000); +$b = chr(20000); +$c = $a . $b; + +ok($ok); diff --git a/cpan/encoding-warnings/t/4-lexical.t b/cpan/encoding-warnings/t/4-lexical.t new file mode 100644 index 0000000000..e80c50411a --- /dev/null +++ b/cpan/encoding-warnings/t/4-lexical.t @@ -0,0 +1,49 @@ +use strict; +use Test; +BEGIN { + use Config; + if ($Config::Config{'extensions'} !~ /\bEncode\b/) { + print "1..0 # Skip: Encode was not built\n"; + exit 0; + } + + plan tests => 3; +} + +{ + use encoding::warnings; + ok(encoding::warnings->VERSION); + + if ($] < 5.009004) { + ok('skipped'); + ok('skipped'); + exit; + } + + my ($a, $b, $c, $warned); + + local $SIG{__WARN__} = sub { + if ($_[0] =~ /upgraded/) { $warned = 1 } + }; + + utf8::encode($a = chr(20000)); + $b = chr(20000); + $c = $a . $b; + ok($warned); +} + +{ + my ($a, $b, $c, $warned); + + local $SIG{__WARN__} = sub { + if ($_[0] =~ /upgraded/) { $warned = 1 } + }; + + utf8::encode($a = chr(20000)); + $b = chr(20000); + $c = $a . $b; + ok(!$warned); +} + + +__END__ |