From 1485817ebf8dde6b1eba1bd4d8bf7c6e2e9fb14b Mon Sep 17 00:00:00 2001 From: Dan Kogai Date: Sun, 18 May 2003 09:45:35 +0900 Subject: [Encode] UTF-7 Support Message-Id: <99C4504E-887E-11D7-840A-000393AE4244@dan.co.jp> p4raw-id: //depot/perl@19548 --- ext/Encode/Changes | 7 ++ ext/Encode/MANIFEST | 1 + ext/Encode/Unicode/Unicode.pm | 10 ++- ext/Encode/lib/Encode/Alias.pm | 2 + ext/Encode/lib/Encode/Config.pm | 1 + ext/Encode/lib/Encode/Supported.pod | 4 ++ ext/Encode/lib/Encode/Unicode/UTF7.pm | 117 ++++++++++++++++++++++++++++++++++ ext/Encode/t/Unicode.t | 20 +++++- 8 files changed, 158 insertions(+), 4 deletions(-) create mode 100644 ext/Encode/lib/Encode/Unicode/UTF7.pm (limited to 'ext/Encode') diff --git a/ext/Encode/Changes b/ext/Encode/Changes index 1e68f4374b..4edb594eeb 100644 --- a/ext/Encode/Changes +++ b/ext/Encode/Changes @@ -3,6 +3,13 @@ # $Id: Changes,v 1.94 2003/05/10 18:13:59 dankogai Exp $ # $Revision: 1.94 $ $Date: 2003/05/10 18:13:59 $ ++ lib/Encode/Unicode/UTF7.pm +! lib/Encode/Config.pm lib/Encode/Alias.pm Unicode/Unicode.pm t/Unicode.t + lib/Encode/Supported.pod + UTF-7 support is now added. With this Encode now has all transcoding + methods in Unicode::String. + +1.94 2003/05/10 18:13:59 ! lib/Encode/MIME/Header.pm A more sophisticated solution for double-encoding by dankogai ! lib/Encode/MIME/Header.pm AUTHORS diff --git a/ext/Encode/MANIFEST b/ext/Encode/MANIFEST index 86aaea7fca..0d08c79166 100644 --- a/ext/Encode/MANIFEST +++ b/ext/Encode/MANIFEST @@ -51,6 +51,7 @@ lib/Encode/KR/2022_KR.pm Encode extension lib/Encode/MIME/Header.pm Encode extension lib/Encode/PerlIO.pod Documents for Encode & PerlIO lib/Encode/Supported.pod Documents for supported encodings +lib/Encode/Unicode/UTF7.pm Encode extension t/Aliases.t test script t/CJKT.t test script t/Encode.t test script diff --git a/ext/Encode/Unicode/Unicode.pm b/ext/Encode/Unicode/Unicode.pm index bcd698a9df..721c9f7851 100644 --- a/ext/Encode/Unicode/Unicode.pm +++ b/ext/Encode/Unicode/Unicode.pm @@ -287,9 +287,13 @@ for UTF-8, which is a native format in perl). =item L says: I A character encoding form plus byte -serialization. There are seven character encoding schemes in Unicode: +serialization. There are Seven character encoding schemes in Unicode: UTF-8, UTF-16, UTF-16BE, UTF-16LE, UTF-32 (UCS-4), UTF-32BE (UCS-4BE) and -UTF-32LE (UCS-4LE). +UTF-32LE (UCS-4LE), and UTF-7. + +Since UTF-7 is a 7-bit (re)encoded version of UTF-16BE, It is not part of +Unicode's Character Encoding Scheme. It is separately implemented in +Encode::Unicode::UTF7. For details see L. =item Quick Reference @@ -434,7 +438,7 @@ every one of \x{0000_0000} up to \x{ffff_ffff} (*) is I. =head1 SEE ALSO -L, L, +L, L, L, L, RFC 2781 L, diff --git a/ext/Encode/lib/Encode/Alias.pm b/ext/Encode/lib/Encode/Alias.pm index 7dbc47badb..b29bfd9871 100644 --- a/ext/Encode/lib/Encode/Alias.pm +++ b/ext/Encode/lib/Encode/Alias.pm @@ -1,5 +1,6 @@ package Encode::Alias; use strict; +no warnings 'redefine'; use Encode; our $VERSION = do { my @r = (q$Revision: 1.35 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; our $DEBUG = 0; @@ -128,6 +129,7 @@ sub init_aliases define_alias( qr/^(.*)$/ => '"\L$1"' ); # UTF/UCS stuff + define_alias( qr/^UTF-?7$/i => '"UTF-7"'); define_alias( qr/^UCS-?2-?LE$/i => '"UCS-2LE"' ); define_alias( qr/^UCS-?2-?(BE)?$/i => '"UCS-2BE"', qr/^UCS-?4-?(BE|LE)?$/i => 'uc("UTF-32$1")', diff --git a/ext/Encode/lib/Encode/Config.pm b/ext/Encode/lib/Encode/Config.pm index a834967a11..0fe77d6b95 100644 --- a/ext/Encode/lib/Encode/Config.pm +++ b/ext/Encode/lib/Encode/Config.pm @@ -98,6 +98,7 @@ our %ExtModule = 'UTF-32' => 'Encode::Unicode', 'UTF-32BE' => 'Encode::Unicode', 'UTF-32LE' => 'Encode::Unicode', + 'UTF-7' => 'Encode::Unicode::UTF7', ); unless (ord("A") == 193){ diff --git a/ext/Encode/lib/Encode/Supported.pod b/ext/Encode/lib/Encode/Supported.pod index e2645c4a66..d09fc0a513 100644 --- a/ext/Encode/lib/Encode/Supported.pod +++ b/ext/Encode/lib/Encode/Supported.pod @@ -87,11 +87,15 @@ Encode::Unicode, which will be autoloaded on demand. UTF-32 [UC] UTF-32BE UCS-4 [UC] UTF-32LE [UC] + UTF-7 [RFC2152] ---------------------------------------------------------------- To find how (UCS-2|UTF-(16|32))(LE|BE)? differ from one another, see L. +UTF-7 is a special encoding which "re-encodes" UTF-16BE into a 7-bit +encoding. It is implemeneted seperately by Encode::Unicode::UTF7. + =head2 Encode::Byte -- Extended ASCII Encode::Byte implements most single-byte encodings except for diff --git a/ext/Encode/lib/Encode/Unicode/UTF7.pm b/ext/Encode/lib/Encode/Unicode/UTF7.pm new file mode 100644 index 0000000000..c3bcd3b1c1 --- /dev/null +++ b/ext/Encode/lib/Encode/Unicode/UTF7.pm @@ -0,0 +1,117 @@ +# +# $Id: UTF7.pm,v 0.1 2003/05/16 18:06:24 dankogai Exp dankogai $ +# +package Encode::Unicode::UTF7; +use strict; +no warnings 'redefine'; +use base qw(Encode::Encoding); +__PACKAGE__->Define('UTF-7'); +our $VERSION = do { my @r = (q$Revision: 0.1 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; +use MIME::Base64; +use Encode; + +# +# Algorithms taken from Unicode::String by Gisle Aas +# + +our $OPTIONAL_DIRECT_CHARS = 1; +my $specials = quotemeta "\'(),-.:?"; +$OPTIONAL_DIRECT_CHARS and + $specials .= quotemeta "!\"#$%&*;<=>@[]^_`{|}"; +# \s will not work because it matches U+3000 DEOGRAPHIC SPACE +# We use \x00-\x20 instead (controls + space) +my $re_asis = qr/(?:[\x00-\x20A-Za-z0-9$specials])/; +my $re_encoded = qr/(?:[^\x00-\x20A-Za-z0-9$specials])/; +my $e_utf16 = find_encoding("UTF-16BE"); + +sub needs_lines { 1 }; + +sub encode($$;$){ + my ($obj, $str, $chk) = @_; + my $len = length($str); + pos($str) = 0; + my $bytes = ''; + while (pos($str) < $len){ + if ($str =~ /\G($re_asis+)/ogc){ + $bytes .= $1; + }elsif($str =~ /\G($re_encoded+)/ogsc){ + if ($1 eq "+"){ + $bytes .= "+-"; + }else{ + my $base64 = encode_base64($e_utf16->encode($1), ''); + $base64 =~ s/=+$//; + $bytes .= "+$base64-"; + } + }else{ + die "This should not happen! (pos=" . pos($str) . ")"; + } + } + $_[1] = '' if $chk; + return $bytes; +} + +sub decode{ + my ($obj, $bytes, $chk) = @_; + my $len = length($bytes); + my $str = ""; + while (pos($bytes) < $len) { + if ($bytes =~ /\G([^+]+)/ogc) { + $str .= $1; + }elsif($bytes =~ /\G\+-/ogc) { + $str .= "+"; + }elsif($bytes =~ /\G\+([A-Za-z0-9+\/]+)-?/ogsc) { + my $base64 = $1; + my $pad = length($base64) % 4; + $base64 .= "=" x (4 - $pad) if $pad; + $str .= $e_utf16->decode(decode_base64($base64)); + }elsif($bytes =~ /\G\+/ogc) { + $^W and warn "Bad UTF7 data escape"; + $str .= "+"; + }else{ + die "This should not happen " . pos($bytes); + } + } + $_[1] = '' if $chk; + return $str; +} +1; +__END__ + +=head1 NAME + +Encode::Unicode::UTF7 -- UTF-7 encoding + +=head1 SYNOPSIS + + use Encode qw/encode decode/; + $utf7 = encode("UTF-7", $utf8); + $utf8 = decode("UTF-7", $ucs2); + +=head1 ABSTRACT + +This module implements UTF-7 encoding documented in RFC 2152. UTF-7, +as its name suggests, is a 7-bit re-encoded version of UTF-16BE. It +is designed to be MTA-safe and expected to be a standard way to +exchange Unicoded mails via mails. But with the advent of UTF-8 and +8-bit compliant MTAs, UTF-7 is hardly ever used. + +UTF-7 was not supported by Encode until version 1.95 because of that. +But Unicode::String, a module by Gisle Aas which adds Unicode supports +to non-utf8-savvy perl did support UTF-7, the UTF-7 support was added +so Encode can supersede Unicode::String 100%. + +=head1 In Practice + +When you want to encode Unicode for mails and web pages, however, do +not use UTF-7 unless you are sure your recipients and readers can +handle it. Very few MUAs and WWW Browsers support these days (only +Mozilla seems to support one). For general cases, use UTF-8 for +message body and MIME-Header for header instead. + +=head1 SEE ALSO + +L, L, L + +RFC 2781 L + +=cut diff --git a/ext/Encode/t/Unicode.t b/ext/Encode/t/Unicode.t index fb0ca1aad5..50e5ba0ac5 100644 --- a/ext/Encode/t/Unicode.t +++ b/ext/Encode/t/Unicode.t @@ -18,7 +18,7 @@ BEGIN { use strict; #use Test::More 'no_plan'; -use Test::More tests => 30; +use Test::More tests => 37; use Encode qw(encode decode); # @@ -103,6 +103,24 @@ SKIP: { } }; +# +# CJKT vs. UTF-7 +# +use File::Spec; +use File::Basename; + +my $dir = dirname(__FILE__); +opendir my $dh, $dir or die "$dir:$!"; +my @file = sort grep {/\.utf$/o} readdir $dh; +closedir $dh; +for my $file (@file){ + my $path = File::Spec->catfile($dir, $file); + open my $fh, '<:utf8', $path or die "$path:$!"; + my $content = join('' => <$fh>); + close $fh; + is(decode("UTF-7", encode("UTF-7", $content)), $content, + "UTF-7 RT:$file"); +} 1; __END__ -- cgit v1.2.1