summaryrefslogtreecommitdiff
path: root/ext/Encode
diff options
context:
space:
mode:
authorDan Kogai <dankogai@dan.co.jp>2003-05-18 09:45:35 +0900
committerJarkko Hietaniemi <jhi@iki.fi>2003-05-17 16:39:19 +0000
commit1485817ebf8dde6b1eba1bd4d8bf7c6e2e9fb14b (patch)
tree2bba0473a06f71a84173c1614ad1cecd19a80ac9 /ext/Encode
parentfde91635d459dc9b5c5563809bb62a4fc882e73c (diff)
downloadperl-1485817ebf8dde6b1eba1bd4d8bf7c6e2e9fb14b.tar.gz
[Encode] UTF-7 Support
Message-Id: <99C4504E-887E-11D7-840A-000393AE4244@dan.co.jp> p4raw-id: //depot/perl@19548
Diffstat (limited to 'ext/Encode')
-rw-r--r--ext/Encode/Changes7
-rw-r--r--ext/Encode/MANIFEST1
-rw-r--r--ext/Encode/Unicode/Unicode.pm10
-rw-r--r--ext/Encode/lib/Encode/Alias.pm2
-rw-r--r--ext/Encode/lib/Encode/Config.pm1
-rw-r--r--ext/Encode/lib/Encode/Supported.pod4
-rw-r--r--ext/Encode/lib/Encode/Unicode/UTF7.pm117
-rw-r--r--ext/Encode/t/Unicode.t20
8 files changed, 158 insertions, 4 deletions
diff --git a/ext/Encode/Changes b/ext/Encode/Changes
index 1e68f4374b..4edb594eeb 100644
--- a/ext/Encode/Changes
+++ b/ext/Encode/Changes
@@ -3,6 +3,13 @@
# $Id: Changes,v 1.94 2003/05/10 18:13:59 dankogai Exp $
#
$Revision: 1.94 $ $Date: 2003/05/10 18:13:59 $
++ lib/Encode/Unicode/UTF7.pm
+! lib/Encode/Config.pm lib/Encode/Alias.pm Unicode/Unicode.pm t/Unicode.t
+ lib/Encode/Supported.pod
+ UTF-7 support is now added. With this Encode now has all transcoding
+ methods in Unicode::String.
+
+1.94 2003/05/10 18:13:59
! lib/Encode/MIME/Header.pm
A more sophisticated solution for double-encoding by dankogai
! lib/Encode/MIME/Header.pm AUTHORS
diff --git a/ext/Encode/MANIFEST b/ext/Encode/MANIFEST
index 86aaea7fca..0d08c79166 100644
--- a/ext/Encode/MANIFEST
+++ b/ext/Encode/MANIFEST
@@ -51,6 +51,7 @@ lib/Encode/KR/2022_KR.pm Encode extension
lib/Encode/MIME/Header.pm Encode extension
lib/Encode/PerlIO.pod Documents for Encode & PerlIO
lib/Encode/Supported.pod Documents for supported encodings
+lib/Encode/Unicode/UTF7.pm Encode extension
t/Aliases.t test script
t/CJKT.t test script
t/Encode.t test script
diff --git a/ext/Encode/Unicode/Unicode.pm b/ext/Encode/Unicode/Unicode.pm
index bcd698a9df..721c9f7851 100644
--- a/ext/Encode/Unicode/Unicode.pm
+++ b/ext/Encode/Unicode/Unicode.pm
@@ -287,9 +287,13 @@ for UTF-8, which is a native format in perl).
=item L<http://www.unicode.org/glossary/> says:
I<Character Encoding Scheme> A character encoding form plus byte
-serialization. There are seven character encoding schemes in Unicode:
+serialization. There are Seven character encoding schemes in Unicode:
UTF-8, UTF-16, UTF-16BE, UTF-16LE, UTF-32 (UCS-4), UTF-32BE (UCS-4BE) and
-UTF-32LE (UCS-4LE).
+UTF-32LE (UCS-4LE), and UTF-7.
+
+Since UTF-7 is a 7-bit (re)encoded version of UTF-16BE, It is not part of
+Unicode's Character Encoding Scheme. It is separately implemented in
+Encode::Unicode::UTF7. For details see L<Encode::Unicode::UTF7>.
=item Quick Reference
@@ -434,7 +438,7 @@ every one of \x{0000_0000} up to \x{ffff_ffff} (*) is I<a character>.
=head1 SEE ALSO
-L<Encode>, L<http://www.unicode.org/glossary/>,
+L<Encode>, L<Encode::Unicode::UTF7>, L<http://www.unicode.org/glossary/>,
L<http://www.unicode.org/unicode/faq/utf_bom.html>,
RFC 2781 L<http://rfc.net/rfc2781.html>,
diff --git a/ext/Encode/lib/Encode/Alias.pm b/ext/Encode/lib/Encode/Alias.pm
index 7dbc47badb..b29bfd9871 100644
--- a/ext/Encode/lib/Encode/Alias.pm
+++ b/ext/Encode/lib/Encode/Alias.pm
@@ -1,5 +1,6 @@
package Encode::Alias;
use strict;
+no warnings 'redefine';
use Encode;
our $VERSION = do { my @r = (q$Revision: 1.35 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
our $DEBUG = 0;
@@ -128,6 +129,7 @@ sub init_aliases
define_alias( qr/^(.*)$/ => '"\L$1"' );
# UTF/UCS stuff
+ define_alias( qr/^UTF-?7$/i => '"UTF-7"');
define_alias( qr/^UCS-?2-?LE$/i => '"UCS-2LE"' );
define_alias( qr/^UCS-?2-?(BE)?$/i => '"UCS-2BE"',
qr/^UCS-?4-?(BE|LE)?$/i => 'uc("UTF-32$1")',
diff --git a/ext/Encode/lib/Encode/Config.pm b/ext/Encode/lib/Encode/Config.pm
index a834967a11..0fe77d6b95 100644
--- a/ext/Encode/lib/Encode/Config.pm
+++ b/ext/Encode/lib/Encode/Config.pm
@@ -98,6 +98,7 @@ our %ExtModule =
'UTF-32' => 'Encode::Unicode',
'UTF-32BE' => 'Encode::Unicode',
'UTF-32LE' => 'Encode::Unicode',
+ 'UTF-7' => 'Encode::Unicode::UTF7',
);
unless (ord("A") == 193){
diff --git a/ext/Encode/lib/Encode/Supported.pod b/ext/Encode/lib/Encode/Supported.pod
index e2645c4a66..d09fc0a513 100644
--- a/ext/Encode/lib/Encode/Supported.pod
+++ b/ext/Encode/lib/Encode/Supported.pod
@@ -87,11 +87,15 @@ Encode::Unicode, which will be autoloaded on demand.
UTF-32 [UC]
UTF-32BE UCS-4 [UC]
UTF-32LE [UC]
+ UTF-7 [RFC2152]
----------------------------------------------------------------
To find how (UCS-2|UTF-(16|32))(LE|BE)? differ from one another,
see L<Encode::Unicode>.
+UTF-7 is a special encoding which "re-encodes" UTF-16BE into a 7-bit
+encoding. It is implemeneted seperately by Encode::Unicode::UTF7.
+
=head2 Encode::Byte -- Extended ASCII
Encode::Byte implements most single-byte encodings except for
diff --git a/ext/Encode/lib/Encode/Unicode/UTF7.pm b/ext/Encode/lib/Encode/Unicode/UTF7.pm
new file mode 100644
index 0000000000..c3bcd3b1c1
--- /dev/null
+++ b/ext/Encode/lib/Encode/Unicode/UTF7.pm
@@ -0,0 +1,117 @@
+#
+# $Id: UTF7.pm,v 0.1 2003/05/16 18:06:24 dankogai Exp dankogai $
+#
+package Encode::Unicode::UTF7;
+use strict;
+no warnings 'redefine';
+use base qw(Encode::Encoding);
+__PACKAGE__->Define('UTF-7');
+our $VERSION = do { my @r = (q$Revision: 0.1 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
+use MIME::Base64;
+use Encode;
+
+#
+# Algorithms taken from Unicode::String by Gisle Aas
+#
+
+our $OPTIONAL_DIRECT_CHARS = 1;
+my $specials = quotemeta "\'(),-.:?";
+$OPTIONAL_DIRECT_CHARS and
+ $specials .= quotemeta "!\"#$%&*;<=>@[]^_`{|}";
+# \s will not work because it matches U+3000 DEOGRAPHIC SPACE
+# We use \x00-\x20 instead (controls + space)
+my $re_asis = qr/(?:[\x00-\x20A-Za-z0-9$specials])/;
+my $re_encoded = qr/(?:[^\x00-\x20A-Za-z0-9$specials])/;
+my $e_utf16 = find_encoding("UTF-16BE");
+
+sub needs_lines { 1 };
+
+sub encode($$;$){
+ my ($obj, $str, $chk) = @_;
+ my $len = length($str);
+ pos($str) = 0;
+ my $bytes = '';
+ while (pos($str) < $len){
+ if ($str =~ /\G($re_asis+)/ogc){
+ $bytes .= $1;
+ }elsif($str =~ /\G($re_encoded+)/ogsc){
+ if ($1 eq "+"){
+ $bytes .= "+-";
+ }else{
+ my $base64 = encode_base64($e_utf16->encode($1), '');
+ $base64 =~ s/=+$//;
+ $bytes .= "+$base64-";
+ }
+ }else{
+ die "This should not happen! (pos=" . pos($str) . ")";
+ }
+ }
+ $_[1] = '' if $chk;
+ return $bytes;
+}
+
+sub decode{
+ my ($obj, $bytes, $chk) = @_;
+ my $len = length($bytes);
+ my $str = "";
+ while (pos($bytes) < $len) {
+ if ($bytes =~ /\G([^+]+)/ogc) {
+ $str .= $1;
+ }elsif($bytes =~ /\G\+-/ogc) {
+ $str .= "+";
+ }elsif($bytes =~ /\G\+([A-Za-z0-9+\/]+)-?/ogsc) {
+ my $base64 = $1;
+ my $pad = length($base64) % 4;
+ $base64 .= "=" x (4 - $pad) if $pad;
+ $str .= $e_utf16->decode(decode_base64($base64));
+ }elsif($bytes =~ /\G\+/ogc) {
+ $^W and warn "Bad UTF7 data escape";
+ $str .= "+";
+ }else{
+ die "This should not happen " . pos($bytes);
+ }
+ }
+ $_[1] = '' if $chk;
+ return $str;
+}
+1;
+__END__
+
+=head1 NAME
+
+Encode::Unicode::UTF7 -- UTF-7 encoding
+
+=head1 SYNOPSIS
+
+ use Encode qw/encode decode/;
+ $utf7 = encode("UTF-7", $utf8);
+ $utf8 = decode("UTF-7", $ucs2);
+
+=head1 ABSTRACT
+
+This module implements UTF-7 encoding documented in RFC 2152. UTF-7,
+as its name suggests, is a 7-bit re-encoded version of UTF-16BE. It
+is designed to be MTA-safe and expected to be a standard way to
+exchange Unicoded mails via mails. But with the advent of UTF-8 and
+8-bit compliant MTAs, UTF-7 is hardly ever used.
+
+UTF-7 was not supported by Encode until version 1.95 because of that.
+But Unicode::String, a module by Gisle Aas which adds Unicode supports
+to non-utf8-savvy perl did support UTF-7, the UTF-7 support was added
+so Encode can supersede Unicode::String 100%.
+
+=head1 In Practice
+
+When you want to encode Unicode for mails and web pages, however, do
+not use UTF-7 unless you are sure your recipients and readers can
+handle it. Very few MUAs and WWW Browsers support these days (only
+Mozilla seems to support one). For general cases, use UTF-8 for
+message body and MIME-Header for header instead.
+
+=head1 SEE ALSO
+
+L<Encode>, L<Encode::Unicode>, L<Unicode::String>
+
+RFC 2781 L<http://www.ietf.org/rfc/rfc2152.txt>
+
+=cut
diff --git a/ext/Encode/t/Unicode.t b/ext/Encode/t/Unicode.t
index fb0ca1aad5..50e5ba0ac5 100644
--- a/ext/Encode/t/Unicode.t
+++ b/ext/Encode/t/Unicode.t
@@ -18,7 +18,7 @@ BEGIN {
use strict;
#use Test::More 'no_plan';
-use Test::More tests => 30;
+use Test::More tests => 37;
use Encode qw(encode decode);
#
@@ -103,6 +103,24 @@ SKIP: {
}
};
+#
+# CJKT vs. UTF-7
+#
+use File::Spec;
+use File::Basename;
+
+my $dir = dirname(__FILE__);
+opendir my $dh, $dir or die "$dir:$!";
+my @file = sort grep {/\.utf$/o} readdir $dh;
+closedir $dh;
+for my $file (@file){
+ my $path = File::Spec->catfile($dir, $file);
+ open my $fh, '<:utf8', $path or die "$path:$!";
+ my $content = join('' => <$fh>);
+ close $fh;
+ is(decode("UTF-7", encode("UTF-7", $content)), $content,
+ "UTF-7 RT:$file");
+}
1;
__END__