summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJarkko Hietaniemi <jhi@iki.fi>2001-03-27 20:24:31 +0000
committerJarkko Hietaniemi <jhi@iki.fi>2001-03-27 20:24:31 +0000
commit60ad9f697ba2c0e755139d5afc80911e68d22894 (patch)
treeeb8b461ed09d1aa3c50f0a74a94105d9fcfcc833
parent521a8ec7fe1e7a81f500dfcdc4a8085b16566a30 (diff)
downloadperl-60ad9f697ba2c0e755139d5afc80911e68d22894.tar.gz
Integrate perlio:
[ 9384] Various EBCDIC fixes: - major revelation that swash code is encoding aware, (or thought it was) - now it is ;-) - With that out of the way fix a slab of tr/// cases. - Fix Encode 'Unicode' to be true Unicode so tests pass. - As anticipated Base64.xs needed tweaks. - Until tr/// works right avoid old_encode64 in MIME tests. p4raw-link: @9384 on //depot/perlio: 5ad8ef521b3ffc4e6bbbb9941bc4940d442b56b2 p4raw-id: //depot/perl@9389
-rw-r--r--doop.c26
-rw-r--r--ext/Encode/Encode.pm39
-rw-r--r--ext/MIME/Base64/Base64.xs24
-rw-r--r--ext/MIME/Base64/QuotedPrint.pm34
-rw-r--r--t/lib/encode.t4
-rw-r--r--t/lib/mimeb64.t78
-rw-r--r--utf8.c26
7 files changed, 166 insertions, 65 deletions
diff --git a/doop.c b/doop.c
index 823c88d18e..266411a009 100644
--- a/doop.c
+++ b/doop.c
@@ -36,7 +36,7 @@ S_do_trans_simple(pTHX_ SV *sv)
tbl = (short*)cPVOP->op_pv;
if (!tbl)
- Perl_croak(aTHX_ "panic: do_trans_simple");
+ Perl_croak(aTHX_ "panic: do_trans_simple line %d",__LINE__);
s = (U8*)SvPV(sv, len);
send = s + len;
@@ -103,7 +103,7 @@ S_do_trans_count(pTHX_ SV *sv)/* SPC - OK */
tbl = (short*)cPVOP->op_pv;
if (!tbl)
- Perl_croak(aTHX_ "panic: do_trans_count");
+ Perl_croak(aTHX_ "panic: do_trans_count line %d",__LINE__);
s = (U8*)SvPV(sv, len);
send = s + len;
@@ -147,7 +147,7 @@ S_do_trans_complex(pTHX_ SV *sv)/* SPC - NOT OK */
tbl = (short*)cPVOP->op_pv;
if (!tbl)
- Perl_croak(aTHX_ "panic: do_trans_complex");
+ Perl_croak(aTHX_ "panic: do_trans_complex line %d",__LINE__);
s = (U8*)SvPV(sv, len);
isutf8 = SvUTF8(sv);
@@ -346,7 +346,7 @@ S_do_trans_simple_utf8(pTHX_ SV *sv)/* SPC - OK */
if ((uv = swash_fetch(rv, s)) < none) {
s += UTF8SKIP(s);
matches++;
- d = uvchr_to_utf8(d, uv);
+ d = uvuni_to_utf8(d, uv);
}
else if (uv == none) {
int i = UTF8SKIP(s);
@@ -358,7 +358,7 @@ S_do_trans_simple_utf8(pTHX_ SV *sv)/* SPC - OK */
int i = UTF8SKIP(s);
s += i;
matches++;
- d = uvchr_to_utf8(d, final);
+ d = uvuni_to_utf8(d, final);
}
else
s += UTF8SKIP(s);
@@ -367,7 +367,7 @@ S_do_trans_simple_utf8(pTHX_ SV *sv)/* SPC - OK */
STRLEN clen = d - dstart;
STRLEN nlen = dend - dstart + len + UTF8_MAXLEN;
if (!grows)
- Perl_croak(aTHX_ "panic: do_trans_complex_utf8");
+ Perl_croak(aTHX_ "panic: do_trans_simple_utf8 line %d",__LINE__);
Renew(dstart, nlen+UTF8_MAXLEN, U8);
d = dstart + clen;
dend = dstart + nlen;
@@ -496,7 +496,7 @@ S_do_trans_complex_utf8(pTHX_ SV *sv) /* SPC - NOT OK */
STRLEN clen = d - dstart;
STRLEN nlen = dend - dstart + len + UTF8_MAXLEN;
if (!grows)
- Perl_croak(aTHX_ "panic: do_trans_complex_utf8");
+ Perl_croak(aTHX_ "panic: do_trans_complex_utf8 line %d",__LINE__);
Renew(dstart, nlen+UTF8_MAXLEN, U8);
d = dstart + clen;
dend = dstart + nlen;
@@ -505,7 +505,7 @@ S_do_trans_complex_utf8(pTHX_ SV *sv) /* SPC - NOT OK */
matches++;
s += UTF8SKIP(s);
if (uv != puv) {
- d = uvchr_to_utf8(d, uv);
+ d = uvuni_to_utf8(d, uv);
puv = uv;
}
continue;
@@ -523,13 +523,13 @@ S_do_trans_complex_utf8(pTHX_ SV *sv) /* SPC - NOT OK */
if (havefinal) {
s += UTF8SKIP(s);
if (puv != final) {
- d = uvchr_to_utf8(d, final);
+ d = uvuni_to_utf8(d, final);
puv = final;
}
}
else {
STRLEN len;
- uv = utf8_to_uvchr(s, &len);
+ uv = utf8_to_uvuni(s, &len);
if (uv != puv) {
Copy(s, d, len, U8);
d += len;
@@ -550,7 +550,7 @@ S_do_trans_complex_utf8(pTHX_ SV *sv) /* SPC - NOT OK */
STRLEN clen = d - dstart;
STRLEN nlen = dend - dstart + len + UTF8_MAXLEN;
if (!grows)
- Perl_croak(aTHX_ "panic: do_trans_complex_utf8");
+ Perl_croak(aTHX_ "panic: do_trans_complex_utf8 line %d",__LINE__);
Renew(dstart, nlen+UTF8_MAXLEN, U8);
d = dstart + clen;
dend = dstart + nlen;
@@ -558,7 +558,7 @@ S_do_trans_complex_utf8(pTHX_ SV *sv) /* SPC - NOT OK */
if (uv < none) {
matches++;
s += UTF8SKIP(s);
- d = uvchr_to_utf8(d, uv);
+ d = uvuni_to_utf8(d, uv);
continue;
}
else if (uv == none) { /* "none" is unmapped character */
@@ -571,7 +571,7 @@ S_do_trans_complex_utf8(pTHX_ SV *sv) /* SPC - NOT OK */
else if (uv == extra && !del) {
matches++;
s += UTF8SKIP(s);
- d = uvchr_to_utf8(d, final);
+ d = uvuni_to_utf8(d, final);
continue;
}
matches++; /* "none+1" is delete character */
diff --git a/ext/Encode/Encode.pm b/ext/Encode/Encode.pm
index 650180647b..fde3891b6f 100644
--- a/ext/Encode/Encode.pm
+++ b/ext/Encode/Encode.pm
@@ -60,6 +60,7 @@ sub findAlias
my $alias = $alias[$i];
my $val = $alias[$i+1];
my $new;
+
if (ref($alias) eq 'Regexp' && $_ =~ $alias)
{
$new = eval $val;
@@ -68,7 +69,7 @@ sub findAlias
{
$new = &{$alias}($val)
}
- elsif (lc($_) eq $alias)
+ elsif (lc($_) eq lc($alias))
{
$new = $val;
}
@@ -222,13 +223,15 @@ sub new_sequence { return $_[0] }
package Encode::XS;
use base 'Encode::Encoding';
-package Encode::Unicode;
+package Encode::Internal;
use base 'Encode::Encoding';
# Dummy package that provides the encode interface but leaves data
# as UTF-X encoded. It is here so that from_to() works.
-__PACKAGE__->Define('Unicode');
+__PACKAGE__->Define('Internal');
+
+Encode::define_alias( 'Unicode' => 'Internal' ) if ord('A') == 65;
sub decode
{
@@ -240,6 +243,36 @@ sub decode
*encode = \&decode;
+package Encoding::Unicode;
+use base 'Encode::Encoding';
+
+__PACKAGE__->Define('Unicode') unless ord('A') == 65;
+
+sub decode
+{
+ my ($obj,$str,$chk) = @_;
+ my $res = '';
+ for (my $i = 0; $i < length($str); $i++)
+ {
+ $res .= chr(utf8::unicode_to_native(ord(substr($str,$i,1))));
+ }
+ $_[1] = '' if $chk;
+ return $res;
+}
+
+sub encode
+{
+ my ($obj,$str,$chk) = @_;
+ my $res = '';
+ for (my $i = 0; $i < length($str); $i++)
+ {
+ $res .= chr(utf8::native_to_unicode(ord(substr($str,$i,1))));
+ }
+ $_[1] = '' if $chk;
+ return $res;
+}
+
+
package Encode::utf8;
use base 'Encode::Encoding';
# package to allow long-hand
diff --git a/ext/MIME/Base64/Base64.xs b/ext/MIME/Base64/Base64.xs
index 118d170823..f77ba14eab 100644
--- a/ext/MIME/Base64/Base64.xs
+++ b/ext/MIME/Base64/Base64.xs
@@ -11,15 +11,15 @@ metamail, which comes with this message:
Copyright (c) 1991 Bell Communications Research, Inc. (Bellcore)
- Permission to use, copy, modify, and distribute this material
- for any purpose and without fee is hereby granted, provided
- that the above copyright notice and this permission notice
- appear in all copies, and that the name of Bellcore not be
- used in advertising or publicity pertaining to this
- material without the specific, prior written permission
- of an authorized representative of Bellcore. BELLCORE
- MAKES NO REPRESENTATIONS ABOUT THE ACCURACY OR SUITABILITY
- OF THIS MATERIAL FOR ANY PURPOSE. IT IS PROVIDED "AS IS",
+ Permission to use, copy, modify, and distribute this material
+ for any purpose and without fee is hereby granted, provided
+ that the above copyright notice and this permission notice
+ appear in all copies, and that the name of Bellcore not be
+ used in advertising or publicity pertaining to this
+ material without the specific, prior written permission
+ of an authorized representative of Bellcore. BELLCORE
+ MAKES NO REPRESENTATIONS ABOUT THE ACCURACY OR SUITABILITY
+ OF THIS MATERIAL FOR ANY PURPOSE. IT IS PROVIDED "AS IS",
WITHOUT ANY EXPRESS OR IMPLIED WARRANTIES.
*/
@@ -160,7 +160,7 @@ decode_base64(sv)
PREINIT:
STRLEN len;
- register unsigned char *str = (unsigned char*)SvPV(sv, len);
+ register unsigned char *str = (unsigned char*)SvPVbyte(sv, len);
unsigned char const* end = str + len;
char *r;
unsigned char c[4];
@@ -177,7 +177,7 @@ decode_base64(sv)
while (str < end) {
int i = 0;
do {
- unsigned char uc = index_64[*str++];
+ unsigned char uc = index_64[NATIVE_TO_ASCII(*str++)];
if (uc != INVALID)
c[i++] = uc;
@@ -192,7 +192,7 @@ decode_base64(sv)
break;
}
} while (i < 4);
-
+
if (c[0] == EQ || c[1] == EQ) {
if (PL_dowarn) warn("Premature padding of base64 data");
break;
diff --git a/ext/MIME/Base64/QuotedPrint.pm b/ext/MIME/Base64/QuotedPrint.pm
index ccdee2bbfa..069f3226e9 100644
--- a/ext/MIME/Base64/QuotedPrint.pm
+++ b/ext/MIME/Base64/QuotedPrint.pm
@@ -112,4 +112,38 @@ sub decode_qp ($)
*encode = \&encode_qp;
*decode = \&decode_qp;
+# Methods for use as a PerlIO layer object
+
+sub PUSHED
+{
+ my ($class,$mode) = @_;
+ # When writing we buffer the data
+ my $write = '';
+ return bless \$write,$class;
+}
+
+sub FILL
+{
+ my ($obj,$fh) = @_;
+ my $line = <$fh>;
+ return (defined $line) ? decode_qp($line) : undef;
+ return undef;
+}
+
+sub WRITE
+{
+ my ($obj,$buf,$fh) = @_;
+ $$obj .= encode_qp($buf);
+ return length($buf);
+}
+
+sub FLUSH
+{
+ my ($obj,$fh) = @_;
+ print $fh $$obj or return -1;
+ $$obj = '';
+ return 0;
+}
+
+
1;
diff --git a/t/lib/encode.t b/t/lib/encode.t
index d4a13eeeaa..ceeb422672 100644
--- a/t/lib/encode.t
+++ b/t/lib/encode.t
@@ -30,10 +30,10 @@ $cpy = $str;
ok(length($str),from_to($cpy,'iso8859-1','Unicode'),"Length Wrong");
my $sym = Encode->getEncoding('symbol');
-my $uni = $sym->decode('a');
+my $uni = $sym->decode(encode(ascii => 'a'));
ok("\N{alpha}",substr($uni,0,1),"alpha does not map to symbol 'a'");
$str = $sym->encode("\N{Beta}");
-ok("B",substr($str,0,1),"Symbol 'B' does not map to Beta");
+ok("B",decode(ascii => substr($str,0,1)),"Symbol 'B' does not map to Beta");
foreach my $enc (qw(symbol dingbats ascii),@encodings)
{
diff --git a/t/lib/mimeb64.t b/t/lib/mimeb64.t
index 5bb78b12fe..b1225521f3 100644
--- a/t/lib/mimeb64.t
+++ b/t/lib/mimeb64.t
@@ -9,6 +9,16 @@ print "1..282\n";
print "# Testing MIME::Base64-", $MIME::Base64::VERSION, "\n";
+BEGIN {
+ if (ord('A') != 193) {
+ *ASCII = sub { return $_[0] };
+ }
+ else {
+ require Encode;
+ *ASCII = sub { Encode::encode('ascii',$_[0]) };
+ }
+}
+
$testno = 1;
encodeTest();
@@ -23,27 +33,6 @@ sub encodeTest
print "# encode test\n";
my @encode_tests = (
- ['' => ''],
- ['a' => 'YQ=='],
- ['aa' => 'YWE='],
- ['aaa' => 'YWFh'],
-
- ['aaa' => 'YWFh'],
- ['aaa' => 'YWFh'],
- ['aaa' => 'YWFh'],
-
- ["\000\377" => "AP8="],
- ["\377\000" => "/wA="],
- ["\000\000\000" => "AAAA"],
-
- # from HTTP spec
- ['Aladdin:open sesame' => 'QWxhZGRpbjpvcGVuIHNlc2FtZQ=='],
-
- ['a' x 100 => 'YWFh' x 33 . 'YQ=='],
-
- ['Multipurpose Internet Mail Extensions: The Base64 Content-Transfer-Encoding is designed to represent sequences of octets in a form that is not humanly readable. '
- => "TXVsdGlwdXJwb3NlIEludGVybmV0IE1haWwgRXh0ZW5zaW9uczogVGhlIEJhc2U2NCBDb250ZW50LVRyYW5zZmVyLUVuY29kaW5nIGlzIGRlc2lnbmVkIHRvIHJlcHJlc2VudCBzZXF1ZW5jZXMgb2Ygb2N0ZXRzIGluIGEgZm9ybSB0aGF0IGlzIG5vdCBodW1hbmx5IHJlYWRhYmxlLiA="],
-
# All values
["\000" => "AA=="],
["\001" => "AQ=="],
@@ -301,6 +290,29 @@ sub encodeTest
["\375" => "/Q=="],
["\376" => "/g=="],
["\377" => "/w=="],
+
+ ["\000\377" => "AP8="],
+ ["\377\000" => "/wA="],
+ ["\000\000\000" => "AAAA"],
+
+ ['' => ''],
+ [ASCII('a') => 'YQ=='],
+ [ASCII('aa') => 'YWE='],
+ [ASCII('aaa') => 'YWFh'],
+
+ [ASCII('aaa') => 'YWFh'],
+ [ASCII('aaa') => 'YWFh'],
+ [ASCII('aaa') => 'YWFh'],
+
+
+ # from HTTP spec
+ [ASCII('Aladdin:open sesame') => 'QWxhZGRpbjpvcGVuIHNlc2FtZQ=='],
+
+ [ASCII('a') x 100 => 'YWFh' x 33 . 'YQ=='],
+
+ [ASCII('Multipurpose Internet Mail Extensions: The Base64 Content-Transfer-Encoding is designed to represent sequences of octets in a form that is not humanly readable. ')
+ => "TXVsdGlwdXJwb3NlIEludGVybmV0IE1haWwgRXh0ZW5zaW9uczogVGhlIEJhc2U2NCBDb250ZW50LVRyYW5zZmVyLUVuY29kaW5nIGlzIGRlc2lnbmVkIHRvIHJlcHJlc2VudCBzZXF1ZW5jZXMgb2Ygb2N0ZXRzIGluIGEgZm9ybSB0aGF0IGlzIG5vdCBodW1hbmx5IHJlYWRhYmxlLiA="],
+
);
for $test (@encode_tests) {
@@ -313,11 +325,12 @@ sub encodeTest
}
my $decoded = decode_base64($encoded);
if ($decoded ne $plain) {
- print "test $testno ($plain): expected $expected, got $encoded\n";
+ print "test $testno ($encoded): expected $plain, got $decoded\n";
print "not ";
}
- # Try the old C versions too
+ if (ord('A') != 193) { # perl versions broken on EBCDIC
+ # Try the old Perl versions too
if ($encoded ne MIME::Base64::old_encode_base64($plain, '')) {
print "old_encode_base64 give different result.\n";
print "not ";
@@ -326,6 +339,7 @@ sub encodeTest
print "old_decode_base64 give different result.\n";
print "not ";
}
+ }
print "ok $testno\n";
$testno++;
@@ -339,17 +353,17 @@ sub decodeTest
local $SIG{__WARN__} = sub { print $_[0] }; # avoid warnings on stderr
my @decode_tests = (
- ['YWE=' => 'aa'],
- [' YWE=' => 'aa'],
- ['Y WE=' => 'aa'],
- ['YWE= ' => 'aa'],
- ["Y\nW\r\nE=" => 'aa'],
+ ['YWE=' => ASCII('aa')],
+ [' YWE=' => ASCII('aa')],
+ ['Y WE=' => ASCII('aa')],
+ ['YWE= ' => ASCII('aa')],
+ ["Y\nW\r\nE=" => ASCII('aa')],
# These will generate some warnings
- ['YWE=====' => 'aa'], # extra padding
- ['YWE' => 'aa'], # missing padding
- ['YWFh====' => 'aaa'],
- ['YQ' => 'a'],
+ ['YWE=====' => ASCII('aa')], # extra padding
+ ['YWE' => ASCII('aa')], # missing padding
+ ['YWFh====' => ASCII('aaa')],
+ ['YQ' => ASCII('a')],
['Y' => ''],
['' => ''],
[undef() => ''],
diff --git a/utf8.c b/utf8.c
index b95c7ad164..66d3fec81c 100644
--- a/utf8.c
+++ b/utf8.c
@@ -1280,14 +1280,34 @@ UV
Perl_swash_fetch(pTHX_ SV *sv, U8 *ptr)
{
HV* hv = (HV*)SvRV(sv);
+ /* Given a UTF-X encoded char 0xAA..0xYY,0xZZ
+ then the "swatch" is a vec() for al the chars which start
+ with 0xAA..0xYY
+ So the key in the hash is length of encoded char -1
+ */
U32 klen = UTF8SKIP(ptr) - 1;
- U32 off = ptr[klen] & 127; /* NB: 64 bit always 0 when len > 1 */
+ U32 off = ptr[klen];
STRLEN slen;
- STRLEN needents = (klen ? 64 : 128);
+ STRLEN needents;
U8 *tmps;
U32 bit;
SV *retval;
+ if (klen == 0)
+ {
+ /* If char in invariant then swatch is for all the invariant chars
+ * In both UTF-8 and UTF8-MOD that happens to be UTF_CONTINUATION_MARK
+ */
+ needents = UTF_CONTINUATION_MARK;
+ off = NATIVE_TO_UTF(ptr[klen]);
+ }
+ else
+ {
+ /* If char is encoded then swatch is for the prefix */
+ needents = (1 << UTF_ACCUMULATION_SHIFT);
+ off = NATIVE_TO_UTF(ptr[klen]) & UTF_CONTINUATION_MASK;
+ }
+
/*
* This single-entry cache saves about 1/3 of the utf8 overhead in test
* suite. (That is, only 7-8% overall over just a hash cache. Still,
@@ -1337,7 +1357,7 @@ Perl_swash_fetch(pTHX_ SV *sv, U8 *ptr)
svp = hv_store(hv, (char*)ptr, klen, retval, 0);
- if (!svp || !(tmps = (U8*)SvPV(*svp, slen)) || slen < 8)
+ if (!svp || !(tmps = (U8*)SvPV(*svp, slen)) || (slen << 3) < needents)
Perl_croak(aTHX_ "SWASHGET didn't return result of proper length");
}