Integrate perlio:

[ 9384] Various EBCDIC fixes: - major revelation that swash code is encoding aware, (or thought it was) - now it is ;-) - With that out of the way fix a slab of tr/// cases. - Fix Encode 'Unicode' to be true Unicode so tests pass. - As anticipated Base64.xs needed tweaks. - Until tr/// works right avoid old_encode64 in MIME tests. p4raw-link: @9384 on //depot/perlio: 5ad8ef521b3ffc4e6bbbb9941bc4940d442b56b2 p4raw-id: //depot/perl@9389
author: Jarkko Hietaniemi <jhi@iki.fi> 2001-03-27 20:24:31 +0000
committer: Jarkko Hietaniemi <jhi@iki.fi> 2001-03-27 20:24:31 +0000
commit: 60ad9f697ba2c0e755139d5afc80911e68d22894 (patch)
tree: eb8b461ed09d1aa3c50f0a74a94105d9fcfcc833
parent: 521a8ec7fe1e7a81f500dfcdc4a8085b16566a30 (diff)
download: perl-60ad9f697ba2c0e755139d5afc80911e68d22894.tar.gz
7 files changed, 166 insertions, 65 deletions
diff --git a/doop.c b/doop.c
index 823c88d18e..266411a009 100644
--- a/doop.c
+++ b/doop.c
@@ -36,7 +36,7 @@ S_do_trans_simple(pTHX_ SV *sv)
 
     tbl = (short*)cPVOP->op_pv;
     if (!tbl)
-	Perl_croak(aTHX_ "panic: do_trans_simple");
+	Perl_croak(aTHX_ "panic: do_trans_simple line %d",__LINE__);
 
     s = (U8*)SvPV(sv, len);
     send = s + len;
@@ -103,7 +103,7 @@ S_do_trans_count(pTHX_ SV *sv)/* SPC - OK */
 
     tbl = (short*)cPVOP->op_pv;
     if (!tbl)
-	Perl_croak(aTHX_ "panic: do_trans_count");
+	Perl_croak(aTHX_ "panic: do_trans_count line %d",__LINE__);
 
     s = (U8*)SvPV(sv, len);
     send = s + len;
@@ -147,7 +147,7 @@ S_do_trans_complex(pTHX_ SV *sv)/* SPC - NOT OK */
 
     tbl = (short*)cPVOP->op_pv;
     if (!tbl)
-	Perl_croak(aTHX_ "panic: do_trans_complex");
+	Perl_croak(aTHX_ "panic: do_trans_complex line %d",__LINE__);
 
     s = (U8*)SvPV(sv, len);
     isutf8 = SvUTF8(sv);
@@ -346,7 +346,7 @@ S_do_trans_simple_utf8(pTHX_ SV *sv)/* SPC - OK */
 	if ((uv = swash_fetch(rv, s)) < none) {
 	    s += UTF8SKIP(s);
 	    matches++;
-	    d = uvchr_to_utf8(d, uv);
+	    d = uvuni_to_utf8(d, uv);
 	}
 	else if (uv == none) {
 	    int i = UTF8SKIP(s);
@@ -358,7 +358,7 @@ S_do_trans_simple_utf8(pTHX_ SV *sv)/* SPC - OK */
 	    int i = UTF8SKIP(s);
 	    s += i;
 	    matches++;
-	    d = uvchr_to_utf8(d, final);
+	    d = uvuni_to_utf8(d, final);
 	}
 	else
 	    s += UTF8SKIP(s);
@@ -367,7 +367,7 @@ S_do_trans_simple_utf8(pTHX_ SV *sv)/* SPC - OK */
 	    STRLEN clen = d - dstart;
 	    STRLEN nlen = dend - dstart + len + UTF8_MAXLEN;
 	    if (!grows)
-		Perl_croak(aTHX_ "panic: do_trans_complex_utf8");
+		Perl_croak(aTHX_ "panic: do_trans_simple_utf8 line %d",__LINE__);
 	    Renew(dstart, nlen+UTF8_MAXLEN, U8);
 	    d = dstart + clen;
 	    dend = dstart + nlen;
@@ -496,7 +496,7 @@ S_do_trans_complex_utf8(pTHX_ SV *sv) /* SPC - NOT OK */
 	        STRLEN clen = d - dstart;
 		STRLEN nlen = dend - dstart + len + UTF8_MAXLEN;
 		if (!grows)
-		    Perl_croak(aTHX_ "panic: do_trans_complex_utf8");
+		    Perl_croak(aTHX_ "panic: do_trans_complex_utf8 line %d",__LINE__);
 		Renew(dstart, nlen+UTF8_MAXLEN, U8);
 		d = dstart + clen;
 		dend = dstart + nlen;
@@ -505,7 +505,7 @@ S_do_trans_complex_utf8(pTHX_ SV *sv) /* SPC - NOT OK */
 		matches++;
 		s += UTF8SKIP(s);
 		if (uv != puv) {
-		    d = uvchr_to_utf8(d, uv);
+		    d = uvuni_to_utf8(d, uv);
 		    puv = uv;
 		}
 		continue;
@@ -523,13 +523,13 @@ S_do_trans_complex_utf8(pTHX_ SV *sv) /* SPC - NOT OK */
 		if (havefinal) {
 		    s += UTF8SKIP(s);
 		    if (puv != final) {
-			d = uvchr_to_utf8(d, final);
+			d = uvuni_to_utf8(d, final);
 			puv = final;
 		    }
 		}
 		else {
 		    STRLEN len;
-		    uv = utf8_to_uvchr(s, &len);
+		    uv = utf8_to_uvuni(s, &len);
 		    if (uv != puv) {
 			Copy(s, d, len, U8);
 			d += len;
@@ -550,7 +550,7 @@ S_do_trans_complex_utf8(pTHX_ SV *sv) /* SPC - NOT OK */
 	        STRLEN clen = d - dstart;
 		STRLEN nlen = dend - dstart + len + UTF8_MAXLEN;
 		if (!grows)
-		    Perl_croak(aTHX_ "panic: do_trans_complex_utf8");
+		    Perl_croak(aTHX_ "panic: do_trans_complex_utf8 line %d",__LINE__);
 		Renew(dstart, nlen+UTF8_MAXLEN, U8);
 		d = dstart + clen;
 		dend = dstart + nlen;
@@ -558,7 +558,7 @@ S_do_trans_complex_utf8(pTHX_ SV *sv) /* SPC - NOT OK */
 	    if (uv < none) {
 		matches++;
 		s += UTF8SKIP(s);
-		d = uvchr_to_utf8(d, uv);
+		d = uvuni_to_utf8(d, uv);
 		continue;
 	    }
 	    else if (uv == none) {	/* "none" is unmapped character */
@@ -571,7 +571,7 @@ S_do_trans_complex_utf8(pTHX_ SV *sv) /* SPC - NOT OK */
 	    else if (uv == extra && !del) {
 		matches++;
 		s += UTF8SKIP(s);
-		d = uvchr_to_utf8(d, final);
+		d = uvuni_to_utf8(d, final);
 		continue;
 	    }
 	    matches++;			/* "none+1" is delete character */
diff --git a/ext/Encode/Encode.pm b/ext/Encode/Encode.pm
index 650180647b..fde3891b6f 100644
--- a/ext/Encode/Encode.pm
+++ b/ext/Encode/Encode.pm
@@ -60,6 +60,7 @@ sub findAlias
      my $alias = $alias[$i];
      my $val   = $alias[$i+1];
      my $new;
+
      if (ref($alias) eq 'Regexp' && $_ =~ $alias)
       {
        $new = eval $val;
@@ -68,7 +69,7 @@ sub findAlias
       {
        $new = &{$alias}($val)
       }
-     elsif (lc($_) eq $alias)
+     elsif (lc($_) eq lc($alias))
       {
        $new = $val;
       }
@@ -222,13 +223,15 @@ sub new_sequence { return $_[0] }
 package Encode::XS;
 use base 'Encode::Encoding';
 
-package Encode::Unicode;
+package Encode::Internal;
 use base 'Encode::Encoding';
 
 # Dummy package that provides the encode interface but leaves data
 # as UTF-X encoded. It is here so that from_to() works.
 
-__PACKAGE__->Define('Unicode');
+__PACKAGE__->Define('Internal');
+
+Encode::define_alias( 'Unicode' => 'Internal' ) if ord('A') == 65;
 
 sub decode
 {
@@ -240,6 +243,36 @@ sub decode
 
 *encode = \&decode;
 
+package Encoding::Unicode;
+use base 'Encode::Encoding';
+
+__PACKAGE__->Define('Unicode') unless ord('A') == 65;
+
+sub decode
+{
+ my ($obj,$str,$chk) = @_;
+ my $res = '';
+ for (my $i = 0; $i < length($str); $i++)
+  {
+   $res .= chr(utf8::unicode_to_native(ord(substr($str,$i,1))));
+  }
+ $_[1] = '' if $chk;
+ return $res;
+}
+
+sub encode
+{
+ my ($obj,$str,$chk) = @_;
+ my $res = '';
+ for (my $i = 0; $i < length($str); $i++)
+  {
+   $res .= chr(utf8::native_to_unicode(ord(substr($str,$i,1))));
+  }
+ $_[1] = '' if $chk;
+ return $res;
+}
+
+
 package Encode::utf8;
 use base 'Encode::Encoding';
 # package to allow long-hand
diff --git a/ext/MIME/Base64/Base64.xs b/ext/MIME/Base64/Base64.xs
index 118d170823..f77ba14eab 100644
--- a/ext/MIME/Base64/Base64.xs
+++ b/ext/MIME/Base64/Base64.xs
@@ -11,15 +11,15 @@ metamail, which comes with this message:
 
   Copyright (c) 1991 Bell Communications Research, Inc. (Bellcore)
 
-  Permission to use, copy, modify, and distribute this material 
-  for any purpose and without fee is hereby granted, provided 
-  that the above copyright notice and this permission notice 
-  appear in all copies, and that the name of Bellcore not be 
-  used in advertising or publicity pertaining to this 
-  material without the specific, prior written permission 
-  of an authorized representative of Bellcore.	BELLCORE 
-  MAKES NO REPRESENTATIONS ABOUT THE ACCURACY OR SUITABILITY 
-  OF THIS MATERIAL FOR ANY PURPOSE.  IT IS PROVIDED "AS IS", 
+  Permission to use, copy, modify, and distribute this material
+  for any purpose and without fee is hereby granted, provided
+  that the above copyright notice and this permission notice
+  appear in all copies, and that the name of Bellcore not be
+  used in advertising or publicity pertaining to this
+  material without the specific, prior written permission
+  of an authorized representative of Bellcore.	BELLCORE
+  MAKES NO REPRESENTATIONS ABOUT THE ACCURACY OR SUITABILITY
+  OF THIS MATERIAL FOR ANY PURPOSE.  IT IS PROVIDED "AS IS",
   WITHOUT ANY EXPRESS OR IMPLIED WARRANTIES.
 
 */
@@ -160,7 +160,7 @@ decode_base64(sv)
 
 	PREINIT:
 	STRLEN len;
-	register unsigned char *str = (unsigned char*)SvPV(sv, len);
+	register unsigned char *str = (unsigned char*)SvPVbyte(sv, len);
 	unsigned char const* end = str + len;
 	char *r;
 	unsigned char c[4];
@@ -177,7 +177,7 @@ decode_base64(sv)
 	while (str < end) {
 	    int i = 0;
             do {
-		unsigned char uc = index_64[*str++];
+		unsigned char uc = index_64[NATIVE_TO_ASCII(*str++)];
 		if (uc != INVALID)
 		    c[i++] = uc;
 
@@ -192,7 +192,7 @@ decode_base64(sv)
 		    break;
 		}
             } while (i < 4);
-	    
+	
 	    if (c[0] == EQ || c[1] == EQ) {
 		if (PL_dowarn) warn("Premature padding of base64 data");
 		break;
diff --git a/ext/MIME/Base64/QuotedPrint.pm b/ext/MIME/Base64/QuotedPrint.pm
index ccdee2bbfa..069f3226e9 100644
--- a/ext/MIME/Base64/QuotedPrint.pm
+++ b/ext/MIME/Base64/QuotedPrint.pm
@@ -112,4 +112,38 @@ sub decode_qp ($)
 *encode = \&encode_qp;
 *decode = \&decode_qp;
 
+# Methods for use as a PerlIO layer object
+
+sub PUSHED
+{
+ my ($class,$mode) = @_;
+ # When writing we buffer the data
+ my $write = '';
+ return bless \$write,$class;
+}
+
+sub FILL
+{
+ my ($obj,$fh) = @_;
+ my $line = <$fh>;
+ return (defined $line) ? decode_qp($line) : undef;
+ return undef;
+}
+
+sub WRITE
+{
+ my ($obj,$buf,$fh) = @_;
+ $$obj .= encode_qp($buf);
+ return length($buf);
+}
+
+sub FLUSH
+{
+ my ($obj,$fh) = @_;
+ print $fh $$obj or return -1;
+ $$obj = '';
+ return 0;
+}
+
+
 1;
diff --git a/t/lib/encode.t b/t/lib/encode.t
index d4a13eeeaa..ceeb422672 100644
--- a/t/lib/encode.t
+++ b/t/lib/encode.t
@@ -30,10 +30,10 @@ $cpy = $str;
 ok(length($str),from_to($cpy,'iso8859-1','Unicode'),"Length Wrong");
 
 my $sym = Encode->getEncoding('symbol');
-my $uni = $sym->decode('a');
+my $uni = $sym->decode(encode(ascii => 'a'));
 ok("\N{alpha}",substr($uni,0,1),"alpha does not map to symbol 'a'");
 $str = $sym->encode("\N{Beta}");
-ok("B",substr($str,0,1),"Symbol 'B' does not map to Beta");
+ok("B",decode(ascii => substr($str,0,1)),"Symbol 'B' does not map to Beta");
 
 foreach my $enc (qw(symbol dingbats ascii),@encodings)
  {
diff --git a/t/lib/mimeb64.t b/t/lib/mimeb64.t
index 5bb78b12fe..b1225521f3 100644
--- a/t/lib/mimeb64.t
+++ b/t/lib/mimeb64.t
@@ -9,6 +9,16 @@ print "1..282\n";
 
 print "# Testing MIME::Base64-", $MIME::Base64::VERSION, "\n";
 
+BEGIN {
+ if (ord('A') != 193) {
+  *ASCII = sub { return $_[0] };
+ }
+ else {
+  require Encode;
+  *ASCII = sub { Encode::encode('ascii',$_[0]) };
+ }
+}
+
 $testno = 1;
 
 encodeTest();
@@ -23,27 +33,6 @@ sub encodeTest
     print "# encode test\n";
 
     my @encode_tests = (
-        [''    => ''],
-	['a'   => 'YQ=='],
-	['aa'  => 'YWE='],
-	['aaa' => 'YWFh'],
-
-	['aaa' => 'YWFh'],
-	['aaa' => 'YWFh'],
-	['aaa' => 'YWFh'],
-
-	["\000\377" => "AP8="],
-	["\377\000" => "/wA="],
-	["\000\000\000" => "AAAA"],
-
-	# from HTTP spec
-	['Aladdin:open sesame' => 'QWxhZGRpbjpvcGVuIHNlc2FtZQ=='],
-
-	['a' x 100 => 'YWFh' x 33 . 'YQ=='],
-
-	['Multipurpose Internet Mail Extensions: The Base64 Content-Transfer-Encoding is designed to represent sequences of octets in a form that is not humanly readable. '
-	=> "TXVsdGlwdXJwb3NlIEludGVybmV0IE1haWwgRXh0ZW5zaW9uczogVGhlIEJhc2U2NCBDb250ZW50LVRyYW5zZmVyLUVuY29kaW5nIGlzIGRlc2lnbmVkIHRvIHJlcHJlc2VudCBzZXF1ZW5jZXMgb2Ygb2N0ZXRzIGluIGEgZm9ybSB0aGF0IGlzIG5vdCBodW1hbmx5IHJlYWRhYmxlLiA="],
-
 	# All values
 	["\000" => "AA=="],
 	["\001" => "AQ=="],
@@ -301,6 +290,29 @@ sub encodeTest
 	["\375" => "/Q=="],
 	["\376" => "/g=="],
 	["\377" => "/w=="],
+
+	["\000\377" => "AP8="],
+	["\377\000" => "/wA="],
+	["\000\000\000" => "AAAA"],
+
+        [''    => ''],
+	[ASCII('a')   => 'YQ=='],
+	[ASCII('aa')  => 'YWE='],
+	[ASCII('aaa') => 'YWFh'],
+
+	[ASCII('aaa') => 'YWFh'],
+	[ASCII('aaa') => 'YWFh'],
+	[ASCII('aaa') => 'YWFh'],
+
+
+	# from HTTP spec
+	[ASCII('Aladdin:open sesame') => 'QWxhZGRpbjpvcGVuIHNlc2FtZQ=='],
+
+	[ASCII('a') x 100 => 'YWFh' x 33 . 'YQ=='],
+
+	[ASCII('Multipurpose Internet Mail Extensions: The Base64 Content-Transfer-Encoding is designed to represent sequences of octets in a form that is not humanly readable. ')
+	=> "TXVsdGlwdXJwb3NlIEludGVybmV0IE1haWwgRXh0ZW5zaW9uczogVGhlIEJhc2U2NCBDb250ZW50LVRyYW5zZmVyLUVuY29kaW5nIGlzIGRlc2lnbmVkIHRvIHJlcHJlc2VudCBzZXF1ZW5jZXMgb2Ygb2N0ZXRzIGluIGEgZm9ybSB0aGF0IGlzIG5vdCBodW1hbmx5IHJlYWRhYmxlLiA="],
+
     );
 
     for $test (@encode_tests) {
@@ -313,11 +325,12 @@ sub encodeTest
 	}
 	my $decoded = decode_base64($encoded);
 	if ($decoded ne $plain) {
-	    print "test $testno ($plain): expected $expected, got $encoded\n";
+	    print "test $testno ($encoded): expected $plain, got $decoded\n";
             print "not ";
 	}
 
-	# Try the old C versions too
+	if (ord('A') != 193) { # perl versions broken on EBCDIC
+	# Try the old Perl versions too
 	if ($encoded ne MIME::Base64::old_encode_base64($plain, '')) {
 	    print "old_encode_base64 give different result.\n";
 	    print "not ";
@@ -326,6 +339,7 @@ sub encodeTest
 	    print "old_decode_base64 give different result.\n";
 	    print "not ";
         }
+	}
 		
 	print "ok $testno\n";
 	$testno++;
@@ -339,17 +353,17 @@ sub decodeTest
     local $SIG{__WARN__} = sub { print $_[0] };  # avoid warnings on stderr
 
     my @decode_tests = (
-	['YWE='   => 'aa'],
-	[' YWE='  => 'aa'],
-	['Y WE='  => 'aa'],
-	['YWE= '  => 'aa'],
-	["Y\nW\r\nE=" => 'aa'],
+	['YWE='   => ASCII('aa')],
+	[' YWE='  =>  ASCII('aa')],
+	['Y WE='  =>  ASCII('aa')],
+	['YWE= '  =>  ASCII('aa')],
+	["Y\nW\r\nE=" =>  ASCII('aa')],
 
 	# These will generate some warnings
-        ['YWE=====' => 'aa'],    # extra padding
-	['YWE'      => 'aa'],    # missing padding
-        ['YWFh====' => 'aaa'],
-        ['YQ'       => 'a'],
+        ['YWE=====' =>  ASCII('aa')],    # extra padding
+	['YWE'      =>  ASCII('aa')],    # missing padding
+        ['YWFh====' =>  ASCII('aaa')],
+        ['YQ'       =>  ASCII('a')],
         ['Y'        => ''],
         [''         => ''],
         [undef()    => ''],
diff --git a/utf8.c b/utf8.c
index b95c7ad164..66d3fec81c 100644
--- a/utf8.c
+++ b/utf8.c
@@ -1280,14 +1280,34 @@ UV
 Perl_swash_fetch(pTHX_ SV *sv, U8 *ptr)
 {
     HV* hv = (HV*)SvRV(sv);
+    /* Given a UTF-X encoded char 0xAA..0xYY,0xZZ
+       then the "swatch" is a vec() for al the chars which start
+       with 0xAA..0xYY
+       So the key in the hash is length of encoded char -1
+     */
     U32 klen = UTF8SKIP(ptr) - 1;
-    U32 off = ptr[klen] & 127;  /* NB: 64 bit always 0 when len > 1 */
+    U32 off = ptr[klen];
     STRLEN slen;
-    STRLEN needents = (klen ? 64 : 128);
+    STRLEN needents;
     U8 *tmps;
     U32 bit;
     SV *retval;
 
+    if (klen == 0)
+     {
+      /* If char in invariant then swatch is for all the invariant chars
+       * In both UTF-8 and UTF8-MOD that happens to be UTF_CONTINUATION_MARK
+       */
+      needents = UTF_CONTINUATION_MARK;
+      off      = NATIVE_TO_UTF(ptr[klen]);
+     }
+    else
+     {
+      /* If char is encoded then swatch is for the prefix */
+      needents = (1 << UTF_ACCUMULATION_SHIFT);
+      off      = NATIVE_TO_UTF(ptr[klen]) & UTF_CONTINUATION_MASK;
+     }
+
     /*
      * This single-entry cache saves about 1/3 of the utf8 overhead in test
      * suite.  (That is, only 7-8% overall over just a hash cache.  Still,
@@ -1337,7 +1357,7 @@ Perl_swash_fetch(pTHX_ SV *sv, U8 *ptr)
 
 	    svp = hv_store(hv, (char*)ptr, klen, retval, 0);
 
-	    if (!svp || !(tmps = (U8*)SvPV(*svp, slen)) || slen < 8)
+	    if (!svp || !(tmps = (U8*)SvPV(*svp, slen)) || (slen << 3) < needents)
 		Perl_croak(aTHX_ "SWASHGET didn't return result of proper length");
 	}
author	Jarkko Hietaniemi <jhi@iki.fi>	2001-03-27 20:24:31 +0000
committer	Jarkko Hietaniemi <jhi@iki.fi>	2001-03-27 20:24:31 +0000
commit	60ad9f697ba2c0e755139d5afc80911e68d22894 (patch)
tree	eb8b461ed09d1aa3c50f0a74a94105d9fcfcc833
parent	521a8ec7fe1e7a81f500dfcdc4a8085b16566a30 (diff)
download	perl-60ad9f697ba2c0e755139d5afc80911e68d22894.tar.gz