11 files changed, 216 insertions, 80 deletions
diff --git a/MANIFEST b/MANIFEST
index 957246ee90..42743f795d 100644
--- a/MANIFEST
+++ b/MANIFEST
@@ -879,6 +879,8 @@ lib/dotsh.pl			Code to "dot" in a shell script
 lib/Dumpvalue.pm		Screen dump of perl values
 lib/Dumpvalue.t			See if Dumpvalue works
 lib/dumpvar.pl			A variable dumper
+lib/encoding.pm			Encoding of legacy data into Unicode
+lib/encoding.t			Test for the encoding pragma
 lib/English.pm			Readable aliases for short variables
 lib/English.t			See if English works
 lib/Env.pm			Map environment into ordinary variables
diff --git a/embedvar.h b/embedvar.h
index 2eb540791e..95550e6d60 100644
--- a/embedvar.h
+++ b/embedvar.h
@@ -231,6 +231,7 @@
 #define PL_dowarn		(PERL_GET_INTERP->Idowarn)
 #define PL_e_script		(PERL_GET_INTERP->Ie_script)
 #define PL_egid			(PERL_GET_INTERP->Iegid)
+#define PL_encoding		(PERL_GET_INTERP->Iencoding)
 #define PL_endav		(PERL_GET_INTERP->Iendav)
 #define PL_envgv		(PERL_GET_INTERP->Ienvgv)
 #define PL_errgv		(PERL_GET_INTERP->Ierrgv)
@@ -522,6 +523,7 @@
 #define PL_dowarn		(vTHX->Idowarn)
 #define PL_e_script		(vTHX->Ie_script)
 #define PL_egid			(vTHX->Iegid)
+#define PL_encoding		(vTHX->Iencoding)
 #define PL_endav		(vTHX->Iendav)
 #define PL_envgv		(vTHX->Ienvgv)
 #define PL_errgv		(vTHX->Ierrgv)
@@ -816,6 +818,7 @@
 #define PL_Idowarn		PL_dowarn
 #define PL_Ie_script		PL_e_script
 #define PL_Iegid		PL_egid
+#define PL_Iencoding		PL_encoding
 #define PL_Iendav		PL_endav
 #define PL_Ienvgv		PL_envgv
 #define PL_Ierrgv		PL_errgv
diff --git a/gv.c b/gv.c
index e99b15c58e..53af8a513b 100644
--- a/gv.c
+++ b/gv.c
@@ -887,7 +887,6 @@ Perl_gv_fetchpv(pTHX_ const char *nambeg, I32 add, I32 sv_type)
     case '\001':	/* $^A */
     case '\003':	/* $^C */
     case '\004':	/* $^D */
-    case '\005':	/* $^E */
     case '\006':	/* $^F */
     case '\010':	/* $^H */
     case '\011':	/* $^I, NOT \t in EBCDIC */
@@ -901,6 +900,11 @@ Perl_gv_fetchpv(pTHX_ const char *nambeg, I32 add, I32 sv_type)
 	    break;
 	sv_setiv(GvSV(gv), (IV)(IoFLAGS(GvIOp(PL_defoutgv)) & IOf_FLUSH) != 0);
 	goto magicalize;
+    case '\005':	/* $^E && $^ENCODING */
+	if (len > 1 && strNE(name, "\005NCODING"))
+	    break;
+	goto magicalize;
+
     case '\017':	/* $^O & $^OPEN */
 	if (len > 1 && strNE(name, "\017PEN"))
 	    break;
diff --git a/intrpvar.h b/intrpvar.h
index c224ff7c5d..63c9397c55 100644
--- a/intrpvar.h
+++ b/intrpvar.h
@@ -499,6 +499,8 @@ PERLVARI(Iknown_layers, PerlIO_list_t *,NULL)
 PERLVARI(Idef_layerlist, PerlIO_list_t *,NULL)
 #endif
 
+PERLVARI(Iencoding,	SV*, Nullsv)		/* character encoding */
+
 /* New variables must be added to the very end for binary compatibility.
  * XSUB.h provides wrapper functions via perlapi.h that make this
  * irrelevant, but not all code may be expected to #include XSUB.h. */
diff --git a/lib/encoding.pm b/lib/encoding.pm
new file mode 100644
index 0000000000..472a10a008
--- /dev/null
+++ b/lib/encoding.pm
@@ -0,0 +1,54 @@
+package encoding;
+
+use Encode;
+
+sub import {
+    my ($class, $name) = @_;
+    $name = $ENV{PERL_ENCODING} if @_ < 2;
+    my $enc = find_encoding($name);
+    unless (defined $enc) {
+	require Carp;
+	Carp::croak "Unknown encoding '$name'";
+    }
+    ${^ENCODING} = $enc;
+}
+
+=pod
+
+=head1 NAME
+
+encoding - pragma to control the conversion of legacy data into Unicode
+
+=head1 SYNOPSIS
+
+    use encoding "iso 8859-7";
+
+    $a = "\xDF";
+    $b = "\x{100}";
+
+    $c = $a . $b;
+
+    # $c will be "\x{3af}\x{100}", not "\x{df}\x{100}".
+    # The \xDF of ISO 8859-7 is \x{3af} in Unicode.
+
+=head1 DESCRIPTION
+
+Normally when legacy 8-bit data is converted to Unicode the data is
+expected to be Latin-1 (or EBCDIC in EBCDIC platforms).  With the
+encoding pragma you can change this default.
+
+The pragma is a per script, not a per block lexical.  Only the last
+'use encoding' seen matters.
+
+=head1 FUTURE POSSIBILITIES
+
+The C<\x..> and C<\0...> in literals and regular expressions are not
+affected by this pragma.  They probably should.
+
+=head1 SEE ALSO
+
+L<perlunicode>
+
+=cut
+
+1;
diff --git a/lib/encoding.t b/lib/encoding.t
new file mode 100644
index 0000000000..6e18d34fe2
--- /dev/null
+++ b/lib/encoding.t
@@ -0,0 +1,24 @@
+print "1..3\n";
+
+use encoding "latin1"; # ignored (overwritten by the next line)
+use encoding "greek";
+
+$a = "\xDF";
+$b = "\x{100}";
+
+my $c = $a . $b;
+
+# "greek" is "ISO 8859-7", and \xDF in ISO 8859-7 is
+# \x3AF in Unicode (GREEK SMALL LETTER IOTA WITH TONOS),
+# instead of \xDF in Unicode (LATIN SMALL LETTER SHARP S)
+
+print "not " unless ord($c) == 0x3af;
+print "ok 1\n";
+
+print "not " unless length($c) == 2;
+print "ok 2\n";
+
+print "not " unless ord(substr($c, 1, 1)) == 0x100;
+print "ok 3\n";
+
+
diff --git a/mg.c b/mg.c
index 793035d9a8..3608e6ab63 100644
--- a/mg.c
+++ b/mg.c
@@ -519,62 +519,66 @@ Perl_magic_get(pTHX_ SV *sv, MAGIC *mg)
 #endif
 	break;
     case '\005':  /* ^E */
+	 if (*(mg->mg_ptr+1) == '\0') {
 #ifdef MACOS_TRADITIONAL
-	{
-	    char msg[256];
+	     {
+		  char msg[256];
 	
-	    sv_setnv(sv,(double)gMacPerl_OSErr);
-	    sv_setpv(sv, gMacPerl_OSErr ? GetSysErrText(gMacPerl_OSErr, msg) : "");	
-	}
+		  sv_setnv(sv,(double)gMacPerl_OSErr);
+		  sv_setpv(sv, gMacPerl_OSErr ? GetSysErrText(gMacPerl_OSErr, msg) : "");	
+	     }
 #else	
 #ifdef VMS
-	{
-#	    include <descrip.h>
-#	    include <starlet.h>
-	    char msg[255];
-	    $DESCRIPTOR(msgdsc,msg);
-	    sv_setnv(sv,(NV) vaxc$errno);
-	    if (sys$getmsg(vaxc$errno,&msgdsc.dsc$w_length,&msgdsc,0,0) & 1)
-		sv_setpvn(sv,msgdsc.dsc$a_pointer,msgdsc.dsc$w_length);
-	    else
-		sv_setpv(sv,"");
-	}
+	     {
+#	          include <descrip.h>
+#	          include <starlet.h>
+		  char msg[255];
+		  $DESCRIPTOR(msgdsc,msg);
+		  sv_setnv(sv,(NV) vaxc$errno);
+		  if (sys$getmsg(vaxc$errno,&msgdsc.dsc$w_length,&msgdsc,0,0) & 1)
+		       sv_setpvn(sv,msgdsc.dsc$a_pointer,msgdsc.dsc$w_length);
+		  else
+		       sv_setpv(sv,"");
+	     }
 #else
 #ifdef OS2
-	if (!(_emx_env & 0x200)) {	/* Under DOS */
-	    sv_setnv(sv, (NV)errno);
-	    sv_setpv(sv, errno ? Strerror(errno) : "");
-	} else {
-	    if (errno != errno_isOS2) {
-		int tmp = _syserrno();
-		if (tmp)	/* 2nd call to _syserrno() makes it 0 */
-		    Perl_rc = tmp;
-	    }
-	    sv_setnv(sv, (NV)Perl_rc);
-	    sv_setpv(sv, os2error(Perl_rc));
-	}
+	     if (!(_emx_env & 0x200)) {	/* Under DOS */
+		  sv_setnv(sv, (NV)errno);
+		  sv_setpv(sv, errno ? Strerror(errno) : "");
+	     } else {
+		  if (errno != errno_isOS2) {
+		       int tmp = _syserrno();
+		       if (tmp)	/* 2nd call to _syserrno() makes it 0 */
+			    Perl_rc = tmp;
+		  }
+		  sv_setnv(sv, (NV)Perl_rc);
+		  sv_setpv(sv, os2error(Perl_rc));
+	     }
 #else
 #ifdef WIN32
-	{
-	    DWORD dwErr = GetLastError();
-	    sv_setnv(sv, (NV)dwErr);
-	    if (dwErr)
-	    {
-		PerlProc_GetOSError(sv, dwErr);
-	    }
-	    else
-		sv_setpv(sv, "");
-	    SetLastError(dwErr);
-	}
+	     {
+		  DWORD dwErr = GetLastError();
+		  sv_setnv(sv, (NV)dwErr);
+		  if (dwErr)
+		  {
+		       PerlProc_GetOSError(sv, dwErr);
+		  }
+		  else
+		       sv_setpv(sv, "");
+		  SetLastError(dwErr);
+	     }
 #else
-	sv_setnv(sv, (NV)errno);
-	sv_setpv(sv, errno ? Strerror(errno) : "");
+	     sv_setnv(sv, (NV)errno);
+	     sv_setpv(sv, errno ? Strerror(errno) : "");
 #endif
 #endif
 #endif
 #endif
-	SvNOK_on(sv);	/* what a wonderful hack! */
-	break;
+	     SvNOK_on(sv);	/* what a wonderful hack! */
+	 }
+	 else if (strEQ(mg->mg_ptr+1, "NCODING"))
+	      sv_setsv(sv, PL_encoding);
+	 break;
     case '\006':		/* ^F */
 	sv_setiv(sv, (IV)PL_maxsysfd);
 	break;
@@ -625,7 +629,7 @@ Perl_magic_get(pTHX_ SV *sv, MAGIC *mg)
     case '\027':		/* ^W  & $^WARNING_BITS & ^WIDE_SYSTEM_CALLS */
 	if (*(mg->mg_ptr+1) == '\0')
 	    sv_setiv(sv, (IV)((PL_dowarn & G_WARN_ON) ? TRUE : FALSE));
-	else if (strEQ(mg->mg_ptr, "\027ARNING_BITS")) {
+	else if (strEQ(mg->mg_ptr+1, "ARNING_BITS")) {
 	    if (PL_compiling.cop_warnings == pWARN_NONE ||
 	        PL_compiling.cop_warnings == pWARN_STD)
 	    {
@@ -639,7 +643,7 @@ Perl_magic_get(pTHX_ SV *sv, MAGIC *mg)
 	    }
 	    SvPOK_only(sv);
 	}
-	else if (strEQ(mg->mg_ptr, "\027IDE_SYSTEM_CALLS"))
+	else if (strEQ(mg->mg_ptr+1, "IDE_SYSTEM_CALLS"))
 	    sv_setiv(sv, (IV)PL_widesyscalls);
 	break;
     case '1': case '2': case '3': case '4':
@@ -1742,25 +1746,32 @@ Perl_magic_set(pTHX_ SV *sv, MAGIC *mg)
 	DEBUG_x(dump_all());
 	break;
     case '\005':  /* ^E */
+	 if (*(mg->mg_ptr+1) == '\0') {
 #ifdef MACOS_TRADITIONAL
-	gMacPerl_OSErr = SvIOK(sv) ? SvIVX(sv) : sv_2iv(sv);
+	      gMacPerl_OSErr = SvIOK(sv) ? SvIVX(sv) : sv_2iv(sv);
 #else
 #  ifdef VMS
-	set_vaxc_errno(SvIOK(sv) ? SvIVX(sv) : sv_2iv(sv));
+	      set_vaxc_errno(SvIOK(sv) ? SvIVX(sv) : sv_2iv(sv));
 #  else
 #    ifdef WIN32
-	SetLastError( SvIV(sv) );
+	      SetLastError( SvIV(sv) );
 #    else
 #      ifdef OS2
-	os2_setsyserrno(SvIOK(sv) ? SvIVX(sv) : sv_2iv(sv));
+	      os2_setsyserrno(SvIOK(sv) ? SvIVX(sv) : sv_2iv(sv));
 #      else
-	/* will anyone ever use this? */
-	SETERRNO(SvIOK(sv) ? SvIVX(sv) : sv_2iv(sv), 4);
+	      /* will anyone ever use this? */
+	      SETERRNO(SvIOK(sv) ? SvIVX(sv) : sv_2iv(sv), 4);
 #      endif
 #    endif
 #  endif
 #endif
-	break;
+	 }
+	 else if (strEQ(mg->mg_ptr+1, "NCODING")) {
+	     if (PL_encoding)
+	         sv_setsv(PL_encoding, sv);
+	     else
+	         PL_encoding = newSVsv(sv);
+	 }
     case '\006':	/* ^F */
 	PL_maxsysfd = SvIOK(sv) ? SvIVX(sv) : sv_2iv(sv);
 	break;
@@ -1811,7 +1822,7 @@ Perl_magic_set(pTHX_ SV *sv, MAGIC *mg)
 		    		| (i ? G_WARN_ON : G_WARN_OFF) ;
 	    }
 	}
-	else if (strEQ(mg->mg_ptr, "\027ARNING_BITS")) {
+	else if (strEQ(mg->mg_ptr+1, "ARNING_BITS")) {
 	    if ( ! (PL_dowarn & G_WARN_ALL_MASK)) {
 		if (!SvPOK(sv) && PL_localizing) {
 	            sv_setpvn(sv, WARN_NONEstring, WARNsize);
@@ -1845,7 +1856,7 @@ Perl_magic_set(pTHX_ SV *sv, MAGIC *mg)
 		}
 	    }
 	}
-	else if (strEQ(mg->mg_ptr, "\027IDE_SYSTEM_CALLS"))
+	else if (strEQ(mg->mg_ptr+1, "IDE_SYSTEM_CALLS"))
 	    PL_widesyscalls = SvTRUE(sv);
 	break;
     case '.':
diff --git a/perlapi.h b/perlapi.h
index 2811a44650..0592374d1a 100644
--- a/perlapi.h
+++ b/perlapi.h
@@ -199,6 +199,8 @@ END_EXTERN_C
 #define PL_e_script		(*Perl_Ie_script_ptr(aTHX))
 #undef  PL_egid
 #define PL_egid			(*Perl_Iegid_ptr(aTHX))
+#undef  PL_encoding
+#define PL_encoding		(*Perl_Iencoding_ptr(aTHX))
 #undef  PL_endav
 #define PL_endav		(*Perl_Iendav_ptr(aTHX))
 #undef  PL_envgv
diff --git a/pod/perlunicode.pod b/pod/perlunicode.pod
index 0b52afa808..9205fdf77c 100644
--- a/pod/perlunicode.pod
+++ b/pod/perlunicode.pod
@@ -102,10 +102,11 @@ literal UTF-8 string constant in the program), character semantics
 apply; otherwise, byte semantics are in effect.  To force byte semantics
 on Unicode data, the C<bytes> pragma should be used.
 
-Notice that if you have a string with byte semantics and you then
-add character data into it, the bytes will be upgraded I<as if they
-were ISO 8859-1 (Latin-1)> (or if in EBCDIC, after a translation
-to ISO 8859-1).
+Notice that if you concatenate strings with byte semantics and strings
+with Unicode character data, the bytes will by default be upgraded
+I<as if they were ISO 8859-1 (Latin-1)> (or if in EBCDIC, after a
+translation to ISO 8859-1).  To change this, use the C<encoding>
+pragma, see L<encoding>.
 
 Under character semantics, many operations that formerly operated on
 bytes change to operating on characters.  For ASCII data this makes no
diff --git a/pod/perlvar.pod b/pod/perlvar.pod
index 64fc7fd654..d34daa6580 100644
--- a/pod/perlvar.pod
+++ b/pod/perlvar.pod
@@ -654,6 +654,11 @@ status; see L<perlvms/$?> for details.
 
 Also see L<Error Indicators>.
 
+=item ${^ENCODING}
+
+The encoding used to interpret native eight-bit encodings to Unicode,
+see L<encode>.  An opaque C<Encode::XS> object.
+
 =item $OS_ERROR
 
 =item $ERRNO
diff --git a/sv.c b/sv.c
index 5885b8e019..520734cf55 100644
--- a/sv.c
+++ b/sv.c
@@ -3302,27 +3302,54 @@ Perl_sv_utf8_upgrade_flags(pTHX_ register SV *sv, I32 flags)
 	sv_force_normal(sv);
     }
 
-    /* This function could be much more efficient if we had a FLAG in SVs
-     * to signal if there are any hibit chars in the PV.
-     * Given that there isn't make loop fast as possible
-     */
-    s = (U8 *) SvPVX(sv);
-    e = (U8 *) SvEND(sv);
-    t = s;
-    while (t < e) {
-	U8 ch = *t++;
-	if ((hibit = !NATIVE_IS_INVARIANT(ch)))
-	    break;
-    }
-    if (hibit) {
-	STRLEN len;
-
-	len = SvCUR(sv) + 1; /* Plus the \0 */
-	SvPVX(sv) = (char*)bytes_to_utf8((U8*)s, &len);
-	SvCUR(sv) = len - 1;
-	if (SvLEN(sv) != 0)
-	    Safefree(s); /* No longer using what was there before. */
-	SvLEN(sv) = len; /* No longer know the real size. */
+    if (PL_encoding) {
+         SV *uni;
+	 STRLEN len;
+	 char *s;
+	 dSP;
+	 ENTER;
+	 SAVETMPS;
+	 PUSHMARK(sp);
+	 EXTEND(SP, 3);
+	 XPUSHs(PL_encoding);
+	 XPUSHs(sv);
+	 XPUSHs(&PL_sv_yes);
+	 PUTBACK;
+	 call_method("decode", G_SCALAR);
+	 SPAGAIN;
+	 uni = POPs;
+	 PUTBACK;
+	 s = SvPVutf8(uni, len);
+	 if (s != SvPVX(sv)) {
+	      SvGROW(sv, len);
+	      Move(s, SvPVX(sv), len, char);
+	      SvCUR_set(sv, len);
+	 }
+	 FREETMPS;
+	 LEAVE;
+    } else { /* Assume Latin-1/EBCDIC */
+	 /* This function could be much more efficient if we
+	  * had a FLAG in SVs to signal if there are any hibit
+	  * chars in the PV.  Given that there isn't such a flag
+	  * make the loop as fast as possible. */
+	 s = (U8 *) SvPVX(sv);
+	 e = (U8 *) SvEND(sv);
+	 t = s;
+	 while (t < e) {
+	      U8 ch = *t++;
+	      if ((hibit = !NATIVE_IS_INVARIANT(ch)))
+		   break;
+	 }
+	 if (hibit) {
+	      STRLEN len;
+	      
+	      len = SvCUR(sv) + 1; /* Plus the \0 */
+	      SvPVX(sv) = (char*)bytes_to_utf8((U8*)s, &len);
+	      SvCUR(sv) = len - 1;
+	      if (SvLEN(sv) != 0)
+		   Safefree(s); /* No longer using what was there before. */
+	      SvLEN(sv) = len; /* No longer know the real size. */
+	 }
     }
     /* Mark as UTF-8 even if no hibit - saves scanning loop */
     SvUTF8_on(sv);
@@ -9827,6 +9854,7 @@ perl_clone_using(PerlInterpreter *proto_perl, UV flags,
 #ifdef VMS
     PL_statusvalue_vms	= proto_perl->Istatusvalue_vms;
 #endif
+    PL_encoding		= sv_dup(proto_perl->Iencoding, param);
 
     /* Clone the regex array */
     PL_regex_padav = newAV();