Batch of UTF-8 patches from Simon Cozens.

p4raw-id: //depot/perl@7075
author: Jarkko Hietaniemi <jhi@iki.fi> 2000-09-14 14:40:40 +0000
committer: Jarkko Hietaniemi <jhi@iki.fi> 2000-09-14 14:40:40 +0000
commit: 67e989fb549091286d76fd8d29f1ec03b9da175d (patch)
tree: b435bb5d55ee1fd063a1afe459e143ab597037ba
parent: de6193504aa249326a30bbe962866c18d77ea85d (diff)
download: perl-67e989fb549091286d76fd8d29f1ec03b9da175d.tar.gz
15 files changed, 214 insertions, 156 deletions
diff --git a/doop.c b/doop.c
index 77c7324e31..80cc0f6337 100644
--- a/doop.c
+++ b/doop.c
@@ -77,7 +77,7 @@ S_do_trans_simple(pTHX_ SV *sv)
 
         ulen = 1;
         /* Need to check this, otherwise 128..255 won't match */
-	c = utf8_to_uv(s, &ulen);
+	c = utf8_to_uv(s, &ulen, 0);
         if (c < 0x100 && (ch = tbl[(short)c]) >= 0) {
             matches++;
             if (ch < 0x80) 
@@ -125,7 +125,7 @@ S_do_trans_count(pTHX_ SV *sv)/* SPC - OK */
             I32 ulen;
             ulen = 1;
             if (hasutf)
-                c = utf8_to_uv(s,&ulen);
+                c = utf8_to_uv(s,&ulen, 0);
             else
                 c = *s;
             if (c < 0x100 && tbl[c] >= 0)
@@ -364,7 +364,7 @@ S_do_trans_complex_utf8(pTHX_ SV *sv) /* SPC - NOT OK */
 	    }
 	    else if (uv == none) {	/* "none" is unmapped character */
 		I32 ulen;
-		*d++ = (U8)utf8_to_uv(s, &ulen);
+		*d++ = (U8)utf8_to_uv(s, &ulen, 0);
 		s += ulen;
 		puv = 0xfeedface;
 		continue;
@@ -405,7 +405,7 @@ S_do_trans_complex_utf8(pTHX_ SV *sv) /* SPC - NOT OK */
 	    }
 	    else if (uv == none) {	/* "none" is unmapped character */
 		I32 ulen;
-		*d++ = (U8)utf8_to_uv(s, &ulen);
+		*d++ = (U8)utf8_to_uv(s, &ulen, 0);
 		s += ulen;
 		continue;
 	    }
@@ -969,10 +969,10 @@ Perl_do_vop(pTHX_ I32 optype, SV *sv, SV *left, SV *right)
 	switch (optype) {
 	case OP_BIT_AND:
 	    while (lulen && rulen) {
-		luc = utf8_to_uv((U8*)lc, &ulen);
+		luc = utf8_to_uv((U8*)lc, &ulen, 0);
 		lc += ulen;
 		lulen -= ulen;
-		ruc = utf8_to_uv((U8*)rc, &ulen);
+		ruc = utf8_to_uv((U8*)rc, &ulen, 0);
 		rc += ulen;
 		rulen -= ulen;
 		duc = luc & ruc;
@@ -984,10 +984,10 @@ Perl_do_vop(pTHX_ I32 optype, SV *sv, SV *left, SV *right)
 	    break;
 	case OP_BIT_XOR:
 	    while (lulen && rulen) {
-		luc = utf8_to_uv((U8*)lc, &ulen);
+		luc = utf8_to_uv((U8*)lc, &ulen, 0);
 		lc += ulen;
 		lulen -= ulen;
-		ruc = utf8_to_uv((U8*)rc, &ulen);
+		ruc = utf8_to_uv((U8*)rc, &ulen, 0);
 		rc += ulen;
 		rulen -= ulen;
 		duc = luc ^ ruc;
@@ -996,10 +996,10 @@ Perl_do_vop(pTHX_ I32 optype, SV *sv, SV *left, SV *right)
 	    goto mop_up_utf;
 	case OP_BIT_OR:
 	    while (lulen && rulen) {
-		luc = utf8_to_uv((U8*)lc, &ulen);
+		luc = utf8_to_uv((U8*)lc, &ulen, 0);
 		lc += ulen;
 		lulen -= ulen;
-		ruc = utf8_to_uv((U8*)rc, &ulen);
+		ruc = utf8_to_uv((U8*)rc, &ulen, 0);
 		rc += ulen;
 		rulen -= ulen;
 		duc = luc | ruc;
diff --git a/embed.h b/embed.h
index 9bd72ad9b8..7e030a9199 100644
--- a/embed.h
+++ b/embed.h
@@ -2186,7 +2186,7 @@
 #define utf8_hop(a,b)		Perl_utf8_hop(aTHX_ a,b)
 #define utf8_to_bytes(a,b)	Perl_utf8_to_bytes(aTHX_ a,b)
 #define bytes_to_utf8(a,b)	Perl_bytes_to_utf8(aTHX_ a,b)
-#define utf8_to_uv(a,b)		Perl_utf8_to_uv(aTHX_ a,b)
+#define utf8_to_uv(a,b,c)	Perl_utf8_to_uv(aTHX_ a,b,c)
 #define uv_to_utf8(a,b)		Perl_uv_to_utf8(aTHX_ a,b)
 #define vivify_defelem(a)	Perl_vivify_defelem(aTHX_ a)
 #define vivify_ref(a,b)		Perl_vivify_ref(aTHX_ a,b)
diff --git a/embed.pl b/embed.pl
index c8e83f85ce..23214a3f26 100755
--- a/embed.pl
+++ b/embed.pl
@@ -2074,7 +2074,7 @@ Ap	|I32	|utf8_distance	|U8 *a|U8 *b
 Ap	|U8*	|utf8_hop	|U8 *s|I32 off
 ApM	|U8*	|utf8_to_bytes	|U8 *s|STRLEN *len
 ApM	|U8*	|bytes_to_utf8	|U8 *s|STRLEN *len
-Ap	|UV	|utf8_to_uv	|U8 *s|I32* retlen
+Ap	|UV	|utf8_to_uv	|U8 *s|I32* retlen|bool checking
 Ap	|U8*	|uv_to_utf8	|U8 *d|UV uv
 p	|void	|vivify_defelem	|SV* sv
 p	|void	|vivify_ref	|SV* sv|U32 to_what
diff --git a/ext/Encode/Encode.xs b/ext/Encode/Encode.xs
index cc0a86a117..5f4a77e6af 100644
--- a/ext/Encode/Encode.xs
+++ b/ext/Encode/Encode.xs
@@ -2,29 +2,104 @@
 #include "perl.h"
 #include "XSUB.h"
 
-MODULE = Encode		PACKAGE = Encode
+#define UNIMPLEMENTED(x,y) y x (SV *sv, char *encoding) {   \
+                         Perl_croak("panic_unimplemented"); \
+                         } 
+UNIMPLEMENTED(_encoded_utf8_to_bytes, I32)
+UNIMPLEMENTED(_encoded_bytes_to_utf8, I32)
+
+void call_failure (SV *routine, U8* done, U8* dest, U8* orig);
+
+MODULE = Encode         PACKAGE = Encode
 
 PROTOTYPES: ENABLE
 
-SV *
+I32
 _bytes_to_utf8(sv, ...)
-	SV *	sv
+        SV *    sv
       CODE:
-	{
-	  SV * encoding = 2 ? ST(1) : Nullsv;
-	  RETVAL = &PL_sv_undef;
-	}
+        {
+          SV * encoding = items == 2 ? ST(1) : Nullsv;
+
+          if (encoding)
+            RETVAL = _encoded_bytes_to_utf8(sv, SvPV_nolen(encoding));
+          else {
+            STRLEN len;
+            U8*    s = SvPV(sv, len);
+            U8*    converted;
+
+            converted = bytes_to_utf8(s, &len); /* This allocs */
+            sv_setpvn(sv, converted, len);
+            SvUTF8_on(sv); /* XXX Should we? */
+            Safefree(converted);                /* ... so free it */
+            RETVAL = len;
+          }
+        }
       OUTPUT:
-	RETVAL
+        RETVAL
 
-SV *
+I32
 _utf8_to_bytes(sv, ...)
-	SV *	sv
+        SV *    sv
       CODE:
-	{
-	  SV * to    = items > 1 ? ST(1) : Nullsv;
-	  SV * check = items > 2 ? ST(2) : Nullsv;
-	  RETVAL = &PL_sv_undef;
+        {
+          SV * to    = items > 1 ? ST(1) : Nullsv;
+          SV * check = items > 2 ? ST(2) : Nullsv;
+          
+          if (to)
+            RETVAL = _encoded_utf8_to_bytes(sv, SvPV_nolen(to));
+          else {
+            U8 *s;
+            STRLEN len;
+            s = SvPV(sv, len);
+
+            if (SvTRUE(check)) {
+              /* Must do things the slow way */
+              U8 *dest;
+              U8 *src  = savepv(s); /* We need a copy to pass to check() */ 
+              U8 *send = s + len;
+
+              New(83, dest, len, U8); /* I think */
+
+              while (s < send) {
+                if (*s < 0x80)
+                  *dest++ = *s++;
+                else {
+                  I32 ulen;
+                  I32 byte;
+		  I32 uv = *s++;
+                  
+                  /* Have to do it all ourselves because of error routine,
+		     aargh. */
+		  if (!(uv & 0x40))
+		    goto failure;
+		  if      (!(uv & 0x20)) { ulen = 2;  uv &= 0x1f; }
+		  else if (!(uv & 0x10)) { ulen = 3;  uv &= 0x0f; }
+		  else if (!(uv & 0x08)) { ulen = 4;  uv &= 0x07; }
+		  else if (!(uv & 0x04)) { ulen = 5;  uv &= 0x03; }
+		  else if (!(uv & 0x02)) { ulen = 6;  uv &= 0x01; }
+		  else if (!(uv & 0x01)) { ulen = 7;  uv = 0; }
+		  else                   { ulen = 13; uv = 0; }
+		  
+		  /* Note change to utf8.c variable naming, for variety */
+		  while (ulen--) {
+		    if ((*s & 0xc0) != 0x80)
+		      goto failure;
+		    
+		    else
+		      uv = (uv << 6) | (*s++ & 0x3f);
+		  } 
+		  if (uv > 256) {
+		  failure:
+		    call_failure(check, s, dest, src);
+		    /* Now what happens? */
+		  }
+		  *dest++ = (U8)uv;
+               }
+               }
+	    } else
+	      RETVAL = (utf8_to_bytes(s, &len) ? len : 0);
+	  }
 	}
       OUTPUT:
 	RETVAL
diff --git a/handy.h b/handy.h
index d82b1c609b..c240c4220c 100644
--- a/handy.h
+++ b/handy.h
@@ -448,21 +448,21 @@ Converts the specified character to lowercase.
 #define isPSXSPC_utf8(c)	(isSPACE_utf8(c) ||(c) == '\f')
 #define isBLANK_utf8(c)		isBLANK(c) /* could be wrong */
 
-#define isALNUM_LC_utf8(p)	isALNUM_LC_uni(utf8_to_uv(p, 0))
-#define isIDFIRST_LC_utf8(p)	isIDFIRST_LC_uni(utf8_to_uv(p, 0))
-#define isALPHA_LC_utf8(p)	isALPHA_LC_uni(utf8_to_uv(p, 0))
-#define isSPACE_LC_utf8(p)	isSPACE_LC_uni(utf8_to_uv(p, 0))
-#define isDIGIT_LC_utf8(p)	isDIGIT_LC_uni(utf8_to_uv(p, 0))
-#define isUPPER_LC_utf8(p)	isUPPER_LC_uni(utf8_to_uv(p, 0))
-#define isLOWER_LC_utf8(p)	isLOWER_LC_uni(utf8_to_uv(p, 0))
-#define isALNUMC_LC_utf8(p)	isALNUMC_LC_uni(utf8_to_uv(p, 0))
-#define isCNTRL_LC_utf8(p)	isCNTRL_LC_uni(utf8_to_uv(p, 0))
-#define isGRAPH_LC_utf8(p)	isGRAPH_LC_uni(utf8_to_uv(p, 0))
-#define isPRINT_LC_utf8(p)	isPRINT_LC_uni(utf8_to_uv(p, 0))
-#define isPUNCT_LC_utf8(p)	isPUNCT_LC_uni(utf8_to_uv(p, 0))
-#define toUPPER_LC_utf8(p)	toUPPER_LC_uni(utf8_to_uv(p, 0))
-#define toTITLE_LC_utf8(p)	toTITLE_LC_uni(utf8_to_uv(p, 0))
-#define toLOWER_LC_utf8(p)	toLOWER_LC_uni(utf8_to_uv(p, 0))
+#define isALNUM_LC_utf8(p)	isALNUM_LC_uni(utf8_to_uv(p, 0, 0))
+#define isIDFIRST_LC_utf8(p)	isIDFIRST_LC_uni(utf8_to_uv(p, 0, 0))
+#define isALPHA_LC_utf8(p)	isALPHA_LC_uni(utf8_to_uv(p, 0, 0))
+#define isSPACE_LC_utf8(p)	isSPACE_LC_uni(utf8_to_uv(p, 0, 0))
+#define isDIGIT_LC_utf8(p)	isDIGIT_LC_uni(utf8_to_uv(p, 0, 0))
+#define isUPPER_LC_utf8(p)	isUPPER_LC_uni(utf8_to_uv(p, 0, 0))
+#define isLOWER_LC_utf8(p)	isLOWER_LC_uni(utf8_to_uv(p, 0, 0))
+#define isALNUMC_LC_utf8(p)	isALNUMC_LC_uni(utf8_to_uv(p, 0, 0))
+#define isCNTRL_LC_utf8(p)	isCNTRL_LC_uni(utf8_to_uv(p, 0, 0))
+#define isGRAPH_LC_utf8(p)	isGRAPH_LC_uni(utf8_to_uv(p, 0, 0))
+#define isPRINT_LC_utf8(p)	isPRINT_LC_uni(utf8_to_uv(p, 0, 0))
+#define isPUNCT_LC_utf8(p)	isPUNCT_LC_uni(utf8_to_uv(p, 0, 0))
+#define toUPPER_LC_utf8(p)	toUPPER_LC_uni(utf8_to_uv(p, 0, 0))
+#define toTITLE_LC_utf8(p)	toTITLE_LC_uni(utf8_to_uv(p, 0, 0))
+#define toLOWER_LC_utf8(p)	toLOWER_LC_uni(utf8_to_uv(p, 0, 0))
 
 #define isPSXSPC_LC_utf8(c)	(isSPACE_LC_utf8(c) ||(c) == '\f')
 #define isBLANK_LC_utf8(c)	isBLANK(c) /* could be wrong */
diff --git a/op.c b/op.c
index 74d67e38f0..d24396a8d3 100644
--- a/op.c
+++ b/op.c
@@ -2656,7 +2656,7 @@ Perl_pmtrans(pTHX_ OP *o, OP *expr, OP *repl)
 	    qsort(cp, i, sizeof(U8*), utf8compare);
 	    for (j = 0; j < i; j++) {
 		U8 *s = cp[j];
-		UV val = utf8_to_uv(s, &ulen);
+		UV val = utf8_to_uv(s, &ulen, 0);
 		s += ulen;
 		diff = val - nextmin;
 		if (diff > 0) {
@@ -2669,7 +2669,7 @@ Perl_pmtrans(pTHX_ OP *o, OP *expr, OP *repl)
 		    }
 	        }
 		if (*s == 0xff)
-		    val = utf8_to_uv(s+1, &ulen);
+		    val = utf8_to_uv(s+1, &ulen, 0);
 		if (val >= nextmin)
 		    nextmin = val + 1;
 	    }
@@ -2696,10 +2696,10 @@ Perl_pmtrans(pTHX_ OP *o, OP *expr, OP *repl)
 	while (t < tend || tfirst <= tlast) {
 	    /* see if we need more "t" chars */
 	    if (tfirst > tlast) {
-		tfirst = (I32)utf8_to_uv(t, &ulen);
+		tfirst = (I32)utf8_to_uv(t, &ulen, 0);
 		t += ulen;
 		if (t < tend && *t == 0xff) {	/* illegal utf8 val indicates range */
-		    tlast = (I32)utf8_to_uv(++t, &ulen);
+		    tlast = (I32)utf8_to_uv(++t, &ulen, 0);
 		    t += ulen;
 		}
 		else
@@ -2709,10 +2709,10 @@ Perl_pmtrans(pTHX_ OP *o, OP *expr, OP *repl)
 	    /* now see if we need more "r" chars */
 	    if (rfirst > rlast) {
 		if (r < rend) {
-		    rfirst = (I32)utf8_to_uv(r, &ulen);
+		    rfirst = (I32)utf8_to_uv(r, &ulen, 0);
 		    r += ulen;
 		    if (r < rend && *r == 0xff) {	/* illegal utf8 val indicates range */
-			rlast = (I32)utf8_to_uv(++r, &ulen);
+			rlast = (I32)utf8_to_uv(++r, &ulen, 0);
 			r += ulen;
 		    }
 		    else
diff --git a/pod/perlapi.pod b/pod/perlapi.pod
index b1feed3197..ca2ba7c834 100644
--- a/pod/perlapi.pod
+++ b/pod/perlapi.pod
@@ -3182,10 +3182,18 @@ Found in file handy.h
 
 =item U8 *s
 
-Returns true if first C<len> bytes of the given string form valid a UTF8
-string, false otherwise.
+Returns the character value of the first character in the string C<s>
+which is assumed to be in UTF8 encoding; C<retlen> will be set to the
+length, in bytes, of that character, and the pointer C<s> will be
+advanced to the end of the character.
 
-	is_utf8_string	U8 *s(STRLEN len)
+If C<s> does not point to a well-formed UTF8 character, the behaviour
+is dependent on the value of C<checking>: if this is true, it is
+assumed that the caller will raise a warning, and this function will
+set C<retlen> to C<-1> and return. If C<checking> is not true, an optional UTF8
+warning is produced.
+
+	utf8_to_uv	U8 *s(I32 *retlen, I32 checking)
 
 =for hackers
 Found in file utf8.c
@@ -3195,7 +3203,7 @@ Found in file utf8.c
 Converts a string C<s> of length C<len> from UTF8 into byte encoding.
 Unlike C<bytes_to_utf8>, this over-writes the original string, and
 updates len to contain the new length.
-Returns zero on failure leaving the string and len unchanged
+Returns zero on failure, setting C<len> to -1.
 
 	U8 *	utf8_to_bytes(U8 *s, STRLEN *len)
 
diff --git a/pp.c b/pp.c
index d5d5dd88b0..1c5a9638e9 100644
--- a/pp.c
+++ b/pp.c
@@ -2195,7 +2195,7 @@ PP(pp_ord)
     I32 retlen;
 
     if ((*tmps & 0x80) && DO_UTF8(tmpsv))
-	value = utf8_to_uv(tmps, &retlen);
+	value = utf8_to_uv(tmps, &retlen, 0);
     else
 	value = (UV)(*tmps & 255);
     XPUSHu(value);
@@ -2262,7 +2262,7 @@ PP(pp_ucfirst)
 	I32 ulen;
 	U8 tmpbuf[UTF8_MAXLEN];
 	U8 *tend;
-	UV uv = utf8_to_uv(s, &ulen);
+	UV uv = utf8_to_uv(s, &ulen, 0);
 
 	if (PL_op->op_private & OPpLOCALE) {
 	    TAINT;
@@ -2321,7 +2321,7 @@ PP(pp_lcfirst)
 	I32 ulen;
 	U8 tmpbuf[UTF8_MAXLEN];
 	U8 *tend;
-	UV uv = utf8_to_uv(s, &ulen);
+	UV uv = utf8_to_uv(s, &ulen, 0);
 
 	if (PL_op->op_private & OPpLOCALE) {
 	    TAINT;
@@ -2398,7 +2398,7 @@ PP(pp_uc)
 		TAINT;
 		SvTAINTED_on(TARG);
 		while (s < send) {
-		    d = uv_to_utf8(d, toUPPER_LC_uni( utf8_to_uv(s, &ulen)));
+		    d = uv_to_utf8(d, toUPPER_LC_uni( utf8_to_uv(s, &ulen, 0)));
 		    s += ulen;
 		}
 	    }
@@ -2472,7 +2472,7 @@ PP(pp_lc)
 		TAINT;
 		SvTAINTED_on(TARG);
 		while (s < send) {
-		    d = uv_to_utf8(d, toLOWER_LC_uni( utf8_to_uv(s, &ulen)));
+		    d = uv_to_utf8(d, toLOWER_LC_uni( utf8_to_uv(s, &ulen, 0)));
 		    s += ulen;
 		}
 	    }
@@ -3614,7 +3614,7 @@ PP(pp_unpack)
 		len = strend - s;
 	    if (checksum) {
 		while (len-- > 0 && s < strend) {
-		    auint = utf8_to_uv((U8*)s, &along);
+		    auint = utf8_to_uv((U8*)s, &along, 0);
 		    s += along;
 		    if (checksum > 32)
 			cdouble += (NV)auint;
@@ -3626,7 +3626,7 @@ PP(pp_unpack)
 		EXTEND(SP, len);
 		EXTEND_MORTAL(len);
 		while (len-- > 0 && s < strend) {
-		    auint = utf8_to_uv((U8*)s, &along);
+		    auint = utf8_to_uv((U8*)s, &along, 0);
 		    s += along;
 		    sv = NEWSV(37, 0);
 		    sv_setuv(sv, (UV)auint);
diff --git a/pp_ctl.c b/pp_ctl.c
index 8981bb88c2..3cc74e5281 100644
--- a/pp_ctl.c
+++ b/pp_ctl.c
@@ -2959,13 +2959,13 @@ PP(pp_require)
 	    U8 *s = (U8*)SvPVX(sv);
 	    U8 *end = (U8*)SvPVX(sv) + SvCUR(sv);
 	    if (s < end) {
-		rev = utf8_to_uv(s, &len);
+		rev = utf8_to_uv(s, &len, 0);
 		s += len;
 		if (s < end) {
-		    ver = utf8_to_uv(s, &len);
+		    ver = utf8_to_uv(s, &len, 0);
 		    s += len;
 		    if (s < end)
-			sver = utf8_to_uv(s, &len);
+			sver = utf8_to_uv(s, &len, 0);
 		}
 	    }
 	    if (PERL_REVISION < rev
diff --git a/proto.h b/proto.h
index ed08b20c8e..6a0229a7fc 100644
--- a/proto.h
+++ b/proto.h
@@ -817,7 +817,7 @@ PERL_CALLCONV I32	Perl_utf8_distance(pTHX_ U8 *a, U8 *b);
 PERL_CALLCONV U8*	Perl_utf8_hop(pTHX_ U8 *s, I32 off);
 PERL_CALLCONV U8*	Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *len);
 PERL_CALLCONV U8*	Perl_bytes_to_utf8(pTHX_ U8 *s, STRLEN *len);
-PERL_CALLCONV UV	Perl_utf8_to_uv(pTHX_ U8 *s, I32* retlen);
+PERL_CALLCONV UV	Perl_utf8_to_uv(pTHX_ U8 *s, I32* retlen, bool checking);
 PERL_CALLCONV U8*	Perl_uv_to_utf8(pTHX_ U8 *d, UV uv);
 PERL_CALLCONV void	Perl_vivify_defelem(pTHX_ SV* sv);
 PERL_CALLCONV void	Perl_vivify_ref(pTHX_ SV* sv, U32 to_what);
diff --git a/regcomp.c b/regcomp.c
index 723cbbe298..c60ab84409 100644
--- a/regcomp.c
+++ b/regcomp.c
@@ -2881,7 +2881,7 @@ tryagain:
 		default:
 		  normal_default:
 		    if ((*p & 0xc0) == 0xc0 && UTF) {
-			ender = utf8_to_uv((U8*)p, &numlen);
+			ender = utf8_to_uv((U8*)p, &numlen, 0);
 			p += numlen;
 		    }
 		    else
@@ -3635,12 +3635,12 @@ S_regclassutf8(pTHX)
 	namedclass = OOB_NAMEDCLASS;
 	if (!range)
 	    rangebegin = PL_regcomp_parse;
-	value = utf8_to_uv((U8*)PL_regcomp_parse, &numlen);
+	value = utf8_to_uv((U8*)PL_regcomp_parse, &numlen, 0);
 	PL_regcomp_parse += numlen;
 	if (value == '[')
 	    namedclass = regpposixcc(value);
 	else if (value == '\\') {
-	    value = (U32)utf8_to_uv((U8*)PL_regcomp_parse, &numlen);
+	    value = (U32)utf8_to_uv((U8*)PL_regcomp_parse, &numlen, 0);
 	    PL_regcomp_parse += numlen;
 	    /* Some compilers cannot handle switching on 64-bit integer
 	     * values, therefore value cannot be an UV.  Yes, this will
diff --git a/regexec.c b/regexec.c
index 6401710b99..990791bd6b 100644
--- a/regexec.c
+++ b/regexec.c
@@ -914,7 +914,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta
 	    PL_reg_flags |= RF_tainted;
 	    /* FALL THROUGH */
 	case BOUNDUTF8:
-	    tmp = (I32)(s != startpos) ? utf8_to_uv(reghop((U8*)s, -1), 0) : '\n';
+	    tmp = (I32)(s != startpos) ? utf8_to_uv(reghop((U8*)s, -1), 0, 0) : '\n';
 	    tmp = ((OP(c) == BOUNDUTF8 ? isALNUM_uni(tmp) : isALNUM_LC_uni(tmp)) != 0);
 	    while (s < strend) {
 		if (tmp == !(OP(c) == BOUNDUTF8 ?
@@ -950,7 +950,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta
 	    PL_reg_flags |= RF_tainted;
 	    /* FALL THROUGH */
 	case NBOUNDUTF8:
-	    tmp = (I32)(s != startpos) ? utf8_to_uv(reghop((U8*)s, -1), 0) : '\n';
+	    tmp = (I32)(s != startpos) ? utf8_to_uv(reghop((U8*)s, -1), 0, 0) : '\n';
 	    tmp = ((OP(c) == NBOUNDUTF8 ? isALNUM_uni(tmp) : isALNUM_LC_uni(tmp)) != 0);
 	    while (s < strend) {
 		if (tmp == !(OP(c) == NBOUNDUTF8 ?
@@ -1995,7 +1995,7 @@ S_regmatch(pTHX_ regnode *prog)
 		while (s < e) {
 		    if (l >= PL_regeol)
 			sayNO;
-		    if (utf8_to_uv((U8*)s, 0) != (c1 ?
+		    if (utf8_to_uv((U8*)s, 0, 0) != (c1 ?
 						  toLOWER_utf8((U8*)l) :
 						  toLOWER_LC_utf8((U8*)l)))
 		    {
@@ -2133,7 +2133,7 @@ S_regmatch(pTHX_ regnode *prog)
 	case NBOUNDUTF8:
 	    /* was last char in word? */
 	    ln = (locinput != PL_regbol)
-		? utf8_to_uv(reghop((U8*)locinput, -1), 0) : PL_regprev;
+		? utf8_to_uv(reghop((U8*)locinput, -1), 0, 0) : PL_regprev;
 	    if (OP(scan) == BOUNDUTF8 || OP(scan) == NBOUNDUTF8) {
 		ln = isALNUM_uni(ln);
 		n = swash_fetch(PL_utf8_alnum, (U8*)locinput);
diff --git a/sv.c b/sv.c
index 4da49cc86d..2c45cae0bd 100644
--- a/sv.c
+++ b/sv.c
@@ -2398,6 +2398,7 @@ Perl_sv_utf8_upgrade(pTHX_ register SV *sv)
 {
     int hicount;
     char *c;
+    char *s;
 
     if (!sv || !SvPOK(sv) || SvUTF8(sv))
 	return;
@@ -2406,30 +2407,16 @@ Perl_sv_utf8_upgrade(pTHX_ register SV *sv)
      * to signal if there are any hibit chars in the string
      */
     hicount = 0;
-    for (c = SvPVX(sv); c < SvEND(sv); c++) {
+    for (c = s = SvPVX(sv); c < SvEND(sv); c++) {
 	if (*c & 0x80)
 	    hicount++;
     }
 
     if (hicount) {
-	char *src, *dst;
-	SvGROW(sv, SvCUR(sv) + hicount + 1);
-
-	src = SvEND(sv) - 1;
-	SvCUR_set(sv, SvCUR(sv) + hicount);
-	dst = SvEND(sv) - 1;
-
-	while (src < dst) {
-	    if (*src & 0x80) {
-		dst--;
-		uv_to_utf8((U8*)dst, (U8)*src--);
-		dst--;
-	    }
-	    else {
-		*dst-- = *src--;
-	    }
-	}
-
+	STRLEN len = SvCUR(sv) + 1; /* Plus the \0 */
+	SvPVX(sv) = bytes_to_utf8(s, &len);
+	SvCUR(sv) = len - 1;
+	Safefree(s); /* No longer using what was there before */
 	SvUTF8_on(sv);
     }
 }
@@ -2450,46 +2437,14 @@ Perl_sv_utf8_downgrade(pTHX_ register SV* sv, bool fail_ok)
 {
     if (SvPOK(sv) && SvUTF8(sv)) {
         char *c = SvPVX(sv);
-        char *first_hi = 0;
-        /* need to figure out if this is possible at all first */
-        while (c < SvEND(sv)) {
-            if (*c & 0x80) {
-                I32 len;
-                UV uv = utf8_to_uv((U8*)c, &len);
-                if (uv >= 256) {
-		    if (fail_ok)
-			return FALSE;
-		    else {
-			/* XXX might want to make a callback here instead */
-			Perl_croak(aTHX_ "Big byte");
-		    }
-		}
-                if (!first_hi)
-                    first_hi = c;
-                c += len;
-            }
-            else {
-                c++;
-            }
-        }
-
-        if (first_hi) {
-            char *src = first_hi;
-            char *dst = first_hi;
-            while (src < SvEND(sv)) {
-                if (*src & 0x80) {
-                    I32 len;
-                    U8 u = (U8)utf8_to_uv((U8*)src, &len);
-                    *dst++ = u;
-                    src += len;
-                }
-                else {
-                    *dst++ = *src++;
-                }
-            }
-            SvCUR_set(sv, dst - SvPVX(sv));
-        }
-        SvUTF8_off(sv);
+	STRLEN len = SvCUR(sv);
+        if (!utf8_to_bytes(c, &len)) {
+	    if (fail_ok)
+		return FALSE;
+	    else
+		Perl_croak("big byte");
+	}
+	SvCUR(sv) = len - 1;
     }
     return TRUE;
 }
@@ -2523,24 +2478,15 @@ Perl_sv_utf8_decode(pTHX_ register SV *sv)
          * we want to make sure everything inside is valid utf8 first.
          */
         c = SvPVX(sv);
+	if (!is_utf8_string(c,SvCUR(c)+1))
+	    return FALSE;
+
         while (c < SvEND(sv)) {
-            if (*c & 0x80) {
-                I32 len;
-                (void)utf8_to_uv((U8*)c, &len);
-                if (len == 1) {
-                    /* bad utf8 */
-                    return FALSE;
-                }
-                c += len;
-                has_utf = TRUE;
-            }
-            else {
-                c++;
-            }
+            if (*c++ & 0x80) {
+		SvUTF8_on(sv);
+		break;
+	    }
         }
-
-        if (has_utf)
-            SvUTF8_on(sv);
     }
     return TRUE;
 }
@@ -6373,7 +6319,7 @@ Perl_sv_vcatpvfn(pTHX_ SV *sv, const char *pat, STRLEN patlen, va_list *args, SV
 		    break;
 		}
 		if (utf)
-		    iv = (IV)utf8_to_uv(vecstr, &ulen);
+		    iv = (IV)utf8_to_uv(vecstr, &ulen, 0);
 		else {
 		    iv = *vecstr;
 		    ulen = 1;
@@ -6455,7 +6401,7 @@ Perl_sv_vcatpvfn(pTHX_ SV *sv, const char *pat, STRLEN patlen, va_list *args, SV
 		    break;
 		}
 		if (utf)
-		    uv = utf8_to_uv(vecstr, &ulen);
+		    uv = utf8_to_uv(vecstr, &ulen, 0);
 		else {
 		    uv = *vecstr;
 		    ulen = 1;
diff --git a/toke.c b/toke.c
index e5f737b8c7..31f5f0a903 100644
--- a/toke.c
+++ b/toke.c
@@ -812,7 +812,7 @@ Perl_str_to_version(pTHX_ SV *sv)
 	I32 skip;
 	UV n;
 	if (utf)
-	    n = utf8_to_uv((U8*)start, &skip);
+	    n = utf8_to_uv((U8*)start, &skip, 0);
 	else {
 	    n = *(U8*)start;
 	    skip = 1;
@@ -1323,7 +1323,7 @@ S_scan_const(pTHX_ char *start)
 	/* (now in tr/// code again) */
 
 	if (*s & 0x80 && thisutf) {
-	   (void)utf8_to_uv((U8*)s, &len);
+	   (void)utf8_to_uv((U8*)s, &len, 0);
 	   if (len == 1) {
 	       /* illegal UTF8, make it valid */
 	       char *old_pvx = SvPVX(sv);
diff --git a/utf8.c b/utf8.c
index a9600e95b9..d97a8b003d 100644
--- a/utf8.c
+++ b/utf8.c
@@ -158,8 +158,25 @@ Perl_is_utf8_string(pTHX_ U8 *s, STRLEN len)
     return 1;
 }
 
+/*
+=for apidoc Am|utf8_to_uv|U8 *s|I32 *retlen|I32 checking
+
+Returns the character value of the first character in the string C<s>
+which is assumed to be in UTF8 encoding; C<retlen> will be set to the
+length, in bytes, of that character, and the pointer C<s> will be
+advanced to the end of the character.
+
+If C<s> does not point to a well-formed UTF8 character, the behaviour
+is dependent on the value of C<checking>: if this is true, it is
+assumed that the caller will raise a warning, and this function will
+set C<retlen> to C<-1> and return. If C<checking> is not true, an optional UTF8
+warning is produced.
+
+=cut
+*/
+
 UV
-Perl_utf8_to_uv(pTHX_ U8* s, I32* retlen)
+Perl_utf8_to_uv(pTHX_ U8* s, I32* retlen, bool checking)
 {
     UV uv = *s;
     int len;
@@ -170,6 +187,11 @@ Perl_utf8_to_uv(pTHX_ U8* s, I32* retlen)
     }
     if (!(uv & 0x40)) {
         dTHR;
+	if (checking && retlen) {
+	    *retlen = -1;
+	    return 0;
+	}
+
 	if (ckWARN_d(WARN_UTF8))     
 	    Perl_warner(aTHX_ WARN_UTF8, "Malformed UTF-8 character");
 	if (retlen)
@@ -192,6 +214,11 @@ Perl_utf8_to_uv(pTHX_ U8* s, I32* retlen)
     while (len--) {
 	if ((*s & 0xc0) != 0x80) {
             dTHR;
+	    if (checking && retlen) {
+		*retlen = -1;
+		return 0;
+            }
+
 	    if (ckWARN_d(WARN_UTF8))     
 	        Perl_warner(aTHX_ WARN_UTF8, "Malformed UTF-8 character");
 	    if (retlen)
@@ -253,7 +280,7 @@ Perl_utf8_hop(pTHX_ U8 *s, I32 off)
 Converts a string C<s> of length C<len> from UTF8 into byte encoding.
 Unlike C<bytes_to_utf8>, this over-writes the original string, and
 updates len to contain the new length.
-Returns zero on failure leaving the string and len unchanged
+Returns zero on failure, setting C<len> to -1.
 
 =cut
 */
@@ -273,8 +300,10 @@ Perl_utf8_to_bytes(pTHX_ U8* s, STRLEN *len)
     while (s < send) {
 	U8 c = *s++;
         if (c >= 0x80 &&
-	    ( (s >= send) || ((*s++ & 0xc0) != 0x80) || ((c & 0xfe) != 0xc2)))
+	    ( (s >= send) || ((*s++ & 0xc0) != 0x80) || ((c & 0xfe) != 0xc2))) {
+	    *len = -1;
 	    return 0;    
+	}
     }
     s = save;
     while (s < send) {
@@ -282,7 +311,7 @@ Perl_utf8_to_bytes(pTHX_ U8* s, STRLEN *len)
             *d++ = *s++;
         else {
             I32 ulen;
-            *d++ = (U8)utf8_to_uv(s, &ulen);
+            *d++ = (U8)utf8_to_uv(s, &ulen, 0);
             s += ulen;
         }
     }
@@ -810,7 +839,7 @@ Perl_to_utf8_upper(pTHX_ U8 *p)
     if (!PL_utf8_toupper)
 	PL_utf8_toupper = swash_init("utf8", "ToUpper", &PL_sv_undef, 4, 0);
     uv = swash_fetch(PL_utf8_toupper, p);
-    return uv ? uv : utf8_to_uv(p,0);
+    return uv ? uv : utf8_to_uv(p,0,0);
 }
 
 UV
@@ -821,7 +850,7 @@ Perl_to_utf8_title(pTHX_ U8 *p)
     if (!PL_utf8_totitle)
 	PL_utf8_totitle = swash_init("utf8", "ToTitle", &PL_sv_undef, 4, 0);
     uv = swash_fetch(PL_utf8_totitle, p);
-    return uv ? uv : utf8_to_uv(p,0);
+    return uv ? uv : utf8_to_uv(p,0,0);
 }
 
 UV
@@ -832,7 +861,7 @@ Perl_to_utf8_lower(pTHX_ U8 *p)
     if (!PL_utf8_tolower)
 	PL_utf8_tolower = swash_init("utf8", "ToLower", &PL_sv_undef, 4, 0);
     uv = swash_fetch(PL_utf8_tolower, p);
-    return uv ? uv : utf8_to_uv(p,0);
+    return uv ? uv : utf8_to_uv(p,0,0);
 }
 
 /* a "swash" is a swatch hash */
@@ -922,7 +951,7 @@ Perl_swash_fetch(pTHX_ SV *sv, U8 *ptr)
 	    PUSHMARK(SP);
 	    EXTEND(SP,3);
 	    PUSHs((SV*)sv);
-	    PUSHs(sv_2mortal(newSViv(utf8_to_uv(ptr, 0) & ~(needents - 1))));
+	    PUSHs(sv_2mortal(newSViv(utf8_to_uv(ptr, 0, 0) & ~(needents - 1))));
 	    PUSHs(sv_2mortal(newSViv(needents)));
 	    PUTBACK;
 	    if (call_method("SWASHGET", G_SCALAR))
author	Jarkko Hietaniemi <jhi@iki.fi>	2000-09-14 14:40:40 +0000
committer	Jarkko Hietaniemi <jhi@iki.fi>	2000-09-14 14:40:40 +0000
commit	67e989fb549091286d76fd8d29f1ec03b9da175d (patch)
tree	b435bb5d55ee1fd063a1afe459e143ab597037ba
parent	de6193504aa249326a30bbe962866c18d77ea85d (diff)
download	perl-67e989fb549091286d76fd8d29f1ec03b9da175d.tar.gz