runtime now looks at the SVf_UTF8 bit on the SV to decide

whether to use widechar semantics; lexer and RE engine continue to need "use utf8" to enable unicode awareness in literals and patterns (TODO: this needs to be fixed); $1 et al are marked SvUTF8 if the pattern was compiled for utf8 (TODO: propagating it from the data is probably better) p4raw-id: //depot/perl@4930
author: Gurusamy Sarathy <gsar@cpan.org> 2000-01-31 04:57:42 +0000
committer: Gurusamy Sarathy <gsar@cpan.org> 2000-01-31 04:57:42 +0000
commit: 7e2040f0b7c6fc88ec07b6e169aa2f75fc0130a4 (patch)
tree: de43e349e9f70e27ef30b2a0de9de2df628cc1c3
parent: 8004f2ac219abdd8660c02a4a46ed97695dc379d (diff)
download: perl-7e2040f0b7c6fc88ec07b6e169aa2f75fc0130a4.tar.gz
16 files changed, 178 insertions, 129 deletions
diff --git a/doop.c b/doop.c
index 990898deba..26ac87dbca 100644
--- a/doop.c
+++ b/doop.c
@@ -933,7 +933,7 @@ Perl_do_chop(pTHX_ register SV *astr, register SV *sv)
     s = SvPV(sv, len);
     if (len && !SvPOK(sv))
 	s = SvPV_force(sv, len);
-    if (IN_UTF8) {
+    if (DO_UTF8(sv)) {
 	if (s && len) {
 	    char *send = s + len;
 	    char *start = s;
@@ -946,12 +946,12 @@ Perl_do_chop(pTHX_ register SV *astr, register SV *sv)
 	    *s = '\0';
 	    SvCUR_set(sv, s - start);
 	    SvNIOK_off(sv);
+	    SvUTF8_on(astr);
 	}
 	else
 	    sv_setpvn(astr, "", 0);
     }
-    else
-    if (s && len) {
+    else if (s && len) {
 	s += --len;
 	sv_setpvn(astr, s, 1);
 	*s = '\0';
@@ -961,7 +961,7 @@ Perl_do_chop(pTHX_ register SV *astr, register SV *sv)
     else
 	sv_setpvn(astr, "", 0);
     SvSETMAGIC(sv);
-} 
+}
 
 I32
 Perl_do_chomp(pTHX_ register SV *sv)
diff --git a/gv.c b/gv.c
index 5a91c08fab..acd85012e7 100644
--- a/gv.c
+++ b/gv.c
@@ -581,9 +581,7 @@ Perl_gv_fetchpv(pTHX_ const char *nambeg, I32 add, I32 sv_type)
     /* No stash in name, so see how we can default */
 
     if (!stash) {
-	if (isIDFIRST(*name)
-	    || (IN_UTF8 && ((*name & 0xc0) == 0xc0) && isIDFIRST_utf8((U8*)name)))
-	{
+	if (isIDFIRST_lazy(name)) {
 	    bool global = FALSE;
 
 	    if (isUPPER(*name)) {
diff --git a/mg.c b/mg.c
index edabb11562..3ba3d08883 100644
--- a/mg.c
+++ b/mg.c
@@ -609,6 +609,10 @@ Perl_magic_get(pTHX_ SV *sv, MAGIC *mg)
 			PL_tainted = FALSE;
 		    }
 		    sv_setpvn(sv, s, i);
+		    if ((PL_curpm->op_pmdynflags & PMdf_UTF8) && !IN_BYTE)
+			SvUTF8_on(sv);
+		    else
+			SvUTF8_off(sv);
 		    if (PL_tainting)
 			PL_tainted = (was_tainted || RX_MATCH_TAINTED(rx));
 		    break;
@@ -1286,7 +1290,7 @@ Perl_magic_getpos(pTHX_ SV *sv, MAGIC *mg)
 	if (mg && mg->mg_len >= 0) {
 	    dTHR;
 	    I32 i = mg->mg_len;
-	    if (IN_UTF8)
+	    if (DO_UTF8(lsv))
 		sv_pos_b2u(lsv, &i);
 	    sv_setiv(sv, i + PL_curcop->cop_arybase);
 	    return 0;
@@ -1323,7 +1327,7 @@ Perl_magic_setpos(pTHX_ SV *sv, MAGIC *mg)
 
     pos = SvIV(sv) - PL_curcop->cop_arybase;
 
-    if (IN_UTF8) {
+    if (DO_UTF8(lsv)) {
 	ulen = sv_len_utf8(lsv);
 	if (ulen)
 	    len = ulen;
diff --git a/op.c b/op.c
index 02edc8d9f1..fdfdf27b93 100644
--- a/op.c
+++ b/op.c
@@ -2867,6 +2867,8 @@ Perl_pmruntime(pTHX_ OP *o, OP *expr, OP *repl)
 	    p = SvPV(pat, plen);
 	    pm->op_pmflags |= PMf_SKIPWHITE;
 	}
+	else if (DO_UTF8(pat))
+	    pm->op_pmdynflags |= PMdf_UTF8;
 	pm->op_pmregexp = CALLREGCOMP(aTHX_ p, p + plen, pm);
 	if (strEQ("\\s+", pm->op_pmregexp->precomp))
 	    pm->op_pmflags |= PMf_WHITE;
diff --git a/op.h b/op.h
index fc38b9cc6b..8bc82769e7 100644
--- a/op.h
+++ b/op.h
@@ -234,6 +234,7 @@ struct pmop {
 
 #define PMdf_USED	0x01		/* pm has been used once already */
 #define PMdf_TAINTED	0x02		/* pm compiled from tainted pattern */
+#define PMdf_UTF8	0x04		/* pm compiled from utf8 data */
 
 #define PMf_RETAINT	0x0001		/* taint $1 etc. if target tainted */
 #define PMf_ONCE	0x0002		/* use pattern only once per reset */
diff --git a/pp.c b/pp.c
index 13fbec8faa..45654a9445 100644
--- a/pp.c
+++ b/pp.c
@@ -389,7 +389,7 @@ PP(pp_pos)
 	    mg = mg_find(sv, 'g');
 	    if (mg && mg->mg_len >= 0) {
 		I32 i = mg->mg_len;
-		if (IN_UTF8)
+		if (DO_UTF8(sv))
 		    sv_pos_b2u(sv, &i);
 		PUSHi(i + PL_curcop->cop_arybase);
 		RETURN;
@@ -1426,7 +1426,7 @@ PP(pp_negate)
 		sv_setsv(TARG, sv);
 		*SvPV_force(TARG, len) = *s == '-' ? '+' : '-';
 	    }
-	    else if (IN_UTF8 && *(U8*)s >= 0xc0 && isIDFIRST_utf8((U8*)s)) {
+	    else if (DO_UTF8(sv) && *(U8*)s >= 0xc0 && isIDFIRST_utf8((U8*)s)) {
 		sv_setpvn(TARG, "-", 1);
 		sv_catsv(TARG, sv);
 	    }
@@ -1930,13 +1930,12 @@ PP(pp_oct)
 PP(pp_length)
 {
     djSP; dTARGET;
+    SV *sv = TOPs;
 
-    if (IN_UTF8) {
-	SETi( sv_len_utf8(TOPs) );
-	RETURN;
-    }
-
-    SETi( sv_len(TOPs) );
+    if (DO_UTF8(sv))
+	SETi(sv_len_utf8(sv));
+    else
+	SETi(sv_len(sv));
     RETURN;
 }
 
@@ -1957,6 +1956,7 @@ PP(pp_substr)
     STRLEN repl_len;
 
     SvTAINTED_off(TARG);			/* decontaminate */
+    SvUTF8_off(TARG);				/* decontaminate */
     if (MAXARG > 2) {
 	if (MAXARG > 3) {
 	    sv = POPs;
@@ -1968,7 +1968,7 @@ PP(pp_substr)
     sv = POPs;
     PUTBACK;
     tmps = SvPV(sv, curlen);
-    if (IN_UTF8) {
+    if (DO_UTF8(sv)) {
         utfcurlen = sv_len_utf8(sv);
 	if (utfcurlen == curlen)
 	    utfcurlen = 0;
@@ -2017,8 +2017,10 @@ PP(pp_substr)
 	RETPUSHUNDEF;
     }
     else {
-        if (utfcurlen)
+        if (utfcurlen) {
 	    sv_pos_u2b(sv, &pos, &rem);
+	    SvUTF8_on(TARG);
+	}
 	tmps += pos;
 	sv_setpvn(TARG, tmps, rem);
 	if (repl)
@@ -2106,7 +2108,7 @@ PP(pp_index)
     little = POPs;
     big = POPs;
     tmps = SvPV(big, biglen);
-    if (IN_UTF8 && offset > 0)
+    if (offset > 0 && DO_UTF8(big))
 	sv_pos_u2b(big, &offset, 0);
     if (offset < 0)
 	offset = 0;
@@ -2117,7 +2119,7 @@ PP(pp_index)
 	retval = -1;
     else
 	retval = tmps2 - tmps;
-    if (IN_UTF8 && retval > 0)
+    if (retval > 0 && DO_UTF8(big))
 	sv_pos_b2u(big, &retval);
     PUSHi(retval + arybase);
     RETURN;
@@ -2145,7 +2147,7 @@ PP(pp_rindex)
     if (MAXARG < 3)
 	offset = blen;
     else {
-	if (IN_UTF8 && offset > 0)
+	if (offset > 0 && DO_UTF8(big))
 	    sv_pos_u2b(big, &offset, 0);
 	offset = offset - arybase + llen;
     }
@@ -2158,7 +2160,7 @@ PP(pp_rindex)
 	retval = -1;
     else
 	retval = tmps2 - tmps;
-    if (IN_UTF8 && retval > 0)
+    if (retval > 0 && DO_UTF8(big))
 	sv_pos_b2u(big, &retval);
     PUSHi(retval + arybase);
     RETURN;
@@ -2179,10 +2181,11 @@ PP(pp_ord)
     djSP; dTARGET;
     UV value;
     STRLEN n_a;
-    U8 *tmps = (U8*)POPpx;
+    SV *tmpsv = POPs;
+    U8 *tmps = (U8*)SvPVx(tmpsv,n_a);
     I32 retlen;
 
-    if (IN_UTF8 && (*tmps & 0x80))
+    if ((*tmps & 0x80) && DO_UTF8(tmpsv))
 	value = utf8_to_uv(tmps, &retlen);
     else
 	value = (UV)(*tmps & 255);
@@ -2196,14 +2199,16 @@ PP(pp_chr)
     char *tmps;
     U32 value = POPu;
 
+    SvUTF8_off(TARG);				/* decontaminate */
     (void)SvUPGRADE(TARG,SVt_PV);
 
-    if (IN_UTF8 && value >= 128) {
+    if (value >= 128 && !IN_BYTE) {
 	SvGROW(TARG,8);
 	tmps = SvPVX(TARG);
 	tmps = (char*)uv_to_utf8((U8*)tmps, (UV)value);
 	SvCUR_set(TARG, tmps - SvPVX(TARG));
 	*tmps = '\0';
+	SvUTF8_on(TARG);
 	(void)SvPOK_only(TARG);
 	XPUSHs(TARG);
 	RETURN;
@@ -2245,7 +2250,7 @@ PP(pp_ucfirst)
     register U8 *s;
     STRLEN slen;
 
-    if (IN_UTF8 && (s = (U8*)SvPV(sv, slen)) && slen && (*s & 0xc0) == 0xc0) {
+    if (DO_UTF8(sv) && (s = (U8*)SvPV(sv, slen)) && slen && (*s & 0xc0) == 0xc0) {
 	I32 ulen;
 	U8 tmpbuf[10];
 	U8 *tend;
@@ -2265,6 +2270,7 @@ PP(pp_ucfirst)
 	    dTARGET;
 	    sv_setpvn(TARG, (char*)tmpbuf, tend - tmpbuf);
 	    sv_catpvn(TARG, (char*)(s + ulen), slen - ulen);
+	    SvUTF8_on(TARG);
 	    SETs(TARG);
 	}
 	else {
@@ -2275,6 +2281,7 @@ PP(pp_ucfirst)
     else {
 	if (!SvPADTMP(sv) || SvREADONLY(sv)) {
 	    dTARGET;
+	    SvUTF8_off(TARG);				/* decontaminate */
 	    sv_setsv(TARG, sv);
 	    sv = TARG;
 	    SETs(sv);
@@ -2302,7 +2309,7 @@ PP(pp_lcfirst)
     register U8 *s;
     STRLEN slen;
 
-    if (IN_UTF8 && (s = (U8*)SvPV(sv, slen)) && slen && (*s & 0xc0) == 0xc0) {
+    if (DO_UTF8(sv) && (s = (U8*)SvPV(sv, slen)) && slen && (*s & 0xc0) == 0xc0) {
 	I32 ulen;
 	U8 tmpbuf[10];
 	U8 *tend;
@@ -2322,6 +2329,7 @@ PP(pp_lcfirst)
 	    dTARGET;
 	    sv_setpvn(TARG, (char*)tmpbuf, tend - tmpbuf);
 	    sv_catpvn(TARG, (char*)(s + ulen), slen - ulen);
+	    SvUTF8_on(TARG);
 	    SETs(TARG);
 	}
 	else {
@@ -2332,6 +2340,7 @@ PP(pp_lcfirst)
     else {
 	if (!SvPADTMP(sv) || SvREADONLY(sv)) {
 	    dTARGET;
+	    SvUTF8_off(TARG);				/* decontaminate */
 	    sv_setsv(TARG, sv);
 	    sv = TARG;
 	    SETs(sv);
@@ -2346,7 +2355,6 @@ PP(pp_lcfirst)
 	    else
 		*s = toLOWER(*s);
 	}
-	SETs(sv);
     }
     if (SvSMAGICAL(sv))
 	mg_set(sv);
@@ -2360,7 +2368,7 @@ PP(pp_uc)
     register U8 *s;
     STRLEN len;
 
-    if (IN_UTF8) {
+    if (DO_UTF8(sv)) {
 	dTARGET;
 	I32 ulen;
 	register U8 *d;
@@ -2368,6 +2376,7 @@ PP(pp_uc)
 
 	s = (U8*)SvPV(sv,len);
 	if (!len) {
+	    SvUTF8_off(TARG);				/* decontaminate */
 	    sv_setpvn(TARG, "", 0);
 	    SETs(TARG);
 	}
@@ -2392,6 +2401,7 @@ PP(pp_uc)
 		}
 	    }
 	    *d = '\0';
+	    SvUTF8_on(TARG);
 	    SvCUR_set(TARG, d - (U8*)SvPVX(TARG));
 	    SETs(TARG);
 	}
@@ -2399,6 +2409,7 @@ PP(pp_uc)
     else {
 	if (!SvPADTMP(sv) || SvREADONLY(sv)) {
 	    dTARGET;
+	    SvUTF8_off(TARG);				/* decontaminate */
 	    sv_setsv(TARG, sv);
 	    sv = TARG;
 	    SETs(sv);
@@ -2431,7 +2442,7 @@ PP(pp_lc)
     register U8 *s;
     STRLEN len;
 
-    if (IN_UTF8) {
+    if (DO_UTF8(sv)) {
 	dTARGET;
 	I32 ulen;
 	register U8 *d;
@@ -2439,6 +2450,7 @@ PP(pp_lc)
 
 	s = (U8*)SvPV(sv,len);
 	if (!len) {
+	    SvUTF8_off(TARG);				/* decontaminate */
 	    sv_setpvn(TARG, "", 0);
 	    SETs(TARG);
 	}
@@ -2463,6 +2475,7 @@ PP(pp_lc)
 		}
 	    }
 	    *d = '\0';
+	    SvUTF8_on(TARG);
 	    SvCUR_set(TARG, d - (U8*)SvPVX(TARG));
 	    SETs(TARG);
 	}
@@ -2470,6 +2483,7 @@ PP(pp_lc)
     else {
 	if (!SvPADTMP(sv) || SvREADONLY(sv)) {
 	    dTARGET;
+	    SvUTF8_off(TARG);				/* decontaminate */
 	    sv_setsv(TARG, sv);
 	    sv = TARG;
 	    SETs(sv);
@@ -2504,11 +2518,12 @@ PP(pp_quotemeta)
     register char *s = SvPV(sv,len);
     register char *d;
 
+    SvUTF8_off(TARG);				/* decontaminate */
     if (len) {
 	(void)SvUPGRADE(TARG, SVt_PV);
 	SvGROW(TARG, (len * 2) + 1);
 	d = SvPVX(TARG);
-	if (IN_UTF8) {
+	if (DO_UTF8(sv)) {
 	    while (len) {
 		if (*s & 0x80) {
 		    STRLEN ulen = UTF8SKIP(s);
@@ -2525,6 +2540,7 @@ PP(pp_quotemeta)
 		    len--;
 		}
 	    }
+	    SvUTF8_on(TARG);
 	}
 	else {
 	    while (len--) {
@@ -3184,13 +3200,14 @@ PP(pp_reverse)
 	dTARGET;
 	STRLEN len;
 
+	SvUTF8_off(TARG);				/* decontaminate */
 	if (SP - MARK > 1)
 	    do_join(TARG, &PL_sv_no, MARK, SP);
 	else
 	    sv_setsv(TARG, (SP > MARK) ? *SP : DEFSV);
 	up = SvPV_force(TARG, len);
 	if (len > 1) {
-	    if (IN_UTF8) {	/* first reverse each character */
+	    if (DO_UTF8(TARG)) {	/* first reverse each character */
 		U8* s = (U8*)SvPVX(TARG);
 		U8* send = (U8*)(s + len);
 		while (s < send) {
diff --git a/pp_ctl.c b/pp_ctl.c
index 8e41646ccd..fd725a347c 100644
--- a/pp_ctl.c
+++ b/pp_ctl.c
@@ -114,6 +114,10 @@ PP(pp_regcomp)
 		PL_reginterp_cnt = I32_MAX; /* Mark as safe.  */
 
 	    pm->op_pmflags = pm->op_pmpermflags;	/* reset case sensitivity */
+	    if (DO_UTF8(tmpstr))
+		pm->op_pmdynflags |= PMdf_UTF8;
+	    else
+		pm->op_pmdynflags &= ~PMdf_UTF8;
 	    pm->op_pmregexp = CALLREGCOMP(aTHX_ t, t + len, pm);
 	    PL_reginterp_cnt = 0;		/* XXXX Be extra paranoid - needed
 					   inside tie/overload accessors.  */
@@ -296,7 +300,8 @@ PP(pp_formline)
     NV value;
     bool gotsome;
     STRLEN len;
-    STRLEN fudge = SvCUR(tmpForm) * (IN_UTF8 ? 3 : 1) + 1;
+    STRLEN fudge = SvCUR(tmpForm) * (IN_BYTE ? 1 : 3) + 1;
+    bool item_is_utf = FALSE;
 
     if (!SvMAGICAL(tmpForm) || !SvCOMPILED(tmpForm)) {
 	SvREADONLY_off(tmpForm);
@@ -374,7 +379,7 @@ PP(pp_formline)
 	case FF_CHECKNL:
 	    item = s = SvPV(sv, len);
 	    itemsize = len;
-	    if (IN_UTF8) {
+	    if (DO_UTF8(sv)) {
 		itemsize = sv_len_utf8(sv);
 		if (itemsize != len) {
 		    I32 itembytes;
@@ -393,11 +398,13 @@ PP(pp_formline)
 			    break;
 			s++;
 		    }
+		    item_is_utf = TRUE;
 		    itemsize = s - item;
 		    sv_pos_b2u(sv, &itemsize);
 		    break;
 		}
 	    }
+	    item_is_utf = FALSE;
 	    if (itemsize > fieldsize)
 		itemsize = fieldsize;
 	    send = chophere = s + itemsize;
@@ -414,7 +421,7 @@ PP(pp_formline)
 	case FF_CHECKCHOP:
 	    item = s = SvPV(sv, len);
 	    itemsize = len;
-	    if (IN_UTF8) {
+	    if (DO_UTF8(sv)) {
 		itemsize = sv_len_utf8(sv);
 		if (itemsize != len) {
 		    I32 itembytes;
@@ -452,9 +459,11 @@ PP(pp_formline)
 			itemsize = chophere - item;
 			sv_pos_b2u(sv, &itemsize);
 		    }
+		    item_is_utf = TRUE;
 		    break;
 		}
 	    }
+	    item_is_utf = FALSE;
 	    if (itemsize <= fieldsize) {
 		send = chophere = s + itemsize;
 		while (s < send) {
@@ -510,7 +519,7 @@ PP(pp_formline)
 	case FF_ITEM:
 	    arg = itemsize;
 	    s = item;
-	    if (IN_UTF8) {
+	    if (item_is_utf) {
 		while (arg--) {
 		    if (*s & 0x80) {
 			switch (UTF8SKIP(s)) {
@@ -553,6 +562,7 @@ PP(pp_formline)
 	case FF_LINEGLOB:
 	    item = s = SvPV(sv, len);
 	    itemsize = len;
+	    item_is_utf = FALSE;		/* XXX is this correct? */
 	    if (itemsize) {
 		gotsome = TRUE;
 		send = s + itemsize;
diff --git a/pp_hot.c b/pp_hot.c
index 18d717b356..6ebde09c29 100644
--- a/pp_hot.c
+++ b/pp_hot.c
@@ -2737,7 +2737,7 @@ S_method_common(pTHX_ SV* meth, U32* hashp)
 	    !(ob=(SV*)GvIO(iogv)))
 	{
 	    if (!packname || 
-		((*(U8*)packname >= 0xc0 && IN_UTF8)
+		((*(U8*)packname >= 0xc0 && DO_UTF8(sv))
 		    ? !isIDFIRST_utf8((U8*)packname)
 		    : !isIDFIRST(*packname)
 		))
diff --git a/regcomp.c b/regcomp.c
index 77a4bfc156..5972724410 100644
--- a/regcomp.c
+++ b/regcomp.c
@@ -1339,8 +1339,10 @@ Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
     if (exp == NULL)
 	FAIL("NULL regexp argument");
 
-    if (PL_curcop == &PL_compiling ? (PL_hints & HINT_UTF8) : IN_UTF8)
+    if (PL_curcop == &PL_compiling ? (PL_hints & HINT_UTF8) : IN_UTF8) {
 	PL_reg_flags |= RF_utf8;
+	pm->op_pmdynflags |= PMdf_UTF8;
+    }
     else
 	PL_reg_flags = 0;
 
diff --git a/sv.c b/sv.c
index 834dac3bd1..d76752fcf9 100644
--- a/sv.c
+++ b/sv.c
@@ -2769,7 +2769,7 @@ Perl_sv_setsv(pTHX_ SV *dstr, register SV *sstr)
 	    *SvEND(dstr) = '\0';
 	    (void)SvPOK_only(dstr);
 	}
-	if (SvUTF8(sstr))
+	if (DO_UTF8(sstr))
 	    SvUTF8_on(dstr);
 	/*SUPPRESS 560*/
 	if (sflags & SVp_NOK) {
@@ -5638,6 +5638,7 @@ Perl_sv_vcatpvfn(pTHX_ SV *sv, const char *pat, STRLEN patlen, va_list *args, SV
     STRLEN origlen;
     I32 svix = 0;
     static char nullstr[] = "(null)";
+    SV *argsv;
 
     /* no matter what, this is a string now */
     (void)SvPV_force(sv, origlen);
@@ -5652,12 +5653,18 @@ Perl_sv_vcatpvfn(pTHX_ SV *sv, const char *pat, STRLEN patlen, va_list *args, SV
 		char *s = va_arg(*args, char*);
 		sv_catpv(sv, s ? s : nullstr);
 	    }
-	    else if (svix < svmax)
+	    else if (svix < svmax) {
 		sv_catsv(sv, *svargs);
+		if (DO_UTF8(*svargs))
+		    SvUTF8_on(sv);
+	    }
 	    return;
 	case '_':
 	    if (args) {
-		sv_catsv(sv, va_arg(*args, SV*));
+		argsv = va_arg(*args, SV*);
+		sv_catsv(sv, argsv);
+		if (DO_UTF8(argsv))
+		    SvUTF8_on(sv);
 		return;
 	    }
 	    /* See comment on '_' below */
@@ -5676,6 +5683,7 @@ Perl_sv_vcatpvfn(pTHX_ SV *sv, const char *pat, STRLEN patlen, va_list *args, SV
 	STRLEN zeros = 0;
 	bool has_precis = FALSE;
 	STRLEN precis = 0;
+	bool is_utf = FALSE;
 
 	char esignbuf[4];
 	U8 utf8buf[10];
@@ -5816,22 +5824,20 @@ Perl_sv_vcatpvfn(pTHX_ SV *sv, const char *pat, STRLEN patlen, va_list *args, SV
 	    goto string;
 
 	case 'c':
-	    if (IN_UTF8) {
-		if (args)
-		    uv = va_arg(*args, int);
-		else
-		    uv = (svix < svmax) ? SvIVx(svargs[svix++]) : 0;
-
+	    if (args)
+		uv = va_arg(*args, int);
+	    else
+		uv = (svix < svmax) ? SvIVx(svargs[svix++]) : 0;
+	    if (uv >= 128 && !IN_BYTE) {
 		eptr = (char*)utf8buf;
 		elen = uv_to_utf8((U8*)eptr, uv) - utf8buf;
-		goto string;
+		is_utf = TRUE;
+	    }
+	    else {
+		c = (char)uv;
+		eptr = &c;
+		elen = 1;
 	    }
-	    if (args)
-		c = va_arg(*args, int);
-	    else
-		c = (svix < svmax) ? SvIVx(svargs[svix++]) : 0;
-	    eptr = &c;
-	    elen = 1;
 	    goto string;
 
 	case 's':
@@ -5851,16 +5857,18 @@ Perl_sv_vcatpvfn(pTHX_ SV *sv, const char *pat, STRLEN patlen, va_list *args, SV
 		}
 	    }
 	    else if (svix < svmax) {
-		eptr = SvPVx(svargs[svix++], elen);
-		if (IN_UTF8) {
+		argsv = svargs[svix++];
+		eptr = SvPVx(argsv, elen);
+		if (DO_UTF8(argsv)) {
 		    if (has_precis && precis < elen) {
 			I32 p = precis;
-			sv_pos_u2b(svargs[svix - 1], &p, 0); /* sticks at end */
+			sv_pos_u2b(argsv, &p, 0); /* sticks at end */
 			precis = p;
 		    }
 		    if (width) { /* fudge width (can't fudge elen) */
-			width += elen - sv_len_utf8(svargs[svix - 1]);
+			width += elen - sv_len_utf8(argsv);
 		    }
+		    is_utf = TRUE;
 		}
 	    }
 	    goto string;
@@ -5873,7 +5881,10 @@ Perl_sv_vcatpvfn(pTHX_ SV *sv, const char *pat, STRLEN patlen, va_list *args, SV
 	     */
 	    if (!args)
 		goto unknown;
-	    eptr = SvPVx(va_arg(*args, SV*), elen);
+	    argsv = va_arg(*args,SV*);
+	    eptr = SvPVx(argsv, elen);
+	    if (DO_UTF8(argsv))
+		is_utf = TRUE;
 
 	string:
 	    if (has_precis && elen > precis)
@@ -6216,6 +6227,8 @@ Perl_sv_vcatpvfn(pTHX_ SV *sv, const char *pat, STRLEN patlen, va_list *args, SV
 	    memset(p, ' ', gap);
 	    p += gap;
 	}
+	if (is_utf)
+	    SvUTF8_on(sv);
 	*p = '\0';
 	SvCUR(sv) = p - SvPVX(sv);
     }
diff --git a/t/pragma/warn/doop b/t/pragma/warn/doop
index cce6bdc07c..c16e24f919 100644
--- a/t/pragma/warn/doop
+++ b/t/pragma/warn/doop
@@ -1,6 +1,6 @@
   doop.c	AOK
 
-  Malformed UTF-8 character
+  \x%s will produce malformed UTF-8 character; use \x{%s} for that
 
 
 __END__
@@ -9,7 +9,6 @@ use utf8 ;
 $_ = "\x80  \xff" ;
 chop ;
 EXPECT
-Malformed UTF-8 character at - line 4.
 ########
 # doop.c
 BEGIN {
@@ -28,4 +27,3 @@ chop ;
 EXPECT
 \x80 will produce malformed UTF-8 character; use \x{80} for that at - line 10.
 \xff will produce malformed UTF-8 character; use \x{ff} for that at - line 10.
-Malformed UTF-8 character at - line 11.
diff --git a/t/pragma/warn/pp b/t/pragma/warn/pp
index eb09e059ba..4c70fd5d6f 100644
--- a/t/pragma/warn/pp
+++ b/t/pragma/warn/pp
@@ -30,7 +30,8 @@
 
   Mandatory Warnings
   ------------------
-  Malformed UTF-8 character
+  Malformed UTF-8 character (not tested: difficult to produce with
+                             perl now)
 
 __END__
 # pp.c
@@ -109,7 +110,6 @@ use utf8 ;
 $_ = "\x80  \xff" ;
 reverse ;
 EXPECT
-Malformed UTF-8 character at - line 4.
 ########
 # pp.c
 BEGIN {
@@ -128,4 +128,3 @@ reverse ;
 EXPECT
 \x80 will produce malformed UTF-8 character; use \x{80} for that at - line 10.
 \xff will produce malformed UTF-8 character; use \x{ff} for that at - line 10.
-Malformed UTF-8 character at - line 11.
diff --git a/t/pragma/warn/sv b/t/pragma/warn/sv
index 97d61bca17..cdec48e2c2 100644
--- a/t/pragma/warn/sv
+++ b/t/pragma/warn/sv
@@ -38,8 +38,8 @@
 
   Mandatory Warnings
   ------------------
-  Malformed UTF-8 character [sv_pos_b2u]
-      my $a = rindex "a\xff bc ", "bc" ;
+  Malformed UTF-8 character [sv_pos_b2u] (not tested: difficult to produce
+                                          with perl now)
 
   Mandatory Warnings TODO
   ------------------
@@ -286,8 +286,6 @@ $^W =0 ;
 my $a = rindex "a\xff bc ", "bc" ;
 EXPECT
 \xff will produce malformed UTF-8 character; use \x{ff} for that at - line 12.
-Malformed UTF-8 character at - line 12.
-Malformed UTF-8 character at - line 16.
 ########
 # sv.c
 use warnings 'misc';
diff --git a/t/pragma/warn/utf8 b/t/pragma/warn/utf8
index 19b8d1db3a..cb1f202b8d 100644
--- a/t/pragma/warn/utf8
+++ b/t/pragma/warn/utf8
@@ -1,14 +1,13 @@
 
   utf8.c AOK
 
-  All Mandatory warnings
- 
      [utf8_to_uv]
      Malformed UTF-8 character
 	my $a = ord "\x80" ;
 
      Malformed UTF-8 character
 	my $a = ord "\xf080" ;
+     <<<<<< this warning can't be easily triggered from perl anymore
 
      [utf16_to_utf8]
      Malformed UTF-16 surrogate		
@@ -19,7 +18,6 @@ __END__
 use utf8 ;
 my $a = ord "\x80" ;
 EXPECT
-Malformed UTF-8 character at - line 3.
 ########
 # utf8.c [utf8_to_uv]
 BEGIN {
@@ -37,15 +35,12 @@ my $a = ord "\x80" ;
     my $a = ord "\x80" ;
 }
 EXPECT
-Malformed UTF-8 character at - line 9.
 \x80 will produce malformed UTF-8 character; use \x{80} for that at - line 12.
-Malformed UTF-8 character at - line 12.
 ########
 # utf8.c [utf8_to_uv]
 use utf8 ;
 my $a = ord "\xf080" ;
 EXPECT
-Malformed UTF-8 character at - line 3.
 ########
 # utf8.c [utf8_to_uv]
 BEGIN {
@@ -63,6 +58,4 @@ my $a = ord "\xf080" ;
     my $a = ord "\xf080" ;
 }
 EXPECT
-Malformed UTF-8 character at - line 9.
 \xf0 will produce malformed UTF-8 character; use \x{f0} for that at - line 12.
-Malformed UTF-8 character at - line 12.
diff --git a/toke.c b/toke.c
index e7e217473e..cc370bc245 100644
--- a/toke.c
+++ b/toke.c
@@ -32,19 +32,8 @@ static void restore_rsfp(pTHXo_ void *f);
 #define XFAKEBRACK 128
 #define XENUMMASK 127
 
+/*#define UTF (SvUTF8(PL_linestr) && !(PL_hints & HINT_BYTE))*/
 #define UTF (PL_hints & HINT_UTF8)
-/*
- * Note: we try to be careful never to call the isXXX_utf8() functions
- * unless we're pretty sure we've seen the beginning of a UTF-8 character
- * (that is, the two high bits are set).  Otherwise we risk loading in the
- * heavy-duty SWASHINIT and SWASHGET routines unnecessarily.
- */
-#define isIDFIRST_lazy(p) ((!UTF || (*((U8*)p) < 0xc0)) \
-				? isIDFIRST(*(p)) \
-				: isIDFIRST_utf8((U8*)p))
-#define isALNUM_lazy(p) ((!UTF || (*((U8*)p) < 0xc0)) \
-				? isALNUM(*(p)) \
-				: isALNUM_utf8((U8*)p))
 
 /* In variables name $^X, these are the legal values for X.  
  * 1999-02-27 mjd-perl-patch@plover.com */
@@ -223,9 +212,9 @@ S_no_op(pTHX_ char *what, char *s)
     yywarn(Perl_form(aTHX_ "%s found where operator expected", what));
     if (is_first)
 	Perl_warn(aTHX_ "\t(Missing semicolon on previous line?)\n");
-    else if (PL_oldoldbufptr && isIDFIRST_lazy(PL_oldoldbufptr)) {
+    else if (PL_oldoldbufptr && isIDFIRST_lazy_if(PL_oldoldbufptr,UTF)) {
 	char *t;
-	for (t = PL_oldoldbufptr; *t && (isALNUM_lazy(t) || *t == ':'); t++) ;
+	for (t = PL_oldoldbufptr; *t && (isALNUM_lazy_if(t,UTF) || *t == ':'); t++) ;
 	if (t < PL_bufptr && isSPACE(*t))
 	    Perl_warn(aTHX_ "\t(Do you need to predeclare %.*s?)\n",
 		t - PL_oldoldbufptr, PL_oldoldbufptr);
@@ -633,7 +622,7 @@ S_check_uni(pTHX)
 	return;
     while (isSPACE(*PL_last_uni))
 	PL_last_uni++;
-    for (s = PL_last_uni; isALNUM_lazy(s) || *s == '-'; s++) ;
+    for (s = PL_last_uni; isALNUM_lazy_if(s,UTF) || *s == '-'; s++) ;
     if ((t = strchr(s, '(')) && t < PL_bufptr)
 	return;
     if (ckWARN_d(WARN_AMBIGUOUS)){
@@ -756,7 +745,7 @@ S_force_word(pTHX_ register char *start, int token, int check_keyword, int allow
     
     start = skipspace(start);
     s = start;
-    if (isIDFIRST_lazy(s) ||
+    if (isIDFIRST_lazy_if(s,UTF) ||
 	(allow_pack && *s == ':') ||
 	(allow_initial_tick && *s == '\'') )
     {
@@ -1159,6 +1148,7 @@ S_scan_const(pTHX_ char *start)
     register char *s = start;			/* start of the constant */
     register char *d = SvPVX(sv);		/* destination for copies */
     bool dorange = FALSE;			/* are we in a translit range? */
+    bool has_utf = FALSE;			/* embedded \x{} */
     I32 len;					/* ? */
     I32 utf = (PL_lex_inwhat == OP_TRANS && PL_sublex_info.sub_op)
 	? (PL_sublex_info.sub_op->op_private & (OPpTRANS_FROM_UTF|OPpTRANS_TO_UTF))
@@ -1264,7 +1254,8 @@ S_scan_const(pTHX_ char *start)
 	}
 
 	/* check for embedded arrays (@foo, @:foo, @'foo, @{foo}, @$foo) */
-	else if (*s == '@' && s[1] && (isALNUM_lazy(s+1) || strchr(":'{$", s[1])))
+	else if (*s == '@' && s[1]
+		 && (isALNUM_lazy_if(s+1,UTF) || strchr(":'{$", s[1])))
 	    break;
 
 	/* check for embedded scalars.  only stop if we're sure it's a
@@ -1368,6 +1359,7 @@ S_scan_const(pTHX_ char *start)
 		    d = (char*)uv_to_utf8((U8*)d,
 					  (UV)scan_hex(s + 1, e - s - 1, &len));
 		    s = e + 1;
+		    has_utf = TRUE;
 		}
 		else {
 		    UV uv = (UV)scan_hex(s, 2, &len);
@@ -1375,6 +1367,7 @@ S_scan_const(pTHX_ char *start)
 			utf != (OPpTRANS_FROM_UTF|OPpTRANS_TO_UTF))
 		    {
 			d = (char*)uv_to_utf8((U8*)d, uv);	/* doing a CU or UC */
+			has_utf = TRUE;
 		    }
 		    else {
 			if (uv >= 127 && UTF) {
@@ -1485,6 +1478,8 @@ S_scan_const(pTHX_ char *start)
     *d = '\0';
     SvCUR_set(sv, d - SvPVX(sv));
     SvPOK_on(sv);
+    if (has_utf)
+	SvUTF8_on(sv);
 
     /* shrink the sv if we allocated more than we used */
     if (SvCUR(sv) + 5 < SvLEN(sv)) {
@@ -1593,7 +1588,7 @@ S_intuit_more(pTHX_ register char *s)
 	    case '&':
 	    case '$':
 		weight -= seen[un_char] * 10;
-		if (isALNUM_lazy(s+1)) {
+		if (isALNUM_lazy_if(s+1,UTF)) {
 		    scan_ident(s, send, tmpbuf, sizeof tmpbuf, FALSE);
 		    if ((int)strlen(tmpbuf) > 1 && gv_fetchpv(tmpbuf,FALSE, SVt_PV))
 			weight -= 100;
@@ -2285,7 +2280,7 @@ Perl_yylex(pTHX)
   retry:
     switch (*s) {
     default:
-	if (isIDFIRST_lazy(s))
+	if (isIDFIRST_lazy_if(s,UTF))
 	    goto keylookup;
 	Perl_croak(aTHX_ "Unrecognized character \\x%02X", *s & 255);
     case 4:
@@ -2653,7 +2648,7 @@ Perl_yylex(pTHX)
 	else if (*s == '>') {
 	    s++;
 	    s = skipspace(s);
-	    if (isIDFIRST_lazy(s)) {
+	    if (isIDFIRST_lazy_if(s,UTF)) {
 		s = force_word(s,METHOD,FALSE,TRUE,FALSE);
 		TOKEN(ARROW);
 	    }
@@ -2749,7 +2744,7 @@ Perl_yylex(pTHX)
 	 grabattrs:
 	    s = skipspace(s);
 	    attrs = Nullop;
-	    while (isIDFIRST_lazy(s)) {
+	    while (isIDFIRST_lazy_if(s,UTF)) {
 		d = scan_word(s, PL_tokenbuf, sizeof PL_tokenbuf, FALSE, &len);
 		if (isLOWER(*s) && (tmp = keyword(PL_tokenbuf, len))) {
 		    if (tmp < 0) tmp = -tmp;
@@ -2894,7 +2889,7 @@ Perl_yylex(pTHX)
 		while (d < PL_bufend && (*d == ' ' || *d == '\t'))
 		    d++;
 	    }
-	    if (d < PL_bufend && isIDFIRST_lazy(d)) {
+	    if (d < PL_bufend && isIDFIRST_lazy_if(d,UTF)) {
 		d = scan_word(d, PL_tokenbuf + 1, sizeof PL_tokenbuf - 1,
 			      FALSE, &len);
 		while (d < PL_bufend && (*d == ' ' || *d == '\t'))
@@ -2985,9 +2980,9 @@ Perl_yylex(pTHX)
 		    }
 		    t++;
 		}
-		else if (isALNUM_lazy(t)) {
+		else if (isALNUM_lazy_if(t,UTF)) {
 		    t += UTF8SKIP(t);
-		    while (t < PL_bufend && isALNUM_lazy(t))
+		    while (t < PL_bufend && isALNUM_lazy_if(t,UTF))
 			 t += UTF8SKIP(t);
 		}
 		while (t < PL_bufend && isSPACE(*t))
@@ -3047,7 +3042,9 @@ Perl_yylex(pTHX)
 	    AOPERATOR(ANDAND);
 	s--;
 	if (PL_expect == XOPERATOR) {
-	    if (ckWARN(WARN_SEMICOLON) && isIDFIRST_lazy(s) && PL_bufptr == PL_linestart) {
+	    if (ckWARN(WARN_SEMICOLON)
+		&& isIDFIRST_lazy_if(s,UTF) && PL_bufptr == PL_linestart)
+	    {
 		CopLINE_dec(PL_curcop);
 		Perl_warner(aTHX_ WARN_SEMICOLON, PL_warn_nosemi);
 		CopLINE_inc(PL_curcop);
@@ -3177,7 +3174,7 @@ Perl_yylex(pTHX)
 	    }
 	}
 
-	if (s[1] == '#' && (isIDFIRST_lazy(s+2) || strchr("{$:+-", s[2]))) {
+	if (s[1] == '#' && (isIDFIRST_lazy_if(s+2,UTF) || strchr("{$:+-", s[2]))) {
 	    PL_tokenbuf[0] = '@';
 	    s = scan_ident(s + 1, PL_bufend, PL_tokenbuf + 1,
 			   sizeof PL_tokenbuf - 1, FALSE);
@@ -3220,7 +3217,7 @@ Perl_yylex(pTHX)
 		PL_tokenbuf[0] = '@';
 		if (ckWARN(WARN_SYNTAX)) {
 		    for(t = s + 1;
-			isSPACE(*t) || isALNUM_lazy(t) || *t == '$';
+			isSPACE(*t) || isALNUM_lazy_if(t,UTF) || *t == '$';
 			t++) ;
 		    if (*t++ == ',') {
 			PL_bufptr = skipspace(PL_bufptr);
@@ -3240,7 +3237,7 @@ Perl_yylex(pTHX)
 		    char tmpbuf[sizeof PL_tokenbuf];
 		    STRLEN len;
 		    for (t++; isSPACE(*t); t++) ;
-		    if (isIDFIRST_lazy(t)) {
+		    if (isIDFIRST_lazy_if(t,UTF)) {
 			t = scan_word(t, tmpbuf, sizeof tmpbuf, TRUE, &len);
 		        for (; isSPACE(*t); t++) ;
 			if (*t == ';' && get_cv(tmpbuf, FALSE))
@@ -3258,9 +3255,9 @@ Perl_yylex(pTHX)
 		PL_expect = XOPERATOR;
 	    else if (strchr("$@\"'`q", *s))
 		PL_expect = XTERM;		/* e.g. print $fh "foo" */
-	    else if (strchr("&*<%", *s) && isIDFIRST_lazy(s+1))
+	    else if (strchr("&*<%", *s) && isIDFIRST_lazy_if(s+1,UTF))
 		PL_expect = XTERM;		/* e.g. print $fh &sub */
-	    else if (isIDFIRST_lazy(s)) {
+	    else if (isIDFIRST_lazy_if(s,UTF)) {
 		char tmpbuf[sizeof PL_tokenbuf];
 		scan_word(s, tmpbuf, sizeof tmpbuf, TRUE, &len);
 		if (tmp = keyword(tmpbuf, len)) {
@@ -3318,7 +3315,7 @@ Perl_yylex(pTHX)
 	    if (ckWARN(WARN_SYNTAX)) {
 		if (*s == '[' || *s == '{') {
 		    char *t = s + 1;
-		    while (*t && (isALNUM_lazy(t) || strchr(" \t$#+-'\"", *t)))
+		    while (*t && (isALNUM_lazy_if(t,UTF) || strchr(" \t$#+-'\"", *t)))
 			t++;
 		    if (*t == '}' || *t == ']') {
 			t++;
@@ -3339,7 +3336,8 @@ Perl_yylex(pTHX)
 	    /* Disable warning on "study /blah/" */
 	    if (PL_oldoldbufptr == PL_last_uni 
 		&& (*PL_last_uni != 's' || s - PL_last_uni < 5 
-		    || memNE(PL_last_uni, "study", 5) || isALNUM_lazy(PL_last_uni+5)))
+		    || memNE(PL_last_uni, "study", 5)
+		    || isALNUM_lazy_if(PL_last_uni+5,UTF)))
 		check_uni();
 	    s = scan_pat(s,OP_MATCH);
 	    TERM(sublex_start());
@@ -3665,7 +3663,7 @@ Perl_yylex(pTHX)
 
 		    /* Two barewords in a row may indicate method call. */
 
-		    if ((isIDFIRST_lazy(s) || *s == '$') && (tmp=intuit_method(s,gv)))
+		    if ((isIDFIRST_lazy_if(s,UTF) || *s == '$') && (tmp=intuit_method(s,gv)))
 			return tmp;
 
 		    /* If not a declared subroutine, it's an indirect object. */
@@ -3711,7 +3709,7 @@ Perl_yylex(pTHX)
 
 		/* If followed by a bareword, see if it looks like indir obj. */
 
-		if ((isIDFIRST_lazy(s) || *s == '$') && (tmp = intuit_method(s,gv)))
+		if ((isIDFIRST_lazy_if(s,UTF) || *s == '$') && (tmp = intuit_method(s,gv)))
 		    return tmp;
 
 		/* Not a method, so call it a subroutine (if defined) */
@@ -4045,7 +4043,7 @@ Perl_yylex(pTHX)
 	case KEY_foreach:
 	    yylval.ival = CopLINE(PL_curcop);
 	    s = skipspace(s);
-	    if (PL_expect == XSTATE && isIDFIRST_lazy(s)) {
+	    if (PL_expect == XSTATE && isIDFIRST_lazy_if(s,UTF)) {
 		char *p = s;
 		if ((PL_bufend - p) >= 3 &&
 		    strnEQ(p, "my", 2) && isSPACE(*(p + 2)))
@@ -4054,7 +4052,7 @@ Perl_yylex(pTHX)
 		    strnEQ(p, "our", 3) && isSPACE(*(p + 3)))
 		    p += 3;
 		p = skipspace(p);
-		if (isIDFIRST_lazy(p)) {
+		if (isIDFIRST_lazy_if(p,UTF)) {
 		    p = scan_ident(p, PL_bufend,
 			PL_tokenbuf, sizeof PL_tokenbuf, TRUE);
 		    p = skipspace(p);
@@ -4269,7 +4267,7 @@ Perl_yylex(pTHX)
 	case KEY_my:
 	    PL_in_my = tmp;
 	    s = skipspace(s);
-	    if (isIDFIRST_lazy(s)) {
+	    if (isIDFIRST_lazy_if(s,UTF)) {
 		s = scan_word(s, PL_tokenbuf, sizeof PL_tokenbuf, TRUE, &len);
 		if (len == 3 && strnEQ(PL_tokenbuf, "sub", 3))
 		    goto really_sub;
@@ -4307,9 +4305,9 @@ Perl_yylex(pTHX)
 
 	case KEY_open:
 	    s = skipspace(s);
-	    if (isIDFIRST_lazy(s)) {
+	    if (isIDFIRST_lazy_if(s,UTF)) {
 		char *t;
-		for (d = s; isALNUM_lazy(d); d++) ;
+		for (d = s; isALNUM_lazy_if(d,UTF); d++) ;
 		t = skipspace(d);
 		if (strchr("|&*+-=!?:.", *t) && ckWARN_d(WARN_AMBIGUOUS))
 		    Perl_warner(aTHX_ WARN_AMBIGUOUS,
@@ -4448,7 +4446,7 @@ Perl_yylex(pTHX)
 	    else {
 		*PL_tokenbuf = '\0';
 		s = force_word(s,WORD,TRUE,TRUE,FALSE);
-		if (isIDFIRST_lazy(PL_tokenbuf))
+		if (isIDFIRST_lazy_if(PL_tokenbuf,UTF))
 		    gv_stashpvn(PL_tokenbuf, strlen(PL_tokenbuf), TRUE);
 		else if (*s == '<')
 		    yyerror("<> should be quotes");
@@ -4639,7 +4637,7 @@ Perl_yylex(pTHX)
 
 		s = skipspace(s);
 
-		if (isIDFIRST_lazy(s) || *s == '\'' ||
+		if (isIDFIRST_lazy_if(s,UTF) || *s == '\'' ||
 		    (*s == ':' && s[1] == ':'))
 		{
 		    PL_expect = XBLOCK;
@@ -5529,9 +5527,9 @@ S_checkcomma(pTHX_ register char *s, char *name, char *what)
 	s++;
     while (s < PL_bufend && isSPACE(*s))
 	s++;
-    if (isIDFIRST_lazy(s)) {
+    if (isIDFIRST_lazy_if(s,UTF)) {
 	w = s++;
-	while (isALNUM_lazy(s))
+	while (isALNUM_lazy_if(s,UTF))
 	    s++;
 	while (s < PL_bufend && isSPACE(*s))
 	    s++;
@@ -5653,7 +5651,7 @@ S_scan_word(pTHX_ register char *s, char *dest, STRLEN destlen, int allow_packag
 	    Perl_croak(aTHX_ ident_too_long);
 	if (isALNUM(*s))	/* UTF handled below */
 	    *d++ = *s++;
-	else if (*s == '\'' && allow_package && isIDFIRST_lazy(s+1)) {
+	else if (*s == '\'' && allow_package && isIDFIRST_lazy_if(s+1,UTF)) {
 	    *d++ = ':';
 	    *d++ = ':';
 	    s++;
@@ -5705,7 +5703,7 @@ S_scan_ident(pTHX_ register char *s, register char *send, char *dest, STRLEN des
 		Perl_croak(aTHX_ ident_too_long);
 	    if (isALNUM(*s))	/* UTF handled below */
 		*d++ = *s++;
-	    else if (*s == '\'' && isIDFIRST_lazy(s+1)) {
+	    else if (*s == '\'' && isIDFIRST_lazy_if(s+1,UTF)) {
 		*d++ = ':';
 		*d++ = ':';
 		s++;
@@ -5736,7 +5734,7 @@ S_scan_ident(pTHX_ register char *s, register char *send, char *dest, STRLEN des
 	return s;
     }
     if (*s == '$' && s[1] &&
-	(isALNUM_lazy(s+1) || strchr("${", s[1]) || strnEQ(s+1,"::",2)) )
+	(isALNUM_lazy_if(s+1,UTF) || strchr("${", s[1]) || strnEQ(s+1,"::",2)) )
     {
 	return s;
     }
@@ -5763,11 +5761,11 @@ S_scan_ident(pTHX_ register char *s, register char *send, char *dest, STRLEN des
 		}
 	    }
 	}
-	if (isIDFIRST_lazy(d)) {
+	if (isIDFIRST_lazy_if(d,UTF)) {
 	    d++;
 	    if (UTF) {
 		e = s;
-		while (e < send && isALNUM_lazy(e) || *e == ':') {
+		while (e < send && isALNUM_lazy_if(e,UTF) || *e == ':') {
 		    e += UTF8SKIP(e);
 		    while (e < send && *e & 0x80 && is_utf8_mark((U8*)e))
 			e += UTF8SKIP(e);
@@ -6071,9 +6069,9 @@ S_scan_heredoc(pTHX_ register char *s)
 	    s++, term = '\'';
 	else
 	    term = '"';
-	if (!isALNUM_lazy(s))
+	if (!isALNUM_lazy_if(s,UTF))
 	    deprecate("bare << to mean <<\"\"");
-	for (; isALNUM_lazy(s); s++) {
+	for (; isALNUM_lazy_if(s,UTF); s++) {
 	    if (d < e)
 		*d++ = *s;
 	}
@@ -6284,7 +6282,7 @@ S_scan_inputsymbol(pTHX_ char *start)
     if (*d == '$' && d[1]) d++;
 
     /* allow <Pkg'VALUE> or <Pkg::VALUE> */
-    while (*d && (isALNUM_lazy(d) || *d == '\'' || *d == ':'))
+    while (*d && (isALNUM_lazy_if(d,UTF) || *d == '\'' || *d == ':'))
 	d++;
 
     /* If we've tried to read what we allow filehandles to look like, and
diff --git a/utf8.h b/utf8.h
index e71264c35c..578abc7783 100644
--- a/utf8.h
+++ b/utf8.h
@@ -28,5 +28,21 @@ END_EXTERN_C
 
 #define IN_UTF8 (PL_curcop->op_private & HINT_UTF8)
 #define IN_BYTE (PL_curcop->op_private & HINT_BYTE)
+#define DO_UTF8(sv) (SvUTF8(sv) && !IN_BYTE)
 
 #define UTF8SKIP(s) PL_utf8skip[*(U8*)s]
+
+/*
+ * Note: we try to be careful never to call the isXXX_utf8() functions
+ * unless we're pretty sure we've seen the beginning of a UTF-8 character
+ * (that is, the two high bits are set).  Otherwise we risk loading in the
+ * heavy-duty SWASHINIT and SWASHGET routines unnecessarily.
+ */
+#define isIDFIRST_lazy_if(p,c) ((!c || (*((U8*)p) < 0xc0)) \
+				? isIDFIRST(*(p)) \
+				: isIDFIRST_utf8((U8*)p))
+#define isALNUM_lazy_if(p,c)   ((!c || (*((U8*)p) < 0xc0)) \
+				? isALNUM(*(p)) \
+				: isALNUM_utf8((U8*)p))
+#define isIDFIRST_lazy(p)	isIDFIRST_lazy_if(p,1)
+#define isALNUM_lazy(p)		isALNUM_lazy_if(p,1)
author	Gurusamy Sarathy <gsar@cpan.org>	2000-01-31 04:57:42 +0000
committer	Gurusamy Sarathy <gsar@cpan.org>	2000-01-31 04:57:42 +0000
commit	7e2040f0b7c6fc88ec07b6e169aa2f75fc0130a4 (patch)
tree	de43e349e9f70e27ef30b2a0de9de2df628cc1c3
parent	8004f2ac219abdd8660c02a4a46ed97695dc379d (diff)
download	perl-7e2040f0b7c6fc88ec07b6e169aa2f75fc0130a4.tar.gz