More EBCDIC stuff:

- Loose the extra level of function on ASCII. - spotted a chr(0) issue in sv.c - re-work of UTF-X tr/// ranges to work in Unicode space. Still issues with the "0xff is illegal UTF-8" hack. - Yet another ad. hoc. utf8 'upgrade' in op.c recoded (why do it once when you can do it all over the place :-( - Enable HINTS_UTF8 on EBCDIC - then ignore it in toke.c, need utf8.pm for swashes. - Simplified and commented scan_const() in toke.c Still something wrong regexp and tr (swashes?). p4raw-id: //depot/perlio@9267
author: Nick Ing-Simmons <nik@tiuk.ti.com> 2001-03-20 20:04:39 +0000
committer: Nick Ing-Simmons <nik@tiuk.ti.com> 2001-03-20 20:04:39 +0000
commit: 2b9d42f0ba1bb562fe21327dc7948ab1a5397a19 (patch)
tree: aee45626e3738deabafbe610cedef159d4c82d3b
parent: f2f6ab5ed2d2f824b4f6c3085a4a2275c2f8500a (diff)
download: perl-2b9d42f0ba1bb562fe21327dc7948ab1a5397a19.tar.gz
10 files changed, 188 insertions, 157 deletions
diff --git a/embed.pl b/embed.pl
index 9557a2b369..cb2b4a71db 100755
--- a/embed.pl
+++ b/embed.pl
@@ -1260,6 +1260,8 @@ _EOE_
 
 close(DOC);
 
+unlink "pod/perlintern.pod";
+
 open(GUTS, ">pod/perlintern.pod") or
 		die "Unable to create pod/perlintern.pod: $!\n";
 print GUTS <<'END';
diff --git a/lib/utf8.pm b/lib/utf8.pm
index 7c9a7dfb4e..f32c0bc5b7 100644
--- a/lib/utf8.pm
+++ b/lib/utf8.pm
@@ -1,6 +1,5 @@
 package utf8;
 
-if (ord('A') != 193) { # make things more pragmatic for EBCDIC folk
 
 $utf8::hint_bits = 0x00800000;
 
@@ -21,8 +20,6 @@ sub AUTOLOAD {
     Carp::croak("Undefined subroutine $AUTOLOAD called");
 }
 
-}
-
 1;
 __END__
 
diff --git a/op.c b/op.c
index 5cdb3df2f7..59643e78b1 100644
--- a/op.c
+++ b/op.c
@@ -114,12 +114,12 @@ S_trlist_upgrade(pTHX_ U8** sp, U8** ep)
     *sp = d;
 
     while (s < e) {
-        if (*s < 0x80 || *s == 0xff)
+        if (NATIVE_IS_INVARIANT(*s) || NATIVE_TO_UTF(*s) == 0xff)
             *d++ = *s++;
 	else {
-            U8 c = *s++;
-            *d++ = ((c >> 6)         | 0xc0);
-            *d++ = ((c       & 0x3f) | 0x80);
+	    U8 c = NATIVE_TO_ASCII(*s++);
+	    *d++ = UTF8_EIGHT_BIT_HI(c);
+	    *d++ = UTF8_EIGHT_BIT_LO(c);
         }
     }
     *ep = d;
@@ -2650,15 +2650,16 @@ Perl_newBINOP(pTHX_ I32 type, I32 flags, OP *first, OP *last)
 }
 
 static int
-utf8compare(const void *a, const void *b)
-{
-    int i;
-    for (i = 0; i < 10; i++) {
-	if ((*(U8**)a)[i] < (*(U8**)b)[i])
-	    return -1;
-	if ((*(U8**)a)[i] > (*(U8**)b)[i])
-	    return 1;
-    }
+uvcompare(const void *a, const void *b)
+{
+    if (*((UV *)a) < (*(UV *)b))
+	return -1;
+    if (*((UV *)a) > (*(UV *)b))
+	return 1;
+    if (*((UV *)a+1) < (*(UV *)b+1))
+	return -1;
+    if (*((UV *)a+1) > (*(UV *)b+1))
+	return 1;
     return 0;
 }
 
@@ -2712,47 +2713,57 @@ Perl_pmtrans(pTHX_ OP *o, OP *expr, OP *repl)
 	U8* tsave = from_utf ? NULL : trlist_upgrade(&t, &tend);
 	U8* rsave = (to_utf || !rlen) ? NULL : trlist_upgrade(&r, &rend);
 
+/* There are several snags with this code on EBCDIC:
+   1. 0xFF is a legal UTF-EBCDIC byte (there are no illegal bytes).
+   2. scan_const() in toke.c has encoded chars in native encoding which makes
+      ranges at least in EBCDIC 0..255 range the bottom odd.
+*/
+
 	if (complement) {
 	    U8 tmpbuf[UTF8_MAXLEN+1];
-	    U8** cp;
+	    UV *cp;
 	    UV nextmin = 0;
-	    New(1109, cp, tlen, U8*);
+	    New(1109, cp, 2*tlen, UV);
 	    i = 0;
 	    transv = newSVpvn("",0);
 	    while (t < tend) {
-		cp[i++] = t;
-		t += UTF8SKIP(t);
-		if (t < tend && *t == 0xff) {
+		cp[2*i] = utf8n_to_uvuni(t, tend-t, &ulen, 0);
+		t += ulen;
+		if (t < tend && NATIVE_TO_UTF(*t) == 0xff) {
 		    t++;
-		    t += UTF8SKIP(t);
+		    cp[2*i+1] = utf8n_to_uvuni(t, tend-t, &ulen, 0);
+		    t += ulen;
 		}
+		else {
+		 cp[2*i+1] = cp[2*i];
+		}
+		i++;
 	    }
-	    qsort(cp, i, sizeof(U8*), utf8compare);
+	    qsort(cp, i, 2*sizeof(UV), uvcompare);
 	    for (j = 0; j < i; j++) {
-		U8 *s = cp[j];
-		I32 cur = j < i - 1 ? cp[j+1] - s : tend - s;
-		/* CHECKME: Use unicode code points for ranges - needs more thought ... NI-S */
-		UV  val = utf8n_to_uvuni(s, cur, &ulen, 0);
-		s += ulen;
+		UV  val = cp[2*j];
 		diff = val - nextmin;
 		if (diff > 0) {
 		    t = uvuni_to_utf8(tmpbuf,nextmin);
 		    sv_catpvn(transv, (char*)tmpbuf, t - tmpbuf);
 		    if (diff > 1) {
+			U8  range_mark = UTF_TO_NATIVE(0xff);
 			t = uvuni_to_utf8(tmpbuf, val - 1);
-			sv_catpvn(transv, "\377", 1);
+			sv_catpvn(transv, (char *)&range_mark, 1);
 			sv_catpvn(transv, (char*)tmpbuf, t - tmpbuf);
 		    }
 	        }
-		if (s < tend && *s == 0xff)
-		    val = utf8n_to_uvuni(s+1, cur - 1, &ulen, 0);
+		val = cp[2*j+1];
 		if (val >= nextmin)
 		    nextmin = val + 1;
 	    }
 	    t = uvuni_to_utf8(tmpbuf,nextmin);
 	    sv_catpvn(transv, (char*)tmpbuf, t - tmpbuf);
+	    {
+		U8 range_mark = UTF_TO_NATIVE(0xff);
+		sv_catpvn(transv, (char *)&range_mark, 1);
+	    }
 	    t = uvuni_to_utf8(tmpbuf, 0x7fffffff);
-	    sv_catpvn(transv, "\377", 1);
 	    sv_catpvn(transv, (char*)tmpbuf, t - tmpbuf);
 	    t = (U8*)SvPVX(transv);
 	    tlen = SvCUR(transv);
@@ -2775,7 +2786,7 @@ Perl_pmtrans(pTHX_ OP *o, OP *expr, OP *repl)
 	    if (tfirst > tlast) {
 		tfirst = (I32)utf8n_to_uvuni(t, tend - t, &ulen, 0);
 		t += ulen;
-		if (t < tend && *t == 0xff) {	/* illegal utf8 val indicates range */
+		if (t < tend && NATIVE_TO_UTF(*t) == 0xff) {	/* illegal utf8 val indicates range */
 		    t++;
 		    tlast = (I32)utf8n_to_uvuni(t, tend - t, &ulen, 0);
 		    t += ulen;
@@ -2789,7 +2800,7 @@ Perl_pmtrans(pTHX_ OP *o, OP *expr, OP *repl)
 		if (r < rend) {
 		    rfirst = (I32)utf8n_to_uvuni(r, rend - r, &ulen, 0);
 		    r += ulen;
-		    if (r < rend && *r == 0xff) {	/* illegal utf8 val indicates range */
+		    if (r < rend && NATIVE_TO_UTF(*r) == 0xff) {	/* illegal utf8 val indicates range */
 			r++;
 			rlast = (I32)utf8n_to_uvuni(r, rend - r, &ulen, 0);
 			r += ulen;
@@ -7040,3 +7051,4 @@ const_sv_xsub(pTHXo_ CV* cv)
     ST(0) = (SV*)XSANY.any_ptr;
     XSRETURN(1);
 }
+
diff --git a/regcomp.c b/regcomp.c
index 19ad253929..33765fff9d 100644
--- a/regcomp.c
+++ b/regcomp.c
@@ -2920,9 +2920,7 @@ tryagain:
 		if (ISMULT2(p)) { /* Back off on ?+*. */
 		    if (len)
 			p = oldp;
-		    /* ender is a Unicode value so it can be > 0xff --
-		     * in other words, do not use UTF8_IS_CONTINUED(). */
-		    else if (NATIVE_TO_ASCII(ender) >= 0x80 && UTF) {
+		    else if (!UNI_IS_INVARIANT(NATIVE_TO_UNI(ender)) && UTF) {
 			reguni(pRExC_state, ender, s, &numlen);
 			s += numlen;
 			len += numlen;
@@ -2933,9 +2931,7 @@ tryagain:
 		    }
 		    break;
 		}
-		/* ender is a Unicode value so it can be > 0xff --
-		 * in other words, do not use UTF8_IS_CONTINUED(). */
-		if (NATIVE_TO_ASCII(ender) >= 0x80 && UTF) {
+		if (!UNI_IS_INVARIANT(NATIVE_TO_UNI(ender)) && UTF) {
 		    reguni(pRExC_state, ender, s, &numlen);
 		    s += numlen;
 		    len += numlen - 1;
@@ -4251,7 +4247,7 @@ Perl_regprop(pTHX_ SV *sv, regnode *o)
 		    U8 s[UTF8_MAXLEN+1];
 		
 		    for (i = 0; i <= 256; i++) { /* just the first 256 */
-			U8 *e = uvuni_to_utf8(s, i);
+			U8 *e = uvchr_to_utf8(s, i);
 			
 			if (i < 256 && swash_fetch(sw, s)) {
 			    if (rangestart == -1)
@@ -4261,14 +4257,14 @@ Perl_regprop(pTHX_ SV *sv, regnode *o)
 			
 			    if (i <= rangestart + 3)
 				for (; rangestart < i; rangestart++) {
-				    for(e = uvuni_to_utf8(s, rangestart), p = s; p < e; p++)
+				    for(e = uvchr_to_utf8(s, rangestart), p = s; p < e; p++)
 					put_byte(sv, *p);
 				}
 			    else {
-				for (e = uvuni_to_utf8(s, rangestart), p = s; p < e; p++)
+				for (e = uvchr_to_utf8(s, rangestart), p = s; p < e; p++)
 				    put_byte(sv, *p);
 				sv_catpv(sv, "-");
-				    for (e = uvuni_to_utf8(s, i - 1), p = s; p < e; p++)
+				    for (e = uvchr_to_utf8(s, i - 1), p = s; p < e; p++)
 					put_byte(sv, *p);
 				}
 				rangestart = -1;
diff --git a/regexec.c b/regexec.c
index 2420f8d3c9..cfe77f5c35 100644
--- a/regexec.c
+++ b/regexec.c
@@ -950,7 +950,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta
 		else {
 		    U8 *r = reghop3((U8*)s, -1, (U8*)startpos);
 		
-		    tmp = (I32)utf8n_to_uvuni(r, s - (char*)r, 0, 0);
+		    tmp = (I32)utf8n_to_uvchr(r, s - (char*)r, 0, 0);
 		}
 		tmp = ((OP(c) == BOUND ?
 			isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
@@ -993,7 +993,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta
 		else {
 		    U8 *r = reghop3((U8*)s, -1, (U8*)startpos);
 		
-		    tmp = (I32)utf8n_to_uvuni(r, s - (char*)r, 0, 0);
+		    tmp = (I32)utf8n_to_uvchr(r, s - (char*)r, 0, 0);
 		}
 		tmp = ((OP(c) == NBOUND ?
 			isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
@@ -1433,7 +1433,7 @@ Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char *
     else {
         if (prog->reganch & ROPT_UTF8 && do_utf8) {
 	    U8 *s = reghop3((U8*)stringarg, -1, (U8*)strbeg);
-	    PL_regprev = utf8n_to_uvuni(s, (U8*)stringarg - s, NULL, 0);
+	    PL_regprev = utf8n_to_uvchr(s, (U8*)stringarg - s, NULL, 0);
 	}
 	else
 	    PL_regprev = (U32)stringarg[-1];
@@ -2161,7 +2161,7 @@ S_regmatch(pTHX_ regnode *prog)
 		    if (l >= PL_regeol) {
 			sayNO;
 		    }
-		    if ((UTF ? utf8n_to_uvuni((U8*)s, e - s, 0, 0) : *((U8*)s)) !=
+		    if ((UTF ? utf8n_to_uvchr((U8*)s, e - s, 0, 0) : *((U8*)s)) !=
 			(c1 ? toLOWER_utf8((U8*)l) : toLOWER_LC_utf8((U8*)l)))
 			    sayNO;
 		    s += UTF ? UTF8SKIP(s) : 1;
@@ -2263,7 +2263,7 @@ S_regmatch(pTHX_ regnode *prog)
 		else {
 		    U8 *r = reghop((U8*)locinput, -1);
 		
-		    ln = utf8n_to_uvuni(r, s - (char*)r, 0, 0);
+		    ln = utf8n_to_uvchr(r, s - (char*)r, 0, 0);
 		}
 		if (OP(scan) == BOUND || OP(scan) == NBOUND) {
 		    ln = isALNUM_uni(ln);
diff --git a/sv.c b/sv.c
index 6d6d39e9f5..315c85b1a0 100644
--- a/sv.c
+++ b/sv.c
@@ -4733,8 +4733,9 @@ Perl_sv_pos_b2u(pTHX_ register SV *sv, I32* offsetp)
     len = 0;
     while (s < send) {
 	STRLEN n;
-        /* We can use low level directly here as we are not looking at the values */
-	if (utf8n_to_uvuni(s, UTF8SKIP(s), &n, 0)) {
+	/* Call utf8n_to_uvchr() to validate the sequence */
+	utf8n_to_uvchr(s, UTF8SKIP(s), &n, 0);
+	if (n > 0) {
 	    s += n;
 	    len++;
 	}
diff --git a/toke.c b/toke.c
index ea0f65097f..00fe0b5504 100644
--- a/toke.c
+++ b/toke.c
@@ -36,8 +36,12 @@ static I32 utf16rev_textfilter(pTHXo_ int idx, SV *sv, int maxlen);
 #define XFAKEBRACK 128
 #define XENUMMASK 127
 
-/*#define UTF (SvUTF8(PL_linestr) && !(PL_hints & HINT_BYTE))*/
-#define UTF ((PL_linestr && DO_UTF8(PL_linestr)) || PL_hints & HINT_UTF8)
+#ifdef EBCDIC
+/* For now 'use utf8' does not affect tokenizer on EBCDIC */
+#define UTF (PL_linestr && DO_UTF8(PL_linestr))
+#else
+#define UTF ((PL_linestr && DO_UTF8(PL_linestr)) || (PL_hints & HINT_UTF8))
+#endif
 
 /* In variables name $^X, these are the legal values for X.
  * 1999-02-27 mjd-perl-patch@plover.com */
@@ -1218,22 +1222,22 @@ S_scan_const(pTHX_ char *start)
     register char *d = SvPVX(sv);		/* destination for copies */
     bool dorange = FALSE;			/* are we in a translit range? */
     bool didrange = FALSE;		        /* did we just finish a range? */
-    bool has_utf8 = (PL_linestr && SvUTF8(PL_linestr));
-						/* the constant is UTF8 */
+    I32  has_utf8 = FALSE;			/* Output constant is UTF8 */
+    I32  this_utf8 = UTF;			/* The source string is assumed to be UTF8 */
     UV uv;
 
-    I32 utf = (PL_lex_inwhat == OP_TRANS && PL_sublex_info.sub_op)
-	? (PL_sublex_info.sub_op->op_private & (OPpTRANS_FROM_UTF|OPpTRANS_TO_UTF))
-	: UTF;
-    I32 this_utf8 = (PL_lex_inwhat == OP_TRANS && PL_sublex_info.sub_op)
-	? (PL_sublex_info.sub_op->op_private & (PL_lex_repl ?
-						OPpTRANS_FROM_UTF : OPpTRANS_TO_UTF))
-	: UTF;
     const char *leaveit =	/* set of acceptably-backslashed characters */
 	PL_lex_inpat
 	    ? "\\.^$@AGZdDwWsSbBpPXC+*?|()-nrtfeaxcz0123456789[{]} \t\n\r\f\v#"
 	    : "";
 
+    if (PL_lex_inwhat == OP_TRANS && PL_sublex_info.sub_op) {
+	/* If we are doing a trans and we know we want UTF8 set expectation */
+	has_utf8   = PL_sublex_info.sub_op->op_private & (OPpTRANS_FROM_UTF|OPpTRANS_TO_UTF);
+	this_utf8  = PL_sublex_info.sub_op->op_private & (PL_lex_repl ? OPpTRANS_FROM_UTF : OPpTRANS_TO_UTF);
+    }
+
+
     while (s < send || dorange) {
         /* get transliterations out of the way (they're most literal) */
 	if (PL_lex_inwhat == OP_TRANS) {
@@ -1243,17 +1247,18 @@ S_scan_const(pTHX_ char *start)
 		I32 min;			/* first character in range */
 		I32 max;			/* last character in range */
 
-		if (utf) {
+		if (has_utf8) {
 		    char *c = (char*)utf8_hop((U8*)d, -1);
 		    char *e = d++;
 		    while (e-- > c)
 			*(e + 1) = *e;
-		    *c = (char)0xff;
+		    *c = UTF_TO_NATIVE(0xff);
 		    /* mark the range as done, and continue */
 		    dorange = FALSE;
 		    didrange = TRUE;
 		    continue;
 		}
+
 		i = d - SvPVX(sv);		/* remember current offset */
 		SvGROW(sv, SvLEN(sv) + 256);	/* never more than 256 chars in a range */
 		d = SvPVX(sv) + i;		/* refresh d after realloc */
@@ -1297,8 +1302,8 @@ S_scan_const(pTHX_ char *start)
 		if (didrange) {
 		    Perl_croak(aTHX_ "Ambiguous range in transliteration operator");
 		}
-		if (utf) {
-		    *d++ = (char)0xff;	/* use illegal utf8 byte--see pmtrans */
+		if (has_utf8) {
+		    *d++ = UTF_TO_NATIVE(0xff);	/* use illegal utf8 byte--see pmtrans */
 		    s++;
 		    continue;
 		}
@@ -1368,6 +1373,8 @@ S_scan_const(pTHX_ char *start)
 		break;		/* in regexp, $ might be tail anchor */
 	}
 
+	/* End of else if chain - OP_TRANS rejoin rest */
+
 	/* backslashes */
 	if (*s == '\\' && s+1 < send) {
 	    s++;
@@ -1502,7 +1509,6 @@ S_scan_const(pTHX_ char *start)
 			    PL_sublex_info.sub_op->op_private |=
 				(PL_lex_repl ? OPpTRANS_FROM_UTF
 					     : OPpTRANS_TO_UTF);
-			    utf = TRUE;
 			}
                     }
 		    else {
@@ -1603,40 +1609,40 @@ S_scan_const(pTHX_ char *start)
 	} /* end if (backslash) */
 
     default_action:
-       if (!UTF8_IS_INVARIANT((U8)(*s)) && (this_utf8 || has_utf8)) {
-           STRLEN len = (STRLEN) -1;
-           UV uv;
-           if (this_utf8) {
-               uv = utf8n_to_uvchr((U8*)s, send - s, &len, 0);
-           }
-           if (len == (STRLEN)-1) {
-               /* Illegal UTF8 (a high-bit byte), make it valid. */
-               char *old_pvx = SvPVX(sv);
-               /* need space for one extra char (NOTE: SvCUR() not set here) */
-               d = SvGROW(sv, SvLEN(sv) + 1) + (d - old_pvx);
-               d = (char*)uvchr_to_utf8((U8*)d, (U8)*s++);
-           }
-           else {
-               while (len--)
-                   *d++ = *s++;
-           }
-           has_utf8 = TRUE;
-	   if (PL_lex_inwhat == OP_TRANS && PL_sublex_info.sub_op) {
-	       PL_sublex_info.sub_op->op_private |=
-		   (PL_lex_repl ? OPpTRANS_FROM_UTF : OPpTRANS_TO_UTF);
-	       utf = TRUE;
-	   }
-           continue;
-       }
-       *d++ = NATIVE_TO_NEED(has_utf8,*s++);
+	/* If we started with encoded form, or already know we want it
+	   and then encode the next character */
+	if ((has_utf8 || this_utf8) && !NATIVE_IS_INVARIANT((U8)(*s))) {
+	    STRLEN len  = 1;
+	    UV uv       = (this_utf8) ? utf8n_to_uvchr((U8*)s, send - s, &len, 0) : (UV) ((U8) *s);
+	    STRLEN need = UNISKIP(NATIVE_TO_UNI(uv));
+	    s += len;
+	    if (need > len) {
+		/* encoded value larger than old, need extra space (NOTE: SvCUR() not set here) */
+		STRLEN off = d - SvPVX(sv);
+		d = SvGROW(sv, SvLEN(sv) + (need-len)) + off;
+	    }
+	    d = (char*)uvchr_to_utf8((U8*)d, uv);
+	    has_utf8 = TRUE;
+	}
+	else {
+	    *d++ = NATIVE_TO_NEED(has_utf8,*s++);
+	}
     } /* while loop to process each character */
 
     /* terminate the string and set up the sv */
     *d = '\0';
     SvCUR_set(sv, d - SvPVX(sv));
+    if (SvCUR(sv) >= SvLEN(sv))
+      Perl_croak(aTHX_ "panic:constant overflowed allocated space");
+
     SvPOK_on(sv);
-    if (has_utf8)
+    if (has_utf8) {
 	SvUTF8_on(sv);
+	if (PL_lex_inwhat == OP_TRANS && PL_sublex_info.sub_op) {
+		PL_sublex_info.sub_op->op_private |=
+		    (PL_lex_repl ? OPpTRANS_FROM_UTF : OPpTRANS_TO_UTF);
+	}
+    }
 
     /* shrink the sv if we allocated more than we used */
     if (SvCUR(sv) + 5 < SvLEN(sv)) {
diff --git a/utf8.c b/utf8.c
index 01afa010be..b95c7ad164 100644
--- a/utf8.c
+++ b/utf8.c
@@ -131,28 +131,6 @@ Perl_uvuni_to_utf8(pTHX_ U8 *d, UV uv)
 #endif /* Loop style */
 }
 
-/*
-=for apidoc A|U8*|uvchr_to_utf8|U8 *d|UV uv
-
-Adds the UTF8 representation of the Native codepoint C<uv> to the end
-of the string C<d>; C<d> should be have at least C<UTF8_MAXLEN+1> free
-bytes available. The return value is the pointer to the byte after the
-end of the new character. In other words,
-
-    d = uvchr_to_utf8(d, uv);
-
-is the recommended wide native character-aware way of saying
-
-    *(d++) = uv;
-
-=cut
-*/
-
-U8 *
-Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv)
-{
-    return Perl_uvuni_to_utf8(aTHX_ d, NATIVE_TO_UNI(uv));
-}
 
 
 /*
@@ -461,25 +439,6 @@ malformed:
 }
 
 /*
-=for apidoc A|U8* s|utf8n_to_uvchr|STRLEN curlen, STRLEN *retlen, U32 flags
-
-Returns the native character value of the first character in the string C<s>
-which is assumed to be in UTF8 encoding; C<retlen> will be set to the
-length, in bytes, of that character.
-
-Allows length and flags to be passed to low level routine.
-
-=cut
-*/
-
-UV
-Perl_utf8n_to_uvchr(pTHX_ U8* s, STRLEN curlen, STRLEN* retlen, U32 flags)
-{
-    UV uv = Perl_utf8n_to_uvuni(aTHX_ s, curlen, retlen, flags);
-    return UNI_TO_NATIVE(uv);
-}
-
-/*
 =for apidoc A|U8* s|utf8_to_uvchr|STRLEN *retlen
 
 Returns the native character value of the first character in the string C<s>
@@ -835,7 +794,7 @@ bool
 Perl_is_uni_alnum(pTHX_ U32 c)
 {
     U8 tmpbuf[UTF8_MAXLEN+1];
-    uvuni_to_utf8(tmpbuf, (UV)c);
+    uvchr_to_utf8(tmpbuf, (UV)c);
     return is_utf8_alnum(tmpbuf);
 }
 
@@ -843,7 +802,7 @@ bool
 Perl_is_uni_alnumc(pTHX_ U32 c)
 {
     U8 tmpbuf[UTF8_MAXLEN+1];
-    uvuni_to_utf8(tmpbuf, (UV)c);
+    uvchr_to_utf8(tmpbuf, (UV)c);
     return is_utf8_alnumc(tmpbuf);
 }
 
@@ -851,7 +810,7 @@ bool
 Perl_is_uni_idfirst(pTHX_ U32 c)
 {
     U8 tmpbuf[UTF8_MAXLEN+1];
-    uvuni_to_utf8(tmpbuf, (UV)c);
+    uvchr_to_utf8(tmpbuf, (UV)c);
     return is_utf8_idfirst(tmpbuf);
 }
 
@@ -859,7 +818,7 @@ bool
 Perl_is_uni_alpha(pTHX_ U32 c)
 {
     U8 tmpbuf[UTF8_MAXLEN+1];
-    uvuni_to_utf8(tmpbuf, (UV)c);
+    uvchr_to_utf8(tmpbuf, (UV)c);
     return is_utf8_alpha(tmpbuf);
 }
 
@@ -867,7 +826,7 @@ bool
 Perl_is_uni_ascii(pTHX_ U32 c)
 {
     U8 tmpbuf[UTF8_MAXLEN+1];
-    uvuni_to_utf8(tmpbuf, (UV)c);
+    uvchr_to_utf8(tmpbuf, (UV)c);
     return is_utf8_ascii(tmpbuf);
 }
 
@@ -875,7 +834,7 @@ bool
 Perl_is_uni_space(pTHX_ U32 c)
 {
     U8 tmpbuf[UTF8_MAXLEN+1];
-    uvuni_to_utf8(tmpbuf, (UV)c);
+    uvchr_to_utf8(tmpbuf, (UV)c);
     return is_utf8_space(tmpbuf);
 }
 
@@ -883,7 +842,7 @@ bool
 Perl_is_uni_digit(pTHX_ U32 c)
 {
     U8 tmpbuf[UTF8_MAXLEN+1];
-    uvuni_to_utf8(tmpbuf, (UV)c);
+    uvchr_to_utf8(tmpbuf, (UV)c);
     return is_utf8_digit(tmpbuf);
 }
 
@@ -891,7 +850,7 @@ bool
 Perl_is_uni_upper(pTHX_ U32 c)
 {
     U8 tmpbuf[UTF8_MAXLEN+1];
-    uvuni_to_utf8(tmpbuf, (UV)c);
+    uvchr_to_utf8(tmpbuf, (UV)c);
     return is_utf8_upper(tmpbuf);
 }
 
@@ -899,7 +858,7 @@ bool
 Perl_is_uni_lower(pTHX_ U32 c)
 {
     U8 tmpbuf[UTF8_MAXLEN+1];
-    uvuni_to_utf8(tmpbuf, (UV)c);
+    uvchr_to_utf8(tmpbuf, (UV)c);
     return is_utf8_lower(tmpbuf);
 }
 
@@ -907,7 +866,7 @@ bool
 Perl_is_uni_cntrl(pTHX_ U32 c)
 {
     U8 tmpbuf[UTF8_MAXLEN+1];
-    uvuni_to_utf8(tmpbuf, (UV)c);
+    uvchr_to_utf8(tmpbuf, (UV)c);
     return is_utf8_cntrl(tmpbuf);
 }
 
@@ -915,7 +874,7 @@ bool
 Perl_is_uni_graph(pTHX_ U32 c)
 {
     U8 tmpbuf[UTF8_MAXLEN+1];
-    uvuni_to_utf8(tmpbuf, (UV)c);
+    uvchr_to_utf8(tmpbuf, (UV)c);
     return is_utf8_graph(tmpbuf);
 }
 
@@ -923,7 +882,7 @@ bool
 Perl_is_uni_print(pTHX_ U32 c)
 {
     U8 tmpbuf[UTF8_MAXLEN+1];
-    uvuni_to_utf8(tmpbuf, (UV)c);
+    uvchr_to_utf8(tmpbuf, (UV)c);
     return is_utf8_print(tmpbuf);
 }
 
@@ -931,7 +890,7 @@ bool
 Perl_is_uni_punct(pTHX_ U32 c)
 {
     U8 tmpbuf[UTF8_MAXLEN+1];
-    uvuni_to_utf8(tmpbuf, (UV)c);
+    uvchr_to_utf8(tmpbuf, (UV)c);
     return is_utf8_punct(tmpbuf);
 }
 
@@ -939,7 +898,7 @@ bool
 Perl_is_uni_xdigit(pTHX_ U32 c)
 {
     U8 tmpbuf[UTF8_MAXLEN+1];
-    uvuni_to_utf8(tmpbuf, (UV)c);
+    uvchr_to_utf8(tmpbuf, (UV)c);
     return is_utf8_xdigit(tmpbuf);
 }
 
@@ -947,7 +906,7 @@ U32
 Perl_to_uni_upper(pTHX_ U32 c)
 {
     U8 tmpbuf[UTF8_MAXLEN+1];
-    uvuni_to_utf8(tmpbuf, (UV)c);
+    uvchr_to_utf8(tmpbuf, (UV)c);
     return to_utf8_upper(tmpbuf);
 }
 
@@ -955,7 +914,7 @@ U32
 Perl_to_uni_title(pTHX_ U32 c)
 {
     U8 tmpbuf[UTF8_MAXLEN+1];
-    uvuni_to_utf8(tmpbuf, (UV)c);
+    uvchr_to_utf8(tmpbuf, (UV)c);
     return to_utf8_title(tmpbuf);
 }
 
@@ -963,7 +922,7 @@ U32
 Perl_to_uni_lower(pTHX_ U32 c)
 {
     U8 tmpbuf[UTF8_MAXLEN+1];
-    uvuni_to_utf8(tmpbuf, (UV)c);
+    uvchr_to_utf8(tmpbuf, (UV)c);
     return to_utf8_lower(tmpbuf);
 }
 
@@ -1352,6 +1311,10 @@ Perl_swash_fetch(pTHX_ SV *sv, U8 *ptr)
 	/* If not cached, generate it via utf8::SWASHGET */
 	if (!svp || !SvPOK(*svp) || !(tmps = (U8*)SvPV(*svp, slen))) {
 	    dSP;
+	    /* We use utf8n_to_uvuni() as we want an index into
+	       Unicode tables, not a native character number.
+	     */
+	    UV code_point = utf8n_to_uvuni(ptr, UTF8_MAXLEN, NULL, 0);
 	    ENTER;
 	    SAVETMPS;
 	    save_re_context();
@@ -1359,10 +1322,7 @@ Perl_swash_fetch(pTHX_ SV *sv, U8 *ptr)
 	    PUSHMARK(SP);
 	    EXTEND(SP,3);
 	    PUSHs((SV*)sv);
-	    /* We call utf8_to_uni as we want and index into Unicode tables,
-	       not a native character number.
-	     */
-	    PUSHs(sv_2mortal(newSViv(utf8_to_uvuni(ptr, 0) & ~(needents - 1))));
+	    PUSHs(sv_2mortal(newSViv(code_point & ~(needents - 1))));
 	    PUSHs(sv_2mortal(newSViv(needents)));
 	    PUTBACK;
 	    if (call_method("SWASHGET", G_SCALAR))
@@ -1406,3 +1366,56 @@ Perl_swash_fetch(pTHX_ SV *sv, U8 *ptr)
     Perl_croak(aTHX_ "panic: swash_fetch");
     return 0;
 }
+
+
+/*
+=for apidoc A|U8*|uvchr_to_utf8|U8 *d|UV uv
+
+Adds the UTF8 representation of the Native codepoint C<uv> to the end
+of the string C<d>; C<d> should be have at least C<UTF8_MAXLEN+1> free
+bytes available. The return value is the pointer to the byte after the
+end of the new character. In other words,
+
+    d = uvchr_to_utf8(d, uv);
+
+is the recommended wide native character-aware way of saying
+
+    *(d++) = uv;
+
+=cut
+*/
+
+/* On ASCII machines this is normally a macro but we want a
+   real function in case XS code wants it
+*/
+#undef Perl_uvchr_to_utf8
+U8 *
+Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv)
+{
+    return Perl_uvuni_to_utf8(aTHX_ d, NATIVE_TO_UNI(uv));
+}
+
+
+/*
+=for apidoc A|U8* s|utf8n_to_uvchr|STRLEN curlen, STRLEN *retlen, U32 flags
+
+Returns the native character value of the first character in the string C<s>
+which is assumed to be in UTF8 encoding; C<retlen> will be set to the
+length, in bytes, of that character.
+
+Allows length and flags to be passed to low level routine.
+
+=cut
+*/
+/* On ASCII machines this is normally a macro but we want a
+   real function in case XS code wants it
+*/
+#undef Perl_utf8n_to_uvchr
+UV
+Perl_utf8n_to_uvchr(pTHX_ U8* s, STRLEN curlen, STRLEN* retlen, U32 flags)
+{
+    UV uv = Perl_utf8n_to_uvuni(aTHX_ s, curlen, retlen, flags);
+    return UNI_TO_NATIVE(uv);
+}
+
+
diff --git a/utf8.h b/utf8.h
index 46bc8289e9..95fe5706f5 100644
--- a/utf8.h
+++ b/utf8.h
@@ -48,6 +48,10 @@ END_EXTERN_C
 #define NATIVE_TO_NEED(enc,ch)   (ch)
 #define ASCII_TO_NEED(enc,ch)    (ch)
 
+/* As there are no translations avoid the function wrapper */
+#define Perl_utf8n_to_uvchr Perl_utf8n_to_uvuni
+#define Perl_uvchr_to_utf8  Perl_uvuni_to_utf8
+
 /*
 
  The following table is from Unicode 3.1.
diff --git a/utfebcdic.h b/utfebcdic.h
index ef67cb2c35..0dd73d2bb0 100644
--- a/utfebcdic.h
+++ b/utfebcdic.h
@@ -274,7 +274,7 @@ END_EXTERN_C
 #define NATIVE_IS_INVARIANT(c)		UNI_IS_INVARIANT(NATIVE_TO_ASCII(c))
 #define UTF8_IS_INVARIANT(c)		UNI_IS_INVARIANT(NATIVE_TO_UTF(c))
 #define UTF8_IS_START(c)		(NATIVE_TO_UTF(c) >= 0xA0 && (NATIVE_TO_UTF(c) & 0xE0) != 0xA0)
-#define UTF8_IS_CONTINUATION(c)		(NATIVE_TO_UTF(c) >= 0xA0 && (NATIVE_TO_UTF(c) & 0xE0) == 0xA0)
+#define UTF8_IS_CONTINUATION(c)		((NATIVE_TO_UTF(c) & 0xE0) == 0xA0)
 #define UTF8_IS_CONTINUED(c) 		(NATIVE_TO_UTF(c) >= 0xA0)
 #define UTF8_IS_DOWNGRADEABLE_START(c)	(NATIVE_TO_UTF(c) >= 0xA0 && (NATIVE_TO_UTF(c) & 0xF8) == 0xC0)
author	Nick Ing-Simmons <nik@tiuk.ti.com>	2001-03-20 20:04:39 +0000
committer	Nick Ing-Simmons <nik@tiuk.ti.com>	2001-03-20 20:04:39 +0000
commit	2b9d42f0ba1bb562fe21327dc7948ab1a5397a19 (patch)
tree	aee45626e3738deabafbe610cedef159d4c82d3b
parent	f2f6ab5ed2d2f824b4f6c3085a4a2275c2f8500a (diff)
download	perl-2b9d42f0ba1bb562fe21327dc7948ab1a5397a19.tar.gz