What started as a small nit (the charnames test, nit found

be Hugo), ballooned a bit... the goal is Larry's wish that illegal Unicode (such as U+FFFF) by default doesn't warn, since what if somebody WANTS to create illegal Unicode? Now getting close to this in the regex runtime. (Also, fix more of my fixation that BOM would be U+FFFE.) p4raw-id: //depot/perl@15689
author: Jarkko Hietaniemi <jhi@iki.fi> 2002-04-02 20:35:13 +0000
committer: Jarkko Hietaniemi <jhi@iki.fi> 2002-04-02 20:35:13 +0000
commit: 872c91ae155f6880f8bf2b15c143bda5279a5794 (patch)
tree: 663505dbece632b3e75b0158fb36720c2d54829f /regexec.c
parent: b929be1d26aeff4c8c3d4bbd655731a23247ecc8 (diff)
download: perl-872c91ae155f6880f8bf2b15c143bda5279a5794.tar.gz
1 files changed, 51 insertions, 17 deletions
diff --git a/regexec.c b/regexec.c
index 8db2dc27b5..29b870436a 100644
--- a/regexec.c
+++ b/regexec.c
@@ -999,8 +999,10 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta
 		to_utf8_lower((U8*)m, tmpbuf1, &ulen1);
 		to_utf8_upper((U8*)m, tmpbuf2, &ulen2);
 
-		c1 = utf8_to_uvchr(tmpbuf1, 0);
-		c2 = utf8_to_uvchr(tmpbuf2, 0);
+		c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXLEN_UCLC, 
+				    0, ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
+		c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXLEN_UCLC,
+				    0, ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
 	    }
 	    else {
 		c1 = *(U8*)m;
@@ -1037,7 +1039,9 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta
 		
 		if (c1 == c2) {
 		    while (s <= e) {
-		        c = utf8_to_uvchr((U8*)s, &len);
+		        c = utf8n_to_uvchr((U8*)s, UTF8_MAXLEN, &len,
+					   ckWARN(WARN_UTF8) ?
+					   0 : UTF8_ALLOW_ANY);
 			if ( c == c1
 			     && (ln == len ||
 				 ibcmp_utf8(s, (char **)0, 0,  do_utf8,
@@ -1062,7 +1066,9 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta
 		}
 		else {
 		    while (s <= e) {
-		        c = utf8_to_uvchr((U8*)s, &len);
+		      c = utf8n_to_uvchr((U8*)s, UTF8_MAXLEN, &len,
+					   ckWARN(WARN_UTF8) ?
+					   0 : UTF8_ALLOW_ANY);
 
 			/* Handle some of the three Greek sigmas cases.
 			 * Note that not all the possible combinations
@@ -2390,7 +2396,9 @@ S_regmatch(pTHX_ regnode *prog)
 			if (l >= PL_regeol)
 			     sayNO;
 			if (NATIVE_TO_UNI(*(U8*)s) !=
-			    utf8_to_uvuni((U8*)l, &ulen))
+			    utf8n_to_uvuni((U8*)l, UTF8_MAXLEN, &ulen,
+					   ckWARN(WARN_UTF8) ?
+					   0 : UTF8_ALLOW_ANY))
 			     sayNO;
 			l += ulen;
 			s ++;
@@ -2402,7 +2410,9 @@ S_regmatch(pTHX_ regnode *prog)
 			if (l >= PL_regeol)
 			    sayNO;
 			if (NATIVE_TO_UNI(*((U8*)l)) !=
-			    utf8_to_uvuni((U8*)s, &ulen))
+			    utf8n_to_uvuni((U8*)s, UTF8_MAXLEN, &ulen,
+					   ckWARN(WARN_UTF8) ?
+					   0 : UTF8_ALLOW_ANY))
 			    sayNO;
 			s += ulen;
 			l ++;
@@ -3545,11 +3555,17 @@ S_regmatch(pTHX_ regnode *prog)
 			     to_utf8_lower((U8*)s, tmpbuf1, &ulen1);
 			     to_utf8_upper((U8*)s, tmpbuf2, &ulen2);
 
-			     c1 = utf8_to_uvuni(tmpbuf1, 0);
-			     c2 = utf8_to_uvuni(tmpbuf2, 0);
+			     c1 = utf8n_to_uvuni(tmpbuf1, UTF8_MAXLEN, 0,
+						 ckWARN(WARN_UTF8) ?
+						 0 : UTF8_ALLOW_ANY);
+			     c2 = utf8n_to_uvuni(tmpbuf2, UTF8_MAXLEN, 0,
+						 ckWARN(WARN_UTF8) ?
+						 0 : UTF8_ALLOW_ANY);
 			}
 			else {
-			    c2 = c1 = utf8_to_uvchr(s, NULL);
+			    c2 = c1 = utf8n_to_uvchr(s, UTF8_MAXLEN, 0,
+						     ckWARN(WARN_UTF8) ?
+						     0 : UTF8_ALLOW_ANY);
 			}
 		    }
 		}
@@ -3605,16 +3621,24 @@ S_regmatch(pTHX_ regnode *prog)
 			else {
 			    STRLEN len;
 			    if (c1 == c2) {
-				/* count initialised to utf8_distance(old, locinput) */
+				/* count initialised to
+				 * utf8_distance(old, locinput) */
 				while (locinput <= e &&
-				       utf8_to_uvchr((U8*)locinput, &len) != c1) {
+				       utf8n_to_uvchr((U8*)locinput,
+						      UTF8_MAXLEN, &len,
+						      ckWARN(WARN_UTF8) ?
+						      0 : UTF8_ALLOW_ANY) != c1) {
 				    locinput += len;
 				    count++;
 				}
 			    } else {
-				/* count initialised to utf8_distance(old, locinput) */
+				/* count initialised to
+				 * utf8_distance(old, locinput) */
 				while (locinput <= e) {
-				    UV c = utf8_to_uvchr((U8*)locinput, &len);
+				    UV c = utf8n_to_uvchr((U8*)locinput,
+							  UTF8_MAXLEN, &len,
+							  ckWARN(WARN_UTF8) ?
+							  0 : UTF8_ALLOW_ANY);
 				    if (c == c1 || c == c2)
 					break;
 				    locinput += len;
@@ -3648,7 +3672,10 @@ S_regmatch(pTHX_ regnode *prog)
 		    UV c;
 		    if (c1 != -1000) {
 			if (do_utf8)
-			    c = utf8_to_uvchr((U8*)PL_reginput, NULL);
+			    c = utf8n_to_uvchr((U8*)PL_reginput,
+					       UTF8_MAXLEN, 0,
+					       ckWARN(WARN_UTF8) ?
+					       0 : UTF8_ALLOW_ANY);
 			else
 			    c = UCHARAT(PL_reginput);
 			/* If it could work, try it. */
@@ -3695,7 +3722,10 @@ S_regmatch(pTHX_ regnode *prog)
 		    while (n >= ln) {
 			if (c1 != -1000) {
 			    if (do_utf8)
-				c = utf8_to_uvchr((U8*)PL_reginput, NULL);
+				c = utf8n_to_uvchr((U8*)PL_reginput,
+						   UTF8_MAXLEN, 0,
+						   ckWARN(WARN_UTF8) ?
+						   0 : UTF8_ALLOW_ANY);
 			    else
 				c = UCHARAT(PL_reginput);
 			}
@@ -3715,7 +3745,10 @@ S_regmatch(pTHX_ regnode *prog)
 		    while (n >= ln) {
 			if (c1 != -1000) {
 			    if (do_utf8)
-				c = utf8_to_uvchr((U8*)PL_reginput, NULL);
+				c = utf8n_to_uvchr((U8*)PL_reginput,
+						   UTF8_MAXLEN, 0,
+						   ckWARN(WARN_UTF8) ?
+						   0 : UTF8_ALLOW_ANY);
 			    else
 				c = UCHARAT(PL_reginput);
 			}
@@ -4297,7 +4330,8 @@ S_reginclass(pTHX_ register regnode *n, register U8* p, STRLEN* lenp, register b
     STRLEN len = 0;
     STRLEN plen;
 
-    c = do_utf8 ? utf8_to_uvchr(p, &len) : *p;
+    c = do_utf8 ? utf8n_to_uvchr(p, UTF8_MAXLEN, &len,
+				 ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY) : *p;
 
     plen = lenp ? *lenp : UNISKIP(NATIVE_TO_UNI(c));
     if (do_utf8 || (flags & ANYOF_UNICODE)) {
author	Jarkko Hietaniemi <jhi@iki.fi>	2002-04-02 20:35:13 +0000
committer	Jarkko Hietaniemi <jhi@iki.fi>	2002-04-02 20:35:13 +0000
commit	872c91ae155f6880f8bf2b15c143bda5279a5794 (patch)
tree	663505dbece632b3e75b0158fb36720c2d54829f /regexec.c
parent	b929be1d26aeff4c8c3d4bbd655731a23247ecc8 (diff)
download	perl-872c91ae155f6880f8bf2b15c143bda5279a5794.tar.gz