Misapplied regex optimizations when \C is present.

Fixes 20001230.002. What still remains broken is that the submatches that have \C in them get their UTF8 flag on because their parent SV has it on. This will result in malformed UTF8 if a \C happened to match a non-ASCII byte. p4raw-id: //depot/perl@8836
author: Jarkko Hietaniemi <jhi@iki.fi> 2001-02-18 22:11:20 +0000
committer: Jarkko Hietaniemi <jhi@iki.fi> 2001-02-18 22:11:20 +0000
commit: 3baa4c62cda542368be1e7e1f7af8bd8257c2ff4 (patch)
tree: 36c534fdd1b4be5046f1c67dcdf0be0722f4dc97
parent: 5da9da9e9f46681684e0c487fd55df8db6f9de67 (diff)
download: perl-3baa4c62cda542368be1e7e1f7af8bd8257c2ff4.tar.gz
6 files changed, 86 insertions, 24 deletions
diff --git a/regcomp.c b/regcomp.c
index 547f756a30..69d114e9e9 100644
--- a/regcomp.c
+++ b/regcomp.c
@@ -1930,6 +1930,8 @@ Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
 	r->reganch |= ROPT_LOOKBEHIND_SEEN;
     if (RExC_seen & REG_SEEN_EVAL)
 	r->reganch |= ROPT_EVAL_SEEN;
+    if (RExC_seen & REG_SEEN_SANY)
+	r->reganch |= ROPT_SANY_SEEN;
     Newz(1002, r->startp, RExC_npar, I32);
     Newz(1002, r->endp, RExC_npar, I32);
     PL_regdata = r->data; /* for regprop() */
@@ -2638,6 +2640,7 @@ tryagain:
 	    break;
 	case 'C':
 	    ret = reg_node(pRExC_state, SANY);
+	    RExC_seen |= REG_SEEN_SANY;
 	    *flagp |= HASWIDTH|SIMPLE;
 	    nextchar(pRExC_state);
 	    break;
diff --git a/regcomp.h b/regcomp.h
index 066e31f01d..ee9be39f71 100644
--- a/regcomp.h
+++ b/regcomp.h
@@ -285,10 +285,11 @@ struct regnode_charclass_class {
 
 #define EXTRA_SIZE(guy) ((sizeof(guy)-1)/sizeof(struct regnode))
 
-#define REG_SEEN_ZERO_LEN	1
-#define REG_SEEN_LOOKBEHIND	2
-#define REG_SEEN_GPOS		4
-#define REG_SEEN_EVAL		8
+#define REG_SEEN_ZERO_LEN	 1
+#define REG_SEEN_LOOKBEHIND	 2
+#define REG_SEEN_GPOS		 4
+#define REG_SEEN_EVAL		 8
+#define REG_SEEN_SANY		16
 
 START_EXTERN_C
 
diff --git a/regexec.c b/regexec.c
index c70d1b1655..5d9e8ac8fa 100644
--- a/regexec.c
+++ b/regexec.c
@@ -398,7 +398,8 @@ Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos,
 	      DEBUG_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
 	      goto fail;
 	  }
-	  if (prog->check_offset_min == prog->check_offset_max) {
+	  if (prog->check_offset_min == prog->check_offset_max &&
+	      !(prog->reganch & ROPT_SANY_SEEN)) {
 	    /* Substring at constant offset from beg-of-str... */
 	    I32 slen;
 
@@ -474,6 +475,10 @@ Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos,
 	if (data)
 	    *data->scream_olds = s;
     }
+    else if (prog->reganch & ROPT_SANY_SEEN)
+	s = fbm_instr((U8*)(s + start_shift),
+		      (U8*)(strend - end_shift),
+		      check, PL_multiline ? FBMrf_MULTILINE : 0);
     else
 	s = fbm_instr(HOP3(s, start_shift, strend),
 		      HOP3(strend, -end_shift, strbeg),
@@ -1407,7 +1412,8 @@ Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char *
 
     minlen = prog->minlen;
     if (do_utf8) {
-      if (utf8_distance((U8*)strend, (U8*)startpos) < minlen) goto phooey;
+      if (!(prog->reganch & ROPT_SANY_SEEN))
+        if (utf8_distance((U8*)strend, (U8*)startpos) < minlen) goto phooey;
     }
     else {
       if (strend - startpos < minlen) goto phooey;
@@ -2075,13 +2081,6 @@ S_regmatch(pTHX_ regnode *prog)
 		sayNO;
 	    break;
 	case SANY:
-	    if (do_utf8) {
-		locinput += PL_utf8skip[nextchr];
-		if (locinput > PL_regeol)
-		    sayNO;
-		nextchr = UCHARAT(locinput);
-		break;
-	    }
 	    if (!nextchr && locinput >= PL_regeol)
 		sayNO;
 	    nextchr = UCHARAT(++locinput);
@@ -3563,15 +3562,7 @@ S_regrepeat(pTHX_ regnode *p, I32 max)
 	}
 	break;
     case SANY:
-	if (do_utf8) {
-	    loceol = PL_regeol;
-	    while (hardcount < max && scan < loceol) {
-		scan += UTF8SKIP(scan);
-		hardcount++;
-	    }
-	} else {
-	    scan = loceol;
-	}
+	scan = loceol;
 	break;
     case EXACT:		/* length of string is 1 */
 	c = (U8)*STRING(p);
diff --git a/regexp.h b/regexp.h
index 3c71060a40..33ace4066e 100644
--- a/regexp.h
+++ b/regexp.h
@@ -54,6 +54,7 @@ typedef struct regexp {
 #define ROPT_CHECK_ALL		0x00100
 #define ROPT_LOOKBEHIND_SEEN	0x00200
 #define ROPT_EVAL_SEEN		0x00400
+#define ROPT_SANY_SEEN		0x00800
 
 /* 0xf800 of reganch is used by PMf_COMPILETIME */
 
diff --git a/t/op/pat.t b/t/op/pat.t
index 17df867fd9..d7eb9f8fd1 100755
--- a/t/op/pat.t
+++ b/t/op/pat.t
@@ -4,7 +4,7 @@
 # the format supported by op/regexp.t.  If you want to add a test
 # that does fit that format, add it to op/re_tests, not here.
 
-print "1..231\n";
+print "1..240\n";
 
 BEGIN {
     chdir 't' if -d 't';
@@ -1129,3 +1129,56 @@ print "not " unless "A \x{263a} B z C" =~ /A . B (??{ "z" }) C/;
 print "ok $test\n";
 $test++;
 
+$_ = "a\x{100}b";
+if (/(.)(\C)(\C)(.)/) {
+  print "ok 232\n";
+  if ($1 eq "a") {
+    print "ok 233\n";
+  } else {
+    print "not ok 233\n";
+  }
+  if ($2 eq "\xC4") {
+    print "ok 234\n";
+  } else {
+    print "not ok 234\n";
+  }
+  if ($3 eq "\x80") {
+    print "ok 235\n";
+  } else {
+    print "not ok 235\n";
+  }
+  if ($4 eq "b") {
+    print "ok 236\n";
+  } else {
+    print "not ok 236\n";
+  }
+} else {
+  for (232..236) {
+    print "not ok $_\n";
+  }
+}
+$_ = "\x{100}";
+if (/(\C)/g) {
+  print "ok 237\n";
+  if ($1 eq "\xC4") {
+    print "ok 238\n";
+  } else {
+    print "not ok 238\n";
+  }
+} else {
+  for (237..238) {
+    print "not ok $_\n";
+  }
+}
+if (/(\C)/g) {
+  print "ok 239\n";
+  if ($1 eq "\x80") {
+    print "ok 240\n";
+  } else {
+    print "not ok 240\n";
+  }
+} else {
+  for (239..240) {
+    print "not ok $_\n";
+  }
+}
diff --git a/t/pragma/utf8.t b/t/pragma/utf8.t
index 577e6b4e2b..60e6c6e102 100755
--- a/t/pragma/utf8.t
+++ b/t/pragma/utf8.t
@@ -10,7 +10,7 @@ BEGIN {
     }
 }
 
-print "1..107\n";
+print "1..109\n";
 
 my $test = 1;
 
@@ -577,3 +577,16 @@ sub nok_bytes {
     $test++;					# 107
 }
 
+{
+    # bug id 20001230.002
+
+    use utf8;
+
+    print "not " unless "École" =~ /^\C\C(.)/ && $1 eq 'c';
+    print "ok $test\n";
+    $test++;					# 108
+
+    print "not " unless "École" =~ /^\C\C(c)/;
+    print "ok $test\n";
+    $test++;					# 109
+}
author	Jarkko Hietaniemi <jhi@iki.fi>	2001-02-18 22:11:20 +0000
committer	Jarkko Hietaniemi <jhi@iki.fi>	2001-02-18 22:11:20 +0000
commit	3baa4c62cda542368be1e7e1f7af8bd8257c2ff4 (patch)
tree	36c534fdd1b4be5046f1c67dcdf0be0722f4dc97
parent	5da9da9e9f46681684e0c487fd55df8db6f9de67 (diff)
download	perl-3baa4c62cda542368be1e7e1f7af8bd8257c2ff4.tar.gz