summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJarkko Hietaniemi <jhi@iki.fi>2001-02-18 22:11:20 +0000
committerJarkko Hietaniemi <jhi@iki.fi>2001-02-18 22:11:20 +0000
commit3baa4c62cda542368be1e7e1f7af8bd8257c2ff4 (patch)
tree36c534fdd1b4be5046f1c67dcdf0be0722f4dc97
parent5da9da9e9f46681684e0c487fd55df8db6f9de67 (diff)
downloadperl-3baa4c62cda542368be1e7e1f7af8bd8257c2ff4.tar.gz
Misapplied regex optimizations when \C is present.
Fixes 20001230.002. What still remains broken is that the submatches that have \C in them get their UTF8 flag on because their parent SV has it on. This will result in malformed UTF8 if a \C happened to match a non-ASCII byte. p4raw-id: //depot/perl@8836
-rw-r--r--regcomp.c3
-rw-r--r--regcomp.h9
-rw-r--r--regexec.c27
-rw-r--r--regexp.h1
-rwxr-xr-xt/op/pat.t55
-rwxr-xr-xt/pragma/utf8.t15
6 files changed, 86 insertions, 24 deletions
diff --git a/regcomp.c b/regcomp.c
index 547f756a30..69d114e9e9 100644
--- a/regcomp.c
+++ b/regcomp.c
@@ -1930,6 +1930,8 @@ Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
r->reganch |= ROPT_LOOKBEHIND_SEEN;
if (RExC_seen & REG_SEEN_EVAL)
r->reganch |= ROPT_EVAL_SEEN;
+ if (RExC_seen & REG_SEEN_SANY)
+ r->reganch |= ROPT_SANY_SEEN;
Newz(1002, r->startp, RExC_npar, I32);
Newz(1002, r->endp, RExC_npar, I32);
PL_regdata = r->data; /* for regprop() */
@@ -2638,6 +2640,7 @@ tryagain:
break;
case 'C':
ret = reg_node(pRExC_state, SANY);
+ RExC_seen |= REG_SEEN_SANY;
*flagp |= HASWIDTH|SIMPLE;
nextchar(pRExC_state);
break;
diff --git a/regcomp.h b/regcomp.h
index 066e31f01d..ee9be39f71 100644
--- a/regcomp.h
+++ b/regcomp.h
@@ -285,10 +285,11 @@ struct regnode_charclass_class {
#define EXTRA_SIZE(guy) ((sizeof(guy)-1)/sizeof(struct regnode))
-#define REG_SEEN_ZERO_LEN 1
-#define REG_SEEN_LOOKBEHIND 2
-#define REG_SEEN_GPOS 4
-#define REG_SEEN_EVAL 8
+#define REG_SEEN_ZERO_LEN 1
+#define REG_SEEN_LOOKBEHIND 2
+#define REG_SEEN_GPOS 4
+#define REG_SEEN_EVAL 8
+#define REG_SEEN_SANY 16
START_EXTERN_C
diff --git a/regexec.c b/regexec.c
index c70d1b1655..5d9e8ac8fa 100644
--- a/regexec.c
+++ b/regexec.c
@@ -398,7 +398,8 @@ Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos,
DEBUG_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
goto fail;
}
- if (prog->check_offset_min == prog->check_offset_max) {
+ if (prog->check_offset_min == prog->check_offset_max &&
+ !(prog->reganch & ROPT_SANY_SEEN)) {
/* Substring at constant offset from beg-of-str... */
I32 slen;
@@ -474,6 +475,10 @@ Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos,
if (data)
*data->scream_olds = s;
}
+ else if (prog->reganch & ROPT_SANY_SEEN)
+ s = fbm_instr((U8*)(s + start_shift),
+ (U8*)(strend - end_shift),
+ check, PL_multiline ? FBMrf_MULTILINE : 0);
else
s = fbm_instr(HOP3(s, start_shift, strend),
HOP3(strend, -end_shift, strbeg),
@@ -1407,7 +1412,8 @@ Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char *
minlen = prog->minlen;
if (do_utf8) {
- if (utf8_distance((U8*)strend, (U8*)startpos) < minlen) goto phooey;
+ if (!(prog->reganch & ROPT_SANY_SEEN))
+ if (utf8_distance((U8*)strend, (U8*)startpos) < minlen) goto phooey;
}
else {
if (strend - startpos < minlen) goto phooey;
@@ -2075,13 +2081,6 @@ S_regmatch(pTHX_ regnode *prog)
sayNO;
break;
case SANY:
- if (do_utf8) {
- locinput += PL_utf8skip[nextchr];
- if (locinput > PL_regeol)
- sayNO;
- nextchr = UCHARAT(locinput);
- break;
- }
if (!nextchr && locinput >= PL_regeol)
sayNO;
nextchr = UCHARAT(++locinput);
@@ -3563,15 +3562,7 @@ S_regrepeat(pTHX_ regnode *p, I32 max)
}
break;
case SANY:
- if (do_utf8) {
- loceol = PL_regeol;
- while (hardcount < max && scan < loceol) {
- scan += UTF8SKIP(scan);
- hardcount++;
- }
- } else {
- scan = loceol;
- }
+ scan = loceol;
break;
case EXACT: /* length of string is 1 */
c = (U8)*STRING(p);
diff --git a/regexp.h b/regexp.h
index 3c71060a40..33ace4066e 100644
--- a/regexp.h
+++ b/regexp.h
@@ -54,6 +54,7 @@ typedef struct regexp {
#define ROPT_CHECK_ALL 0x00100
#define ROPT_LOOKBEHIND_SEEN 0x00200
#define ROPT_EVAL_SEEN 0x00400
+#define ROPT_SANY_SEEN 0x00800
/* 0xf800 of reganch is used by PMf_COMPILETIME */
diff --git a/t/op/pat.t b/t/op/pat.t
index 17df867fd9..d7eb9f8fd1 100755
--- a/t/op/pat.t
+++ b/t/op/pat.t
@@ -4,7 +4,7 @@
# the format supported by op/regexp.t. If you want to add a test
# that does fit that format, add it to op/re_tests, not here.
-print "1..231\n";
+print "1..240\n";
BEGIN {
chdir 't' if -d 't';
@@ -1129,3 +1129,56 @@ print "not " unless "A \x{263a} B z C" =~ /A . B (??{ "z" }) C/;
print "ok $test\n";
$test++;
+$_ = "a\x{100}b";
+if (/(.)(\C)(\C)(.)/) {
+ print "ok 232\n";
+ if ($1 eq "a") {
+ print "ok 233\n";
+ } else {
+ print "not ok 233\n";
+ }
+ if ($2 eq "\xC4") {
+ print "ok 234\n";
+ } else {
+ print "not ok 234\n";
+ }
+ if ($3 eq "\x80") {
+ print "ok 235\n";
+ } else {
+ print "not ok 235\n";
+ }
+ if ($4 eq "b") {
+ print "ok 236\n";
+ } else {
+ print "not ok 236\n";
+ }
+} else {
+ for (232..236) {
+ print "not ok $_\n";
+ }
+}
+$_ = "\x{100}";
+if (/(\C)/g) {
+ print "ok 237\n";
+ if ($1 eq "\xC4") {
+ print "ok 238\n";
+ } else {
+ print "not ok 238\n";
+ }
+} else {
+ for (237..238) {
+ print "not ok $_\n";
+ }
+}
+if (/(\C)/g) {
+ print "ok 239\n";
+ if ($1 eq "\x80") {
+ print "ok 240\n";
+ } else {
+ print "not ok 240\n";
+ }
+} else {
+ for (239..240) {
+ print "not ok $_\n";
+ }
+}
diff --git a/t/pragma/utf8.t b/t/pragma/utf8.t
index 577e6b4e2b..60e6c6e102 100755
--- a/t/pragma/utf8.t
+++ b/t/pragma/utf8.t
@@ -10,7 +10,7 @@ BEGIN {
}
}
-print "1..107\n";
+print "1..109\n";
my $test = 1;
@@ -577,3 +577,16 @@ sub nok_bytes {
$test++; # 107
}
+{
+ # bug id 20001230.002
+
+ use utf8;
+
+ print "not " unless "École" =~ /^\C\C(.)/ && $1 eq 'c';
+ print "ok $test\n";
+ $test++; # 108
+
+ print "not " unless "École" =~ /^\C\C(c)/;
+ print "ok $test\n";
+ $test++; # 109
+}