diff options
author | Jarkko Hietaniemi <jhi@iki.fi> | 2001-02-18 22:11:20 +0000 |
---|---|---|
committer | Jarkko Hietaniemi <jhi@iki.fi> | 2001-02-18 22:11:20 +0000 |
commit | 3baa4c62cda542368be1e7e1f7af8bd8257c2ff4 (patch) | |
tree | 36c534fdd1b4be5046f1c67dcdf0be0722f4dc97 | |
parent | 5da9da9e9f46681684e0c487fd55df8db6f9de67 (diff) | |
download | perl-3baa4c62cda542368be1e7e1f7af8bd8257c2ff4.tar.gz |
Misapplied regex optimizations when \C is present.
Fixes 20001230.002.
What still remains broken is that the submatches that
have \C in them get their UTF8 flag on because their
parent SV has it on. This will result in malformed
UTF8 if a \C happened to match a non-ASCII byte.
p4raw-id: //depot/perl@8836
-rw-r--r-- | regcomp.c | 3 | ||||
-rw-r--r-- | regcomp.h | 9 | ||||
-rw-r--r-- | regexec.c | 27 | ||||
-rw-r--r-- | regexp.h | 1 | ||||
-rwxr-xr-x | t/op/pat.t | 55 | ||||
-rwxr-xr-x | t/pragma/utf8.t | 15 |
6 files changed, 86 insertions, 24 deletions
@@ -1930,6 +1930,8 @@ Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm) r->reganch |= ROPT_LOOKBEHIND_SEEN; if (RExC_seen & REG_SEEN_EVAL) r->reganch |= ROPT_EVAL_SEEN; + if (RExC_seen & REG_SEEN_SANY) + r->reganch |= ROPT_SANY_SEEN; Newz(1002, r->startp, RExC_npar, I32); Newz(1002, r->endp, RExC_npar, I32); PL_regdata = r->data; /* for regprop() */ @@ -2638,6 +2640,7 @@ tryagain: break; case 'C': ret = reg_node(pRExC_state, SANY); + RExC_seen |= REG_SEEN_SANY; *flagp |= HASWIDTH|SIMPLE; nextchar(pRExC_state); break; @@ -285,10 +285,11 @@ struct regnode_charclass_class { #define EXTRA_SIZE(guy) ((sizeof(guy)-1)/sizeof(struct regnode)) -#define REG_SEEN_ZERO_LEN 1 -#define REG_SEEN_LOOKBEHIND 2 -#define REG_SEEN_GPOS 4 -#define REG_SEEN_EVAL 8 +#define REG_SEEN_ZERO_LEN 1 +#define REG_SEEN_LOOKBEHIND 2 +#define REG_SEEN_GPOS 4 +#define REG_SEEN_EVAL 8 +#define REG_SEEN_SANY 16 START_EXTERN_C @@ -398,7 +398,8 @@ Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos, DEBUG_r(PerlIO_printf(Perl_debug_log, "Not at start...\n")); goto fail; } - if (prog->check_offset_min == prog->check_offset_max) { + if (prog->check_offset_min == prog->check_offset_max && + !(prog->reganch & ROPT_SANY_SEEN)) { /* Substring at constant offset from beg-of-str... */ I32 slen; @@ -474,6 +475,10 @@ Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos, if (data) *data->scream_olds = s; } + else if (prog->reganch & ROPT_SANY_SEEN) + s = fbm_instr((U8*)(s + start_shift), + (U8*)(strend - end_shift), + check, PL_multiline ? FBMrf_MULTILINE : 0); else s = fbm_instr(HOP3(s, start_shift, strend), HOP3(strend, -end_shift, strbeg), @@ -1407,7 +1412,8 @@ Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char * minlen = prog->minlen; if (do_utf8) { - if (utf8_distance((U8*)strend, (U8*)startpos) < minlen) goto phooey; + if (!(prog->reganch & ROPT_SANY_SEEN)) + if (utf8_distance((U8*)strend, (U8*)startpos) < minlen) goto phooey; } else { if (strend - startpos < minlen) goto phooey; @@ -2075,13 +2081,6 @@ S_regmatch(pTHX_ regnode *prog) sayNO; break; case SANY: - if (do_utf8) { - locinput += PL_utf8skip[nextchr]; - if (locinput > PL_regeol) - sayNO; - nextchr = UCHARAT(locinput); - break; - } if (!nextchr && locinput >= PL_regeol) sayNO; nextchr = UCHARAT(++locinput); @@ -3563,15 +3562,7 @@ S_regrepeat(pTHX_ regnode *p, I32 max) } break; case SANY: - if (do_utf8) { - loceol = PL_regeol; - while (hardcount < max && scan < loceol) { - scan += UTF8SKIP(scan); - hardcount++; - } - } else { - scan = loceol; - } + scan = loceol; break; case EXACT: /* length of string is 1 */ c = (U8)*STRING(p); @@ -54,6 +54,7 @@ typedef struct regexp { #define ROPT_CHECK_ALL 0x00100 #define ROPT_LOOKBEHIND_SEEN 0x00200 #define ROPT_EVAL_SEEN 0x00400 +#define ROPT_SANY_SEEN 0x00800 /* 0xf800 of reganch is used by PMf_COMPILETIME */ diff --git a/t/op/pat.t b/t/op/pat.t index 17df867fd9..d7eb9f8fd1 100755 --- a/t/op/pat.t +++ b/t/op/pat.t @@ -4,7 +4,7 @@ # the format supported by op/regexp.t. If you want to add a test # that does fit that format, add it to op/re_tests, not here. -print "1..231\n"; +print "1..240\n"; BEGIN { chdir 't' if -d 't'; @@ -1129,3 +1129,56 @@ print "not " unless "A \x{263a} B z C" =~ /A . B (??{ "z" }) C/; print "ok $test\n"; $test++; +$_ = "a\x{100}b"; +if (/(.)(\C)(\C)(.)/) { + print "ok 232\n"; + if ($1 eq "a") { + print "ok 233\n"; + } else { + print "not ok 233\n"; + } + if ($2 eq "\xC4") { + print "ok 234\n"; + } else { + print "not ok 234\n"; + } + if ($3 eq "\x80") { + print "ok 235\n"; + } else { + print "not ok 235\n"; + } + if ($4 eq "b") { + print "ok 236\n"; + } else { + print "not ok 236\n"; + } +} else { + for (232..236) { + print "not ok $_\n"; + } +} +$_ = "\x{100}"; +if (/(\C)/g) { + print "ok 237\n"; + if ($1 eq "\xC4") { + print "ok 238\n"; + } else { + print "not ok 238\n"; + } +} else { + for (237..238) { + print "not ok $_\n"; + } +} +if (/(\C)/g) { + print "ok 239\n"; + if ($1 eq "\x80") { + print "ok 240\n"; + } else { + print "not ok 240\n"; + } +} else { + for (239..240) { + print "not ok $_\n"; + } +} diff --git a/t/pragma/utf8.t b/t/pragma/utf8.t index 577e6b4e2b..60e6c6e102 100755 --- a/t/pragma/utf8.t +++ b/t/pragma/utf8.t @@ -10,7 +10,7 @@ BEGIN { } } -print "1..107\n"; +print "1..109\n"; my $test = 1; @@ -577,3 +577,16 @@ sub nok_bytes { $test++; # 107 } +{ + # bug id 20001230.002 + + use utf8; + + print "not " unless "École" =~ /^\C\C(.)/ && $1 eq 'c'; + print "ok $test\n"; + $test++; # 108 + + print "not " unless "École" =~ /^\C\C(c)/; + print "ok $test\n"; + $test++; # 109 +} |