diff options
-rw-r--r-- | regcomp.c | 3 | ||||
-rw-r--r-- | regcomp.h | 9 | ||||
-rw-r--r-- | regexec.c | 27 | ||||
-rw-r--r-- | regexp.h | 1 | ||||
-rwxr-xr-x | t/op/pat.t | 55 | ||||
-rwxr-xr-x | t/pragma/utf8.t | 15 |
6 files changed, 86 insertions, 24 deletions
@@ -1930,6 +1930,8 @@ Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm) r->reganch |= ROPT_LOOKBEHIND_SEEN; if (RExC_seen & REG_SEEN_EVAL) r->reganch |= ROPT_EVAL_SEEN; + if (RExC_seen & REG_SEEN_SANY) + r->reganch |= ROPT_SANY_SEEN; Newz(1002, r->startp, RExC_npar, I32); Newz(1002, r->endp, RExC_npar, I32); PL_regdata = r->data; /* for regprop() */ @@ -2638,6 +2640,7 @@ tryagain: break; case 'C': ret = reg_node(pRExC_state, SANY); + RExC_seen |= REG_SEEN_SANY; *flagp |= HASWIDTH|SIMPLE; nextchar(pRExC_state); break; @@ -285,10 +285,11 @@ struct regnode_charclass_class { #define EXTRA_SIZE(guy) ((sizeof(guy)-1)/sizeof(struct regnode)) -#define REG_SEEN_ZERO_LEN 1 -#define REG_SEEN_LOOKBEHIND 2 -#define REG_SEEN_GPOS 4 -#define REG_SEEN_EVAL 8 +#define REG_SEEN_ZERO_LEN 1 +#define REG_SEEN_LOOKBEHIND 2 +#define REG_SEEN_GPOS 4 +#define REG_SEEN_EVAL 8 +#define REG_SEEN_SANY 16 START_EXTERN_C @@ -398,7 +398,8 @@ Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos, DEBUG_r(PerlIO_printf(Perl_debug_log, "Not at start...\n")); goto fail; } - if (prog->check_offset_min == prog->check_offset_max) { + if (prog->check_offset_min == prog->check_offset_max && + !(prog->reganch & ROPT_SANY_SEEN)) { /* Substring at constant offset from beg-of-str... */ I32 slen; @@ -474,6 +475,10 @@ Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos, if (data) *data->scream_olds = s; } + else if (prog->reganch & ROPT_SANY_SEEN) + s = fbm_instr((U8*)(s + start_shift), + (U8*)(strend - end_shift), + check, PL_multiline ? FBMrf_MULTILINE : 0); else s = fbm_instr(HOP3(s, start_shift, strend), HOP3(strend, -end_shift, strbeg), @@ -1407,7 +1412,8 @@ Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char * minlen = prog->minlen; if (do_utf8) { - if (utf8_distance((U8*)strend, (U8*)startpos) < minlen) goto phooey; + if (!(prog->reganch & ROPT_SANY_SEEN)) + if (utf8_distance((U8*)strend, (U8*)startpos) < minlen) goto phooey; } else { if (strend - startpos < minlen) goto phooey; @@ -2075,13 +2081,6 @@ S_regmatch(pTHX_ regnode *prog) sayNO; break; case SANY: - if (do_utf8) { - locinput += PL_utf8skip[nextchr]; - if (locinput > PL_regeol) - sayNO; - nextchr = UCHARAT(locinput); - break; - } if (!nextchr && locinput >= PL_regeol) sayNO; nextchr = UCHARAT(++locinput); @@ -3563,15 +3562,7 @@ S_regrepeat(pTHX_ regnode *p, I32 max) } break; case SANY: - if (do_utf8) { - loceol = PL_regeol; - while (hardcount < max && scan < loceol) { - scan += UTF8SKIP(scan); - hardcount++; - } - } else { - scan = loceol; - } + scan = loceol; break; case EXACT: /* length of string is 1 */ c = (U8)*STRING(p); @@ -54,6 +54,7 @@ typedef struct regexp { #define ROPT_CHECK_ALL 0x00100 #define ROPT_LOOKBEHIND_SEEN 0x00200 #define ROPT_EVAL_SEEN 0x00400 +#define ROPT_SANY_SEEN 0x00800 /* 0xf800 of reganch is used by PMf_COMPILETIME */ diff --git a/t/op/pat.t b/t/op/pat.t index 17df867fd9..d7eb9f8fd1 100755 --- a/t/op/pat.t +++ b/t/op/pat.t @@ -4,7 +4,7 @@ # the format supported by op/regexp.t. If you want to add a test # that does fit that format, add it to op/re_tests, not here. -print "1..231\n"; +print "1..240\n"; BEGIN { chdir 't' if -d 't'; @@ -1129,3 +1129,56 @@ print "not " unless "A \x{263a} B z C" =~ /A . B (??{ "z" }) C/; print "ok $test\n"; $test++; +$_ = "a\x{100}b"; +if (/(.)(\C)(\C)(.)/) { + print "ok 232\n"; + if ($1 eq "a") { + print "ok 233\n"; + } else { + print "not ok 233\n"; + } + if ($2 eq "\xC4") { + print "ok 234\n"; + } else { + print "not ok 234\n"; + } + if ($3 eq "\x80") { + print "ok 235\n"; + } else { + print "not ok 235\n"; + } + if ($4 eq "b") { + print "ok 236\n"; + } else { + print "not ok 236\n"; + } +} else { + for (232..236) { + print "not ok $_\n"; + } +} +$_ = "\x{100}"; +if (/(\C)/g) { + print "ok 237\n"; + if ($1 eq "\xC4") { + print "ok 238\n"; + } else { + print "not ok 238\n"; + } +} else { + for (237..238) { + print "not ok $_\n"; + } +} +if (/(\C)/g) { + print "ok 239\n"; + if ($1 eq "\x80") { + print "ok 240\n"; + } else { + print "not ok 240\n"; + } +} else { + for (239..240) { + print "not ok $_\n"; + } +} diff --git a/t/pragma/utf8.t b/t/pragma/utf8.t index 577e6b4e2b..60e6c6e102 100755 --- a/t/pragma/utf8.t +++ b/t/pragma/utf8.t @@ -10,7 +10,7 @@ BEGIN { } } -print "1..107\n"; +print "1..109\n"; my $test = 1; @@ -577,3 +577,16 @@ sub nok_bytes { $test++; # 107 } +{ + # bug id 20001230.002 + + use utf8; + + print "not " unless "École" =~ /^\C\C(.)/ && $1 eq 'c'; + print "ok $test\n"; + $test++; # 108 + + print "not " unless "École" =~ /^\C\C(c)/; + print "ok $test\n"; + $test++; # 109 +} |