summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDavid Mitchell <davem@iabyn.com>2011-11-18 14:48:49 +0000
committerDavid Mitchell <davem@iabyn.com>2012-06-13 13:32:45 +0100
commit6ae44cd27f5f18bfa72c8d5ea352083cea44b3bc (patch)
tree54f7f19314f30c3483fb313cff81934902643ac3
parent2bd8e0da284e556e0ebae220a2fa52570cd96ca3 (diff)
downloadperl-6ae44cd27f5f18bfa72c8d5ea352083cea44b3bc.tar.gz
"don't recompile pattern" check: account for UTF8
When recompiling a pattern (e.g. for $x (x,y) { /$x/ }), it tests whether the new pattern string matches the old one, and if so skips recompiling it. However, it doesn't take account of the UTF8ness of the old and new patterns, so can falsely skip recompiling. Now fixed. Also, there is a feature in re_op_compile() that may abort a pattern compilation, upgrade the pattern to UTF8, then begin the compile again. I've added a second check for whether the pattern matches the old pattern, against the upgraded string. I can't see a way to test this, since its just an optimisation. Arguably I could add a BEGIN in an embedded code block to see if it gets compiled twice, but soon I'm going to make it so that embedded code blocks always get recompiled anyway.
-rw-r--r--embed.fnc3
-rw-r--r--proto.h2
-rw-r--r--regcomp.c23
-rw-r--r--t/re/pat.t13
4 files changed, 35 insertions, 6 deletions
diff --git a/embed.fnc b/embed.fnc
index 0fe55c6a5c..3446a1e434 100644
--- a/embed.fnc
+++ b/embed.fnc
@@ -1061,7 +1061,8 @@ Ap |REGEXP*|pregcomp |NN SV * const pattern|const U32 flags
p |REGEXP*|re_op_compile |NULLOK SV ** const patternp \
|int pat_count|NULLOK OP *expr \
|NULLOK const regexp_engine* eng \
- |NULLOK REGEXP *old_re|NULLOK int *is_bare_re \
+ |NULLOK REGEXP *VOL old_re \
+ |NULLOK int *is_bare_re \
|U32 flags
Ap |REGEXP*|re_compile |NN SV * const pattern|U32 flags
Ap |char* |re_intuit_start|NN REGEXP * const rx|NULLOK SV* sv|NN char* strpos \
diff --git a/proto.h b/proto.h
index ccab428804..3db372433d 100644
--- a/proto.h
+++ b/proto.h
@@ -3150,7 +3150,7 @@ PERL_CALLCONV SV* Perl_re_intuit_string(pTHX_ REGEXP *const r)
#define PERL_ARGS_ASSERT_RE_INTUIT_STRING \
assert(r)
-PERL_CALLCONV REGEXP* Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, OP *expr, const regexp_engine* eng, REGEXP *old_re, int *is_bare_re, U32 flags);
+PERL_CALLCONV REGEXP* Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, OP *expr, const regexp_engine* eng, REGEXP *VOL old_re, int *is_bare_re, U32 flags);
PERL_CALLCONV Malloc_t Perl_realloc(Malloc_t where, MEM_SIZE nbytes)
__attribute__malloc__
__attribute__warn_unused_result__;
diff --git a/regcomp.c b/regcomp.c
index b03d959285..77a9401a23 100644
--- a/regcomp.c
+++ b/regcomp.c
@@ -5044,7 +5044,7 @@ S_get_pat_and_code_indices(pTHX_ RExC_state_t *pRExC_state, OP* expr, SV* pat) {
REGEXP *
Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
- OP *expr, const regexp_engine* eng, REGEXP *old_re,
+ OP *expr, const regexp_engine* eng, REGEXP *VOL old_re,
int *is_bare_re, U32 orig_pm_flags)
{
dVAR;
@@ -5282,8 +5282,10 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
return CALLREGCOMP_ENG(eng, pat, orig_pm_flags);
}
- if (old_re && RX_PRECOMP(old_re) && RX_PRELEN(old_re) == plen
- && memEQ(RX_PRECOMP(old_re), exp, plen))
+ if ( old_re
+ && !!RX_UTF8(old_re) == !!SvUTF8(pat)
+ && RX_PRECOMP(old_re) && RX_PRELEN(old_re) == plen
+ && memEQ(RX_PRECOMP(old_re), exp, plen))
{
ReREFCNT_inc(old_re);
return old_re;
@@ -5375,6 +5377,21 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
xend = exp + plen;
SAVEFREEPV(exp);
RExC_orig_utf8 = RExC_utf8 = 1;
+
+ /* we've changed the string; check again whether it matches
+ * the old pattern, to avoid recompilation */
+ if ( old_re
+ && RX_UTF8(old_re)
+ && RX_PRECOMP(old_re) && RX_PRELEN(old_re) == plen
+ && memEQ(RX_PRECOMP(old_re), exp, plen))
+ {
+ ReREFCNT_inc(old_re);
+ if (used_setjump) {
+ JMPENV_POP;
+ }
+ return old_re;
+ }
+
}
#ifdef TRIE_STUDY_OPT
diff --git a/t/re/pat.t b/t/re/pat.t
index e03f5ca9af..fa9a5467e2 100644
--- a/t/re/pat.t
+++ b/t/re/pat.t
@@ -19,7 +19,7 @@ BEGIN {
require './test.pl';
}
-plan tests => 436; # Update this when adding/deleting tests.
+plan tests => 438; # Update this when adding/deleting tests.
run_tests() unless caller;
@@ -1216,6 +1216,17 @@ EOP
use re '/aa';
unlike 'k', qr/(?i:\N{KELVIN SIGN})/, "(?i: shouldn't lose the passed in /aa";
}
+
+ {
+ # the test for whether the pattern should be re-compiled should
+ # consider the UTF8ness of the previous and current pattern
+ # string, as well as the physical bytes of the pattern string
+
+ for my $s ("\xc4\x80", "\x{100}") {
+ ok($s =~ /^$s$/, "re-compile check is UTF8-aware");
+ }
+ }
+
} # End of sub run_tests
1;