diff options
author | David Mitchell <davem@iabyn.com> | 2011-11-18 14:48:49 +0000 |
---|---|---|
committer | David Mitchell <davem@iabyn.com> | 2012-06-13 13:32:45 +0100 |
commit | 6ae44cd27f5f18bfa72c8d5ea352083cea44b3bc (patch) | |
tree | 54f7f19314f30c3483fb313cff81934902643ac3 | |
parent | 2bd8e0da284e556e0ebae220a2fa52570cd96ca3 (diff) | |
download | perl-6ae44cd27f5f18bfa72c8d5ea352083cea44b3bc.tar.gz |
"don't recompile pattern" check: account for UTF8
When recompiling a pattern (e.g. for $x (x,y) { /$x/ }),
it tests whether the new pattern string matches the old one, and if so
skips recompiling it. However, it doesn't take account of the UTF8ness of
the old and new patterns, so can falsely skip recompiling. Now fixed.
Also, there is a feature in re_op_compile() that may abort a pattern
compilation, upgrade the pattern to UTF8, then begin the compile again.
I've added a second check for whether the pattern matches the old pattern,
against the upgraded string. I can't see a way to test this, since its
just an optimisation. Arguably I could add a BEGIN in an embedded code
block to see if it gets compiled twice, but soon I'm going to make it so
that embedded code blocks always get recompiled anyway.
-rw-r--r-- | embed.fnc | 3 | ||||
-rw-r--r-- | proto.h | 2 | ||||
-rw-r--r-- | regcomp.c | 23 | ||||
-rw-r--r-- | t/re/pat.t | 13 |
4 files changed, 35 insertions, 6 deletions
@@ -1061,7 +1061,8 @@ Ap |REGEXP*|pregcomp |NN SV * const pattern|const U32 flags p |REGEXP*|re_op_compile |NULLOK SV ** const patternp \ |int pat_count|NULLOK OP *expr \ |NULLOK const regexp_engine* eng \ - |NULLOK REGEXP *old_re|NULLOK int *is_bare_re \ + |NULLOK REGEXP *VOL old_re \ + |NULLOK int *is_bare_re \ |U32 flags Ap |REGEXP*|re_compile |NN SV * const pattern|U32 flags Ap |char* |re_intuit_start|NN REGEXP * const rx|NULLOK SV* sv|NN char* strpos \ @@ -3150,7 +3150,7 @@ PERL_CALLCONV SV* Perl_re_intuit_string(pTHX_ REGEXP *const r) #define PERL_ARGS_ASSERT_RE_INTUIT_STRING \ assert(r) -PERL_CALLCONV REGEXP* Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, OP *expr, const regexp_engine* eng, REGEXP *old_re, int *is_bare_re, U32 flags); +PERL_CALLCONV REGEXP* Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, OP *expr, const regexp_engine* eng, REGEXP *VOL old_re, int *is_bare_re, U32 flags); PERL_CALLCONV Malloc_t Perl_realloc(Malloc_t where, MEM_SIZE nbytes) __attribute__malloc__ __attribute__warn_unused_result__; @@ -5044,7 +5044,7 @@ S_get_pat_and_code_indices(pTHX_ RExC_state_t *pRExC_state, OP* expr, SV* pat) { REGEXP * Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, - OP *expr, const regexp_engine* eng, REGEXP *old_re, + OP *expr, const regexp_engine* eng, REGEXP *VOL old_re, int *is_bare_re, U32 orig_pm_flags) { dVAR; @@ -5282,8 +5282,10 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, return CALLREGCOMP_ENG(eng, pat, orig_pm_flags); } - if (old_re && RX_PRECOMP(old_re) && RX_PRELEN(old_re) == plen - && memEQ(RX_PRECOMP(old_re), exp, plen)) + if ( old_re + && !!RX_UTF8(old_re) == !!SvUTF8(pat) + && RX_PRECOMP(old_re) && RX_PRELEN(old_re) == plen + && memEQ(RX_PRECOMP(old_re), exp, plen)) { ReREFCNT_inc(old_re); return old_re; @@ -5375,6 +5377,21 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, xend = exp + plen; SAVEFREEPV(exp); RExC_orig_utf8 = RExC_utf8 = 1; + + /* we've changed the string; check again whether it matches + * the old pattern, to avoid recompilation */ + if ( old_re + && RX_UTF8(old_re) + && RX_PRECOMP(old_re) && RX_PRELEN(old_re) == plen + && memEQ(RX_PRECOMP(old_re), exp, plen)) + { + ReREFCNT_inc(old_re); + if (used_setjump) { + JMPENV_POP; + } + return old_re; + } + } #ifdef TRIE_STUDY_OPT diff --git a/t/re/pat.t b/t/re/pat.t index e03f5ca9af..fa9a5467e2 100644 --- a/t/re/pat.t +++ b/t/re/pat.t @@ -19,7 +19,7 @@ BEGIN { require './test.pl'; } -plan tests => 436; # Update this when adding/deleting tests. +plan tests => 438; # Update this when adding/deleting tests. run_tests() unless caller; @@ -1216,6 +1216,17 @@ EOP use re '/aa'; unlike 'k', qr/(?i:\N{KELVIN SIGN})/, "(?i: shouldn't lose the passed in /aa"; } + + { + # the test for whether the pattern should be re-compiled should + # consider the UTF8ness of the previous and current pattern + # string, as well as the physical bytes of the pattern string + + for my $s ("\xc4\x80", "\x{100}") { + ok($s =~ /^$s$/, "re-compile check is UTF8-aware"); + } + } + } # End of sub run_tests 1; |