diff options
author | David Mitchell <davem@iabyn.com> | 2011-11-18 12:37:59 +0000 |
---|---|---|
committer | David Mitchell <davem@iabyn.com> | 2012-06-13 13:25:53 +0100 |
commit | 2bd8e0da284e556e0ebae220a2fa52570cd96ca3 (patch) | |
tree | b64c881eebed9be5199023ce8aecb91215d76ca5 | |
parent | f5cf2abdb19e03f24bd768767fd145f15f076d40 (diff) | |
download | perl-2bd8e0da284e556e0ebae220a2fa52570cd96ca3.tar.gz |
re_op_compile: recalc code indexes on utf8 upgrade
As part of the compilation, we calculate the start and end positions
of the text of each literal code block within the pattern string.
The 'if pattern gets unexpected upgraded to UTF8, longjmp and restart
the compilation' mechanism, means that these indices can become invalid,
so if this happens, recalculate them. We do this by unrolling a call
to Perl_bytes_to_utf8(), which updates the indices at the same time that
it uopdtes the string.
Note that some of the new TODO test are actually passing, but this is for
the wrong reason. They're supposed to test for forced recompilation of
non-literal code blocks, even if the pattern string hasn't changed (which I
haven't implemented yet), but instead they're passing because the "don't
recomile if strings match" check isn't UTF8-aware. I'll fix this
(pre-existing) bug in the next commit.
-rw-r--r-- | regcomp.c | 54 | ||||
-rw-r--r-- | t/re/pat_re_eval.t | 30 |
2 files changed, 63 insertions, 21 deletions
@@ -5313,7 +5313,10 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, }); } else { /* longjumped back */ - STRLEN len = plen; + U8 *src, *dst; + int n=0; + STRLEN s = 0, d = 0; + bool do_end = 0; /* If the cause for the longjmp was other than changing to utf8, pop * our own setjmp, and longjmp to the correct handler */ @@ -5335,21 +5338,42 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "UTF8 mismatch! Converting to utf8 for resizing and compile\n")); - if (!pat_count) { - assert(expr && expr->op_type == OP_LIST); - sv_setpvn(pat, "", 0); - SvUTF8_on(pat); - S_get_pat_and_code_indices(aTHX_ pRExC_state, expr, pat); - exp = SvPV(pat, plen); - xend = exp + plen; - } - else { - exp = (char*)Perl_bytes_to_utf8(aTHX_ - (U8*)SvPV_nomg(pat, plen), - &len); - xend = exp + len; - SAVEFREEPV(exp); + /* upgrade pattern to UTF8, and if there are code blocks, + * recalculate the indices. + * This is essentially an unrolled Perl_bytes_to_utf8() */ + + src = (U8*)SvPV_nomg(pat, plen); + Newx(dst, plen * 2 + 1, U8); + + while (s < plen) { + const UV uv = NATIVE_TO_ASCII(src[s]); + if (UNI_IS_INVARIANT(uv)) + dst[d] = (U8)UTF_TO_NATIVE(uv); + else { + dst[d++] = (U8)UTF8_EIGHT_BIT_HI(uv); + dst[d] = (U8)UTF8_EIGHT_BIT_LO(uv); + } + if (n < pRExC_state->num_code_blocks) { + if (!do_end && pRExC_state->code_blocks[n].start == s) { + pRExC_state->code_blocks[n].start = d; + assert(dst[d] == '('); + do_end = 1; + } + else if (do_end && pRExC_state->code_blocks[n].end == s) { + pRExC_state->code_blocks[n].end = d; + assert(dst[d] == ')'); + do_end = 0; + n++; + } + } + s++; + d++; } + dst[d] = '\0'; + plen = d; + exp = (char*) dst; + xend = exp + plen; + SAVEFREEPV(exp); RExC_orig_utf8 = RExC_utf8 = 1; } diff --git a/t/re/pat_re_eval.t b/t/re/pat_re_eval.t index e9bb50ea09..843d91a83a 100644 --- a/t/re/pat_re_eval.t +++ b/t/re/pat_re_eval.t @@ -22,7 +22,7 @@ BEGIN { } -plan tests => 220; # Update this when adding/deleting tests. +plan tests => 241; # Update this when adding/deleting tests. run_tests() unless caller; @@ -374,23 +374,41 @@ sub run_tests { use re 'eval'; for my $x (qw(a b c)) { my $bc = ($x ne 'a'); + my $c80 = chr(0x80); # the most basic: literal code should be in same scope # as the parent - ok("A$x" =~ /^A(??{$x})$/, "[$x] literal code"); + ok("A$x" =~ /^A(??{$x})$/, "[$x] literal code"); + ok("\x{100}$x" =~ /^\x{100}(??{$x})$/, "[$x] literal code UTF8"); # the "don't recompile if pattern unchanged" mechanism # shouldn't apply to code blocks - recompile every time # to pick up new instances of variables - my $code1 = 'B(??{$x})'; - tok($bc, "AB$x" =~ /^A$code1$/, "[$x] unvarying runtime code"); + my $code1 = 'B(??{$x})'; + my $code1u = $c80 . "\x{100}" . '(??{$x})'; + tok($bc, "AB$x" =~ /^A$code1$/, "[$x] unvarying runtime code AA"); + tok($bc, "A$c80\x{100}$x" =~ /^A$code1u$/, + "[$x] unvarying runtime code AU"); + tok($bc, "$c80\x{100}B$x" =~ /^$c80\x{100}$code1$/, + "[$x] unvarying runtime code UA"); + tok($bc, "$c80\x{101}$c80\x{100}$x" =~ /^$c80\x{101}$code1u$/, + "[$x] unvarying runtime code UU"); # mixed literal and run-time code blocks - my $code2 = 'B(??{$x})'; - tok($bc, "A$x-B$x" =~ /^A(??{$x})-$code2$/, "[$x] literal+runtime"); + my $code2 = 'B(??{$x})'; + my $code2u = $c80 . "\x{100}" . '(??{$x})'; + tok($bc, "A$x-B$x" =~ /^A(??{$x})-$code2$/, + "[$x] literal+runtime AA"); + tok($bc, "A$x-$c80\x{100}$x" =~ /^A(??{$x})-$code2u$/, + "[$x] literal+runtime AU"); + tok($bc, "$c80\x{100}$x-B$x" =~ /^$c80\x{100}(??{$x})-$code2$/, + "[$x] literal+runtime UA"); + tok($bc, "$c80\x{101}$x-$c80\x{100}$x" + =~ /^$c80\x{101}(??{$x})-$code2u$/, + "[$x] literal+runtime UU"); # literal qr code only created once, naked |