re_op_compile: recalc code indexes on utf8 upgrade

As part of the compilation, we calculate the start and end positions of the text of each literal code block within the pattern string. The 'if pattern gets unexpected upgraded to UTF8, longjmp and restart the compilation' mechanism, means that these indices can become invalid, so if this happens, recalculate them. We do this by unrolling a call to Perl_bytes_to_utf8(), which updates the indices at the same time that it uopdtes the string. Note that some of the new TODO test are actually passing, but this is for the wrong reason. They're supposed to test for forced recompilation of non-literal code blocks, even if the pattern string hasn't changed (which I haven't implemented yet), but instead they're passing because the "don't recomile if strings match" check isn't UTF8-aware. I'll fix this (pre-existing) bug in the next commit.
author: David Mitchell <davem@iabyn.com> 2011-11-18 12:37:59 +0000
committer: David Mitchell <davem@iabyn.com> 2012-06-13 13:25:53 +0100
commit: 2bd8e0da284e556e0ebae220a2fa52570cd96ca3 (patch)
tree: b64c881eebed9be5199023ce8aecb91215d76ca5
parent: f5cf2abdb19e03f24bd768767fd145f15f076d40 (diff)
download: perl-2bd8e0da284e556e0ebae220a2fa52570cd96ca3.tar.gz
2 files changed, 63 insertions, 21 deletions
diff --git a/regcomp.c b/regcomp.c
index f369f768d6..b03d959285 100644
--- a/regcomp.c
+++ b/regcomp.c
@@ -5313,7 +5313,10 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
         });
     }
     else {  /* longjumped back */
-        STRLEN len = plen;
+	U8 *src, *dst;
+	int n=0;
+	STRLEN s = 0, d = 0;
+	bool do_end = 0;
 
         /* If the cause for the longjmp was other than changing to utf8, pop
          * our own setjmp, and longjmp to the correct handler */
@@ -5335,21 +5338,42 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
         DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
 	    "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
 
-	if (!pat_count) {
-	    assert(expr && expr->op_type == OP_LIST);
-	    sv_setpvn(pat, "", 0);
-	    SvUTF8_on(pat);
-	    S_get_pat_and_code_indices(aTHX_ pRExC_state, expr, pat);
-	    exp = SvPV(pat, plen);
-	    xend = exp + plen;
-	}
-	else {
-	    exp = (char*)Perl_bytes_to_utf8(aTHX_
-					    (U8*)SvPV_nomg(pat, plen),
-					    &len);
-	    xend = exp + len;
-	    SAVEFREEPV(exp);
+	/* upgrade pattern to UTF8, and if there are code blocks,
+	 * recalculate the indices.
+	 * This is essentially an unrolled Perl_bytes_to_utf8() */
+
+	src = (U8*)SvPV_nomg(pat, plen);
+	Newx(dst, plen * 2 + 1, U8);
+
+	while (s < plen) {
+	    const UV uv = NATIVE_TO_ASCII(src[s]);
+	    if (UNI_IS_INVARIANT(uv))
+		dst[d]   = (U8)UTF_TO_NATIVE(uv);
+	    else {
+		dst[d++] = (U8)UTF8_EIGHT_BIT_HI(uv);
+		dst[d]   = (U8)UTF8_EIGHT_BIT_LO(uv);
+	    }
+	    if (n < pRExC_state->num_code_blocks) {
+		if (!do_end && pRExC_state->code_blocks[n].start == s) {
+		    pRExC_state->code_blocks[n].start = d;
+		    assert(dst[d] == '(');
+		    do_end = 1;
+		}
+		else if (do_end && pRExC_state->code_blocks[n].end == s) {
+		    pRExC_state->code_blocks[n].end = d;
+		    assert(dst[d] == ')');
+		    do_end = 0;
+		    n++;
+		}
+	    }
+	    s++;
+	    d++;
 	}
+	dst[d] = '\0';
+	plen = d;
+	exp = (char*) dst;
+	xend = exp + plen;
+	SAVEFREEPV(exp);
 	RExC_orig_utf8 = RExC_utf8 = 1;
     }
 
diff --git a/t/re/pat_re_eval.t b/t/re/pat_re_eval.t
index e9bb50ea09..843d91a83a 100644
--- a/t/re/pat_re_eval.t
+++ b/t/re/pat_re_eval.t
@@ -22,7 +22,7 @@ BEGIN {
 }
 
 
-plan tests => 220;  # Update this when adding/deleting tests.
+plan tests => 241;  # Update this when adding/deleting tests.
 
 run_tests() unless caller;
 
@@ -374,23 +374,41 @@ sub run_tests {
 	use re 'eval';
 	for my $x (qw(a b c)) {
 	    my $bc = ($x ne 'a');
+	    my $c80 = chr(0x80);
 
 	    # the most basic: literal code should be in same scope
 	    # as the parent
 
-	    ok("A$x" =~ /^A(??{$x})$/, "[$x] literal code");
+	    ok("A$x"       =~ /^A(??{$x})$/,       "[$x] literal code");
+	    ok("\x{100}$x" =~ /^\x{100}(??{$x})$/, "[$x] literal code UTF8");
 
 	    # the "don't recompile if pattern unchanged" mechanism
 	    # shouldn't apply to code blocks - recompile every time
 	    # to pick up new instances of variables
 
-	    my $code1 = 'B(??{$x})';
-	    tok($bc, "AB$x" =~ /^A$code1$/, "[$x] unvarying runtime code");
+	    my $code1  = 'B(??{$x})';
+	    my $code1u = $c80 . "\x{100}" . '(??{$x})';
+	    tok($bc, "AB$x" =~ /^A$code1$/, "[$x] unvarying runtime code AA");
+	    tok($bc, "A$c80\x{100}$x" =~ /^A$code1u$/,
+					"[$x] unvarying runtime code AU");
+	    tok($bc, "$c80\x{100}B$x" =~ /^$c80\x{100}$code1$/,
+					"[$x] unvarying runtime code UA");
+	    tok($bc, "$c80\x{101}$c80\x{100}$x" =~ /^$c80\x{101}$code1u$/,
+					"[$x] unvarying runtime code UU");
 
 	    # mixed literal and run-time code blocks
 
-	    my $code2 = 'B(??{$x})';
-	    tok($bc, "A$x-B$x" =~ /^A(??{$x})-$code2$/, "[$x] literal+runtime");
+	    my $code2  = 'B(??{$x})';
+	    my $code2u = $c80 . "\x{100}" . '(??{$x})';
+	    tok($bc, "A$x-B$x" =~ /^A(??{$x})-$code2$/,
+					"[$x] literal+runtime AA");
+	    tok($bc, "A$x-$c80\x{100}$x" =~ /^A(??{$x})-$code2u$/,
+					"[$x] literal+runtime AU");
+	    tok($bc, "$c80\x{100}$x-B$x" =~ /^$c80\x{100}(??{$x})-$code2$/,
+					"[$x] literal+runtime UA");
+	    tok($bc, "$c80\x{101}$x-$c80\x{100}$x"
+					=~ /^$c80\x{101}(??{$x})-$code2u$/,
+					"[$x] literal+runtime UU");
 
 	    # literal qr code only created once, naked
author	David Mitchell <davem@iabyn.com>	2011-11-18 12:37:59 +0000
committer	David Mitchell <davem@iabyn.com>	2012-06-13 13:25:53 +0100
commit	2bd8e0da284e556e0ebae220a2fa52570cd96ca3 (patch)
tree	b64c881eebed9be5199023ce8aecb91215d76ca5
parent	f5cf2abdb19e03f24bd768767fd145f15f076d40 (diff)
download	perl-2bd8e0da284e556e0ebae220a2fa52570cd96ca3.tar.gz