re_op_compile: recalc code indexes on utf8 upgrade

As part of the compilation, we calculate the start and end positions of the text of each literal code block within the pattern string. The 'if pattern gets unexpected upgraded to UTF8, longjmp and restart the compilation' mechanism, means that these indices can become invalid, so if this happens, recalculate them. We do this by unrolling a call to Perl_bytes_to_utf8(), which updates the indices at the same time that it uopdtes the string. Note that some of the new TODO test are actually passing, but this is for the wrong reason. They're supposed to test for forced recompilation of non-literal code blocks, even if the pattern string hasn't changed (which I haven't implemented yet), but instead they're passing because the "don't recomile if strings match" check isn't UTF8-aware. I'll fix this (pre-existing) bug in the next commit.
author: David Mitchell <davem@iabyn.com> 2011-11-18 12:37:59 +0000
committer: David Mitchell <davem@iabyn.com> 2011-12-19 15:06:04 +0000
commit: 676c8b25b5f8bb63a6a4b62a86a25d4f4987779d (patch)
tree: 2b30f85a033b996459fcc22d975f0159b5076f80
parent: 7cdb2cba61a9adb1b14c53f046754c86f456fa10 (diff)
download: perl-676c8b25b5f8bb63a6a4b62a86a25d4f4987779d.tar.gz
2 files changed, 63 insertions, 21 deletions
diff --git a/regcomp.c b/regcomp.c
index 1c8849895a..29dfe3201f 100644
--- a/regcomp.c
+++ b/regcomp.c
@@ -4832,7 +4832,10 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
         });
     }
     else {  /* longjumped back */
-        STRLEN len = plen;
+	U8 *src, *dst;
+	int n=0;
+	STRLEN s = 0, d = 0;
+	bool do_end = 0;
 
         /* If the cause for the longjmp was other than changing to utf8, pop
          * our own setjmp, and longjmp to the correct handler */
@@ -4854,21 +4857,42 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
         DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
 	    "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
 
-	if (!pat_count) {
-	    assert(expr && expr->op_type == OP_LIST);
-	    sv_setpvn(pat, "", 0);
-	    SvUTF8_on(pat);
-	    S_get_pat_and_code_indices(aTHX_ pRExC_state, expr, pat);
-	    exp = SvPV(pat, plen);
-	    xend = exp + plen;
-	}
-	else {
-	    exp = (char*)Perl_bytes_to_utf8(aTHX_
-					    (U8*)SvPV_nomg(pat, plen),
-					    &len);
-	    xend = exp + len;
-	    SAVEFREEPV(exp);
+	/* upgrade pattern to UTF8, and if there are code blocks,
+	 * recalculate the indices.
+	 * This is essentially an unrolled Perl_bytes_to_utf8() */
+
+	src = (U8*)SvPV_nomg(pat, plen);
+	Newx(dst, plen * 2 + 1, U8);
+
+	while (s < plen) {
+	    const UV uv = NATIVE_TO_ASCII(src[s]);
+	    if (UNI_IS_INVARIANT(uv))
+		dst[d]   = (U8)UTF_TO_NATIVE(uv);
+	    else {
+		dst[d++] = (U8)UTF8_EIGHT_BIT_HI(uv);
+		dst[d]   = (U8)UTF8_EIGHT_BIT_LO(uv);
+	    }
+	    if (n < pRExC_state->num_code_blocks) {
+		if (!do_end && pRExC_state->code_blocks[n].start == s) {
+		    pRExC_state->code_blocks[n].start = d;
+		    assert(dst[d] == '(');
+		    do_end = 1;
+		}
+		else if (do_end && pRExC_state->code_blocks[n].end == s) {
+		    pRExC_state->code_blocks[n].end = d;
+		    assert(dst[d] == ')');
+		    do_end = 0;
+		    n++;
+		}
+	    }
+	    s++;
+	    d++;
 	}
+	dst[d] = '\0';
+	plen = d;
+	exp = (char*) dst;
+	xend = exp + plen;
+	SAVEFREEPV(exp);
 	RExC_orig_utf8 = RExC_utf8 = 1;
     }
 
diff --git a/t/re/pat_re_eval.t b/t/re/pat_re_eval.t
index 4eef9d166f..28328ae751 100644
--- a/t/re/pat_re_eval.t
+++ b/t/re/pat_re_eval.t
@@ -22,7 +22,7 @@ BEGIN {
 }
 
 
-plan tests => 220;  # Update this when adding/deleting tests.
+plan tests => 241;  # Update this when adding/deleting tests.
 
 run_tests() unless caller;
 
@@ -374,23 +374,41 @@ sub run_tests {
 	use re 'eval';
 	for my $x (qw(a b c)) {
 	    my $bc = ($x ne 'a');
+	    my $c80 = chr(0x80);
 
 	    # the most basic: literal code should be in same scope
 	    # as the parent
 
-	    ok("A$x" =~ /^A(??{$x})$/, "[$x] literal code");
+	    ok("A$x"       =~ /^A(??{$x})$/,       "[$x] literal code");
+	    ok("\x{100}$x" =~ /^\x{100}(??{$x})$/, "[$x] literal code UTF8");
 
 	    # the "don't recompile if pattern unchanged" mechanism
 	    # shouldn't apply to code blocks - recompile every time
 	    # to pick up new instances of variables
 
-	    my $code1 = 'B(??{$x})';
-	    tok($bc, "AB$x" =~ /^A$code1$/, "[$x] unvarying runtime code");
+	    my $code1  = 'B(??{$x})';
+	    my $code1u = $c80 . "\x{100}" . '(??{$x})';
+	    tok($bc, "AB$x" =~ /^A$code1$/, "[$x] unvarying runtime code AA");
+	    tok($bc, "A$c80\x{100}$x" =~ /^A$code1u$/,
+					"[$x] unvarying runtime code AU");
+	    tok($bc, "$c80\x{100}B$x" =~ /^$c80\x{100}$code1$/,
+					"[$x] unvarying runtime code UA");
+	    tok($bc, "$c80\x{101}$c80\x{100}$x" =~ /^$c80\x{101}$code1u$/,
+					"[$x] unvarying runtime code UU");
 
 	    # mixed literal and run-time code blocks
 
-	    my $code2 = 'B(??{$x})';
-	    tok($bc, "A$x-B$x" =~ /^A(??{$x})-$code2$/, "[$x] literal+runtime");
+	    my $code2  = 'B(??{$x})';
+	    my $code2u = $c80 . "\x{100}" . '(??{$x})';
+	    tok($bc, "A$x-B$x" =~ /^A(??{$x})-$code2$/,
+					"[$x] literal+runtime AA");
+	    tok($bc, "A$x-$c80\x{100}$x" =~ /^A(??{$x})-$code2u$/,
+					"[$x] literal+runtime AU");
+	    tok($bc, "$c80\x{100}$x-B$x" =~ /^$c80\x{100}(??{$x})-$code2$/,
+					"[$x] literal+runtime UA");
+	    tok($bc, "$c80\x{101}$x-$c80\x{100}$x"
+					=~ /^$c80\x{101}(??{$x})-$code2u$/,
+					"[$x] literal+runtime UU");
 
 	    # literal qr code only created once, naked
author	David Mitchell <davem@iabyn.com>	2011-11-18 12:37:59 +0000
committer	David Mitchell <davem@iabyn.com>	2011-12-19 15:06:04 +0000
commit	676c8b25b5f8bb63a6a4b62a86a25d4f4987779d (patch)
tree	2b30f85a033b996459fcc22d975f0159b5076f80
parent	7cdb2cba61a9adb1b14c53f046754c86f456fa10 (diff)
download	perl-676c8b25b5f8bb63a6a4b62a86a25d4f4987779d.tar.gz