From 634c83a2672252257e360eb1939b7ec762ef6308 Mon Sep 17 00:00:00 2001 From: Karl Williamson Date: Wed, 20 Oct 2010 10:20:29 -0600 Subject: regexec.c: utf8 doesn't match non-utf8 self Some regex patterns don't match a character with itself when the target string is in utf8 and the pattern isn't, and the character is variant under utf8. (This means only Latin1-range characters in the pattern are affected.) The solution is to test for this case and use the utf8 representation of the pattern character for the comparison. --- t/re/pat.t | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) (limited to 't') diff --git a/t/re/pat.t b/t/re/pat.t index c007880b8c..46681040e5 100644 --- a/t/re/pat.t +++ b/t/re/pat.t @@ -23,7 +23,7 @@ BEGIN { } -plan tests => 398; # Update this when adding/deleting tests. +plan tests => 402; # Update this when adding/deleting tests. run_tests() unless caller; @@ -1072,6 +1072,18 @@ sub run_tests { } + { # Some constructs with Latin1 characters cause a utf8 string not to + # match itself in non-utf8 + my $c = "\xc0"; + my $pattern = my $utf8_pattern = qr/((\xc0)+,?)/; + utf8::upgrade($utf8_pattern); + ok $c =~ $pattern, "\\xc0 =~ $pattern; Neither pattern nor target utf8"; + ok $c =~ $utf8_pattern, "\\xc0 =~ $pattern; pattern utf8, target not"; + utf8::upgrade($c); + ok $c =~ $pattern, "\\xc0 =~ $pattern; target utf8, pattern not"; + ok $c =~ $utf8_pattern, "\\xc0 =~ $pattern; Both target and pattern utf8"; + } + { # Test that a regex followed by an operator and/or a statement modifier work # These tests use string-eval so that it reports a clean error when it fails -- cgit v1.2.1