regexec.c: utf8 doesn't match non-utf8 self

Some regex patterns don't match a character with itself when the target string is in utf8 and the pattern isn't, and the character is variant under utf8. (This means only Latin1-range characters in the pattern are affected.) The solution is to test for this case and use the utf8 representation of the pattern character for the comparison.
author: Karl Williamson <public@khwilliamson.com> 2010-10-20 10:20:29 -0600
committer: Father Chrysostomos <sprout@cpan.org> 2010-10-21 05:56:30 -0700
commit: 634c83a2672252257e360eb1939b7ec762ef6308 (patch)
tree: cb4ff7fdfa156a475b0cd9b5b9ace912cbc84318 /regexec.c
parent: d53d27f973b3f4329ad8aa1e1a11554c8e19c3e3 (diff)
download: perl-634c83a2672252257e360eb1939b7ec762ef6308.tar.gz
1 files changed, 37 insertions, 3 deletions
diff --git a/regexec.c b/regexec.c
index 901703fd3f..f87c2fa93c 100644
--- a/regexec.c
+++ b/regexec.c
@@ -5750,10 +5750,44 @@ S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth)
     case CANY:
 	scan = loceol;
 	break;
-    case EXACT:		/* length of string is 1 */
+    case EXACT:
+	/* To get here, EXACT nodes must have *byte* length == 1.  That means
+	 * they match only characters in the string that can be expressed as a
+	 * single byte.  For non-utf8 strings, that means a simple match.  For
+	 * utf8 strings, the character matched must be an invariant, or
+	 * downgradable to a single byte.  The pattern's utf8ness is
+	 * irrelevant, as it must be a single byte, so either it isn't utf8, or
+	 * if it is it's an invariant */
+
 	c = (U8)*STRING(p);
-	while (scan < loceol && UCHARAT(scan) == c)
-	    scan++;
+	assert(! UTF_PATTERN || UNI_IS_INVARIANT(c));
+	if ((! utf8_target) || UNI_IS_INVARIANT(c)) {
+
+	    /* Here, the string isn't utf8, or the character in the EXACT
+	     * node is the same in utf8 as not, so can just do equality.
+	     * Each matching char must be 1 byte long */
+	    while (scan < loceol && UCHARAT(scan) == c) {
+		scan++;
+	    }
+	}
+	else {
+
+	    /* Here, the string is utf8, and the char to match is different
+	     * in utf8 than not.  Fastest to find the two utf8 bytes that
+	     * represent c, and then look for those in sequence in the utf8
+	     * string */
+	    U8 high = UTF8_TWO_BYTE_HI(c);
+	    U8 low = UTF8_TWO_BYTE_LO(c);
+	    loceol = PL_regeol;
+	    while (hardcount < max
+		   && scan + 1 < loceol
+		   && UCHARAT(scan) == high
+		   && UCHARAT(scan + 1) == low)
+	    {
+		scan += 2;
+		hardcount++;
+	    }
+	}
 	break;
     case EXACTF:	/* length of string is 1 */
 	c = (U8)*STRING(p);
author	Karl Williamson <public@khwilliamson.com>	2010-10-20 10:20:29 -0600
committer	Father Chrysostomos <sprout@cpan.org>	2010-10-21 05:56:30 -0700
commit	634c83a2672252257e360eb1939b7ec762ef6308 (patch)
tree	cb4ff7fdfa156a475b0cd9b5b9ace912cbc84318 /regexec.c
parent	d53d27f973b3f4329ad8aa1e1a11554c8e19c3e3 (diff)
download	perl-634c83a2672252257e360eb1939b7ec762ef6308.tar.gz