regexec.c: Fix EXACT node handling in regrepeat()

Commit b40a2c17551b484a78122be98db5dc06bb4614d5 introduced a bug in handling EXACT nodes when the pattern is in UTF-8. This cleans that up.
author: Karl Williamson <public@khwilliamson.com> 2012-10-11 14:56:27 -0600
committer: Karl Williamson <public@khwilliamson.com> 2012-10-11 20:37:50 -0600
commit: 5e4a1da18f8fd71f2e5f0b98b0d41e3da257281a (patch)
tree: 8384a55d211e51dffe7b0c2da0ada570f826df52 /regexec.c
parent: def6ed225c88257d6c50813a8212783f0d267e0e (diff)
download: perl-5e4a1da18f8fd71f2e5f0b98b0d41e3da257281a.tar.gz
1 files changed, 35 insertions, 17 deletions
diff --git a/regexec.c b/regexec.c
index bad11f2f67..febc222fb4 100644
--- a/regexec.c
+++ b/regexec.c
@@ -6488,31 +6488,48 @@ S_regrepeat(pTHX_ const regexp *prog, char **startposp, const regnode *p, I32 ma
     case EXACT:
 	c = (U8)*STRING(p);
 
-	if (! utf8_target || UNI_IS_INVARIANT(c)) {
+        /* Can use a simple loop if the pattern char to match on is invariant
+         * under UTF-8, or both target and pattern aren't UTF-8.  Note that we
+         * can use UTF8_IS_INVARIANT() even if the pattern isn't UTF-8, as it's
+         * true iff it doesn't matter if the argument is in UTF-8 or not */
+        if (UTF8_IS_INVARIANT(c) || (! utf8_target && ! UTF_PATTERN)) {
 	    while (scan < loceol && UCHARAT(scan) == c) {
 		scan++;
 	    }
 	}
 	else if (UTF_PATTERN) {
-            STRLEN scan_char_len;
-
-	    loceol = PL_regeol;
+            if (utf8_target) {
+                STRLEN scan_char_len;
+                loceol = PL_regeol;
+
+                /* When both target and pattern are UTF-8, we have to do s
+                 * string EQ */
+                while (hardcount < max
+                       && scan + (scan_char_len = UTF8SKIP(scan)) <= loceol
+                       && scan_char_len <= STR_LEN(p)
+                       && memEQ(scan, STRING(p), scan_char_len))
+                {
+                    scan += scan_char_len;
+                    hardcount++;
+                }
+            }
+            else if (! UTF8_IS_ABOVE_LATIN1(c)) {
 
-	    while (hardcount < max
-                   && scan + (scan_char_len = UTF8SKIP(scan)) < loceol
-                   && scan_char_len <= STR_LEN(p)
-                   && memEQ(scan, STRING(p), scan_char_len))
-            {
-		scan += scan_char_len;
-		hardcount++;
-	    }
+                /* Target isn't utf8; convert the character in the UTF-8
+                 * pattern to non-UTF8, and do a simple loop */
+                c = TWO_BYTE_UTF8_TO_UNI(c, *(STRING(p) + 1));
+                while (scan < loceol && UCHARAT(scan) == c) {
+                    scan++;
+                }
+            } /* else pattern char is above Latin1, can't possibly match the
+                 non-UTF-8 target */
         }
-	else {
+        else {
 
-	    /* Here, the string is utf8, the pattern isn't, but <c> is different
-	     * in utf8 than not, so can't compare them directly.  Outside the
-	     * loop, find the two utf8 bytes that represent c, and then
-	     * look for those in sequence in the utf8 string */
+            /* Here, the string must be utf8; pattern isn't, and <c> is
+             * different in utf8 than not, so can't compare them directly.
+             * Outside the loop, find the two utf8 bytes that represent c, and
+             * then look for those in sequence in the utf8 string */
 	    U8 high = UTF8_TWO_BYTE_HI(c);
 	    U8 low = UTF8_TWO_BYTE_LO(c);
 	    loceol = PL_regeol;
@@ -6527,6 +6544,7 @@ S_regrepeat(pTHX_ const regexp *prog, char **startposp, const regnode *p, I32 ma
 	    }
 	}
 	break;
+
     case EXACTFA:
 	utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
 	goto do_exactf;
author	Karl Williamson <public@khwilliamson.com>	2012-10-11 14:56:27 -0600
committer	Karl Williamson <public@khwilliamson.com>	2012-10-11 20:37:50 -0600
commit	5e4a1da18f8fd71f2e5f0b98b0d41e3da257281a (patch)
tree	8384a55d211e51dffe7b0c2da0ada570f826df52 /regexec.c
parent	def6ed225c88257d6c50813a8212783f0d267e0e (diff)
download	perl-5e4a1da18f8fd71f2e5f0b98b0d41e3da257281a.tar.gz