regcomp.c: tell regexec more about multi-char folds

A multi-char fold that matches in the Latin1 range needs to have that fact communicated to regexec.
author: Karl Williamson <public@khwilliamson.com> 2011-02-07 11:11:16 -0700
committer: Karl Williamson <public@khwilliamson.com> 2011-02-14 08:41:36 -0700
commit: 8e3094e5043730c7392a234c5ba58db1c535fa59 (patch)
tree: d2fe28c4db0a5e1885683dabb2c1c628e7a3b8ec /regcomp.c
parent: d18bf9dc360d179168ae3b6311b5d69480eef4f2 (diff)
download: perl-8e3094e5043730c7392a234c5ba58db1c535fa59.tar.gz
1 files changed, 24 insertions, 2 deletions
diff --git a/regcomp.c b/regcomp.c
index 50b8877e4d..49db5ea366 100644
--- a/regcomp.c
+++ b/regcomp.c
@@ -9790,8 +9790,31 @@ parseit:
 			 * these multicharacter foldings, to be later saved as
 			 * part of the additional "s" data. */
 			if (! RExC_in_lookbehind) {
-			    /* XXX Discard this fold if any are latin1 and LOC */
 			    SV *sv;
+			    U8* loc = foldbuf;
+			    U8* e = foldbuf + foldlen;
+
+			    /* If any of the folded characters of this are in
+			     * the Latin1 range, tell the regex engine that
+			     * this can match a non-utf8 target string.  The
+			     * multi-byte fold whose source is in the
+			     * Latin1 range (U+00DF) applies only when the
+			     * target string is utf8, or under unicode rules */
+			    if (j > 255 || AT_LEAST_UNI_SEMANTICS) {
+				while (loc < e) {
+				    /* XXX Discard this fold if any are latin1
+				     * and LOC */
+				    if (UTF8_IS_INVARIANT(*loc)
+					|| UTF8_IS_DOWNGRADEABLE_START(*loc))
+				    {
+					ANYOF_FLAGS(ret)
+						|= ANYOF_NONBITMAP_NON_UTF8;
+					break;
+				    }
+				    loc += UTF8SKIP(loc);
+				}
+			    }
+			    ANYOF_FLAGS(ret) |= ANYOF_UTF8;
 
 			    if (!unicode_alternate) {
 				unicode_alternate = newAV();
@@ -9801,7 +9824,6 @@ parseit:
 
 			    /* This node is variable length */
 			    OP(ret) = ANYOFV;
-			    ANYOF_FLAGS(ret) |= ANYOF_UTF8;
 			}
 		    }
 		    else { /* Single character fold */
author	Karl Williamson <public@khwilliamson.com>	2011-02-07 11:11:16 -0700
committer	Karl Williamson <public@khwilliamson.com>	2011-02-14 08:41:36 -0700
commit	8e3094e5043730c7392a234c5ba58db1c535fa59 (patch)
tree	d2fe28c4db0a5e1885683dabb2c1c628e7a3b8ec /regcomp.c
parent	d18bf9dc360d179168ae3b6311b5d69480eef4f2 (diff)
download	perl-8e3094e5043730c7392a234c5ba58db1c535fa59.tar.gz