summaryrefslogtreecommitdiff
path: root/regcomp.c
diff options
context:
space:
mode:
authorKarl Williamson <public@khwilliamson.com>2011-02-07 11:11:16 -0700
committerKarl Williamson <public@khwilliamson.com>2011-02-14 08:41:36 -0700
commit8e3094e5043730c7392a234c5ba58db1c535fa59 (patch)
treed2fe28c4db0a5e1885683dabb2c1c628e7a3b8ec /regcomp.c
parentd18bf9dc360d179168ae3b6311b5d69480eef4f2 (diff)
downloadperl-8e3094e5043730c7392a234c5ba58db1c535fa59.tar.gz
regcomp.c: tell regexec more about multi-char folds
A multi-char fold that matches in the Latin1 range needs to have that fact communicated to regexec.
Diffstat (limited to 'regcomp.c')
-rw-r--r--regcomp.c26
1 files changed, 24 insertions, 2 deletions
diff --git a/regcomp.c b/regcomp.c
index 50b8877e4d..49db5ea366 100644
--- a/regcomp.c
+++ b/regcomp.c
@@ -9790,8 +9790,31 @@ parseit:
* these multicharacter foldings, to be later saved as
* part of the additional "s" data. */
if (! RExC_in_lookbehind) {
- /* XXX Discard this fold if any are latin1 and LOC */
SV *sv;
+ U8* loc = foldbuf;
+ U8* e = foldbuf + foldlen;
+
+ /* If any of the folded characters of this are in
+ * the Latin1 range, tell the regex engine that
+ * this can match a non-utf8 target string. The
+ * multi-byte fold whose source is in the
+ * Latin1 range (U+00DF) applies only when the
+ * target string is utf8, or under unicode rules */
+ if (j > 255 || AT_LEAST_UNI_SEMANTICS) {
+ while (loc < e) {
+ /* XXX Discard this fold if any are latin1
+ * and LOC */
+ if (UTF8_IS_INVARIANT(*loc)
+ || UTF8_IS_DOWNGRADEABLE_START(*loc))
+ {
+ ANYOF_FLAGS(ret)
+ |= ANYOF_NONBITMAP_NON_UTF8;
+ break;
+ }
+ loc += UTF8SKIP(loc);
+ }
+ }
+ ANYOF_FLAGS(ret) |= ANYOF_UTF8;
if (!unicode_alternate) {
unicode_alternate = newAV();
@@ -9801,7 +9824,6 @@ parseit:
/* This node is variable length */
OP(ret) = ANYOFV;
- ANYOF_FLAGS(ret) |= ANYOF_UTF8;
}
}
else { /* Single character fold */