diff options
author | Karl Williamson <public@khwilliamson.com> | 2011-02-07 11:11:16 -0700 |
---|---|---|
committer | Karl Williamson <public@khwilliamson.com> | 2011-02-14 08:41:36 -0700 |
commit | 8e3094e5043730c7392a234c5ba58db1c535fa59 (patch) | |
tree | d2fe28c4db0a5e1885683dabb2c1c628e7a3b8ec /regcomp.c | |
parent | d18bf9dc360d179168ae3b6311b5d69480eef4f2 (diff) | |
download | perl-8e3094e5043730c7392a234c5ba58db1c535fa59.tar.gz |
regcomp.c: tell regexec more about multi-char folds
A multi-char fold that matches in the Latin1 range needs to have that
fact communicated to regexec.
Diffstat (limited to 'regcomp.c')
-rw-r--r-- | regcomp.c | 26 |
1 files changed, 24 insertions, 2 deletions
@@ -9790,8 +9790,31 @@ parseit: * these multicharacter foldings, to be later saved as * part of the additional "s" data. */ if (! RExC_in_lookbehind) { - /* XXX Discard this fold if any are latin1 and LOC */ SV *sv; + U8* loc = foldbuf; + U8* e = foldbuf + foldlen; + + /* If any of the folded characters of this are in + * the Latin1 range, tell the regex engine that + * this can match a non-utf8 target string. The + * multi-byte fold whose source is in the + * Latin1 range (U+00DF) applies only when the + * target string is utf8, or under unicode rules */ + if (j > 255 || AT_LEAST_UNI_SEMANTICS) { + while (loc < e) { + /* XXX Discard this fold if any are latin1 + * and LOC */ + if (UTF8_IS_INVARIANT(*loc) + || UTF8_IS_DOWNGRADEABLE_START(*loc)) + { + ANYOF_FLAGS(ret) + |= ANYOF_NONBITMAP_NON_UTF8; + break; + } + loc += UTF8SKIP(loc); + } + } + ANYOF_FLAGS(ret) |= ANYOF_UTF8; if (!unicode_alternate) { unicode_alternate = newAV(); @@ -9801,7 +9824,6 @@ parseit: /* This node is variable length */ OP(ret) = ANYOFV; - ANYOF_FLAGS(ret) |= ANYOF_UTF8; } } else { /* Single character fold */ |