summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKarl Williamson <public@khwilliamson.com>2011-02-19 14:27:18 -0700
committerKarl Williamson <public@khwilliamson.com>2011-02-19 17:18:05 -0700
commitde87c4fec898d44ec7ff4bdaba989015b8ec0089 (patch)
tree174d0d663a8f1145aaafa052b62f163a25fa4cfb
parent236d7867779a27dce9f2a12d974d7fba484394de (diff)
downloadperl-de87c4fec898d44ec7ff4bdaba989015b8ec0089.tar.gz
regexec.c: Fix utf8 e.g. [\s] under locale
locale rules are handled improperly for utf8-encoded strings in bracketed character classes under locale. This fixes that.
-rw-r--r--regexec.c9
-rw-r--r--t/re/charset.t2
2 files changed, 8 insertions, 3 deletions
diff --git a/regexec.c b/regexec.c
index 6bcfee0b3c..a1ab8f8ce0 100644
--- a/regexec.c
+++ b/regexec.c
@@ -6618,13 +6618,18 @@ S_reginclass(pTHX_ const regexp * const prog, register const regnode * const n,
}
/* If the bitmap didn't (or couldn't) match, and something outside the
- * bitmap could match, try that */
+ * bitmap could match, try that. Locale nodes specifiy completely the
+ * behavior of code points in the bit map (otherwise, a utf8 target would
+ * cause them to be treated as Unicode and not locale), except XXX in
+ * the very unlikely event when this node is a synthetic start class, which
+ * could be a combination of locale and non-locale nodes */
if (!match) {
if (utf8_target && (flags & ANYOF_UNICODE_ALL) && c >= 256) {
match = TRUE; /* Everything above 255 matches */
}
else if ((flags & ANYOF_NONBITMAP_NON_UTF8
- || (utf8_target && flags & ANYOF_UTF8)))
+ || (utf8_target && flags & ANYOF_UTF8
+ && (c >=256 || ! (flags & ANYOF_LOCALE)))))
{
AV *av;
SV * const sw = regclass_swash(prog, n, TRUE, 0, (SV**)&av);
diff --git a/t/re/charset.t b/t/re/charset.t
index f34cec80b2..e27f078a36 100644
--- a/t/re/charset.t
+++ b/t/re/charset.t
@@ -35,7 +35,7 @@ $testcases{'[:space:]'} = $testcases{'\s'};
$testcases{'[:word:]'} = $testcases{'\w'};
# For each possible character set...
-foreach my $charset ("a", "d", "u") {
+foreach my $charset ("a", "d", "l", "u") {
# And in utf8 or not
foreach my $upgrade ("", 'utf8::upgrade($a); ') {