diff options
author | Karl Williamson <public@khwilliamson.com> | 2011-01-15 13:42:58 -0700 |
---|---|---|
committer | Karl Williamson <public@khwilliamson.com> | 2011-01-16 08:18:54 -0700 |
commit | 11454c594f22abc5945e69a46fc965363dbf326e (patch) | |
tree | 8e51baaf062d5e28410294b9cac63f791c63ced2 | |
parent | f424400810b6af341e96230836690da51c37b812 (diff) | |
download | perl-11454c594f22abc5945e69a46fc965363dbf326e.tar.gz |
Fix \xa0 matching both [\s] [\S], et.al.
This bug stemmed from Latin1 characters not matching any (non-complemented)
character class in /d semantics when the target string is no utf8; but having
unicode semantics when it isn't. The solution here is to add a special flag.
There were several tests that relied on the broken behavior, specifically they
tested that \xff isn't a printable word character even in utf8. I changed the
deparse test to instead use a non-printable code point, and I changed the ones
in re_tests to be TODOs, and will change them back using /a when that is
shortly added.
-rw-r--r-- | dist/B-Deparse/t/deparse.t | 2 | ||||
-rw-r--r-- | regcomp.c | 20 | ||||
-rw-r--r-- | regcomp.h | 4 | ||||
-rw-r--r-- | regexec.c | 6 | ||||
-rw-r--r-- | t/re/re_tests | 8 |
5 files changed, 30 insertions, 10 deletions
diff --git a/dist/B-Deparse/t/deparse.t b/dist/B-Deparse/t/deparse.t index 89a449383f..50baa90ee5 100644 --- a/dist/B-Deparse/t/deparse.t +++ b/dist/B-Deparse/t/deparse.t @@ -247,7 +247,7 @@ my $foo; $_ .= <ARGV> . <$foo>; #### # \x{} -my $foo = "Ab\x{100}\200\x{200}\377Cd\000Ef\x{1000}\cA\x{2000}\cZ"; +my $foo = "Ab\x{100}\200\x{200}\237Cd\000Ef\x{1000}\cA\x{2000}\cZ"; #### # s///e s/x/'y';/e; @@ -707,10 +707,9 @@ S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *c ANYOF_CLASS_ZERO(cl); ANYOF_BITMAP_SETALL(cl); - cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL; + cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL|ANYOF_LOC_NONBITMAP_FOLD|ANYOF_NON_UTF8_LATIN1_ALL; if (LOC) cl->flags |= ANYOF_LOCALE; - cl->flags |= ANYOF_LOC_NONBITMAP_FOLD; } /* Can match anything (initialization) */ @@ -783,6 +782,8 @@ S_cl_and(struct regnode_charclass_class *cl, if (!(and_with->flags & ANYOF_LOC_NONBITMAP_FOLD)) cl->flags &= ~ANYOF_LOC_NONBITMAP_FOLD; + if (!(and_with->flags & ANYOF_NON_UTF8_LATIN1_ALL)) + cl->flags &= ~ANYOF_NON_UTF8_LATIN1_ALL; if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_NONBITMAP && !(and_with->flags & ANYOF_INVERT)) { @@ -850,6 +851,8 @@ S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, con } if (or_with->flags & ANYOF_EOS) cl->flags |= ANYOF_EOS; + if (!(or_with->flags & ANYOF_NON_UTF8_LATIN1_ALL)) + cl->flags |= ANYOF_NON_UTF8_LATIN1_ALL; if (or_with->flags & ANYOF_LOC_NONBITMAP_FOLD) cl->flags |= ANYOF_LOC_NONBITMAP_FOLD; @@ -8232,9 +8235,12 @@ case ANYOF_N##NAME: \ if (! TEST_7) stored += \ S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) value); \ } \ - for (value = 128; value < 256; value++) { \ - S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) value); \ - } \ + /* For a non-ut8 target string with DEPENDS semantics, all above ASCII \ + * Latin1 code points match the complement of any of the classes. But \ + * in utf8, they have their Unicode semantics, so can't just set them \ + * in the bitmap, or else regexec.c will think they matched when they \ + * shouldn't. */ \ + ANYOF_FLAGS(ret) |= ANYOF_NON_UTF8_LATIN1_ALL|ANYOF_UTF8; \ } \ yesno = '!'; \ what = WORD; \ @@ -9824,6 +9830,10 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o) EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags); + if (flags & ANYOF_NON_UTF8_LATIN1_ALL) { + sv_catpvs(sv, "{non-utf8-latin1-all}"); + } + /* output information about the unicode matching */ if (flags & ANYOF_UNICODE_ALL) sv_catpvs(sv, "{unicode_all}"); @@ -362,6 +362,10 @@ struct regnode_charclass_class { /* Matches every code point 0x100 and above*/ #define ANYOF_UNICODE_ALL 0x40 +/* Match all Latin1 characters that aren't ASCII when the target string is not + * in utf8. */ +#define ANYOF_NON_UTF8_LATIN1_ALL 0x80 + #define ANYOF_FLAGS_ALL 0xff /* Character classes for node->classflags of ANYOF */ @@ -6336,6 +6336,12 @@ S_reginclass(pTHX_ const regexp * const prog, register const regnode * const n, if (c < 256) { if (ANYOF_BITMAP_TEST(n, c)) match = TRUE; + else if (flags & ANYOF_NON_UTF8_LATIN1_ALL + && ! utf8_target + && ! isASCII(c)) + { + match = TRUE; + } else if (flags & ANYOF_LOCALE) { PL_reg_flags |= RF_tainted; diff --git a/t/re/re_tests b/t/re/re_tests index 02da1e1bdc..041296aa54 100644 --- a/t/re/re_tests +++ b/t/re/re_tests @@ -628,15 +628,15 @@ $(?<=^(a)) a y $1 a ([[:upper:]]+) ABcd01Xy__-- ${nulnul}${ffff} y $1 AB ([[:xdigit:]]+) ABcd01Xy__-- ${nulnul}${ffff} y $1 ABcd01 ([[:^alpha:]]+) ABcd01Xy__-- ${nulnul}${ffff} y $1 01 -([[:^alnum:]]+) ABcd01Xy__-- ${nulnul}${ffff} y $1 __-- ${nulnul}${ffff} +([[:^alnum:]]+) ABcd01Xy__-- ${nulnul}${ffff} yT $1 __-- ${nulnul}${ffff} ([[:^ascii:]]+) ABcd01Xy__-- ${nulnul}${ffff} y $1 ${ffff} ([[:^cntrl:]]+) ABcd01Xy__-- ${nulnul}${ffff} y $1 ABcd01Xy__-- ([[:^digit:]]+) ABcd01Xy__-- ${nulnul}${ffff} y $1 ABcd ([[:^lower:]]+) ABcd01Xy__-- ${nulnul}${ffff} y $1 AB -([[:^print:]]+) ABcd01Xy__-- ${nulnul}${ffff} y $1 ${nulnul}${ffff} +([[:^print:]]+) ABcd01Xy__-- ${nulnul}${ffff} yT $1 ${nulnul}${ffff} ([[:^punct:]]+) ABcd01Xy__-- ${nulnul}${ffff} y $1 ABcd01Xy ([[:^space:]]+) ABcd01Xy__-- ${nulnul}${ffff} y $1 ABcd01Xy__-- -([[:^word:]]+) ABcd01Xy__-- ${nulnul}${ffff} y $1 -- ${nulnul}${ffff} +([[:^word:]]+) ABcd01Xy__-- ${nulnul}${ffff} yT $1 -- ${nulnul}${ffff} ([[:^upper:]]+) ABcd01Xy__-- ${nulnul}${ffff} y $1 cd01 ([[:^xdigit:]]+) ABcd01Xy__-- ${nulnul}${ffff} y $1 Xy__-- ${nulnul}${ffff} [[:foo:]] - c - POSIX class [:foo:] unknown @@ -1406,7 +1406,7 @@ foo(\h)bar foo\tbar y $1 \t /\N{U+41}\x{c1}/i a\x{e1} y $& a\x{e1} /[\N{U+41}\x{c1}]/i \x{e1} y $& \x{e1} -[\s][\S] \x{a0}\x{a0} nT - - # Unicode complements should not match same character +[\s][\S] \x{a0}\x{a0} n - - # Unicode complements should not match same character # was generating malformed utf8 '[\x{100}\xff]'i \x{ff} y $& \x{ff} |