diff options
author | Karl Williamson <public@khwilliamson.com> | 2011-03-08 17:06:47 -0700 |
---|---|---|
committer | Karl Williamson <public@khwilliamson.com> | 2011-03-08 23:22:17 -0700 |
commit | c613755a4b4fc8e64a77639d47d7e208fee68edc (patch) | |
tree | 79d619f3808d2f33e5d8613e59e16ebf74c3fc03 | |
parent | f0c16e54b3b5efbb4380952c7ba5e8d7626d7cae (diff) | |
download | perl-c613755a4b4fc8e64a77639d47d7e208fee68edc.tar.gz |
regex: /l in combo with others in syn start class
Now that regexes can be combinations of different charset modifiers,
a synthetic start class can match locale and non-locale both. locale
should generally match only things in the bitmap for code points < 256.
But a synthetic start class with a non-locale component can match such
code points. This patch makes an exception for synthetic nodes that
will be resolved if it passes and is matched again for real.
-rw-r--r-- | regcomp.c | 6 | ||||
-rw-r--r-- | regcomp.h | 22 | ||||
-rw-r--r-- | regexec.c | 11 |
3 files changed, 20 insertions, 19 deletions
@@ -5014,14 +5014,13 @@ reStudy: && (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY)) ri->regstclass = NULL; - /* If the synthetic start class were to ever be used when EOS is set, - * that bit would have to be cleared, as it is shared with another */ if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset) && stclass_flag && !(data.start_class->flags & ANYOF_EOS) && !cl_is_anything(data.start_class)) { const U32 n = add_data(pRExC_state, 1, "f"); + data.start_class->flags |= ANYOF_IS_SYNTHETIC; Newx(RExC_rxi->data->data[n], 1, struct regnode_charclass_class); @@ -5089,12 +5088,11 @@ reStudy: r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8 = r->float_substr = r->float_utf8 = NULL; - /* If the synthetic start class were to ever be used when EOS is set, - * that bit would have to be cleared, as it is shared with another */ if (!(data.start_class->flags & ANYOF_EOS) && !cl_is_anything(data.start_class)) { const U32 n = add_data(pRExC_state, 1, "f"); + data.start_class->flags |= ANYOF_IS_SYNTHETIC; Newx(RExC_rxi->data->data[n], 1, struct regnode_charclass_class); @@ -337,20 +337,18 @@ struct regnode_charclass_class { #define ANYOF_LARGE ANYOF_CLASS /* Same; name retained for back compat */ /* EOS, meaning that it can match an empty string too, is used for the - * synthetic start class (ssc) only. It looks like it could share the INVERT - * bit, as the ssc is never inverted. But doing that caused this reges to - * not match: - * 'foo/file.fob' =~ m,^(?=[^\.])[^/]* /(?=[^\.])[^/]*\.fo[^/]$,; - * (except the space between the * and the / above shouldn't be there; it was - * inserted to make this comment continue on.) - * Rather than try to figure out what was going on in the optimizer, I (khw) - * found a way to save a different bit. But my original line of reasoning was - * "The bit just needs to be turned off before regexec.c gets a hold of it so - * that regexec.c doesn't think it's inverted, but this happens automatically, - * as if the ssc can match an EOS, the ssc is discarded, and never passed to - * regexec.c" */ + * synthetic start class only. */ #define ANYOF_EOS 0x10 +/* ? Is this node the synthetic start class (ssc). This bit is shared with + * ANYOF_EOS, as the latter is used only for the ssc, and then not used by + * regexec.c. And, the code is structured so that if it is set, the ssc is + * not used, so it is guaranteed to be 0 for the ssc by the time regexec.c + * gets executed, and 0 for a non-ssc ANYOF node, as it only ever gets set for + * a potential ssc candidate. Thus setting it to 1 after it has been + * determined that the ssc will be used is not ambiguous */ +#define ANYOF_IS_SYNTHETIC ANYOF_EOS + /* Can match something outside the bitmap that isn't in utf8 */ #define ANYOF_NONBITMAP_NON_UTF8 0x20 @@ -6587,16 +6587,21 @@ S_reginclass(pTHX_ const regexp * const prog, register const regnode * const n, /* If the bitmap didn't (or couldn't) match, and something outside the * bitmap could match, try that. Locale nodes specifiy completely the * behavior of code points in the bit map (otherwise, a utf8 target would - * cause them to be treated as Unicode and not locale), except XXX in + * cause them to be treated as Unicode and not locale), except in * the very unlikely event when this node is a synthetic start class, which - * could be a combination of locale and non-locale nodes */ + * could be a combination of locale and non-locale nodes. So allow locale + * to match for the synthetic start class, which will give a false + * positive that will be resolved when the match is done again as not part + * of the synthetic start class */ if (!match) { if (utf8_target && (flags & ANYOF_UNICODE_ALL) && c >= 256) { match = TRUE; /* Everything above 255 matches */ } else if ((flags & ANYOF_NONBITMAP_NON_UTF8 || (utf8_target && ANYOF_NONBITMAP(n) - && (c >=256 || ! (flags & ANYOF_LOCALE))))) + && (c >=256 + || (! (flags & ANYOF_LOCALE)) + || (flags & ANYOF_IS_SYNTHETIC))))) { AV *av; SV * const sw = regclass_swash(prog, n, TRUE, 0, (SV**)&av); |