summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKarl Williamson <public@khwilliamson.com>2011-03-08 17:06:47 -0700
committerKarl Williamson <public@khwilliamson.com>2011-03-08 23:22:17 -0700
commitc613755a4b4fc8e64a77639d47d7e208fee68edc (patch)
tree79d619f3808d2f33e5d8613e59e16ebf74c3fc03
parentf0c16e54b3b5efbb4380952c7ba5e8d7626d7cae (diff)
downloadperl-c613755a4b4fc8e64a77639d47d7e208fee68edc.tar.gz
regex: /l in combo with others in syn start class
Now that regexes can be combinations of different charset modifiers, a synthetic start class can match locale and non-locale both. locale should generally match only things in the bitmap for code points < 256. But a synthetic start class with a non-locale component can match such code points. This patch makes an exception for synthetic nodes that will be resolved if it passes and is matched again for real.
-rw-r--r--regcomp.c6
-rw-r--r--regcomp.h22
-rw-r--r--regexec.c11
3 files changed, 20 insertions, 19 deletions
diff --git a/regcomp.c b/regcomp.c
index be90fca0ff..15dcf3a15e 100644
--- a/regcomp.c
+++ b/regcomp.c
@@ -5014,14 +5014,13 @@ reStudy:
&& (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY))
ri->regstclass = NULL;
- /* If the synthetic start class were to ever be used when EOS is set,
- * that bit would have to be cleared, as it is shared with another */
if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
&& stclass_flag
&& !(data.start_class->flags & ANYOF_EOS)
&& !cl_is_anything(data.start_class))
{
const U32 n = add_data(pRExC_state, 1, "f");
+ data.start_class->flags |= ANYOF_IS_SYNTHETIC;
Newx(RExC_rxi->data->data[n], 1,
struct regnode_charclass_class);
@@ -5089,12 +5088,11 @@ reStudy:
r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
= r->float_substr = r->float_utf8 = NULL;
- /* If the synthetic start class were to ever be used when EOS is set,
- * that bit would have to be cleared, as it is shared with another */
if (!(data.start_class->flags & ANYOF_EOS)
&& !cl_is_anything(data.start_class))
{
const U32 n = add_data(pRExC_state, 1, "f");
+ data.start_class->flags |= ANYOF_IS_SYNTHETIC;
Newx(RExC_rxi->data->data[n], 1,
struct regnode_charclass_class);
diff --git a/regcomp.h b/regcomp.h
index 18c8f6f745..9ffca0e969 100644
--- a/regcomp.h
+++ b/regcomp.h
@@ -337,20 +337,18 @@ struct regnode_charclass_class {
#define ANYOF_LARGE ANYOF_CLASS /* Same; name retained for back compat */
/* EOS, meaning that it can match an empty string too, is used for the
- * synthetic start class (ssc) only. It looks like it could share the INVERT
- * bit, as the ssc is never inverted. But doing that caused this reges to
- * not match:
- * 'foo/file.fob' =~ m,^(?=[^\.])[^/]* /(?=[^\.])[^/]*\.fo[^/]$,;
- * (except the space between the * and the / above shouldn't be there; it was
- * inserted to make this comment continue on.)
- * Rather than try to figure out what was going on in the optimizer, I (khw)
- * found a way to save a different bit. But my original line of reasoning was
- * "The bit just needs to be turned off before regexec.c gets a hold of it so
- * that regexec.c doesn't think it's inverted, but this happens automatically,
- * as if the ssc can match an EOS, the ssc is discarded, and never passed to
- * regexec.c" */
+ * synthetic start class only. */
#define ANYOF_EOS 0x10
+/* ? Is this node the synthetic start class (ssc). This bit is shared with
+ * ANYOF_EOS, as the latter is used only for the ssc, and then not used by
+ * regexec.c. And, the code is structured so that if it is set, the ssc is
+ * not used, so it is guaranteed to be 0 for the ssc by the time regexec.c
+ * gets executed, and 0 for a non-ssc ANYOF node, as it only ever gets set for
+ * a potential ssc candidate. Thus setting it to 1 after it has been
+ * determined that the ssc will be used is not ambiguous */
+#define ANYOF_IS_SYNTHETIC ANYOF_EOS
+
/* Can match something outside the bitmap that isn't in utf8 */
#define ANYOF_NONBITMAP_NON_UTF8 0x20
diff --git a/regexec.c b/regexec.c
index 739eba6d8d..76784ee97f 100644
--- a/regexec.c
+++ b/regexec.c
@@ -6587,16 +6587,21 @@ S_reginclass(pTHX_ const regexp * const prog, register const regnode * const n,
/* If the bitmap didn't (or couldn't) match, and something outside the
* bitmap could match, try that. Locale nodes specifiy completely the
* behavior of code points in the bit map (otherwise, a utf8 target would
- * cause them to be treated as Unicode and not locale), except XXX in
+ * cause them to be treated as Unicode and not locale), except in
* the very unlikely event when this node is a synthetic start class, which
- * could be a combination of locale and non-locale nodes */
+ * could be a combination of locale and non-locale nodes. So allow locale
+ * to match for the synthetic start class, which will give a false
+ * positive that will be resolved when the match is done again as not part
+ * of the synthetic start class */
if (!match) {
if (utf8_target && (flags & ANYOF_UNICODE_ALL) && c >= 256) {
match = TRUE; /* Everything above 255 matches */
}
else if ((flags & ANYOF_NONBITMAP_NON_UTF8
|| (utf8_target && ANYOF_NONBITMAP(n)
- && (c >=256 || ! (flags & ANYOF_LOCALE)))))
+ && (c >=256
+ || (! (flags & ANYOF_LOCALE))
+ || (flags & ANYOF_IS_SYNTHETIC)))))
{
AV *av;
SV * const sw = regclass_swash(prog, n, TRUE, 0, (SV**)&av);