diff options
author | Karl Williamson <public@khwilliamson.com> | 2011-03-17 10:24:28 -0600 |
---|---|---|
committer | Karl Williamson <public@khwilliamson.com> | 2011-03-17 11:00:50 -0600 |
commit | 3fffb88a6cabe9bc42e2a62f0ab7442ba6850dd0 (patch) | |
tree | 66e6ac2fdec3b07d7e52cbe5df8c8f9c3574ff35 /regcomp.c | |
parent | 77dc54c8ed3f77947d97b9085c718de3fc7fb81f (diff) | |
download | perl-3fffb88a6cabe9bc42e2a62f0ab7442ba6850dd0.tar.gz |
regcomp.c: Avoid locale in optimizer unless necessary
This is further work along the lines in RT #85964 and commit
af302e7fa58415c2d8454c8cbef7bccd8b504257. It reverts, for the the most
part, commits aa19b56b2f07e9eabf57540f00d312d8093e9d28 (Remove unused
parameter) and c613755a4b4fc8e64a77639d47d7e208fee68edc (/l in synthetic
start class).
Those commits caused the synthetic start class to often be marked as
matching under locale rules, even if there was no part of the regular
expression that used locale. This led to RT #85964, which made apparent
that there were a number of assumptions in the optimizer about locale
that were no longer necessarily true. This new commit changes things so
that locale has to be somewhere in the regex in order to get the
synthetic start class to include /l. In other words, this reverts the
effect of those commits to regular expression which have /l -- we go
back to the old way of doing things for non-locale regexes. This limits
any bugs that may have been introduced by the addition of /l (and being
able to match only sub-parts of a regex under locale) to the relatively
uncommon regexes which actually use it. There are a number of bugs
that have surfaced for the locale rules regexes that have gone
unreported; and some say locale rules regexes should be deprecated.
Diffstat (limited to 'regcomp.c')
-rw-r--r-- | regcomp.c | 52 |
1 files changed, 30 insertions, 22 deletions
@@ -722,16 +722,24 @@ S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *min /* Can match anything (initialization) */ STATIC void -S_cl_anything(struct regnode_charclass_class *cl) +S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl) { PERL_ARGS_ASSERT_CL_ANYTHING; ANYOF_BITMAP_SETALL(cl); ANYOF_CLASS_ZERO(cl); /* all bits set, so class is irrelevant */ - cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL|ANYOF_LOC_NONBITMAP_FOLD|ANYOF_NON_UTF8_LATIN1_ALL|ANYOF_LOCALE; - /* The above line set locale which given the current logic may not get - * cleared even if no locale is in the regex, which may lead to false - * positives; see the commit message */ + cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL|ANYOF_LOC_NONBITMAP_FOLD|ANYOF_NON_UTF8_LATIN1_ALL; + + /* If any portion of the regex is to operate under locale rules, + * initialization includes it. The reason this isn't done for all regexes + * is that the optimizer was written under the assumption that locale was + * all-or-nothing. Given the complexity and lack of documentation in the + * optimizer, and that there are inadequate test cases for locale, so many + * parts of it may not work properly, it is safest to avoid locale unless + * necessary. */ + if (RExC_contains_locale) { + cl->flags |= ANYOF_LOCALE; + } } /* Can match anything (initialization) */ @@ -760,7 +768,7 @@ S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl) Zero(cl, 1, struct regnode_charclass_class); cl->type = ANYOF; - cl_anything(cl); + cl_anything(pRExC_state, cl); ARG_SET(cl, ANYOF_NONBITMAP_EMPTY); } @@ -861,7 +869,7 @@ S_cl_and(struct regnode_charclass_class *cl, /* 'OR' a given class with another one. Can create false positives */ /* cl should not be inverted */ STATIC void -S_cl_or(struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with) +S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with) { PERL_ARGS_ASSERT_CL_OR; @@ -871,7 +879,7 @@ S_cl_or(struct regnode_charclass_class *cl, const struct regnode_charclass_class * complement of everything not in the bitmap, but currently we don't * know what that is, so give up and match anything */ if (ANYOF_NONBITMAP(or_with)) { - cl_anything(cl); + cl_anything(pRExC_state, cl); } /* We do not use * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2)) @@ -891,7 +899,7 @@ S_cl_or(struct regnode_charclass_class *cl, const struct regnode_charclass_class cl->bitmap[i] |= ~or_with->bitmap[i]; } /* XXXX: logic is complicated otherwise */ else { - cl_anything(cl); + cl_anything(pRExC_state, cl); } /* And, we can just take the union of the flags that aren't affected @@ -924,7 +932,7 @@ S_cl_or(struct regnode_charclass_class *cl, const struct regnode_charclass_class } } else { /* XXXX: logic is complicated, leave it along for a moment. */ - cl_anything(cl); + cl_anything(pRExC_state, cl); } /* Take the union */ @@ -2808,7 +2816,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, data->whilem_c = data_fake.whilem_c; } if (flags & SCF_DO_STCLASS) - cl_or(&accum, &this_class); + cl_or(pRExC_state, &accum, &this_class); } if (code == IFTHEN && num < 2) /* Empty ELSE branch */ min1 = 0; @@ -2821,7 +2829,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, min += min1; delta += max1 - min1; if (flags & SCF_DO_STCLASS_OR) { - cl_or(data->start_class, &accum); + cl_or(pRExC_state, data->start_class, &accum); if (min1) { cl_and(data->start_class, and_withp); flags &= ~SCF_DO_STCLASS; @@ -3096,7 +3104,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, } is_inf = is_inf_internal = 1; if (flags & SCF_DO_STCLASS_OR) /* Allow everything */ - cl_anything(data->start_class); + cl_anything(pRExC_state, data->start_class); flags &= ~SCF_DO_STCLASS; } } else { @@ -3388,7 +3396,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, data->start_class = oclass; if (mincount == 0 || minnext == 0) { if (flags & SCF_DO_STCLASS_OR) { - cl_or(data->start_class, &this_class); + cl_or(pRExC_state, data->start_class, &this_class); } else if (flags & SCF_DO_STCLASS_AND) { /* Switch to OR mode: cache the old value of @@ -3404,7 +3412,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, } } else { /* Non-zero len */ if (flags & SCF_DO_STCLASS_OR) { - cl_or(data->start_class, &this_class); + cl_or(pRExC_state, data->start_class, &this_class); cl_and(data->start_class, and_withp); } else if (flags & SCF_DO_STCLASS_AND) @@ -3654,7 +3662,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, } is_inf = is_inf_internal = 1; if (flags & SCF_DO_STCLASS_OR) - cl_anything(data->start_class); + cl_anything(pRExC_state, data->start_class); flags &= ~SCF_DO_STCLASS; break; } @@ -3717,7 +3725,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, do_default: /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */ if (flags & SCF_DO_STCLASS_OR) /* Allow everything */ - cl_anything(data->start_class); + cl_anything(pRExC_state, data->start_class); break; case REG_ANY: if (OP(scan) == SANY) @@ -3725,7 +3733,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */ value = (ANYOF_BITMAP_TEST(data->start_class,'\n') || ANYOF_CLASS_TEST_ANY_SET(data->start_class)); - cl_anything(data->start_class); + cl_anything(pRExC_state, data->start_class); } if (flags & SCF_DO_STCLASS_AND || !value) ANYOF_BITMAP_CLEAR(data->start_class,'\n'); @@ -3735,7 +3743,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, cl_and(data->start_class, (struct regnode_charclass_class*)scan); else - cl_or(data->start_class, + cl_or(pRExC_state, data->start_class, (struct regnode_charclass_class*)scan); break; case ALNUM: @@ -4156,7 +4164,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, } is_inf = is_inf_internal = 1; if (flags & SCF_DO_STCLASS_OR) /* Allow everything */ - cl_anything(data->start_class); + cl_anything(pRExC_state, data->start_class); flags &= ~SCF_DO_STCLASS; } else if (OP(scan) == GPOS) { @@ -4254,7 +4262,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, data->whilem_c = data_fake.whilem_c; } if (flags & SCF_DO_STCLASS) - cl_or(&accum, &this_class); + cl_or(pRExC_state, &accum, &this_class); } } if (flags & SCF_DO_SUBSTR) { @@ -4266,7 +4274,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, min += min1; delta += max1 - min1; if (flags & SCF_DO_STCLASS_OR) { - cl_or(data->start_class, &accum); + cl_or(pRExC_state, data->start_class, &accum); if (min1) { cl_and(data->start_class, and_withp); flags &= ~SCF_DO_STCLASS; |