diff options
author | Karl Williamson <public@khwilliamson.com> | 2011-03-16 12:19:42 -0600 |
---|---|---|
committer | Karl Williamson <public@khwilliamson.com> | 2011-03-16 20:56:28 -0600 |
commit | af302e7fa58415c2d8454c8cbef7bccd8b504257 (patch) | |
tree | cbed6bcf6015e29d45c11473c167cd3462463930 | |
parent | bf3c5c060b294724da178b40885fa63ad4708233 (diff) | |
download | perl-af302e7fa58415c2d8454c8cbef7bccd8b504257.tar.gz |
RT #85964: bleadperl breaks CGI-FormBuilder
The introduction of the l regex modifier introduces the possibility that
a regular expression can have subportions that match under locale and
other portions that don't. I (khw) failed to see all the implications
of that in the optimizer. Unfortunately, things didn't start surfacing
until late in the development cycle.
The optimizer is structured so that a new blank node is initialized to
match anything, and the state is set to AND, so that the first real node
that comes along is supposed to be ANDed together; with the result being
that node. (Like an AND of all 1's with some bit pattern yields that
bit pattern.) Then the mode is switched to OR, so subsequent nodes that
could be the start ones are or'd in. *(see footnote below).
This design leads to some issues, like at the XXX line added by this
commit, which looks to be a work-around for the deficiencies of the
design.
Commit cf34198ebe3dd876d67c10caa9acf491ad2a0c51 that led to this ticket
changed things to include LOCALE as part of the initialization, so that
the l could be on and off in various parts of the regex. I tried to
just revert that (plus associated parameter changes), and found that the
changes made to the AND and OR logic that fixed other problems really
depended on that commit. Perhaps those could be worked around, but it
is not the forward direction.
This commit works around things in a different way. What happened in
the earlier commit was that the synthetic start class (SSC) is, under
some circumstances, getting generated as matching locale even if there
is no locale matching in the regex. (This could not happen if the
design were as described in the footnote.) This shouldn't matter except
for potentially performance issues, as this would just be false
positives. However, it turns out there is code in the optimizer that
assumes that locale and non-locale are never mixed; and so does not do
the right thing.
This patch is aimed at safety. If the SSC is marked as locale, it sets
the bits for things like \w as if the SSC could also end up being for
non-locale. This can generate false positives for true locale matches
but shouldn't introduce actual optimizer errors, since it only adds to
what the SSC can match and doesn't make any restrictions.
* I don't see why this design; it seems to me easier to start with the
initial state set to all 0's, and then the first node gets OR'd in,
yielding exactly that first node; then you don't have to switch; you
still have to deal with AND cases, as for example in 0 length
lookaheads, but things are made easier.
-rw-r--r-- | regcomp.c | 27 | ||||
-rw-r--r-- | t/re/re_tests | 12 |
2 files changed, 24 insertions, 15 deletions
@@ -727,6 +727,9 @@ S_cl_anything(struct regnode_charclass_class *cl) ANYOF_BITMAP_SETALL(cl); ANYOF_CLASS_ZERO(cl); /* all bits set, so class is irrelevant */ cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL|ANYOF_LOC_NONBITMAP_FOLD|ANYOF_NON_UTF8_LATIN1_ALL|ANYOF_LOCALE; + /* The above set locale which given the current logic may not get cleared + * even if no locale is in the regex, which may lead to false positives; + * see the commit message */ } /* Can match anything (initialization) */ @@ -3240,6 +3243,9 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, data->start_class->flags &= ~ANYOF_EOS; data->start_class->flags |= ANYOF_LOC_NONBITMAP_FOLD; if (OP(scan) == EXACTFL) { + /* XXX This set is probably no longer necessary, and + * probably wrong as LOCALE now is on in the initial + * state */ data->start_class->flags |= ANYOF_LOCALE; } else { @@ -3752,7 +3758,11 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, else { if (data->start_class->flags & ANYOF_LOCALE) ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM); - else if (OP(scan) == ALNUMU) { + + /* Even if under locale, set the bits for non-locale + * in case it isn't a true locale-node. This will + * create false positives if it truly is locale */ + if (OP(scan) == ALNUMU) { for (value = 0; value < 256; value++) { if (isWORDCHAR_L1(value)) { ANYOF_BITMAP_SET(data->start_class, value); @@ -3789,7 +3799,11 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, else { if (data->start_class->flags & ANYOF_LOCALE) ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM); - else { + + /* Even if under locale, set the bits for + * non-locale in case it isn't a true locale-node. + * This will create false positives if it truly is + * locale */ if (OP(scan) == NALNUMU) { for (value = 0; value < 256; value++) { if (! isWORDCHAR_L1(value)) { @@ -3803,7 +3817,6 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, } } } - } } break; case SPACE: @@ -3829,7 +3842,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, if (data->start_class->flags & ANYOF_LOCALE) { ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE); } - else if (OP(scan) == SPACEU) { + if (OP(scan) == SPACEU) { for (value = 0; value < 256; value++) { if (isSPACE_L1(value)) { ANYOF_BITMAP_SET(data->start_class, value); @@ -3866,7 +3879,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, else { if (data->start_class->flags & ANYOF_LOCALE) ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE); - else if (OP(scan) == NSPACEU) { + if (OP(scan) == NSPACEU) { for (value = 0; value < 256; value++) { if (!isSPACE_L1(value)) { ANYOF_BITMAP_SET(data->start_class, value); @@ -3894,11 +3907,9 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, else { if (data->start_class->flags & ANYOF_LOCALE) ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT); - else { for (value = 0; value < 256; value++) if (isDIGIT(value)) ANYOF_BITMAP_SET(data->start_class, value); - } } break; case NDIGIT: @@ -3912,11 +3923,9 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, else { if (data->start_class->flags & ANYOF_LOCALE) ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT); - else { for (value = 0; value < 256; value++) if (!isDIGIT(value)) ANYOF_BITMAP_SET(data->start_class, value); - } } break; CASE_SYNST_FNC(VERTWS); diff --git a/t/re/re_tests b/t/re/re_tests index 0f19ae21d1..b3815298bb 100644 --- a/t/re/re_tests +++ b/t/re/re_tests @@ -1498,16 +1498,16 @@ abc\N{def - c - \\N{NAME} must be resolved by the lexer (?{})[\x{100}] \x{100} y $& \x{100} # RT #85964 -^m?(\S)(.*)\1$ aba Ty $1 a +^m?(\S)(.*)\1$ aba y $1 a ^m?(\S)(.*)\1$ \tb\t n - - -^m?(\s)(.*)\1$ \tb\t Ty $1 \t +^m?(\s)(.*)\1$ \tb\t y $1 \t ^m?(\s)(.*)\1$ aba n - - -^m?(\W)(.*)\1$ :b: Ty $1 : +^m?(\W)(.*)\1$ :b: y $1 : ^m?(\W)(.*)\1$ aba n - - -^m?(\w)(.*)\1$ aba Ty $1 a +^m?(\w)(.*)\1$ aba y $1 a ^m?(\w)(.*)\1$ :b: n - - -^m?(\D)(.*)\1$ aba Ty $1 a +^m?(\D)(.*)\1$ aba y $1 a ^m?(\D)(.*)\1$ 5b5 n - - -^m?(\d)(.*)\1$ 5b5 Ty $1 5 +^m?(\d)(.*)\1$ 5b5 y $1 5 ^m?(\d)(.*)\1$ aba n - - # vim: softtabstop=0 noexpandtab |