diff options
author | Yves Orton <demerphq@gmail.com> | 2013-03-25 23:23:40 +0100 |
---|---|---|
committer | Yves Orton <demerphq@gmail.com> | 2013-03-27 08:38:00 +0100 |
commit | dbc200c5a1d3ae1d9360435a384c19883bf5f4f6 (patch) | |
tree | 2312197f897140b952835ad4d7864c03d9fcd791 /regexp.h | |
parent | c9d98c4e542a0779fb34f107a15def6ed7ff3f98 (diff) | |
download | perl-dbc200c5a1d3ae1d9360435a384c19883bf5f4f6.tar.gz |
rework split() special case interaction with regex engine
This patch resolves several issues at once. The parts are
sufficiently interconnected that it is hard to break it down
into smaller commits. The tickets open for these issues are:
RT #94490 - split and constant folding
RT #116086 - split "\x20" doesn't work as documented
It additionally corrects some issues with cached regexes that
were exposed by the split changes (and applied to them).
It effectively reverts 5255171e6cd0accee6f76ea2980e32b3b5b8e171
and cccd1425414e6518c1fc8b7bcaccfb119320c513.
Prior to this patch the special RXf_SKIPWHITE behavior of
split(" ", $thing)
was only available if Perl could resolve the first argument to
split at compile time, meaning under various arcane situations.
This manifested as oddities like
my $delim = $cond ? " " : qr/\s+/;
split $delim, $string;
and
split $cond ? " ", qr/\s+/, $string
not behaving the same as:
($cond ? split(" ", $string) : split(/\s+/, $string))
which isn't very convenient.
This patch changes this by adding a new flag to the op_pmflags,
PMf_SPLIT which enables pp_regcomp() to know whether it was called
as part of split, which allows the RXf_SPLIT to be passed into run
time regex compilation. We also preserve the original flags so
pattern caching works properly, by adding a new property to the
regexp structure, "compflags", and related macros for accessing it.
We preserve the original flags passed into the compilation process,
so we can compare when we are trying to decide if we need to
recompile.
Note that this essentially the opposite fix from the one applied
originally to fix #94490 in 5255171e6cd0accee6f76ea2980e32b3b5b8e171.
The reverted patch was meant to make:
split( 0 || " ", $thing ) #1
consistent with
my $x=0; split( $x || " ", $thing ) #2
and not with
split( " ", $thing ) #3
This was reverted because it broke C<split("\x{20}", $thing)>, and
because one might argue that is not that #1 does the wrong thing,
but rather that the behavior of #2 that is wrong. In other words
we might expect that all three should behave the same as #3, and
that instead of "fixing" the behavior of #1 to be like #2, we should
really fix the behavior of #2 to behave like #3. (Which is what we did.)
Also, it doesn't make sense to move the special case detection logic
further from the regex engine. We really want the regex engine to decide
this stuff itself, otherwise split " ", ... wouldn't work properly with
an alternate engine. (Imagine we add a special regexp meta pattern that behaves
the same as " " does in a split /.../. For instance we might make
split /(*SPLITWHITE)/ trigger the same behavior as split " ".
The other major change as result of this patch is it effectively
reverts commit cccd1425414e6518c1fc8b7bcaccfb119320c513, which
was intended to get rid of RXf_SPLIT and RXf_SKIPWHITE, which
and free up bits in the regex flags structure.
But we dont want to get rid of these vars, and it turns out that
RXf_SEEN_LOOKBEHIND is used only in the same situation as the new
RXf_MODIFIES_VARS. So I have renamed RXf_SEEN_LOOKBEHIND to
RXf_NO_INPLACE_SUBST, and then instead of using two vars we use
only the one. Which in turn allows RXf_SPLIT and RXf_SKIPWHITE to
have their bits back.
Diffstat (limited to 'regexp.h')
-rw-r--r-- | regexp.h | 35 |
1 files changed, 18 insertions, 17 deletions
@@ -130,6 +130,9 @@ struct reg_code_block { /* Information about the match that isn't often used */ \ /* offset from wrapped to the start of precomp */ \ PERL_BITFIELD32 pre_prefix:4; \ + /* original flags used to compile the pattern, may differ */ \ + /* from extflags in various ways */ \ + PERL_BITFIELD32 compflags:9; \ CV *qr_anoncv /* the anon sub wrapped round qr/(?{..})/ */ typedef struct regexp { @@ -333,7 +336,17 @@ and check for NULL. /* Leave some space, so future bit allocations can go either in the shared or * unshared area without affecting binary compatibility */ -#define RXf_BASE_SHIFT (_RXf_PMf_SHIFT_NEXT+1) +#define RXf_BASE_SHIFT (_RXf_PMf_SHIFT_NEXT) + +/* + Set in Perl_pmruntime if op_flags & OPf_SPECIAL, i.e. split. Will + be used by regex engines to check whether they should set + RXf_SKIPWHITE +*/ +#define RXf_SPLIT (1<<(RXf_BASE_SHIFT-1)) +#if RXf_SPLIT != RXf_PMf_SPLIT +# error "RXf_SPLIT does not match RXf_PMf_SPLIT" +#endif /* Manually decorate this function with gcc-style attributes just to * avoid having to restructure the header files and their called order, @@ -366,19 +379,6 @@ get_regex_charset_name(const U32 flags, STRLEN* const lenp) } } -/* - Two flags no longer used. - RXf_SPLIT used to be set in Perl_pmruntime if op_flags & OPf_SPECIAL, - i.e., split. It was used by the regex engine to check whether it should - set RXf_SKIPWHITE. Regexp plugins on CPAN also have done the same thing - historically, so we leave these flags defined. -*/ -#ifndef PERL_CORE -# define RXf_SPLIT 0 -# define RXf_SKIPWHITE 0 -#endif - - /* Anchor and GPOS related stuff */ #define RXf_ANCH_BOL (1<<(RXf_BASE_SHIFT+0)) #define RXf_ANCH_MBOL (1<<(RXf_BASE_SHIFT+1)) @@ -392,7 +392,7 @@ get_regex_charset_name(const U32 flags, STRLEN* const lenp) #define RXf_ANCH_SINGLE (RXf_ANCH_SBOL|RXf_ANCH_GPOS) /* What we have seen */ -#define RXf_LOOKBEHIND_SEEN (1<<(RXf_BASE_SHIFT+6)) +#define RXf_NO_INPLACE_SUBST (1<<(RXf_BASE_SHIFT+6)) #define RXf_EVAL_SEEN (1<<(RXf_BASE_SHIFT+7)) #define RXf_CANY_SEEN (1<<(RXf_BASE_SHIFT+8)) @@ -409,8 +409,6 @@ get_regex_charset_name(const U32 flags, STRLEN* const lenp) #define RXf_INTUIT_TAIL (1<<(RXf_BASE_SHIFT+14)) #define RXf_USE_INTUIT (RXf_USE_INTUIT_NOML|RXf_USE_INTUIT_ML) -#define RXf_MODIFIES_VARS (1<<(RXf_BASE_SHIFT+15)) - /* Copy and tainted info */ #define RXf_COPY_DONE (1<<(RXf_BASE_SHIFT+16)) @@ -422,6 +420,7 @@ get_regex_charset_name(const U32 flags, STRLEN* const lenp) /* Flags indicating special patterns */ #define RXf_START_ONLY (1<<(RXf_BASE_SHIFT+19)) /* Pattern is /^/ */ +#define RXf_SKIPWHITE (1<<(RXf_BASE_SHIFT+20)) /* Pattern is for a split " " */ #define RXf_WHITE (1<<(RXf_BASE_SHIFT+21)) /* Pattern is /\s+/ */ #define RXf_NULL (1U<<(RXf_BASE_SHIFT+22)) /* Pattern is // */ #if RXf_BASE_SHIFT+22 > 31 @@ -468,6 +467,7 @@ get_regex_charset_name(const U32 flags, STRLEN* const lenp) : RX_MATCH_COPIED_off(prog)) #define RXp_EXTFLAGS(rx) ((rx)->extflags) +#define RXp_COMPFLAGS(rx) ((rx)->compflags) /* For source compatibility. We used to store these explicitly. */ #define RX_PRECOMP(prog) (RX_WRAPPED(prog) + ReANY(prog)->pre_prefix) @@ -482,6 +482,7 @@ get_regex_charset_name(const U32 flags, STRLEN* const lenp) #define RX_CHECK_SUBSTR(prog) (ReANY(prog)->check_substr) #define RX_REFCNT(prog) SvREFCNT(prog) #define RX_EXTFLAGS(prog) RXp_EXTFLAGS(ReANY(prog)) +#define RX_COMPFLAGS(prog) RXp_COMPFLAGS(ReANY(prog)) #define RX_ENGINE(prog) (ReANY(prog)->engine) #define RX_SUBBEG(prog) (ReANY(prog)->subbeg) #define RX_SUBOFFSET(prog) (ReANY(prog)->suboffset) |