diff options
author | Yves Orton <yves.orton@booking.com> | 2014-02-02 23:37:37 +0800 |
---|---|---|
committer | Yves Orton <yves.orton@booking.com> | 2014-02-03 22:44:29 +0800 |
commit | ee273784a82417ff7a4ec06716556fb7fb705427 (patch) | |
tree | 8f471a22d18b3b440cc87d95c8910ec01fe07b28 | |
parent | f25474b0531878b277e2bd50a823ced1a3df6000 (diff) | |
download | perl-ee273784a82417ff7a4ec06716556fb7fb705427.tar.gz |
Add RXf_UNBOUNDED_QUANTIFIER and regexp->maxlen
The flag tells us that a pattern may match an infinitely long string.
The new member in the regexp struct tells us how long the string might
be.
With these two items we can implement regexp based $/
-rw-r--r-- | regcomp.c | 39 | ||||
-rw-r--r-- | regcomp.h | 2 | ||||
-rw-r--r-- | regexp.h | 3 | ||||
-rw-r--r-- | regnodes.h | 2 |
4 files changed, 37 insertions, 9 deletions
@@ -159,6 +159,7 @@ struct RExC_state_t { within pattern */ int num_code_blocks; /* size of code_blocks[] */ int code_index; /* next code_blocks[] slot */ + SSize_t maxlen; /* mininum possible number of chars in string to match */ #if ADD_TO_REGEXEC char *starttry; /* -Dr: where regtry was called. */ #define RExC_starttry (pRExC_state->starttry) @@ -196,6 +197,7 @@ struct RExC_state_t { #define RExC_sawback (pRExC_state->sawback) #define RExC_seen (pRExC_state->seen) #define RExC_size (pRExC_state->size) +#define RExC_maxlen (pRExC_state->maxlen) #define RExC_npar (pRExC_state->npar) #define RExC_nestroot (pRExC_state->nestroot) #define RExC_extralen (pRExC_state->extralen) @@ -774,6 +776,9 @@ static const scan_data_t zero_scan_data = if (RExC_seen & REG_GOSTART_SEEN) \ PerlIO_printf(Perl_debug_log,"REG_GOSTART_SEEN "); \ \ + if (RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN) \ + PerlIO_printf(Perl_debug_log,"REG_UNBOUNDED_QUANTIFIER_SEEN "); \ + \ PerlIO_printf(Perl_debug_log,"\n"); \ }); @@ -3671,7 +3676,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot merge strings after this. */ - if (flags & SCF_DO_STCLASS) + if (flags & SCF_DO_STCLASS) ssc_init_zero(pRExC_state, &accum); while (OP(scan) == code) { @@ -4494,12 +4499,12 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, is_inf_internal |= deltanext == SSize_t_MAX || (maxcount == REG_INFTY && minnext + deltanext > 0); is_inf |= is_inf_internal; - if (is_inf) + if (is_inf) { delta = SSize_t_MAX; - else + } else { delta += (minnext + deltanext) * maxcount - minnext * mincount; - + } /* Try powerful optimization CURLYX => CURLYN. */ if ( OP(oscan) == CURLYX && data && data->flags & SF_IN_PAR @@ -5366,6 +5371,7 @@ PerlIO_printf(Perl_debug_log, "LHS=%"UVdf" RHS=%"UVdf"\n", *scanp = scan; *deltap = is_inf_internal ? SSize_t_MAX : delta; + if (flags & SCF_DO_SUBSTR && is_inf) data->pos_delta = SSize_t_MAX - data->pos_min; if (is_par > (I32)U8_MAX) @@ -5385,7 +5391,16 @@ PerlIO_printf(Perl_debug_log, "LHS=%"UVdf" RHS=%"UVdf"\n", DEBUG_STUDYDATA("post-fin:",data,depth); - return min < stopmin ? min : stopmin; + { + SSize_t final_minlen= min < stopmin ? min : stopmin; + + if (RExC_maxlen < final_minlen + delta) { + RExC_maxlen = final_minlen + delta; + } + + return final_minlen; + } + /* not-reached */ } STATIC U32 @@ -6376,6 +6391,7 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, RExC_sawback = 0; RExC_seen = 0; + RExC_maxlen = 0; RExC_in_lookbehind = 0; RExC_seen_zerolen = *exp == '^' ? -1 : 0; RExC_extralen = 0; @@ -7017,13 +7033,15 @@ reStudy: /* Guard against an embedded (?=) or (?<=) with a longer minlen than the "real" pattern. */ DEBUG_OPTIMISE_r({ - PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf"\n", - (IV)minlen, (IV)r->minlen); + PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf" maxlen:%ld\n", + (IV)minlen, (IV)r->minlen, RExC_maxlen); }); r->minlenret = minlen; if (r->minlen < minlen) r->minlen = minlen; + + if (RExC_seen & REG_GPOS_SEEN) r->intflags |= PREGf_GPOS_SEEN; if (RExC_seen & REG_LOOKBEHIND_SEEN) @@ -7047,6 +7065,9 @@ reStudy: else RXp_PAREN_NAMES(r) = NULL; + if (RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN) + r->extflags |= RXf_UNBOUNDED_QUANTIFIER_SEEN; + /* If we have seen an anchor in our pattern then we set the extflag RXf_IS_ANCHORED * so it can be used in pp.c */ if (r->intflags & PREGf_ANCH) @@ -10466,6 +10487,8 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) ARG1_SET(ret, (U16)min); ARG2_SET(ret, (U16)max); } + if (max == REG_INFTY) + RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN; goto nest_check; } @@ -10503,6 +10526,7 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) reginsert(pRExC_state, STAR, ret, depth+1); ret->flags = 0; RExC_naughty += 4; + RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN; } else if (op == '*') { min = 0; @@ -10512,6 +10536,7 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) reginsert(pRExC_state, PLUS, ret, depth+1); ret->flags = 0; RExC_naughty += 3; + RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN; } else if (op == '+') { min = 1; @@ -582,6 +582,8 @@ struct regnode_ssc { #define REG_RUN_ON_COMMENT_SEEN 0x00000200 #define REG_UNFOLDED_MULTI_SEEN 0x00000400 #define REG_GOSTART_SEEN 0x00000800 +#define REG_UNBOUNDED_QUANTIFIER_SEEN 0x00001000 + START_EXTERN_C @@ -128,6 +128,7 @@ struct reg_code_block { SSize_t suboffset; /* byte offset of subbeg from logical start of str */ \ SSize_t subcoffset; /* suboffset equiv, but in chars (for @-/@+) */ \ /* Information about the match that isn't often used */ \ + SSize_t maxlen; /* mininum possible number of chars in string to match */\ /* offset from wrapped to the start of precomp */ \ PERL_BITFIELD32 pre_prefix:4; \ /* original flags used to compile the pattern, may differ */ \ @@ -398,7 +399,7 @@ get_regex_charset_name(const U32 flags, STRLEN* const lenp) #define RXf_UNUSED8 (1<<(RXf_BASE_SHIFT+8)) /* Special */ -#define RXf_UNUSED9 (1<<(RXf_BASE_SHIFT+9)) +#define RXf_UNBOUNDED_QUANTIFIER_SEEN (1<<(RXf_BASE_SHIFT+9)) #define RXf_CHECK_ALL (1<<(RXf_BASE_SHIFT+10)) /* UTF8 related */ diff --git a/regnodes.h b/regnodes.h index 088e5a6bee..f9d4fc05dd 100644 --- a/regnodes.h +++ b/regnodes.h @@ -659,7 +659,7 @@ EXTCONST char * const PL_reg_extflags_name[] = { "NO_INPLACE_SUBST", /* 0x00008000 */ "EVAL_SEEN", /* 0x00010000 */ "UNUSED8", /* 0x00020000 */ - "UNUSED9", /* 0x00040000 */ + "UNBOUNDED_QUANTIFIER_SEEN",/* 0x00040000 */ "CHECK_ALL", /* 0x00080000 */ "MATCH_UTF8", /* 0x00100000 */ "USE_INTUIT_NOML", /* 0x00200000 */ |