summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorYves Orton <demerphq@gmail.com>2006-09-10 00:00:05 +0200
committerRafael Garcia-Suarez <rgarciasuarez@gmail.com>2006-09-11 10:04:19 +0000
commit1de063289cf096bd67e3d9d1b4a6dca2498966fa (patch)
tree624db769c58230025a7b28c45d4cbd107daa75be
parente2e369dbafb5e29919b75b3f221700a38e4a5d42 (diff)
downloadperl-1de063289cf096bd67e3d9d1b4a6dca2498966fa.tar.gz
Teach regex optimiser how to handle (?=) and (?<=) properly.
Message-ID: <9b18b3110609091300x1fd0b15dt32932902a0a80674@mail.gmail.com> p4raw-id: //depot/perl@28816
-rw-r--r--embed.fnc6
-rw-r--r--embed.h6
-rw-r--r--proto.h16
-rw-r--r--regcomp.c564
-rw-r--r--regcomp.h11
-rw-r--r--regcomp.sym62
-rw-r--r--regexec.c292
-rw-r--r--regnodes.h36
-rw-r--r--sv.c1
-rw-r--r--t/op/re_tests14
-rwxr-xr-xt/op/regexp.t46
11 files changed, 767 insertions, 287 deletions
diff --git a/embed.fnc b/embed.fnc
index a34cb00a74..082a7c5288 100644
--- a/embed.fnc
+++ b/embed.fnc
@@ -1313,7 +1313,7 @@ Es |void |regtail |NN struct RExC_state_t *state|NN regnode *p|NN const regnode
Es |U32 |join_exact |NN struct RExC_state_t *state|NN regnode *scan|NN I32 *min|U32 flags|NULLOK regnode *val|U32 depth
EsRn |char* |regwhite |NN char *p|NN const char *e
Es |char* |nextchar |NN struct RExC_state_t *state
-Es |void |scan_commit |NN const struct RExC_state_t* state|NN struct scan_data_t *data
+Es |void |scan_commit |NN const struct RExC_state_t* state|NN struct scan_data_t *data|NN I32 *minlenp
Esn |void |cl_anything |NN const struct RExC_state_t* state|NN struct regnode_charclass_class *cl
EsRn |int |cl_is_anything |NN const struct regnode_charclass_class *cl
Esn |void |cl_init |NN const struct RExC_state_t* state|NN struct regnode_charclass_class *cl
@@ -1323,7 +1323,8 @@ Esn |void |cl_and |NN struct regnode_charclass_class *cl \
Esn |void |cl_or |NN const struct RExC_state_t* state|NN struct regnode_charclass_class *cl \
|NN const struct regnode_charclass_class *or_with
Es |I32 |study_chunk |NN struct RExC_state_t* state|NN regnode **scanp \
- |NN I32 *deltap|NN regnode *last|NULLOK struct scan_data_t *data \
+ |NN I32 *minlenp|NN I32 *deltap \
+ |NN regnode *last|NULLOK struct scan_data_t *data \
|U32 flags|U32 depth
EsRn |I32 |add_data |NN struct RExC_state_t* state|I32 n|NN const char *s
rs |void |re_croak2 |NN const char* pat1|NN const char* pat2|...
@@ -1357,6 +1358,7 @@ ERs |bool |reginclass |NULLOK const regexp *prog|NN const regnode *n|NN const U8
Es |CHECKPOINT|regcppush |I32 parenfloor
Es |char* |regcppop |NN const regexp *rex
ERsn |U8* |reghop3 |NN U8 *pos|I32 off|NN const U8 *lim
+ERsn |U8* |reghop4 |NN U8 *pos|I32 off|NN const U8 *llim|NN const U8 *rlim
ERsn |U8* |reghopmaybe3 |NN U8 *pos|I32 off|NN const U8 *lim
ERs |char* |find_byclass |NN regexp * prog|NN const regnode *c|NN char *s|NN const char *strend|NULLOK const regmatch_info *reginfo
Es |void |to_utf8_substr |NN regexp * prog
diff --git a/embed.h b/embed.h
index 6959cb9414..dbb6ca34cc 100644
--- a/embed.h
+++ b/embed.h
@@ -1356,6 +1356,7 @@
#define regcppush S_regcppush
#define regcppop S_regcppop
#define reghop3 S_reghop3
+#define reghop4 S_reghop4
#define reghopmaybe3 S_reghopmaybe3
#define find_byclass S_find_byclass
#define to_utf8_substr S_to_utf8_substr
@@ -3504,14 +3505,14 @@
#define join_exact(a,b,c,d,e,f) S_join_exact(aTHX_ a,b,c,d,e,f)
#define regwhite S_regwhite
#define nextchar(a) S_nextchar(aTHX_ a)
-#define scan_commit(a,b) S_scan_commit(aTHX_ a,b)
+#define scan_commit(a,b,c) S_scan_commit(aTHX_ a,b,c)
#define cl_anything S_cl_anything
#define cl_is_anything S_cl_is_anything
#define cl_init S_cl_init
#define cl_init_zero S_cl_init_zero
#define cl_and S_cl_and
#define cl_or S_cl_or
-#define study_chunk(a,b,c,d,e,f,g) S_study_chunk(aTHX_ a,b,c,d,e,f,g)
+#define study_chunk(a,b,c,d,e,f,g,h) S_study_chunk(aTHX_ a,b,c,d,e,f,g,h)
#define add_data S_add_data
#endif
#ifdef PERL_CORE
@@ -3542,6 +3543,7 @@
#define regcppush(a) S_regcppush(aTHX_ a)
#define regcppop(a) S_regcppop(aTHX_ a)
#define reghop3 S_reghop3
+#define reghop4 S_reghop4
#define reghopmaybe3 S_reghopmaybe3
#define find_byclass(a,b,c,d,e) S_find_byclass(aTHX_ a,b,c,d,e)
#define to_utf8_substr(a) S_to_utf8_substr(aTHX_ a)
diff --git a/proto.h b/proto.h
index ae714fb964..48a66bb920 100644
--- a/proto.h
+++ b/proto.h
@@ -3587,9 +3587,10 @@ STATIC char* S_regwhite(char *p, const char *e)
STATIC char* S_nextchar(pTHX_ struct RExC_state_t *state)
__attribute__nonnull__(pTHX_1);
-STATIC void S_scan_commit(pTHX_ const struct RExC_state_t* state, struct scan_data_t *data)
+STATIC void S_scan_commit(pTHX_ const struct RExC_state_t* state, struct scan_data_t *data, I32 *minlenp)
__attribute__nonnull__(pTHX_1)
- __attribute__nonnull__(pTHX_2);
+ __attribute__nonnull__(pTHX_2)
+ __attribute__nonnull__(pTHX_3);
STATIC void S_cl_anything(const struct RExC_state_t* state, struct regnode_charclass_class *cl)
__attribute__nonnull__(1)
@@ -3616,11 +3617,12 @@ STATIC void S_cl_or(const struct RExC_state_t* state, struct regnode_charclass_c
__attribute__nonnull__(2)
__attribute__nonnull__(3);
-STATIC I32 S_study_chunk(pTHX_ struct RExC_state_t* state, regnode **scanp, I32 *deltap, regnode *last, struct scan_data_t *data, U32 flags, U32 depth)
+STATIC I32 S_study_chunk(pTHX_ struct RExC_state_t* state, regnode **scanp, I32 *minlenp, I32 *deltap, regnode *last, struct scan_data_t *data, U32 flags, U32 depth)
__attribute__nonnull__(pTHX_1)
__attribute__nonnull__(pTHX_2)
__attribute__nonnull__(pTHX_3)
- __attribute__nonnull__(pTHX_4);
+ __attribute__nonnull__(pTHX_4)
+ __attribute__nonnull__(pTHX_5);
STATIC I32 S_add_data(struct RExC_state_t* state, I32 n, const char *s)
__attribute__warn_unused_result__
@@ -3707,6 +3709,12 @@ STATIC U8* S_reghop3(U8 *pos, I32 off, const U8 *lim)
__attribute__nonnull__(1)
__attribute__nonnull__(3);
+STATIC U8* S_reghop4(U8 *pos, I32 off, const U8 *llim, const U8 *rlim)
+ __attribute__warn_unused_result__
+ __attribute__nonnull__(1)
+ __attribute__nonnull__(3)
+ __attribute__nonnull__(4);
+
STATIC U8* S_reghopmaybe3(U8 *pos, I32 off, const U8 *lim)
__attribute__warn_unused_result__
__attribute__nonnull__(1)
diff --git a/regcomp.c b/regcomp.c
index 1c64fd96de..48d6aa2544 100644
--- a/regcomp.c
+++ b/regcomp.c
@@ -174,23 +174,102 @@ typedef struct RExC_state_t {
#define FULL_TRIE_STUDY
#define TRIE_STCLASS
#endif
-/* Length of a variant. */
+
+
+/* About scan_data_t.
+
+ During optimisation we recurse through the regexp program performing
+ various inplace (keyhole style) optimisations. In addition study_chunk
+ and scan_commit populate this data structure with information about
+ what strings MUST appear in the pattern. We look for the longest
+ string that must appear for at a fixed location, and we look for the
+ longest string that may appear at a floating location. So for instance
+ in the pattern:
+
+ /FOO[xX]A.*B[xX]BAR/
+
+ Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
+ strings (because they follow a .* construct). study_chunk will identify
+ both FOO and BAR as being the longest fixed and floating strings respectively.
+
+ The strings can be composites, for instance
+
+ /(f)(o)(o)/
+
+ will result in a composite fixed substring 'foo'.
+
+ For each string some basic information is maintained:
+
+ - offset or min_offset
+ This is the position the string must appear at, or not before.
+ It also implicitly (when combined with minlenp) tells us how many
+ character must match before the string we are searching.
+ Likewise when combined with minlenp and the length of the string
+ tells us how many characters must appear after the string we have
+ found.
+
+ - max_offset
+ Only used for floating strings. This is the rightmost point that
+ the string can appear at. Ifset to I32 max it indicates that the
+ string can occur infinitely far to the right.
+
+ - minlenp
+ A pointer to the minimum length of the pattern that the string
+ was found inside. This is important as in the case of positive
+ lookahead or positive lookbehind we can have multiple patterns
+ involved. Consider
+
+ /(?=FOO).*F/
+
+ The minimum length of the pattern overall is 3, the minimum length
+ of the lookahead part is 3, but the minimum length of the part that
+ will actually match is 1. So 'FOO's minimum length is 3, but the
+ minimum length for the F is 1. This is important as the minimum length
+ is used to determine offsets in front of and behind the string being
+ looked for. Since strings can be composites this is the length of the
+ pattern at the time it was commited with a scan_commit. Note that
+ the length is calculated by study_chunk, so that the minimum lengths
+ are not known until the full pattern has been compiled, thus the
+ pointer to the value.
+
+ - lookbehind
+
+ In the case of lookbehind the string being searched for can be
+ offset past the start point of the final matching string.
+ If this value was just blithely removed from the min_offset it would
+ invalidate some of the calculations for how many chars must match
+ before or after (as they are derived from min_offset and minlen and
+ the length of the string being searched for).
+ When the final pattern is compiled and the data is moved from the
+ scan_data_t structure into the regexp structure the information
+ about lookbehind is factored in, with the information that would
+ have been lost precalculated in the end_shift field for the
+ associated string.
+
+ The fields pos_min and pos_delta are used to store the minimum offset
+ and the delta to the maximum offset at the current point in the pattern.
+
+*/
typedef struct scan_data_t {
- I32 len_min;
- I32 len_delta;
+ /*I32 len_min; unused */
+ /*I32 len_delta; unused */
I32 pos_min;
I32 pos_delta;
SV *last_found;
- I32 last_end; /* min value, <0 unless valid. */
+ I32 last_end; /* min value, <0 unless valid. */
I32 last_start_min;
I32 last_start_max;
- SV **longest; /* Either &l_fixed, or &l_float. */
- SV *longest_fixed;
- I32 offset_fixed;
- SV *longest_float;
- I32 offset_float_min;
- I32 offset_float_max;
+ SV **longest; /* Either &l_fixed, or &l_float. */
+ SV *longest_fixed; /* longest fixed string found in pattern */
+ I32 offset_fixed; /* offset where it starts */
+ I32 *minlen_fixed; /* pointer to the minlen relevent to the string */
+ I32 lookbehind_fixed; /* is the position of the string modfied by LB */
+ SV *longest_float; /* longest floating string found in pattern */
+ I32 offset_float_min; /* earliest point in string it can appear */
+ I32 offset_float_max; /* latest point in string it can appear */
+ I32 *minlen_float; /* pointer to the minlen relevent to the string */
+ I32 lookbehind_float; /* is the position of the string modified by LB */
I32 flags;
I32 whilem_c;
I32 *last_closep;
@@ -202,7 +281,7 @@ typedef struct scan_data_t {
*/
static const scan_data_t zero_scan_data =
- { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
#define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
#define SF_BEFORE_SEOL 0x0001
@@ -443,6 +522,39 @@ static const scan_data_t zero_scan_data =
#define EXPERIMENTAL_INPLACESCAN
#endif
+#define DEBUG_STUDYDATA(data,depth) \
+DEBUG_OPTIMISE_r(if(data){ \
+ PerlIO_printf(Perl_debug_log, \
+ "%*s"/* Len:%"IVdf"/%"IVdf" */" Pos:%"IVdf"/%"IVdf \
+ " Flags: %"IVdf" Whilem_c: %"IVdf" Lcp: %"IVdf" ", \
+ (int)(depth)*2, "", \
+ (IV)((data)->pos_min), \
+ (IV)((data)->pos_delta), \
+ (IV)((data)->flags), \
+ (IV)((data)->whilem_c), \
+ (IV)((data)->last_closep ? *((data)->last_closep) : -1) \
+ ); \
+ if ((data)->last_found) \
+ PerlIO_printf(Perl_debug_log, \
+ "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
+ " %sFloat: '%s' @ %"IVdf"/%"IVdf"", \
+ SvPVX_const((data)->last_found), \
+ (IV)((data)->last_end), \
+ (IV)((data)->last_start_min), \
+ (IV)((data)->last_start_max), \
+ ((data)->longest && \
+ (data)->longest==&((data)->longest_fixed)) ? "*" : "", \
+ SvPVX_const((data)->longest_fixed), \
+ (IV)((data)->offset_fixed), \
+ ((data)->longest && \
+ (data)->longest==&((data)->longest_float)) ? "*" : "", \
+ SvPVX_const((data)->longest_float), \
+ (IV)((data)->offset_float_min), \
+ (IV)((data)->offset_float_max) \
+ ); \
+ PerlIO_printf(Perl_debug_log,"\n"); \
+});
+
static void clear_re(pTHX_ void *r);
/* Mark that we cannot extend a found fixed substring at this point.
@@ -450,10 +562,11 @@ static void clear_re(pTHX_ void *r);
floating substrings if needed. */
STATIC void
-S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data)
+S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *minlenp)
{
const STRLEN l = CHR_SVLEN(data->last_found);
const STRLEN old_l = CHR_SVLEN(*data->longest);
+ GET_RE_DEBUG_FLAGS_DECL;
if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
SvSetMagicSV(*data->longest, data->last_found);
@@ -464,6 +577,8 @@ S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data)
|= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
else
data->flags &= ~SF_FIX_BEFORE_EOL;
+ data->minlen_fixed=minlenp;
+ data->lookbehind_fixed=0;
}
else {
data->offset_float_min = l ? data->last_start_min : data->pos_min;
@@ -477,6 +592,8 @@ S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data)
|= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
else
data->flags &= ~SF_FL_BEFORE_EOL;
+ data->minlen_float=minlenp;
+ data->lookbehind_float=0;
}
}
SvCUR_set(data->last_found, 0);
@@ -490,6 +607,7 @@ S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data)
}
data->last_end = -1;
data->flags &= ~SF_BEFORE_EOL;
+ DEBUG_STUDYDATA(data,0);
}
/* Can match anything (initialization) */
@@ -1784,7 +1902,7 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs
jumper = last;
/* XXXX */
if ( !trie->states[trie->startstate].wordnum && trie->bitmap &&
- ((char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
+ ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
{
OP( convert ) = TRIEC;
Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
@@ -1954,6 +2072,10 @@ S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode
Next ? (REG_NODE_NUM(Next)) : 0 ); \
});
+
+
+
+
#define JOIN_EXACT(scan,min,flags) \
if (PL_regkind[OP(scan)] == EXACT) \
join_exact(pRExC_state,(scan),(min),(flags),NULL,depth+1)
@@ -2109,7 +2231,8 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, I32 *min, U32 flags
STATIC I32
-S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap,
+S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
+ I32 *minlenp, I32 *deltap,
regnode *last, scan_data_t *data, U32 flags, U32 depth)
/* scanp: Start here (read-write). */
/* deltap: Write maxlen-minlen here. */
@@ -2140,8 +2263,8 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap,
while (scan && OP(scan) != END && scan < last) {
/* Peephole optimizer: */
+ DEBUG_STUDYDATA(data,depth);
DEBUG_PEEP("Peep",scan,depth);
-
JOIN_EXACT(scan,&min,0);
/* Follow the next-chain of the current node and optimize
@@ -2186,7 +2309,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap,
regnode * const startbranch=scan;
if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
- scan_commit(pRExC_state, data); /* Cannot merge strings after this. */
+ scan_commit(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
if (flags & SCF_DO_STCLASS)
cl_init_zero(pRExC_state, &accum);
@@ -2215,7 +2338,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap,
f |= SCF_WHILEM_VISITED_POS;
/* we suppose the run is continuous, last=next...*/
- minnext = study_chunk(pRExC_state, &scan, &deltanext,
+ minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
next, &data_fake, f,depth+1);
if (min1 > minnext)
min1 = minnext;
@@ -2536,7 +2659,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap,
/* Search for fixed substrings supports EXACT only. */
if (flags & SCF_DO_SUBSTR) {
assert(data);
- scan_commit(pRExC_state, data);
+ scan_commit(pRExC_state, data, minlenp);
}
if (UTF) {
const U8 * const s = (U8 *)STRING(scan);
@@ -2615,7 +2738,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap,
is_inf = is_inf_internal = 1;
scan = regnext(scan);
if (flags & SCF_DO_SUBSTR) {
- scan_commit(pRExC_state, data); /* Cannot extend fixed substrings */
+ scan_commit(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */
data->longest = &(data->longest_float);
}
goto optimize_curly_tail;
@@ -2631,7 +2754,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap,
next_is_eval = (OP(scan) == EVAL);
do_curly:
if (flags & SCF_DO_SUBSTR) {
- if (mincount == 0) scan_commit(pRExC_state,data); /* Cannot extend fixed substrings */
+ if (mincount == 0) scan_commit(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */
pos_before = data->pos_min;
}
if (data) {
@@ -2657,7 +2780,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap,
f &= ~SCF_WHILEM_VISITED_POS;
/* This will finish on WHILEM, setting scan, or on NULL: */
- minnext = study_chunk(pRExC_state, &scan, &deltanext, last, data,
+ minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext, last, data,
(mincount == 0
? (f & ~SCF_DO_SUBSTR) : f),depth+1);
@@ -2795,7 +2918,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap,
}
#endif
/* Optimize again: */
- study_chunk(pRExC_state, &nxt1, &deltanext, nxt,
+ study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
NULL, 0,depth+1);
}
else
@@ -2890,7 +3013,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap,
if (mincount != maxcount) {
/* Cannot extend fixed substrings found inside
the group. */
- scan_commit(pRExC_state,data);
+ scan_commit(pRExC_state,data,minlenp);
if (mincount && last_str) {
SV * const sv = data->last_found;
MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
@@ -2922,7 +3045,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap,
continue;
default: /* REF and CLUMP only? */
if (flags & SCF_DO_SUBSTR) {
- scan_commit(pRExC_state,data); /* Cannot expect anything... */
+ scan_commit(pRExC_state,data,minlenp); /* Cannot expect anything... */
data->longest = &(data->longest_float);
}
is_inf = is_inf_internal = 1;
@@ -2936,7 +3059,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap,
int value = 0;
if (flags & SCF_DO_SUBSTR) {
- scan_commit(pRExC_state,data);
+ scan_commit(pRExC_state,data,minlenp);
data->pos_min++;
}
min++;
@@ -3141,53 +3264,160 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap,
/* Lookbehind, or need to calculate parens/evals/stclass: */
&& (scan->flags || data || (flags & SCF_DO_STCLASS))
&& (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
- /* Lookahead/lookbehind */
- I32 deltanext, minnext, fake = 0;
- regnode *nscan;
- struct regnode_charclass_class intrnl;
- int f = 0;
-
- data_fake.flags = 0;
- if (data) {
- data_fake.whilem_c = data->whilem_c;
- data_fake.last_closep = data->last_closep;
- }
- else
- data_fake.last_closep = &fake;
- if ( flags & SCF_DO_STCLASS && !scan->flags
- && OP(scan) == IFMATCH ) { /* Lookahead */
- cl_init(pRExC_state, &intrnl);
- data_fake.start_class = &intrnl;
- f |= SCF_DO_STCLASS_AND;
- }
- if (flags & SCF_WHILEM_VISITED_POS)
- f |= SCF_WHILEM_VISITED_POS;
- next = regnext(scan);
- nscan = NEXTOPER(NEXTOPER(scan));
- minnext = study_chunk(pRExC_state, &nscan, &deltanext, last, &data_fake, f,depth+1);
- if (scan->flags) {
- if (deltanext) {
- vFAIL("Variable length lookbehind not implemented");
+ if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
+ || OP(scan) == UNLESSM )
+ {
+ /* Negative Lookahead/lookbehind
+ In this case we can't do fixed string optimisation.
+ */
+
+ I32 deltanext, minnext, fake = 0;
+ regnode *nscan;
+ struct regnode_charclass_class intrnl;
+ int f = 0;
+
+ data_fake.flags = 0;
+ if (data) {
+ data_fake.whilem_c = data->whilem_c;
+ data_fake.last_closep = data->last_closep;
}
- else if (minnext > (I32)U8_MAX) {
- vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
+ else
+ data_fake.last_closep = &fake;
+ if ( flags & SCF_DO_STCLASS && !scan->flags
+ && OP(scan) == IFMATCH ) { /* Lookahead */
+ cl_init(pRExC_state, &intrnl);
+ data_fake.start_class = &intrnl;
+ f |= SCF_DO_STCLASS_AND;
}
- scan->flags = (U8)minnext;
- }
- if (data) {
- if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
- pars++;
- if (data_fake.flags & SF_HAS_EVAL)
- data->flags |= SF_HAS_EVAL;
- data->whilem_c = data_fake.whilem_c;
+ if (flags & SCF_WHILEM_VISITED_POS)
+ f |= SCF_WHILEM_VISITED_POS;
+ next = regnext(scan);
+ nscan = NEXTOPER(NEXTOPER(scan));
+ minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext, last, &data_fake, f,depth+1);
+ if (scan->flags) {
+ if (deltanext) {
+ vFAIL("Variable length lookbehind not implemented");
+ }
+ else if (minnext > (I32)U8_MAX) {
+ vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
+ }
+ scan->flags = (U8)minnext;
+ }
+ if (data) {
+ if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
+ pars++;
+ if (data_fake.flags & SF_HAS_EVAL)
+ data->flags |= SF_HAS_EVAL;
+ data->whilem_c = data_fake.whilem_c;
+ }
+ if (f & SCF_DO_STCLASS_AND) {
+ const int was = (data->start_class->flags & ANYOF_EOS);
+
+ cl_and(data->start_class, &intrnl);
+ if (was)
+ data->start_class->flags |= ANYOF_EOS;
+ }
}
- if (f & SCF_DO_STCLASS_AND) {
- const int was = (data->start_class->flags & ANYOF_EOS);
+#if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
+ else {
+ /* Positive Lookahead/lookbehind
+ In this case we can do fixed string optimisation,
+ but we must be careful about it. Note in the case of
+ lookbehind the positions will be offset by the minimum
+ length of the pattern, something we won't know about
+ until after the recurse.
+ */
+ I32 deltanext, fake = 0;
+ regnode *nscan;
+ struct regnode_charclass_class intrnl;
+ int f = 0;
+ /* We use SAVEFREEPV so that when the full compile
+ is finished perl will clean up the allocated
+ minlens when its all done. This was we don't
+ have to worry about freeing them when we know
+ they wont be used, which would be a pain.
+ */
+ I32 *minnextp;
+ Newx( minnextp, 1, I32 );
+ SAVEFREEPV(minnextp);
+
+ if (data) {
+ StructCopy(data, &data_fake, scan_data_t);
+ if ((flags & SCF_DO_SUBSTR) && data->last_found) {
+ f |= SCF_DO_SUBSTR;
+ if (scan->flags)
+ scan_commit(pRExC_state, &data_fake,minlenp);
+ data_fake.last_found=newSVsv(data->last_found);
+ }
+ }
+ else
+ data_fake.last_closep = &fake;
+ data_fake.flags = 0;
+ if (is_inf)
+ data_fake.flags |= SF_IS_INF;
+ if ( flags & SCF_DO_STCLASS && !scan->flags
+ && OP(scan) == IFMATCH ) { /* Lookahead */
+ cl_init(pRExC_state, &intrnl);
+ data_fake.start_class = &intrnl;
+ f |= SCF_DO_STCLASS_AND;
+ }
+ if (flags & SCF_WHILEM_VISITED_POS)
+ f |= SCF_WHILEM_VISITED_POS;
+ next = regnext(scan);
+ nscan = NEXTOPER(NEXTOPER(scan));
+
+ *minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext, last, &data_fake, f,depth+1);
+ if (scan->flags) {
+ if (deltanext) {
+ vFAIL("Variable length lookbehind not implemented");
+ }
+ else if (*minnextp > (I32)U8_MAX) {
+ vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
+ }
+ scan->flags = (U8)*minnextp;
+ }
+
+ *minnextp += min;
+
+
+ if (f & SCF_DO_STCLASS_AND) {
+ const int was = (data->start_class->flags & ANYOF_EOS);
+
+ cl_and(data->start_class, &intrnl);
+ if (was)
+ data->start_class->flags |= ANYOF_EOS;
+ }
+ if (data) {
+ if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
+ pars++;
+ if (data_fake.flags & SF_HAS_EVAL)
+ data->flags |= SF_HAS_EVAL;
+ data->whilem_c = data_fake.whilem_c;
+ if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
+ if (RExC_rx->minlen<*minnextp)
+ RExC_rx->minlen=*minnextp;
+ scan_commit(pRExC_state, &data_fake, minnextp);
+ SvREFCNT_dec(data_fake.last_found);
+
+ if ( data_fake.minlen_fixed != minlenp )
+ {
+ data->offset_fixed= data_fake.offset_fixed;
+ data->minlen_fixed= data_fake.minlen_fixed;
+ data->lookbehind_fixed+= scan->flags;
+ }
+ if ( data_fake.minlen_float != minlenp )
+ {
+ data->minlen_float= data_fake.minlen_float;
+ data->offset_float_min=data_fake.offset_float_min;
+ data->offset_float_max=data_fake.offset_float_max;
+ data->lookbehind_float+= scan->flags;
+ }
+ }
+ }
+
- cl_and(data->start_class, &intrnl);
- if (was)
- data->start_class->flags |= ANYOF_EOS;
}
+#endif
}
else if (OP(scan) == OPEN) {
pars++;
@@ -3208,7 +3438,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap,
}
else if (OP(scan) == LOGICAL && scan->flags == 2) { /* Embedded follows */
if (flags & SCF_DO_SUBSTR) {
- scan_commit(pRExC_state,data);
+ scan_commit(pRExC_state,data,minlenp);
data->longest = &(data->longest_float);
}
is_inf = is_inf_internal = 1;
@@ -3228,7 +3458,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap,
struct regnode_charclass_class accum;
if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
- scan_commit(pRExC_state, data); /* Cannot merge strings after this. */
+ scan_commit(pRExC_state, data,minlenp); /* Cannot merge strings after this. */
if (flags & SCF_DO_STCLASS)
cl_init_zero(pRExC_state, &accum);
@@ -3268,7 +3498,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap,
it. Note this means we need the vestigal unused branches
even though they arent otherwise used.
*/
- minnext = study_chunk(pRExC_state, &scan, &deltanext,
+ minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
(regnode *)nextbranch, &data_fake, f,depth+1);
}
if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
@@ -3337,7 +3567,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap,
delta += (trie->maxlen - trie->minlen);
flags &= ~SCF_DO_STCLASS; /* xxx */
if (flags & SCF_DO_SUBSTR) {
- scan_commit(pRExC_state,data); /* Cannot expect anything... */
+ scan_commit(pRExC_state,data,minlenp); /* Cannot expect anything... */
data->pos_min += trie->minlen;
data->pos_delta += (trie->maxlen - trie->minlen);
if (trie->maxlen != trie->minlen)
@@ -3371,6 +3601,9 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap,
cl_and(data->start_class, &and_with);
if (flags & SCF_TRIE_RESTUDY)
data->flags |= SCF_TRIE_RESTUDY;
+
+ DEBUG_STUDYDATA(data,depth);
+
return min;
}
@@ -3581,7 +3814,7 @@ Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
Newx(r->substrs, 1, struct reg_substr_data);
reStudy:
- minlen=sawplus=sawopen=0;
+ r->minlen = minlen = sawplus = sawopen = 0;
Zero(r->substrs, 1, struct reg_substr_data);
StructCopy(&zero_scan_data, &data, scan_data_t);
@@ -3589,12 +3822,11 @@ reStudy:
if ( restudied ) {
DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
RExC_state=copyRExC_state;
- if (data.longest_fixed)
+ if (data.last_found) {
SvREFCNT_dec(data.longest_fixed);
- if (data.longest_float)
SvREFCNT_dec(data.longest_float);
- if (data.last_found)
SvREFCNT_dec(data.last_found);
+ }
} else {
copyRExC_state=RExC_state;
}
@@ -3609,7 +3841,8 @@ reStudy:
r->reganch |= ROPT_NAUGHTY;
scan = r->program + 1; /* First BRANCH. */
- /* XXXX Should not we check for something else? Usually it is OPEN1... */
+ /* testing for BRANCH here tells us whether there is "must appear"
+ data in the pattern. If there is then we can use it for optimisations */
if (OP(scan) != BRANCH) { /* Only one top-level choice. */
I32 fake;
STRLEN longest_float_length, longest_fixed_length;
@@ -3668,6 +3901,7 @@ reStudy:
StructCopy(first,trieop,struct regnode_charclass);
trie_op=(regnode *)trieop;
}
+ OP(trie_op)+=2;
make_trie_failtable(pRExC_state, (regnode *)first, trie_op, 0);
r->regstclass = trie_op;
}
@@ -3750,7 +3984,7 @@ reStudy:
stclass_flag = 0;
data.last_closep = &last_close;
- minlen = study_chunk(pRExC_state, &first, &fake, scan + RExC_size, /* Up to end */
+ minlen = study_chunk(pRExC_state, &first, &minlen, &fake, scan + RExC_size, /* Up to end */
&data, SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
@@ -3762,21 +3996,28 @@ reStudy:
&& !RExC_seen_zerolen
&& (!(RExC_seen & REG_SEEN_GPOS) || (r->reganch & ROPT_ANCH_GPOS)))
r->reganch |= ROPT_CHECK_ALL;
- scan_commit(pRExC_state, &data);
+ scan_commit(pRExC_state, &data,&minlen);
SvREFCNT_dec(data.last_found);
+ /* Note that code very similar to this but for anchored string
+ follows immediately below, changes may need to be made to both.
+ Be careful.
+ */
longest_float_length = CHR_SVLEN(data.longest_float);
if (longest_float_length
|| (data.flags & SF_FL_BEFORE_EOL
&& (!(data.flags & SF_FL_BEFORE_MEOL)
- || (RExC_flags & PMf_MULTILINE)))) {
- int t;
+ || (RExC_flags & PMf_MULTILINE))))
+ {
+ int t,ml;
- if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
+ if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
&& data.offset_fixed == data.offset_float_min
&& SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
goto remove_float; /* As in (a)+. */
+ /* copy the information about the longest float from the reg_scan_data
+ over to the program. */
if (SvUTF8(data.longest_float)) {
r->float_utf8 = data.longest_float;
r->float_substr = NULL;
@@ -3784,8 +4025,20 @@ reStudy:
r->float_substr = data.longest_float;
r->float_utf8 = NULL;
}
- r->float_min_offset = data.offset_float_min;
+ /* float_end_shift is how many chars that must be matched that
+ follow this item. We calculate it ahead of time as once the
+ lookbehind offset is added in we lose the ability to correctly
+ calculate it.*/
+ ml = data.minlen_float ? *(data.minlen_float)
+ : longest_float_length;
+ r->float_end_shift = ml - data.offset_float_min
+ - longest_float_length + (SvTAIL(data.longest_float) != 0)
+ + data.lookbehind_float;
+ r->float_min_offset = data.offset_float_min - data.lookbehind_float;
r->float_max_offset = data.offset_float_max;
+ if (data.offset_float_max < (U32)I32_MAX) /* Don't offset infinity */
+ r->float_max_offset -= data.lookbehind_float;
+
t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
&& (!(data.flags & SF_FL_BEFORE_MEOL)
|| (RExC_flags & PMf_MULTILINE)));
@@ -3798,13 +4051,20 @@ reStudy:
longest_float_length = 0;
}
+ /* Note that code very similar to this but for floating string
+ is immediately above, changes may need to be made to both.
+ Be careful.
+ */
longest_fixed_length = CHR_SVLEN(data.longest_fixed);
if (longest_fixed_length
|| (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
&& (!(data.flags & SF_FIX_BEFORE_MEOL)
- || (RExC_flags & PMf_MULTILINE)))) {
- int t;
+ || (RExC_flags & PMf_MULTILINE))))
+ {
+ int t,ml;
+ /* copy the information about the longest fixed
+ from the reg_scan_data over to the program. */
if (SvUTF8(data.longest_fixed)) {
r->anchored_utf8 = data.longest_fixed;
r->anchored_substr = NULL;
@@ -3812,7 +4072,17 @@ reStudy:
r->anchored_substr = data.longest_fixed;
r->anchored_utf8 = NULL;
}
- r->anchored_offset = data.offset_fixed;
+ /* fixed_end_shift is how many chars that must be matched that
+ follow this item. We calculate it ahead of time as once the
+ lookbehind offset is added in we lose the ability to correctly
+ calculate it.*/
+ ml = data.minlen_fixed ? *(data.minlen_fixed)
+ : longest_fixed_length;
+ r->anchored_end_shift = ml - data.offset_fixed
+ - longest_fixed_length + (SvTAIL(data.longest_fixed) != 0)
+ + data.lookbehind_fixed;
+ r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
+
t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
&& (!(data.flags & SF_FIX_BEFORE_MEOL)
|| (RExC_flags & PMf_MULTILINE)));
@@ -3849,6 +4119,7 @@ reStudy:
/* A temporary algorithm prefers floated substr to fixed one to dig more info. */
if (longest_fixed_length > longest_float_length) {
+ r->check_end_shift = r->anchored_end_shift;
r->check_substr = r->anchored_substr;
r->check_utf8 = r->anchored_utf8;
r->check_offset_min = r->check_offset_max = r->anchored_offset;
@@ -3856,10 +4127,11 @@ reStudy:
r->reganch |= ROPT_NOSCAN;
}
else {
+ r->check_end_shift = r->float_end_shift;
r->check_substr = r->float_substr;
r->check_utf8 = r->float_utf8;
- r->check_offset_min = data.offset_float_min;
- r->check_offset_max = data.offset_float_max;
+ r->check_offset_min = r->float_min_offset;
+ r->check_offset_max = r->float_max_offset;
}
/* XXXX Currently intuiting is not compatible with ANCH_GPOS.
This should be changed ASAP! */
@@ -3868,6 +4140,12 @@ reStudy:
if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
r->reganch |= RE_INTUIT_TAIL;
}
+ /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
+ if ( (STRLEN)minlen < longest_float_length )
+ minlen= longest_float_length;
+ if ( (STRLEN)minlen < longest_fixed_length )
+ minlen= longest_fixed_length;
+ */
}
else {
/* Several toplevels. Best we can is to set minlen. */
@@ -3882,7 +4160,7 @@ reStudy:
data.start_class = &ch_class;
data.last_closep = &last_close;
- minlen = study_chunk(pRExC_state, &scan, &fake, scan + RExC_size,
+ minlen = study_chunk(pRExC_state, &scan, &minlen, &fake, scan + RExC_size,
&data, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
CHECK_RESTUDY_GOTO;
@@ -3909,7 +4187,11 @@ reStudy:
}
}
- r->minlen = minlen;
+ /* Guard against an embedded (?=) or (?<=) with a longer minlen than
+ the "real" pattern. */
+ if (r->minlen < minlen)
+ r->minlen = minlen;
+
if (RExC_seen & REG_SEEN_GPOS)
r->reganch |= ROPT_GPOS_SEEN;
if (RExC_seen & REG_SEEN_LOOKBEHIND)
@@ -6633,7 +6915,7 @@ Perl_regdump(pTHX_ const regexp *r)
if (r->regstclass) {
regprop(r, sv, r->regstclass);
- PerlIO_printf(Perl_debug_log, "stclass \"%s\" ", SvPVX_const(sv));
+ PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
}
if (r->reganch & ROPT_ANCH) {
PerlIO_printf(Perl_debug_log, "anchored");
@@ -6672,6 +6954,7 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
#ifdef DEBUGGING
dVAR;
register int k;
+ GET_RE_DEBUG_FLAGS_DECL;
sv_setpvn(sv, "", 0);
if (OP(o) >= reg_num) /* regnode.type is unsigned */
@@ -6697,9 +6980,54 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
);
Perl_sv_catpvf(aTHX_ sv, " %s", s );
} else if (k == TRIE) {
- Perl_sv_catpvf(aTHX_ sv, "-%s",reg_name[o->flags]);
/* print the details of the trie in dumpuntil instead, as
* prog->data isn't available here */
+ const char op = OP(o);
+ const I32 n = ARG(o);
+ const reg_ac_data * const ac = IS_TRIE_AC(op) ?
+ (reg_ac_data *)prog->data->data[n] :
+ NULL;
+ const reg_trie_data * const trie = !IS_TRIE_AC(op) ?
+ (reg_trie_data*)prog->data->data[n] :
+ ac->trie;
+
+ Perl_sv_catpvf(aTHX_ sv, "-%s",reg_name[o->flags]);
+ DEBUG_TRIE_COMPILE_r(
+ Perl_sv_catpvf(aTHX_ sv,
+ "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
+ (UV)trie->startstate,
+ (IV)trie->laststate-1,
+ (UV)trie->wordcount,
+ (UV)trie->minlen,
+ (UV)trie->maxlen,
+ (UV)TRIE_CHARCOUNT(trie),
+ (UV)trie->uniquecharcount
+ )
+ );
+ if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
+ int i;
+ int rangestart = -1;
+ U8* bitmap = IS_ANYOF_TRIE(op) ? ANYOF_BITMAP(o) : TRIE_BITMAP(trie);
+ Perl_sv_catpvf(aTHX_ sv, "[");
+ for (i = 0; i <= 256; i++) {
+ if (i < 256 && BITMAP_TEST(bitmap,i)) {
+ if (rangestart == -1)
+ rangestart = i;
+ } else if (rangestart != -1) {
+ if (i <= rangestart + 3)
+ for (; rangestart < i; rangestart++)
+ put_byte(sv, rangestart);
+ else {
+ put_byte(sv, rangestart);
+ sv_catpvs(sv, "-");
+ put_byte(sv, i - 1);
+ }
+ rangestart = -1;
+ }
+ }
+ Perl_sv_catpvf(aTHX_ sv, "]");
+ }
+
} else if (k == CURLY) {
if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
@@ -7214,7 +7542,7 @@ S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
else
PerlIO_printf(Perl_debug_log, "(%"IVdf")", (IV)(next - start));
- if (PL_regkind[(U8)op] != TRIE)
+ /*if (PL_regkind[(U8)op] != TRIE)*/
(void)PerlIO_putc(Perl_debug_log, '\n');
}
@@ -7235,51 +7563,17 @@ S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
DUMPUNTIL(NEXTOPER(node), next);
}
else if ( PL_regkind[(U8)op] == TRIE ) {
+ const char op = OP(node);
const I32 n = ARG(node);
- const reg_trie_data * const trie = (reg_trie_data*)r->data->data[n];
+ const reg_ac_data * const ac = op>=AHOCORASICK ?
+ (reg_ac_data *)r->data->data[n] :
+ NULL;
+ const reg_trie_data * const trie = op<AHOCORASICK ?
+ (reg_trie_data*)r->data->data[n] :
+ ac->trie;
const regnode *nextbranch= NULL;
I32 word_idx;
-
- DEBUG_TRIE_COMPILE_r(
- PerlIO_printf(Perl_debug_log,
- " S:%"UVuf"/%"IVdf" W:%d L:%d/%d C:%d/%d ",
- (UV)trie->startstate,
- (IV)trie->laststate-1,
- (int)trie->wordcount,
- (int)trie->minlen,
- (int)trie->maxlen,
- (int)TRIE_CHARCOUNT(trie),
- trie->uniquecharcount
- );
- );
- if ( op==TRIEC || trie->bitmap ) {
- int i;
- int rangestart = -1;
- U8* bitmap = op==TRIEC ? (U8*)ANYOF_BITMAP(node) : (U8*)TRIE_BITMAP(trie);
-
- sv_setpvn(sv, "", 0);
- for (i = 0; i <= 256; i++) {
- if (i < 256 && BITMAP_TEST(bitmap,i)) {
- if (rangestart == -1)
- rangestart = i;
- } else if (rangestart != -1) {
- if (i <= rangestart + 3)
- for (; rangestart < i; rangestart++)
- put_byte(sv, rangestart);
- else {
- put_byte(sv, rangestart);
- sv_catpvs(sv, "-");
- put_byte(sv, i - 1);
- }
- rangestart = -1;
- }
- }
- PerlIO_printf(Perl_debug_log, "[%s]\n", SvPVX_const(sv));
- } else
- PerlIO_printf(Perl_debug_log, "\n");
-
-
-
+ sv_setpvn(sv, "", 0);
for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
SV ** const elem_ptr = av_fetch(trie->words,word_idx,0);
@@ -7347,7 +7641,7 @@ S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
#ifdef DEBUG_DUMPUNTIL
PerlIO_printf(Perl_debug_log, "--- %d\n",indent);
#endif
- return last ? last : node;
+ return node;
}
#endif /* DEBUGGING */
diff --git a/regcomp.h b/regcomp.h
index b6f3617ccf..b4f549f64c 100644
--- a/regcomp.h
+++ b/regcomp.h
@@ -13,6 +13,7 @@ typedef OP OP_4tree; /* Will be redefined later. */
#define PERL_ENABLE_TRIE_OPTIMISATION 1
#define PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION 1
+#define PERL_ENABLE_POSITIVE_ASSERTION_STUDY 1
#define PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS 0
/*
@@ -396,6 +397,7 @@ struct reg_substr_datum {
I32 max_offset;
SV *substr; /* non-utf8 variant */
SV *utf8_substr; /* utf8 variant */
+ I32 end_shift;
};
struct reg_substr_data {
@@ -405,14 +407,19 @@ struct reg_substr_data {
#define anchored_substr substrs->data[0].substr
#define anchored_utf8 substrs->data[0].utf8_substr
#define anchored_offset substrs->data[0].min_offset
+#define anchored_end_shift substrs->data[0].end_shift
+
#define float_substr substrs->data[1].substr
#define float_utf8 substrs->data[1].utf8_substr
#define float_min_offset substrs->data[1].min_offset
#define float_max_offset substrs->data[1].max_offset
+#define float_end_shift substrs->data[1].end_shift
+
#define check_substr substrs->data[2].substr
#define check_utf8 substrs->data[2].utf8_substr
#define check_offset_min substrs->data[2].min_offset
#define check_offset_max substrs->data[2].max_offset
+#define check_end_shift substrs->data[2].end_shift
@@ -503,6 +510,10 @@ typedef struct _reg_ac_data reg_ac_data;
#define TRIE_BITMAP_CLEAR(p,c) (TRIE_BITMAP_BYTE(p, c) &= ~ANYOF_BIT((U8)c))
#define TRIE_BITMAP_TEST(p, c) (TRIE_BITMAP_BYTE(p, c) & ANYOF_BIT((U8)c))
+#define IS_ANYOF_TRIE(op) ((op)==TRIEC || (op)==AHOCORASICKC)
+#define IS_TRIE_AC(op) ((op)>=AHOCORASICK)
+
+
#define BITMAP_BYTE(p, c) (((U8*)p)[(((U8)(c)) >> 3) & 31])
#define BITMAP_TEST(p, c) (BITMAP_BYTE(p, c) & ANYOF_BIT((U8)c))
diff --git a/regcomp.sym b/regcomp.sym
index f62b7bf73a..25da9f7397 100644
--- a/regcomp.sym
+++ b/regcomp.sym
@@ -8,11 +8,13 @@
# Add new regops to the end, and do not re-order the existing ops.
#
-# Exit points
+#* Exit points (0,1)
+
END END, no End of program.
SUCCEED END, no Return from a subroutine, basically.
-# Anchors:
+#* Anchors: (2..13)
+
BOL BOL, no Match "" at beginning of line.
MBOL BOL, no Same, assuming multiline.
SBOL BOL, no Same, assuming singleline.
@@ -26,7 +28,8 @@ NBOUND NBOUND, no Match "" at any word non-boundary
NBOUNDL NBOUND, no Match "" at any word non-boundary
GPOS GPOS, no Matches where last m//g left off.
-# [Special] alternatives
+#* [Special] alternatives: (14..30)
+
REG_ANY REG_ANY, no Match any one character (except newline).
SANY REG_ANY, no Match any one character.
CANY REG_ANY, no Match any one byte.
@@ -45,7 +48,9 @@ NDIGIT NDIGIT, no Match any non-numeric character
NDIGITL NDIGIT, no Match any non-numeric character in locale
CLUMP CLUMP, no Match any combining character sequence
-# BRANCH The set of branches constituting a single choice are hooked
+#* Alternation (31)
+
+# BRANCH The set of branches constituting a single choice are hooked
# together with their "next" pointers, since precedence prevents
# anything being concatenated to any individual branch. The
# "next" pointer of the last BRANCH in a choice points to the
@@ -55,21 +60,27 @@ CLUMP CLUMP, no Match any combining character sequence
#
BRANCH BRANCH, node Match this alternative, or the next...
+#*Back pointer (32)
+
# BACK Normal "next" pointers all implicitly point forward; BACK
# exists to make loop structures possible.
# not used
BACK BACK, no Match "", "next" ptr points backward.
-# Literals
+#*Literals (33..35)
+
EXACT EXACT, sv Match this string (preceded by length).
EXACTF EXACT, sv Match this string, folded (prec. by length).
EXACTFL EXACT, sv Match this string, folded in locale (w/len).
-# Do nothing
+#*Do nothing types (36..37)
+
NOTHING NOTHING,no Match empty string.
# A variant of above which delimits a group, thus stops optimizations
TAIL NOTHING,no Match empty string. Can jump here from outside.
+#*Loops (38..44)
+
# STAR,PLUS '?', and complex '*' and '+', are implemented as circular
# BRANCH structures using BACK. Simple cases (one character
# per match) are implemented with STAR and PLUS for speed
@@ -87,6 +98,8 @@ CURLYX CURLY, sv 2 Match this complex thing {n,m} times.
# This terminator creates a loop structure for CURLYX
WHILEM WHILEM, no Do curly processing and see if rest matches.
+#*Buffer related (45..49)
+
# OPEN,CLOSE,GROUPP ...are numbered at compile time.
OPEN OPEN, num 1 Mark this point in input as start of #n.
CLOSE CLOSE, num 1 Analogous to OPEN.
@@ -95,40 +108,57 @@ REF REF, num 1 Match some already matched string
REFF REF, num 1 Match already matched string, folded
REFFL REF, num 1 Match already matched string, folded in loc.
-# grouping assertions
+#*Grouping assertions (50..54)
+
IFMATCH BRANCHJ,off 1 2 Succeeds if the following matches.
UNLESSM BRANCHJ,off 1 2 Fails if the following matches.
SUSPEND BRANCHJ,off 1 1 "Independent" sub-RE.
IFTHEN BRANCHJ,off 1 1 Switch, should be preceeded by switcher .
GROUPP GROUPP, num 1 Whether the group matched.
-# Support for long RE
+#*Support for long RE (55..56)
+
LONGJMP LONGJMP,off 1 1 Jump far away.
BRANCHJ BRANCHJ,off 1 1 BRANCH with long offset.
-# The heavy worker
+#*The heavy worker (57..58)
+
EVAL EVAL, evl 1 Execute some Perl code.
-# Modifiers
+#*Modifiers (59..60)
+
MINMOD MINMOD, no Next operator is not greedy.
LOGICAL LOGICAL,no Next opcode should set the flag only.
-# This is not used yet
+# This is not used yet (61)
RENUM BRANCHJ,off 1 1 Group with independently numbered parens.
-# This is not really a node, but an optimized away piece of a "long" node.
-# To simplify debugging output, we mark it as if it were a node
-OPTIMIZED NOTHING,off Placeholder for dump.
+#*Trie Related (62..64)
+
+# Behave the same as A|LIST|OF|WORDS would. The '..C' variants have
+# inline charclass data (ascii only), the 'C' store it in the structure.
+# NOTE: the relative order of the TRIE-like regops is signifigant
-# Trie Related (behave the same as A|LIST|OF|WORDS would)
TRIE TRIE, trie 1 Match many EXACT(FL?)? at once. flags==type
TRIEC TRIE, trie charclass Same as TRIE, but with embedded charclass data
-# NEW STUFF HERE
+# For start classes, contains an added fail table.
+AHOCORASICK TRIE, trie 1 Aho Corasick stclass. flags==type
+AHOCORASICKC TRIE, trie charclass Same as AHOCORASICK, but with embedded charclass data
+
+# NEW STUFF ABOVE THIS LINE -- Please update counts below.
+
+#*Special Nodes (65, 66)
+
+# This is not really a node, but an optimized away piece of a "long" node.
+# To simplify debugging output, we mark it as if it were a node
+OPTIMIZED NOTHING,off Placeholder for dump.
+
# Special opcode with the property that no opcode in a compiled program
# will ever be of this type. Thus it can be used as a flag value that
# no other opcode has been seen. END is used similarly, in that an END
# node cant be optimized. So END implies "unoptimizable" and PSEUDO mean
# "not seen anything to optimize yet".
PSEUDO PSEUDO,off Pseudo opcode for internal use.
+
# NOTHING BELOW HERE \ No newline at end of file
diff --git a/regexec.c b/regexec.c
index d82b135f57..49ea3ea85e 100644
--- a/regexec.c
+++ b/regexec.c
@@ -356,7 +356,7 @@ Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos,
register SV *check;
char *strbeg;
char *t;
- const int do_utf8 = sv ? SvUTF8(sv) : 0; /* if no sv we have to assume bytes */
+ const bool do_utf8 = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */
I32 ml_anch;
register char *other_last = NULL; /* other substr checked before this */
char *check_at = NULL; /* check substr found at this pos */
@@ -374,7 +374,8 @@ Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos,
}
DEBUG_EXECUTE_r(
debug_start_match(prog, do_utf8, strpos, strend,
- "Guessing start of match for");
+ sv ? "Guessing start of match in sv for"
+ : "Guessing start of match in string for");
);
/* CHR_DIST() would be more correct here but it makes things slow. */
@@ -383,6 +384,7 @@ Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos,
"String too short... [re_intuit_start]\n"));
goto fail;
}
+
strbeg = (sv && SvPOK(sv)) ? strend - SvCUR(sv) : strpos;
PL_regeol = strend;
if (do_utf8) {
@@ -394,9 +396,9 @@ Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos,
to_byte_substr(prog);
check = prog->check_substr;
}
- if (check == &PL_sv_undef) {
+ if (check == &PL_sv_undef) {
DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
- "Non-utf string cannot match utf check string\n"));
+ "Non-utf8 string cannot match utf8 check string\n"));
goto fail;
}
if (prog->reganch & ROPT_ANCH) { /* Match at beg-of-str or after \n */
@@ -419,6 +421,7 @@ Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos,
I32 slen;
s = HOP3c(strpos, prog->check_offset_min, strend);
+
if (SvTAIL(check)) {
slen = SvCUR(check); /* >= 1 */
@@ -448,8 +451,8 @@ Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos,
/* Match is anchored, but substr is not anchored wrt beg-of-str. */
s = strpos;
start_shift = prog->check_offset_min; /* okay to underestimate on CC */
- end_shift = prog->minlen - start_shift -
- CHR_SVLEN(check) + (SvTAIL(check) != 0);
+ end_shift = prog->check_end_shift;
+
if (!ml_anch) {
const I32 end = prog->check_offset_max + CHR_SVLEN(check)
- (SvTAIL(check) != 0);
@@ -462,20 +465,37 @@ Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos,
else { /* Can match at random position */
ml_anch = 0;
s = strpos;
- start_shift = prog->check_offset_min; /* okay to underestimate on CC */
- /* Should be nonnegative! */
- end_shift = prog->minlen - start_shift -
- CHR_SVLEN(check) + (SvTAIL(check) != 0);
+ start_shift = prog->check_offset_min; /* okay to underestimate on CC */
+ end_shift = prog->check_end_shift;
+
+ /* end shift should be non negative here */
}
#ifdef DEBUGGING /* 7/99: reports of failure (with the older version) */
if (end_shift < 0)
- Perl_croak(aTHX_ "panic: end_shift");
+ Perl_croak(aTHX_ "panic: end_shift: %"IVdf" pattern:\n%s\n ",
+ end_shift,prog->precomp);
#endif
restart:
/* Find a possible match in the region s..strend by looking for
the "check" substring in the region corrected by start/end_shift. */
+
+ {
+ I32 srch_start_shift = start_shift;
+ I32 srch_end_shift = end_shift;
+ if (srch_start_shift < 0 && strbeg - s > srch_start_shift) {
+ srch_end_shift -= ((strbeg - s) - srch_start_shift);
+ srch_start_shift = strbeg - s;
+ }
+ DEBUG_OPTIMISE_r({
+ PerlIO_printf(Perl_debug_log, "Check offset min: %"IVdf" Start shift: %"IVdf" End shift %"IVdf" Real End Shift: %"IVdf"\n",
+ (IV)prog->check_offset_min,
+ (IV)srch_start_shift,
+ (IV)srch_end_shift,
+ (IV)prog->check_end_shift);
+ });
+
if (flags & REXEC_SCREAM) {
I32 p = -1; /* Internal iterator of scream. */
I32 * const pp = data ? data->scream_pos : &p;
@@ -485,7 +505,7 @@ Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos,
&& (BmPREVIOUS(check) == SvCUR(check) - 1)
&& SvTAIL(check) ))
s = screaminstr(sv, check,
- start_shift + (s - strbeg), end_shift, pp, 0);
+ srch_start_shift + (s - strbeg), srch_end_shift, pp, 0);
else
goto fail_finish;
/* we may be pointing at the wrong string */
@@ -494,15 +514,27 @@ Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos,
if (data)
*data->scream_olds = s;
}
- else if (prog->reganch & ROPT_CANY_SEEN)
- s = fbm_instr((U8*)(s + start_shift),
- (U8*)(strend - end_shift),
- check, multiline ? FBMrf_MULTILINE : 0);
- else
- s = fbm_instr(HOP3(s, start_shift, strend),
- HOP3(strend, -end_shift, strbeg),
+ else {
+ U8* start_point;
+ U8* end_point;
+ if (prog->reganch & ROPT_CANY_SEEN) {
+ start_point= (U8*)(s + srch_start_shift);
+ end_point= (U8*)(strend - srch_end_shift);
+ } else {
+ start_point= HOP3(s, srch_start_shift, srch_start_shift < 0 ? strbeg : strend);
+ end_point= HOP3(strend, -srch_end_shift, strbeg);
+ }
+ DEBUG_OPTIMISE_r({
+ PerlIO_printf(Perl_debug_log, "fbm_instr len=%"IVdf" str=<%.*s>\n",
+ (int)(end_point - start_point),
+ (int)(end_point - start_point),
+ start_point);
+ });
+
+ s = fbm_instr( start_point, end_point,
check, multiline ? FBMrf_MULTILINE : 0);
-
+ }
+ }
/* Update the count-of-usability, remove useless subpatterns,
unshift s. */
@@ -520,12 +552,20 @@ Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos,
if (!s)
goto fail_finish;
-
- check_at = s;
-
/* Finish the diagnostic message */
DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
+ /* XXX dmq: first branch is for positive lookbehind...
+ Our check string is offset from the beginning of the pattern.
+ So we need to do any stclass tests offset forward from that
+ point. I think. :-(
+ */
+
+
+
+ check_at=s;
+
+
/* Got a candidate. Check MBOL anchoring, and the *other* substr.
Start with the other substr.
XXXX no SCREAM optimization yet - and a very coarse implementation
@@ -534,7 +574,9 @@ Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos,
Probably it is right to do no SCREAM here...
*/
- if (do_utf8 ? (prog->float_utf8 && prog->anchored_utf8) : (prog->float_substr && prog->anchored_substr)) {
+ if (do_utf8 ? (prog->float_utf8 && prog->anchored_utf8)
+ : (prog->float_substr && prog->anchored_substr))
+ {
/* Take into account the "other" substring. */
/* XXXX May be hopelessly wrong for UTF... */
if (!other_last)
@@ -561,12 +603,17 @@ Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos,
last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
if (last < last1)
last1 = last;
- /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
+ /* XXXX It is not documented what units *_offsets are in.
+ We assume bytes, but this is clearly wrong.
+ Meaning this code needs to be carefully reviewed for errors.
+ dmq.
+ */
+
/* On end-of-str: see comment below. */
must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
if (must == &PL_sv_undef) {
s = (char*)NULL;
- DEBUG_EXECUTE_r(must = prog->anchored_utf8); /* for debug */
+ DEBUG_r(must = prog->anchored_utf8); /* for debug */
}
else
s = fbm_instr(
@@ -630,7 +677,7 @@ Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos,
and end-of-str is not later than strend we are OK. */
if (must == &PL_sv_undef) {
s = (char*)NULL;
- DEBUG_EXECUTE_r(must = prog->float_utf8); /* for debug message */
+ DEBUG_r(must = prog->float_utf8); /* for debug message */
}
else
s = fbm_instr((unsigned char*)s,
@@ -669,11 +716,26 @@ Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos,
}
}
- t = s - prog->check_offset_max;
+
+ t= HOP3( s, -prog->check_offset_max, (prog->check_offset_max<0) ? strend : strpos);
+
+ DEBUG_OPTIMISE_r(
+ PerlIO_printf(Perl_debug_log,
+ "Check offset min:%"IVdf" max:%"IVdf" S:%"IVdf" t:%"IVdf" D:%"IVdf" end:%"IVdf"\n",
+ (IV)prog->check_offset_min,
+ (IV)prog->check_offset_max,
+ (IV)(s-strpos),
+ (IV)(t-strpos),
+ (IV)(t-s),
+ (IV)(strend-strpos)
+ )
+ );
+
if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
&& (!do_utf8
- || ((t = (char*)reghopmaybe3((U8*)s, -prog->check_offset_max, (U8*)strpos))
- && t > strpos))) {
+ || ((t = (char*)reghopmaybe3((U8*)s, -prog->check_offset_max, (prog->check_offset_max<0) ? strend : strpos))
+ && t > strpos)))
+ {
/* Fixed substring is found far enough so that the match
cannot start at strpos. */
try_at_offset:
@@ -782,6 +844,8 @@ Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos,
/* Last resort... */
/* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
+ /* trie stclasses are too expensive to use here, we are better off to
+ leave it to regmatch itself */
if (prog->regstclass && PL_regkind[OP(prog->regstclass)]!=TRIE) {
/* minlen == 0 is possible if regstclass is \b or \B,
and the fixed substr is ''$.
@@ -795,14 +859,17 @@ Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos,
const int cl_l = (PL_regkind[OP(prog->regstclass)] == EXACT
? CHR_DIST(str+STR_LEN(prog->regstclass), str)
: 1);
- const char * endpos = (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
- ? HOP3c(s, (prog->minlen ? cl_l : 0), strend)
- : (prog->float_substr || prog->float_utf8
- ? HOP3c(HOP3c(check_at, -start_shift, strbeg),
- cl_l, strend)
- : strend);
- /*if (OP(prog->regstclass) == TRIE)
- endpos++;*/
+ char * endpos;
+ if (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
+ endpos= HOP3c(s, (prog->minlen ? cl_l : 0), strend);
+ else if (prog->float_substr || prog->float_utf8)
+ endpos= HOP3c(HOP3c(check_at, -start_shift, strbeg), cl_l, strend);
+ else
+ endpos= strend;
+
+ DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" s: %"IVdf" endpos: %"IVdf"\n",
+ start_shift,check_at-strbeg,s-strbeg,endpos-strbeg));
+
t = s;
s = find_byclass(prog, prog->regstclass, s, endpos, NULL);
if (!s) {
@@ -1329,8 +1396,8 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
!isDIGIT_LC_utf8((U8*)s),
!isDIGIT_LC(*s)
);
- case TRIEC:
- case TRIE:
+ case AHOCORASICKC:
+ case AHOCORASICK:
{
const enum { trie_plain, trie_utf8, trie_utf8_fold }
trie_type = do_utf8 ?
@@ -1369,7 +1436,9 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
SvPOK_on(sv_points);
sv_2mortal(sv_points);
points=(U8**)SvPV_nolen(sv_points );
- if ( trie_type != trie_utf8_fold && (trie->bitmap || OP(c)==TRIEC) ) {
+ if ( trie_type != trie_utf8_fold
+ && (trie->bitmap || OP(c)==AHOCORASICKC) )
+ {
if (trie->bitmap)
bitmap=(U8*)trie->bitmap;
else
@@ -1412,19 +1481,21 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
int failed=0;
U32 word = aho->states[ state ].wordnum;
- if( state==1 && bitmap ) {
- DEBUG_TRIE_EXECUTE_r(
- if ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
- dump_exec_pos( (char *)uc, c, strend, real_start,
- (char*)uc, do_utf8 );
- PerlIO_printf( Perl_debug_log,
- " Scanning for legal start char...\n");
+ if( state==1 ) {
+ if ( bitmap ) {
+ DEBUG_TRIE_EXECUTE_r(
+ if ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
+ dump_exec_pos( (char *)uc, c, strend, real_start,
+ (char *)uc, do_utf8 );
+ PerlIO_printf( Perl_debug_log,
+ " Scanning for legal start char...\n");
+ }
+ );
+ while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
+ uc++;
}
- );
- while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
- uc++;
+ s= (char *)uc;
}
- s= (char *)uc;
if (uc >(U8*)last_start) break;
}
@@ -1585,14 +1656,20 @@ Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char *
reginfo.prog = prog;
RX_MATCH_UTF8_set(prog, do_utf8);
+ DEBUG_EXECUTE_r(
+ debug_start_match(prog, do_utf8, startpos, strend,
+ "Matching");
+ );
minlen = prog->minlen;
- if (strend - startpos < minlen) {
+
+ if (strend - startpos < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) {
DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
"String too short [regexec_flags]...\n"));
goto phooey;
}
+
/* Check validity of program. */
if (UCHARAT(prog->program) != REG_MAGIC) {
Perl_croak(aTHX_ "corrupted regexp program");
@@ -1651,10 +1728,7 @@ Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char *
}
}
- DEBUG_EXECUTE_r(
- debug_start_match(prog, do_utf8, startpos, strend,
- "Matching");
- );
+
/* Simplest case: anchored match need be tried only once. */
/* [unless only anchor is BOL and multiline is set] */
@@ -1768,14 +1842,19 @@ Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char *
back_max = prog->float_max_offset;
back_min = prog->float_min_offset;
}
+
+
if (must == &PL_sv_undef)
/* could not downgrade utf8 check substring, so must fail */
goto phooey;
- last = HOP3c(strend, /* Cannot start after this */
- -(I32)(CHR_SVLEN(must)
- - (SvTAIL(must) != 0) + back_min), strbeg);
-
+ if (back_min<0) {
+ last = strend;
+ } else {
+ last = HOP3c(strend, /* Cannot start after this */
+ -(I32)(CHR_SVLEN(must)
+ - (SvTAIL(must) != 0) + back_min), strbeg);
+ }
if (s > PL_bostr)
last1 = HOPc(s, -1);
else
@@ -1788,9 +1867,9 @@ Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char *
strend = HOPc(strend, -dontbother);
while ( (s <= last) &&
((flags & REXEC_SCREAM)
- ? (s = screaminstr(sv, must, HOP3c(s, back_min, strend) - strbeg,
+ ? (s = screaminstr(sv, must, HOP3c(s, back_min, (back_min<0 ? strbeg : strend)) - strbeg,
end_shift, &scream_pos, 0))
- : (s = fbm_instr((unsigned char*)HOP3(s, back_min, strend),
+ : (s = fbm_instr((unsigned char*)HOP3(s, back_min, (back_min<0 ? strbeg : strend)),
(unsigned char*)strend, must,
multiline ? FBMrf_MULTILINE : 0))) ) {
/* we may be pointing at the wrong string */
@@ -2352,9 +2431,10 @@ S_debug_start_match(pTHX_ const regexp *prog, const bool do_utf8,
PL_colors[4], blurb, PL_colors[5], s0, s1);
if (do_utf8||utf8_pat)
- PerlIO_printf(Perl_debug_log, "UTF-8 %s...\n",
- !do_utf8 ? "pattern" : !utf8_pat ? "string" :
- "pattern and string"
+ PerlIO_printf(Perl_debug_log, "UTF-8 %s%s%s...\n",
+ utf8_pat ? "pattern" : "",
+ utf8_pat && do_utf8 ? " and " : "",
+ do_utf8 ? "string" : ""
);
}
}
@@ -2396,15 +2476,16 @@ S_dump_exec_pos(pTHX_ const char *locinput,
const int is_uni = (do_utf8 && OP(scan) != CANY) ? 1 : 0;
RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
- (locinput - pref_len),pref0_len, pref0_len, 4, 5);
+ (locinput - pref_len),pref0_len, 60, 4, 5);
RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
(locinput - pref_len + pref0_len),
- pref_len - pref0_len, pref_len - pref0_len, 2, 3);
+ pref_len - pref0_len, 60, 2, 3);
RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
- locinput, loc_regeol - locinput, l, 0, 1);
+ locinput, loc_regeol - locinput, 10, 0, 1);
+ const STRLEN tlen=len0+len1+len2;
PerlIO_printf(Perl_debug_log,
"%4"IVdf" <%.*s%.*s%s%.*s>%*s|",
(IV)(locinput - loc_bostr),
@@ -2412,7 +2493,7 @@ S_dump_exec_pos(pTHX_ const char *locinput,
len1, s1,
(docolor ? "" : "> <"),
len2, s2,
- 15 - l - pref_len + 1,
+ tlen > 19 ? 0 : 19 - tlen,
"");
}
}
@@ -2488,6 +2569,7 @@ S_regmatch(pTHX_ const regmatch_info *reginfo, regnode *prog)
DEBUG_EXECUTE_r( {
SV * const prop = sv_newmortal();
+ regnode *rnext=regnext(scan);
DUMP_EXEC_POS( locinput, scan, do_utf8 );
regprop(rex, prop, scan);
@@ -2495,7 +2577,8 @@ S_regmatch(pTHX_ const regmatch_info *reginfo, regnode *prog)
"%3"IVdf":%*s%s(%"IVdf")\n",
(IV)(scan - rex->program), PL_regindent*2, "",
SvPVX_const(prop),
- PL_regkind[OP(scan)] == END ? 0 : (IV)(regnext(scan) - rex->program));
+ (PL_regkind[OP(scan)] == END || !rnext) ?
+ 0 : (IV)(rnext - rex->program));
});
next = scan + NEXT_OFF(scan);
@@ -3332,11 +3415,6 @@ S_regmatch(pTHX_ const regmatch_info *reginfo, regnode *prog)
}
/* run the pattern returned from (??{...}) */
- DEBUG_EXECUTE_r(
- debug_start_match(re, do_utf8, locinput, PL_regeol,
- "Matching embedded");
- );
-
ST.cp = regcppush(0); /* Save *all* the positions. */
REGCP_SET(ST.lastcp);
*PL_reglastparen = 0;
@@ -3354,6 +3432,10 @@ S_regmatch(pTHX_ const regmatch_info *reginfo, regnode *prog)
rex = re;
ST.B = next;
+ DEBUG_EXECUTE_r(
+ debug_start_match(re, do_utf8, locinput, PL_regeol,
+ "Matching embedded");
+ );
/* now continue from first node in postoned RE */
PUSH_YES_STATE_GOTO(EVAL_A, re->program + 1);
/* NOTREACHED */
@@ -3383,7 +3465,6 @@ S_regmatch(pTHX_ const regmatch_info *reginfo, regnode *prog)
/* continue at the node following the (??{...}) */
scan = ST.B;
continue;
-
case EVAL_A_fail: /* unsuccessfully ran inner rex (??{rex}) */
/* Restore state to the outer re then re-throw the failure */
if (ST.toggleutf)
@@ -4443,7 +4524,6 @@ yes_final:
if (yes_state) {
/* we have successfully completed a subexpression, but we must now
* pop to the state marked by yes_state and continue from there */
-
assert(st != yes_state);
while (yes_state < SLAB_FIRST(PL_regmatch_slab)
|| yes_state > SLAB_LAST(PL_regmatch_slab))
@@ -4461,9 +4541,9 @@ yes_final:
PL_regmatch_state = st;
switch (st->resume_state) {
+ case EVAL_A:
case IFMATCH_A:
case CURLYM_A:
- case EVAL_A:
state_num = st->resume_state;
goto reenter_switch;
@@ -5064,21 +5144,43 @@ S_reghop3(U8 *s, I32 off, const U8* lim)
}
}
else {
- while (off++) {
- if (s > lim) {
- s--;
- if (UTF8_IS_CONTINUED(*s)) {
- while (s > (U8*)lim && UTF8_IS_CONTINUATION(*s))
- s--;
- }
- /* XXX could check well-formedness here */
+ while (off++ && s > lim) {
+ s--;
+ if (UTF8_IS_CONTINUED(*s)) {
+ while (s > lim && UTF8_IS_CONTINUATION(*s))
+ s--;
}
+ /* XXX could check well-formedness here */
}
}
return s;
}
STATIC U8 *
+S_reghop4(U8 *s, I32 off, const U8* llim, const U8* rlim)
+{
+ dVAR;
+ if (off >= 0) {
+ while (off-- && s < rlim) {
+ /* XXX could check well-formedness here */
+ s += UTF8SKIP(s);
+ }
+ }
+ else {
+ while (off++ && s > llim) {
+ s--;
+ if (UTF8_IS_CONTINUED(*s)) {
+ while (s > llim && UTF8_IS_CONTINUATION(*s))
+ s--;
+ }
+ /* XXX could check well-formedness here */
+ }
+ }
+ return s;
+}
+
+
+STATIC U8 *
S_reghopmaybe3(U8* s, I32 off, const U8* lim)
{
dVAR;
@@ -5091,17 +5193,13 @@ S_reghopmaybe3(U8* s, I32 off, const U8* lim)
return NULL;
}
else {
- while (off++) {
- if (s > lim) {
- s--;
- if (UTF8_IS_CONTINUED(*s)) {
- while (s > (U8*)lim && UTF8_IS_CONTINUATION(*s))
- s--;
- }
- /* XXX could check well-formedness here */
+ while (off++ && s > lim) {
+ s--;
+ if (UTF8_IS_CONTINUED(*s)) {
+ while (s > lim && UTF8_IS_CONTINUATION(*s))
+ s--;
}
- else
- break;
+ /* XXX could check well-formedness here */
}
if (off <= 0)
return NULL;
diff --git a/regnodes.h b/regnodes.h
index d8e3006b08..01985f5a73 100644
--- a/regnodes.h
+++ b/regnodes.h
@@ -65,11 +65,13 @@
#define MINMOD 58 /* 0x3a Next operator is not greedy. */
#define LOGICAL 59 /* 0x3b Next opcode should set the flag only. */
#define RENUM 60 /* 0x3c Group with independently numbered parens. */
-#define OPTIMIZED 61 /* 0x3d Placeholder for dump. */
-#define TRIE 62 /* 0x3e Match many EXACT(FL?)? at once. flags==type */
-#define TRIEC 63 /* 0x3f Same as TRIE, but with embedded charclass data */
-#define PSEUDO 64 /* 0x40 Pseudo opcode for internal use. */
-#define REGNODE_MAX 64
+#define TRIE 61 /* 0x3d Match many EXACT(FL?)? at once. flags==type */
+#define TRIEC 62 /* 0x3e Same as TRIE, but with embedded charclass data */
+#define AHOCORASICK 63 /* 0x3f Aho Corasick stclass. flags==type */
+#define AHOCORASICKC 64 /* 0x40 Same as AHOCORASICK, but with embedded charclass data */
+#define OPTIMIZED 65 /* 0x41 Placeholder for dump. */
+#define PSEUDO 66 /* 0x42 Pseudo opcode for internal use. */
+#define REGNODE_MAX 66
#ifndef DOINIT
EXTCONST U8 PL_regkind[];
@@ -136,9 +138,11 @@ EXTCONST U8 PL_regkind[] = {
MINMOD, /* MINMOD */
LOGICAL, /* LOGICAL */
BRANCHJ, /* RENUM */
- NOTHING, /* OPTIMIZED */
TRIE, /* TRIE */
TRIE, /* TRIEC */
+ TRIE, /* AHOCORASICK */
+ TRIE, /* AHOCORASICKC */
+ NOTHING, /* OPTIMIZED */
PSEUDO, /* PSEUDO */
};
#endif
@@ -207,9 +211,11 @@ static const U8 regarglen[] = {
0, /* MINMOD */
0, /* LOGICAL */
EXTRA_SIZE(struct regnode_1), /* RENUM */
- 0, /* OPTIMIZED */
EXTRA_SIZE(struct regnode_1), /* TRIE */
EXTRA_SIZE(struct regnode_charclass), /* TRIEC */
+ EXTRA_SIZE(struct regnode_1), /* AHOCORASICK */
+ EXTRA_SIZE(struct regnode_charclass), /* AHOCORASICKC */
+ 0, /* OPTIMIZED */
0, /* PSEUDO */
};
@@ -275,9 +281,11 @@ static const char reg_off_by_arg[] = {
0, /* MINMOD */
0, /* LOGICAL */
1, /* RENUM */
- 0, /* OPTIMIZED */
0, /* TRIE */
0, /* TRIEC */
+ 0, /* AHOCORASICK */
+ 0, /* AHOCORASICKC */
+ 0, /* OPTIMIZED */
0, /* PSEUDO */
};
@@ -344,13 +352,15 @@ static const char * const reg_name[] = {
"MINMOD", /* 0x3a */
"LOGICAL", /* 0x3b */
"RENUM", /* 0x3c */
- "OPTIMIZED", /* 0x3d */
- "TRIE", /* 0x3e */
- "TRIEC", /* 0x3f */
- "PSEUDO", /* 0x40 */
+ "TRIE", /* 0x3d */
+ "TRIEC", /* 0x3e */
+ "AHOCORASICK", /* 0x3f */
+ "AHOCORASICKC", /* 0x40 */
+ "OPTIMIZED", /* 0x41 */
+ "PSEUDO", /* 0x42 */
};
-static const int reg_num = 65;
+static const int reg_num = 67;
#endif /* DEBUGGING */
#endif /* REG_COMP_C */
diff --git a/sv.c b/sv.c
index 82c5b235fc..53256b85a7 100644
--- a/sv.c
+++ b/sv.c
@@ -9508,6 +9508,7 @@ Perl_re_dup(pTHX_ const REGEXP *r, CLONE_PARAMS *param)
for (s = ret->substrs->data, i = 0; i < 3; i++, s++) {
s->min_offset = r->substrs->data[i].min_offset;
s->max_offset = r->substrs->data[i].max_offset;
+ s->end_shift = r->substrs->data[i].end_shift;
s->substr = sv_dup_inc(r->substrs->data[i].substr, param);
s->utf8_substr = sv_dup_inc(r->substrs->data[i].utf8_substr, param);
}
diff --git a/t/op/re_tests b/t/op/re_tests
index f8ee725131..351414476d 100644
--- a/t/op/re_tests
+++ b/t/op/re_tests
@@ -990,3 +990,17 @@ X(?!b+(?!(c+)*(?!(c+)*d))).*X aXbbbbbbbcccccccccccccaaaX y - -
^([TUV]+|XXX|YYY|Z.Q*X|Z[TE]Q*P|[MKJ]): ZEQQQX: y $1 ZEQQQX
X(?:ABCF[cC]x*|ABCD|ABCF):(?:DIT|DID|DIM) XABCFCxxxxxxxxxx:DIM y $& XABCFCxxxxxxxxxx:DIM
(((ABCD|ABCE|ABCF)))(A|B|C[xy]*): ABCFCxxxxxxxxxx:DIM y $& ABCFCxxxxxxxxxx:
+(?=foo) foo y pos 0
+(?=foo) XfooY y pos 1
+.*(?=foo) XfooY y pos 1
+(?<=foo) foo y pos 3
+(?<=foo) XfooY y pos 4
+.*(?<=foo) foo y pos 3
+.*(?<=foo) XfooY y pos 4
+(?<=foo)Y XfooY y pos 5
+o(?<=foo)Y ..XfooY.. y pos 7
+X(?=foo)f ..XfooY.. y pos 4
+X(?=foo) ..XfooY.. y pos 3
+X(?<=foo.)[YZ] ..XfooXY.. y pos 8
+(?=XY*foo) Xfoo y pos 0
+^(?=XY*foo) Xfoo y pos 0
diff --git a/t/op/regexp.t b/t/op/regexp.t
index 662be92dab..678a46ad1d 100755
--- a/t/op/regexp.t
+++ b/t/op/regexp.t
@@ -72,23 +72,33 @@ while (<TESTS>) {
$skip = ($skip_amp ? ($result =~ s/B//i) : ($result =~ s/B//));
$reason = 'skipping $&' if $reason eq '' && $skip_amp;
$result =~ s/B//i unless $skip;
- for $study ('', 'study \$subject') {
+
+ for $study ('', 'study $subject') {
$c = $iters;
- if ($qr_embed) {
- eval qq"
- my \$RE = qr$pat;
- $study;
- \$match = (\$subject =~ /(?:)\$RE(?:)/) while \$c--;
- \$got = \"$repl\";
- ";
- }
- else {
- eval qq"
- $study;
- \$match = (\$subject =~ $OP$pat) while \$c--;
- \$got = \"$repl\";
- ";
- }
+ if ($repl eq 'pos') {
+ $code= <<EOFCODE;
+ $study;
+ pos(\$subject)=0;
+ \$match = ( \$subject =~ m${pat}g );
+ \$got = pos(\$subject);
+EOFCODE
+ }
+ elsif ($qr_embed) {
+ $code= <<EOFCODE;
+ my \$RE = qr$pat;
+ $study;
+ \$match = (\$subject =~ /(?:)\$RE(?:)/) while \$c--;
+ \$got = "$repl";
+EOFCODE
+ }
+ else {
+ $code= <<EOFCODE;
+ $study;
+ \$match = (\$subject =~ $OP$pat$addg) while \$c--;
+ \$got = "$repl";
+EOFCODE
+ }
+ eval $code;
chomp( $err = $@ );
if ($result eq 'c') {
if ($err !~ m!^\Q$expect!) { print "not ok $. (compile) $input => `$err'\n"; next TEST }
@@ -99,14 +109,14 @@ while (<TESTS>) {
next TEST;
}
elsif ($@) {
- print "not ok $. $input => error `$err'\n"; next TEST;
+ print "not ok $. $input => error `$err'\n$code\n$@\n"; next TEST;
}
elsif ($result eq 'n') {
if ($match) { print "not ok $. ($study) $input => false positive\n"; next TEST }
}
else {
if (!$match || $got ne $expect) {
- print "not ok $. ($study) $input => `$got', match=$match\n";
+ print "not ok $. ($study) $input => `$got', match=$match\n$code\n";
next TEST;
}
}