summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDavid Mitchell <davem@iabyn.com>2014-02-08 15:41:39 +0000
committerDavid Mitchell <davem@iabyn.com>2014-02-08 15:41:39 +0000
commit388e7c71a000e894d45537b02795a849ec63d162 (patch)
treea06ef1a56f20c0e11e5f79e62d37ff3d0d1e0e7d
parent769e4861d3176ce691c41777140ccac986ca9a37 (diff)
parent4d0062494ca3897c363c7b032d8ad924495435b3 (diff)
downloadperl-388e7c71a000e894d45537b02795a849ec63d162.tar.gz
[MERGE] fix and refactor re_intuit_start()
Perl_re_intuit_start() is the main run-time optimising function for the regex engine. It tries to either quickly reject a match, or find a suitable starting point for the NFA. Unfortunately it is impenetrable code, with 13 labels and no large scale loop or other constructs, and has several severe performance issues with long utf8 strings. This series of commits attempts to fix the performance issues, audit the code for utf8 and other correctness, and refactor and simplify the code, as well as improve the documentation. In particular it fixes RT#120692. With gcc on x86_64, this branch decreases the binary size of the function by around 15%. Much of my work on this branch has been an iterative process of wondering why a piece of code is the way it is, adding some assertions and seeing what breaks in the test suite, then using that info to improve the code or documentation. This work isn't finished yet; in particular I haven't yet audited and refactored the stclass block of code towards the end of the function. I also have more refactorisations and some more more optimisations still to go, as well as general tidying up of the documentation.
-rw-r--r--embed.fnc2
-rw-r--r--embed.h4
-rw-r--r--ext/re/t/regop.t4
-rw-r--r--proto.h18
-rw-r--r--regcomp.c19
-rw-r--r--regcomp.h4
-rw-r--r--regexec.c842
-rw-r--r--regexp.h7
-rw-r--r--t/re/pat.t29
-rw-r--r--t/re/re_tests8
10 files changed, 542 insertions, 395 deletions
diff --git a/embed.fnc b/embed.fnc
index 64aa735f23..6f743e49af 100644
--- a/embed.fnc
+++ b/embed.fnc
@@ -2169,10 +2169,8 @@ ERsn |U8* |reghop3 |NN U8 *s|SSize_t off|NN const U8 *lim
ERsM |SV* |core_regclass_swash|NULLOK const regexp *prog \
|NN const struct regnode *node|bool doinit \
|NULLOK SV **listsvp
-#ifdef XXX_dmq
ERsn |U8* |reghop4 |NN U8 *s|SSize_t off|NN const U8 *llim \
|NN const U8 *rlim
-#endif
ERsn |U8* |reghopmaybe3 |NN U8 *s|SSize_t off|NN const U8 *lim
ERs |char* |find_byclass |NN regexp * prog|NN const regnode *c \
|NN char *s|NN const char *strend \
diff --git a/embed.h b/embed.h
index 2a6064cbcb..d1224eb200 100644
--- a/embed.h
+++ b/embed.h
@@ -1004,6 +1004,7 @@
#define regcppop(a,b) S_regcppop(aTHX_ a,b)
#define regcppush(a,b,c) S_regcppush(aTHX_ a,b,c)
#define reghop3 S_reghop3
+#define reghop4 S_reghop4
#define reghopmaybe3 S_reghopmaybe3
#define reginclass(a,b,c,d,e) S_reginclass(aTHX_ a,b,c,d,e)
#define regmatch(a,b,c) S_regmatch(aTHX_ a,b,c)
@@ -1011,9 +1012,6 @@
#define regtry(a,b) S_regtry(aTHX_ a,b)
#define to_byte_substr(a) S_to_byte_substr(aTHX_ a)
#define to_utf8_substr(a) S_to_utf8_substr(aTHX_ a)
-# if defined(XXX_dmq)
-#define reghop4 S_reghop4
-# endif
# endif
# if defined(PERL_IN_UTF8_C) || defined(PERL_IN_REGCOMP_C) || defined(PERL_IN_REGEXEC_C)
#define _to_fold_latin1(a,b,c,d) Perl__to_fold_latin1(aTHX_ a,b,c,d)
diff --git a/ext/re/t/regop.t b/ext/re/t/regop.t
index 6eda0ab28c..76576b1213 100644
--- a/ext/re/t/regop.t
+++ b/ext/re/t/regop.t
@@ -98,7 +98,7 @@ matched empty string
Match successful!
Found floating substr "Y" at offset 1...
Found anchored substr "X" at offset 0...
-Guessed: match at offset 0
+Successfully guessed: match at offset 0
checking floating
minlen 2
S:1/6
@@ -121,7 +121,7 @@ foobar
checking anchored isall
minlen 6
anchored "foobar" at 0
-Guessed: match at offset 0
+Successfully guessed: match at offset 0
Compiling REx "[f][o][o][b][a][r]"
Freeing REx: "[f][o][o][b][a][r]"
%MATCHED%
diff --git a/proto.h b/proto.h
index 0115eab3ea..88e246a9b2 100644
--- a/proto.h
+++ b/proto.h
@@ -7175,6 +7175,14 @@ STATIC U8* S_reghop3(U8 *s, SSize_t off, const U8 *lim)
#define PERL_ARGS_ASSERT_REGHOP3 \
assert(s); assert(lim)
+STATIC U8* S_reghop4(U8 *s, SSize_t off, const U8 *llim, const U8 *rlim)
+ __attribute__warn_unused_result__
+ __attribute__nonnull__(1)
+ __attribute__nonnull__(3)
+ __attribute__nonnull__(4);
+#define PERL_ARGS_ASSERT_REGHOP4 \
+ assert(s); assert(llim); assert(rlim)
+
STATIC U8* S_reghopmaybe3(U8 *s, SSize_t off, const U8 *lim)
__attribute__warn_unused_result__
__attribute__nonnull__(1)
@@ -7224,16 +7232,6 @@ STATIC void S_to_utf8_substr(pTHX_ regexp * prog)
#define PERL_ARGS_ASSERT_TO_UTF8_SUBSTR \
assert(prog)
-# if defined(XXX_dmq)
-STATIC U8* S_reghop4(U8 *s, SSize_t off, const U8 *llim, const U8 *rlim)
- __attribute__warn_unused_result__
- __attribute__nonnull__(1)
- __attribute__nonnull__(3)
- __attribute__nonnull__(4);
-#define PERL_ARGS_ASSERT_REGHOP4 \
- assert(s); assert(llim); assert(rlim)
-
-# endif
#endif
#if defined(PERL_IN_SCOPE_C)
STATIC void S_save_pushptri32ptr(pTHX_ void *const ptr1, const I32 i, void *const ptr2, const int type);
diff --git a/regcomp.c b/regcomp.c
index 499a366fdb..a82171a9b2 100644
--- a/regcomp.c
+++ b/regcomp.c
@@ -4630,6 +4630,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
pars++;
if (flags & SCF_DO_SUBSTR) {
SV *last_str = NULL;
+ STRLEN last_chrs = 0;
int counted = mincount != 0;
if (data->last_end > 0 && mincount != 0) { /* Ends with a
@@ -4645,9 +4646,12 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
l -= old;
/* Get the added string: */
last_str = newSVpvn_utf8(s + old, l, UTF);
+ last_chrs = UTF ? utf8_length((U8*)(s + old),
+ (U8*)(s + old + l)) : l;
if (deltanext == 0 && pos_before == b) {
/* What was added is a constant string */
if (mincount > 1) {
+
SvGROW(last_str, (mincount * l) + 1);
repeatcpy(SvPVX(last_str) + l,
SvPVX_const(last_str), l,
@@ -4663,8 +4667,9 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
SvUTF8(sv) && SvMAGICAL(sv) ?
mg_find(sv, PERL_MAGIC_utf8) : NULL;
if (mg && mg->mg_len >= 0)
- mg->mg_len += CHR_SVLEN(last_str) - l;
+ mg->mg_len += last_chrs * (mincount-1);
}
+ last_chrs *= mincount;
data->last_end += l * (mincount - 1);
}
} else {
@@ -4706,12 +4711,10 @@ PerlIO_printf(Perl_debug_log, "LHS=%"UVdf" RHS=%"UVdf"\n",
mg->mg_len = -1;
sv_setsv(sv, last_str);
data->last_end = data->pos_min;
- data->last_start_min =
- data->pos_min - CHR_SVLEN(last_str);
+ data->last_start_min = data->pos_min - last_chrs;
data->last_start_max = is_inf
? SSize_t_MAX
- : data->pos_min + data->pos_delta
- - CHR_SVLEN(last_str);
+ : data->pos_min + data->pos_delta - last_chrs;
}
data->longest = &(data->longest_float);
}
@@ -6950,14 +6953,16 @@ reStudy:
/* A temporary algorithm prefers floated substr to fixed one to dig
* more info. */
if (longest_fixed_length > longest_float_length) {
+ r->substrs->check_ix = 0;
r->check_end_shift = r->anchored_end_shift;
r->check_substr = r->anchored_substr;
r->check_utf8 = r->anchored_utf8;
r->check_offset_min = r->check_offset_max = r->anchored_offset;
- if (r->intflags & PREGf_ANCH_SINGLE)
+ if (r->intflags & (PREGf_ANCH_SBOL|PREGf_ANCH_GPOS))
r->intflags |= PREGf_NOSCAN;
}
else {
+ r->substrs->check_ix = 1;
r->check_end_shift = r->float_end_shift;
r->check_substr = r->float_substr;
r->check_utf8 = r->float_utf8;
@@ -6969,6 +6974,8 @@ reStudy:
if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
r->extflags |= RXf_INTUIT_TAIL;
}
+ r->substrs->data[0].max_offset = r->substrs->data[0].min_offset;
+
/* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
if ( (STRLEN)minlen < longest_float_length )
minlen= longest_float_length;
diff --git a/regcomp.h b/regcomp.h
index 832ed3e542..3c1e5f6cec 100644
--- a/regcomp.h
+++ b/regcomp.h
@@ -145,8 +145,8 @@
#define PREGf_ANCH_SBOL 0x00001000
#define PREGf_ANCH_GPOS 0x00002000
-#define PREGf_ANCH_SINGLE ( PREGf_ANCH_SBOL | PREGf_ANCH_GPOS )
-#define PREGf_ANCH ( PREGf_ANCH_SINGLE | PREGf_ANCH_MBOL | PREGf_ANCH_BOL )
+#define PREGf_ANCH (PREGf_ANCH_SBOL | PREGf_ANCH_GPOS | \
+ PREGf_ANCH_MBOL | PREGf_ANCH_BOL )
/* this is where the old regcomp.h started */
diff --git a/regexec.c b/regexec.c
index 3e67e3a77a..df8bd146f5 100644
--- a/regexec.c
+++ b/regexec.c
@@ -129,6 +129,16 @@ static const char* const non_utf8_target_but_utf8_required
#define HOP3(pos,off,lim) (reginfo->is_utf8_target ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off))
#define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
+/* like HOP3, but limits the result to <= lim even for the non-utf8 case.
+ * off must be >=0; args should be vars rather than expressions */
+#define HOP3lim(pos,off,lim) (reginfo->is_utf8_target \
+ ? reghop3((U8*)(pos), off, (U8*)(lim)) \
+ : (U8*)((pos + off) > lim ? lim : (pos + off)))
+
+#define HOP4(pos,off,llim, rlim) (reginfo->is_utf8_target \
+ ? reghop4((U8*)(pos), off, (U8*)(llim), (U8*)(rlim)) \
+ : (U8*)(pos + off))
+#define HOP4c(pos,off,llim, rlim) ((char*)HOP4(pos,off,llim, rlim))
#define NEXTCHR_EOS -10 /* nextchr has fallen off the end */
#define NEXTCHR_IS_EOS (nextchr < 0)
@@ -628,31 +638,56 @@ Perl_re_intuit_start(pTHX_
SSize_t start_shift = 0;
/* Should be nonnegative! */
SSize_t end_shift = 0;
- char *s;
+ /* current lowest pos in string where the regex can start matching */
+ char *rx_origin = strpos;
SV *check;
- char *t;
const bool utf8_target = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */
- I32 ml_anch;
- char *other_last = NULL; /* other substr checked before this */
+ U8 other_ix = 1 - prog->substrs->check_ix;
+ bool ml_anch = 0;
+ char *other_last = strpos;/* latest pos 'other' substr already checked to */
char *check_at = NULL; /* check substr found at this pos */
char *checked_upto = NULL; /* how far into the string we have already checked using find_byclass*/
const I32 multiline = prog->extflags & RXf_PMf_MULTILINE;
RXi_GET_DECL(prog,progi);
regmatch_info reginfo_buf; /* create some info to pass to find_byclass */
regmatch_info *const reginfo = &reginfo_buf;
-#ifdef DEBUGGING
- const char * const i_strpos = strpos;
-#endif
GET_RE_DEBUG_FLAGS_DECL;
PERL_ARGS_ASSERT_RE_INTUIT_START;
PERL_UNUSED_ARG(flags);
PERL_UNUSED_ARG(data);
+ DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
+ "Intuit: trying to determine minimum start position...\n"));
+
+ /* for now, assume that all substr offsets are positive. If at some point
+ * in the future someone wants to do clever things with look-behind and
+ * -ve offsets, they'll need to fix up any code in this function
+ * which uses these offsets. See the thread beginning
+ * <20140113145929.GF27210@iabyn.com>
+ */
+ assert(prog->substrs->data[0].min_offset >= 0);
+ assert(prog->substrs->data[0].max_offset >= 0);
+ assert(prog->substrs->data[1].min_offset >= 0);
+ assert(prog->substrs->data[1].max_offset >= 0);
+ assert(prog->substrs->data[2].min_offset >= 0);
+ assert(prog->substrs->data[2].max_offset >= 0);
+
+ /* for now, assume that if both present, that the floating substring
+ * follows the anchored substring, and that they don't overlap.
+ * If you break this assumption (e.g. doing better optimisations
+ * with lookahead/behind), then you'll need to audit the code in this
+ * function carefully first
+ */
+ assert(
+ ! ( (prog->anchored_utf8 || prog->anchored_substr)
+ && (prog->float_utf8 || prog->float_substr))
+ || (prog->float_min_offset >= prog->anchored_offset));
+
/* CHR_DIST() would be more correct here but it makes things slow. */
if (prog->minlen > strend - strpos) {
DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
- "String too short... [re_intuit_start]\n"));
+ " String too short...\n"));
goto fail;
}
@@ -677,87 +712,102 @@ Perl_re_intuit_start(pTHX_
}
check = prog->check_substr;
}
+
+ /* dump the various substring data */
+ DEBUG_OPTIMISE_MORE_r({
+ int i;
+ for (i=0; i<=2; i++) {
+ SV *sv = (utf8_target ? prog->substrs->data[i].utf8_substr
+ : prog->substrs->data[i].substr);
+ if (!sv)
+ continue;
+
+ PerlIO_printf(Perl_debug_log,
+ " substrs[%d]: min=%"IVdf" max=%"IVdf" end shift=%"IVdf
+ " useful=%"IVdf" utf8=%d [%s]\n",
+ i,
+ (IV)prog->substrs->data[i].min_offset,
+ (IV)prog->substrs->data[i].max_offset,
+ (IV)prog->substrs->data[i].end_shift,
+ BmUSEFUL(sv),
+ utf8_target ? 1 : 0,
+ SvPEEK(sv));
+ }
+ });
+
if (prog->intflags & PREGf_ANCH) { /* Match at \G, beg-of-str or after \n */
- ml_anch = !( (prog->intflags & PREGf_ANCH_SINGLE)
- || ( (prog->intflags & PREGf_ANCH_BOL)
- && !multiline ) ); /* Check after \n? */
+ /* Check after \n? */
+ ml_anch = ( (prog->intflags & PREGf_ANCH_MBOL)
+ || ((prog->intflags & PREGf_ANCH_BOL) && multiline));
if (!ml_anch) {
- /* we are only allowed to match at BOS or \G */
+ /* we are only allowed to match at BOS or \G */
- if (prog->intflags & PREGf_ANCH_GPOS) {
- /* in this case, we hope(!) that the caller has already
+ /* trivially reject if there's a BOS anchor and we're not at BOS.
+ * In the case of \G, we hope(!) that the caller has already
* set strpos to pos()-gofs, and will already have checked
- * that this anchor position is legal
+ * that this anchor position is legal. So we can skip it here.
*/
- ;
- }
- else if (!(prog->intflags & PREGf_IMPLICIT) /* not a real BOL */
+ if ( !(prog->intflags & PREGf_ANCH_GPOS)
+ && !(prog->intflags & PREGf_IMPLICIT) /* not a real BOL */
&& (strpos != strbeg))
- {
- DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
- goto fail;
- }
- if (prog->check_offset_min == prog->check_offset_max
- && !(prog->intflags & PREGf_CANY_SEEN)
- && ! multiline) /* /m can cause \n's to match that aren't
- accounted for in the string max length.
- See [perl #115242] */
- {
- /* Substring at constant offset from beg-of-str... */
- SSize_t slen;
-
- s = HOP3c(strpos, prog->check_offset_min, strend);
+ {
+ DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
+ " Not at start...\n"));
+ goto fail;
+ }
+
+ /* in the presence of an anchor, the anchored (relative to the
+ * start of the regex) substr must also be anchored relative
+ * to strpos. So quickly reject if substr isn't found there */
+
+ if (prog->check_offset_min == prog->check_offset_max
+ && !(prog->intflags & PREGf_CANY_SEEN)
+ && ! multiline) /* /m can cause \n's to match that aren't
+ accounted for in the string max length.
+ See [perl #115242] */
+ {
+ /* Substring at constant offset from beg-of-str... */
+ SSize_t slen = SvCUR(check);
+ char *s;
+
+ s = HOP3c(strpos, prog->check_offset_min, strend);
- if (SvTAIL(check)) {
- slen = SvCUR(check); /* >= 1 */
+ DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
+ " Looking for check substr at fixed offset %"IVdf"...\n",
+ (IV)prog->check_offset_min));
+
+ if (SvTAIL(check)) {
+ /* In this case, the regex is anchored at the end too,
+ * so the lengths must match exactly, give or take a \n.
+ * NB: slen >= 1 since the last char of check is \n */
+ if ( strend - s > slen || strend - s < slen - 1
+ || (strend - s == slen && strend[-1] != '\n'))
+ {
+ DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
+ " String too long...\n"));
+ goto fail_finish;
+ }
+ /* Now should match s[0..slen-2] */
+ slen--;
+ }
+ if (slen && (*SvPVX_const(check) != *s
+ || (slen > 1 && memNE(SvPVX_const(check), s, slen))))
+ {
+ DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
+ " String not equal...\n"));
+ goto fail_finish;
+ }
- if ( strend - s > slen || strend - s < slen - 1
- || (strend - s == slen && strend[-1] != '\n')) {
- DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
- goto fail_finish;
- }
- /* Now should match s[0..slen-2] */
- slen--;
- if (slen && (*SvPVX_const(check) != *s
- || (slen > 1
- && memNE(SvPVX_const(check), s, slen)))) {
- report_neq:
- DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
- goto fail_finish;
- }
+ check_at = s;
+ goto success_at_start;
}
- else if (*SvPVX_const(check) != *s
- || ((slen = SvCUR(check)) > 1
- && memNE(SvPVX_const(check), s, slen)))
- goto report_neq;
- check_at = s;
- goto success_at_start;
- }
}
- /* Match is anchored, but substr is not anchored wrt beg-of-str. */
- s = strpos;
- start_shift = prog->check_offset_min; /* okay to underestimate on CC */
- end_shift = prog->check_end_shift;
-
- if (!ml_anch) {
- const SSize_t end = prog->check_offset_max + CHR_SVLEN(check)
- - (SvTAIL(check) != 0);
- const SSize_t eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
-
- if (end_shift < eshift)
- end_shift = eshift;
- }
- }
- else { /* Can match at random position */
- ml_anch = 0;
- s = strpos;
- start_shift = prog->check_offset_min; /* okay to underestimate on CC */
- end_shift = prog->check_end_shift;
-
- /* end shift should be non negative here */
}
+ start_shift = prog->check_offset_min; /* okay to underestimate on CC */
+ end_shift = prog->check_end_shift;
+
#ifdef DEBUGGING /* 7/99: reports of failure (with the older version) */
if (end_shift < 0)
Perl_croak(aTHX_ "panic: end_shift: %"IVdf" pattern:\n%s\n ",
@@ -765,62 +815,89 @@ Perl_re_intuit_start(pTHX_
#endif
restart:
- /* Find a possible match in the region s..strend by looking for
- the "check" substring in the region corrected by start/end_shift. */
+ /* Find a candidate regex origin in the region rx_origin..strend
+ * by looking for the "check" substring in that region, corrected by
+ * start/end_shift.
+ */
{
- SSize_t srch_start_shift = start_shift;
- SSize_t srch_end_shift = end_shift;
U8* start_point;
U8* end_point;
- if (srch_start_shift < 0 && strbeg - s > srch_start_shift) {
- srch_end_shift -= ((strbeg - s) - srch_start_shift);
- srch_start_shift = strbeg - s;
- }
- DEBUG_OPTIMISE_MORE_r({
- PerlIO_printf(Perl_debug_log, "Check offset min: %"IVdf" Start shift: %"IVdf" End shift %"IVdf" Real End Shift: %"IVdf"\n",
- (IV)prog->check_offset_min,
- (IV)srch_start_shift,
- (IV)srch_end_shift,
- (IV)prog->check_end_shift);
- });
+
+ DEBUG_OPTIMISE_MORE_r({
+ PerlIO_printf(Perl_debug_log,
+ " At restart: rx_origin=%"IVdf" Check offset min: %"IVdf
+ " Start shift: %"IVdf" End shift %"IVdf
+ " Real end Shift: %"IVdf"\n",
+ (IV)(rx_origin - strpos),
+ (IV)prog->check_offset_min,
+ (IV)start_shift,
+ (IV)end_shift,
+ (IV)prog->check_end_shift);
+ });
if (prog->intflags & PREGf_CANY_SEEN) {
- start_point= (U8*)(s + srch_start_shift);
- end_point= (U8*)(strend - srch_end_shift);
+ start_point= (U8*)(rx_origin + start_shift);
+ end_point= (U8*)(strend - end_shift);
} else {
- start_point= HOP3(s, srch_start_shift, srch_start_shift < 0 ? strbeg : strend);
- end_point= HOP3(strend, -srch_end_shift, strbeg);
+ start_point= HOP3(rx_origin, start_shift, strend);
+ end_point= HOP3(strend, -end_shift, strbeg);
}
+
+ /* if the regex is absolutely anchored to the start of the string,
+ * then check_offset_max represents an upper bound on the string
+ * where the substr could start */
+ if (!ml_anch
+ && prog->intflags & PREGf_ANCH
+ && prog->check_offset_max != SSize_t_MAX
+ && start_shift < prog->check_offset_max)
+ {
+ SSize_t len = SvCUR(check) - !!SvTAIL(check);
+ end_point = HOP3lim(start_point,
+ prog->check_offset_max - start_shift,
+ end_point -len)
+ + len;
+ }
+
DEBUG_OPTIMISE_MORE_r({
- PerlIO_printf(Perl_debug_log, "fbm_instr len=%d str=<%.*s>\n",
+ PerlIO_printf(Perl_debug_log, " fbm_instr len=%d str=<%.*s>\n",
(int)(end_point - start_point),
(int)(end_point - start_point) > 20 ? 20 : (int)(end_point - start_point),
start_point);
});
- s = fbm_instr( start_point, end_point,
+ check_at = fbm_instr( start_point, end_point,
check, multiline ? FBMrf_MULTILINE : 0);
}
+
/* Update the count-of-usability, remove useless subpatterns,
unshift s. */
DEBUG_EXECUTE_r({
RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
SvPVX_const(check), RE_SV_DUMPLEN(check), 30);
- PerlIO_printf(Perl_debug_log, "%s %s substr %s%s%s",
- (s ? "Found" : "Did not find"),
+ PerlIO_printf(Perl_debug_log, " %s %s substr %s%s%s",
+ (check_at ? "Found" : "Did not find"),
(check == (utf8_target ? prog->anchored_utf8 : prog->anchored_substr)
? "anchored" : "floating"),
quoted,
RE_SV_TAIL(check),
- (s ? " at offset " : "...\n") );
+ (check_at ? " at offset " : "...\n") );
});
- if (!s)
+ if (!check_at)
goto fail_finish;
/* Finish the diagnostic message */
- DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
+ DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(check_at - strpos)) );
+
+ /* set rx_origin to the minimum position where the regex could start
+ * matching, given the constraint of the just-matched check substring.
+ * But don't set it lower than previously.
+ */
+
+ if (check_at - rx_origin > prog->check_offset_max)
+ rx_origin = HOP3c(check_at, -prog->check_offset_max, rx_origin);
+
/* XXX dmq: first branch is for positive lookbehind...
Our check string is offset from the beginning of the pattern.
@@ -828,11 +905,6 @@ Perl_re_intuit_start(pTHX_
point. I think. :-(
*/
-
-
- check_at=s;
-
-
/* Got a candidate. Check MBOL anchoring, and the *other* substr.
Start with the other substr.
XXXX no SCREAM optimization yet - and a very coarse implementation
@@ -841,222 +913,269 @@ Perl_re_intuit_start(pTHX_
Probably it is right to do no SCREAM here...
*/
- if (utf8_target ? (prog->float_utf8 && prog->anchored_utf8)
- : (prog->float_substr && prog->anchored_substr))
+ if (utf8_target ? prog->substrs->data[other_ix].utf8_substr
+ : prog->substrs->data[other_ix].substr)
{
/* Take into account the "other" substring. */
- /* XXXX May be hopelessly wrong for UTF... */
- if (!other_last)
- other_last = strpos;
- if (check == (utf8_target ? prog->float_utf8 : prog->float_substr)) {
- do_other_anchored:
- {
- char * const last = HOP3c(s, -start_shift, strbeg);
- char *last1, *last2;
- char * const saved_s = s;
- SV* must;
-
- t = s - prog->check_offset_max;
- if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
- && (!utf8_target
- || ((t = (char*)reghopmaybe3((U8*)s, -(prog->check_offset_max), (U8*)strpos))
- && t > strpos)))
- NOOP;
- else
- t = strpos;
- t = HOP3c(t, prog->anchored_offset, strend);
- if (t < other_last) /* These positions already checked */
- t = other_last;
- last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
- if (last < last1)
- last1 = last;
- /* XXXX It is not documented what units *_offsets are in.
- We assume bytes, but this is clearly wrong.
- Meaning this code needs to be carefully reviewed for errors.
- dmq.
- */
-
- /* On end-of-str: see comment below. */
- must = utf8_target ? prog->anchored_utf8 : prog->anchored_substr;
- if (must == &PL_sv_undef) {
- s = (char*)NULL;
- DEBUG_r(must = prog->anchored_utf8); /* for debug */
- }
- else
- s = fbm_instr(
- (unsigned char*)t,
- HOP3(HOP3(last1, prog->anchored_offset, strend)
- + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
- must,
- multiline ? FBMrf_MULTILINE : 0
- );
- DEBUG_EXECUTE_r({
- RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
- SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
- PerlIO_printf(Perl_debug_log, "%s anchored substr %s%s",
- (s ? "Found" : "Contradicts"),
- quoted, RE_SV_TAIL(must));
- });
-
-
- if (!s) {
- if (last1 >= last2) {
- DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
- ", giving up...\n"));
- goto fail_finish;
- }
- DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
- ", trying floating at offset %ld...\n",
- (long)(HOP3c(saved_s, 1, strend) - i_strpos)));
- other_last = HOP3c(last1, prog->anchored_offset+1, strend);
- s = HOP3c(last, 1, strend);
- goto restart;
- }
- else {
- DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
- (long)(s - i_strpos)));
- t = HOP3c(s, -prog->anchored_offset, strbeg);
- other_last = HOP3c(s, 1, strend);
- s = saved_s;
- if (t == strpos)
- goto try_at_start;
- goto try_at_offset;
- }
- }
- }
- else { /* Take into account the floating substring. */
- char *last, *last1;
- char * const saved_s = s;
- SV* must;
-
- t = HOP3c(s, -start_shift, strbeg);
- last1 = last =
- HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
- if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
- last = HOP3c(t, prog->float_max_offset, strend);
- s = HOP3c(t, prog->float_min_offset, strend);
- if (s < other_last)
- s = other_last;
- /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
- must = utf8_target ? prog->float_utf8 : prog->float_substr;
- /* fbm_instr() takes into account exact value of end-of-str
- if the check is SvTAIL(ed). Since false positives are OK,
- and end-of-str is not later than strend we are OK. */
- if (must == &PL_sv_undef) {
- s = (char*)NULL;
- DEBUG_r(must = prog->float_utf8); /* for debug message */
- }
- else
- s = fbm_instr((unsigned char*)s,
- (unsigned char*)last + SvCUR(must)
- - (SvTAIL(must)!=0),
- must, multiline ? FBMrf_MULTILINE : 0);
- DEBUG_EXECUTE_r({
- RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
- SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
- PerlIO_printf(Perl_debug_log, "%s floating substr %s%s",
- (s ? "Found" : "Contradicts"),
- quoted, RE_SV_TAIL(must));
- });
- if (!s) {
- if (last1 == last) {
- DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
- ", giving up...\n"));
- goto fail_finish;
- }
- DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
- ", trying anchored starting at offset %ld...\n",
- (long)(saved_s + 1 - i_strpos)));
- other_last = last;
- s = HOP3c(t, 1, strend);
- goto restart;
- }
- else {
- DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
- (long)(s - i_strpos)));
- other_last = s; /* Fix this later. --Hugo */
- s = saved_s;
- if (t == strpos)
- goto try_at_start;
- goto try_at_offset;
- }
- }
+ char *last, *last1;
+ char *s;
+ SV* must;
+ struct reg_substr_datum *other;
+
+ do_other_substr:
+ other = &prog->substrs->data[other_ix];
+
+ /* if "other" is anchored:
+ * we've previously found a floating substr starting at check_at.
+ * This means that the regex origin must lie somewhere
+ * between min (rx_origin): HOP3(check_at, -check_offset_max)
+ * and max: HOP3(check_at, -check_offset_min)
+ * (except that min will be >= strpos)
+ * So the fixed substr must lie somewhere between
+ * HOP3(min, anchored_offset)
+ * HOP3(max, anchored_offset) + SvCUR(substr)
+ */
+
+ /* if "other" is floating
+ * Calculate last1, the absolute latest point where the
+ * floating substr could start in the string, ignoring any
+ * constraints from the earlier fixed match. It is calculated
+ * as follows:
+ *
+ * strend - prog->minlen (in chars) is the absolute latest
+ * position within the string where the origin of the regex
+ * could appear. The latest start point for the floating
+ * substr is float_min_offset(*) on from the start of the
+ * regex. last1 simply combines thee two offsets.
+ *
+ * (*) You might think the latest start point should be
+ * float_max_offset from the regex origin, and technically
+ * you'd be correct. However, consider
+ * /a\d{2,4}bcd\w/
+ * Here, float min, max are 3,5 and minlen is 7.
+ * This can match either
+ * /a\d\dbcd\w/
+ * /a\d\d\dbcd\w/
+ * /a\d\d\d\dbcd\w/
+ * In the first case, the regex matches minlen chars; in the
+ * second, minlen+1, in the third, minlen+2.
+ * In the first case, the floating offset is 3 (which equals
+ * float_min), in the second, 4, and in the third, 5 (which
+ * equals float_max). In all cases, the floating string bcd
+ * can never start more than 4 chars from the end of the
+ * string, which equals minlen - float_min. As the substring
+ * starts to match more than float_min from the start of the
+ * regex, it makes the regex match more than minlen chars,
+ * and the two cancel each other out. So we can always use
+ * float_min - minlen, rather than float_max - minlen for the
+ * latest position in the string.
+ *
+ * Note that -minlen + float_min_offset is equivalent (AFAIKT)
+ * to CHR_SVLEN(must) - !!SvTAIL(must) + prog->float_end_shift
+ */
+
+ assert(prog->minlen >= other->min_offset);
+ last1 = HOP3c(strend,
+ other->min_offset - prog->minlen, strbeg);
+
+ if (other_ix) {/* i.e. if (other-is-float) */
+ /* last is the latest point where the floating substr could
+ * start, *given* any constraints from the earlier fixed
+ * match. This constraint is that the floating string starts
+ * <= float_max_offset chars from the regex origin (rx_origin).
+ * If this value is less than last1, use it instead.
+ */
+ assert(rx_origin <= last1);
+ last =
+ /* this condition handles the offset==infinity case, and
+ * is a short-cut otherwise. Although it's comparing a
+ * byte offset to a char length, it does so in a safe way,
+ * since 1 char always occupies 1 or more bytes,
+ * so if a string range is (last1 - rx_origin) bytes,
+ * it will be less than or equal to (last1 - rx_origin)
+ * chars; meaning it errs towards doing the accurate HOP3
+ * rather than just using last1 as a short-cut */
+ (last1 - rx_origin) < other->max_offset
+ ? last1
+ : (char*)HOP3lim(rx_origin, other->max_offset, last1);
+ }
+ else {
+ assert(strpos + start_shift <= check_at);
+ last = HOP4c(check_at, other->min_offset - start_shift,
+ strbeg, strend);
+ }
+
+ s = HOP3c(rx_origin, other->min_offset, strend);
+ if (s < other_last) /* These positions already checked */
+ s = other_last;
+
+ must = utf8_target ? other->utf8_substr : other->substr;
+ assert(SvPOK(must));
+ s = fbm_instr(
+ (unsigned char*)s,
+ (unsigned char*)last + SvCUR(must) - (SvTAIL(must)!=0),
+ must,
+ multiline ? FBMrf_MULTILINE : 0
+ );
+ DEBUG_EXECUTE_r({
+ RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
+ SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
+ PerlIO_printf(Perl_debug_log, " %s %s substr %s%s",
+ s ? "Found" : "Contradicts",
+ other_ix ? "floating" : "anchored",
+ quoted, RE_SV_TAIL(must));
+ });
+
+
+ if (!s) {
+ /* last1 is latest possible substr location. If we didn't
+ * find it before there, we never will */
+ if (last >= last1) {
+ DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
+ ", giving up...\n"));
+ goto fail_finish;
+ }
+
+ /* try to find the check substr again at a later
+ * position. Maybe next time we'll find the "other" substr
+ * in range too */
+ DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
+ ", trying %s at offset %ld...\n",
+ (other_ix ? "floating" : "anchored"),
+ (long)(HOP3c(check_at, 1, strend) - strpos)));
+
+ other_last = HOP3c(last, 1, strend) /* highest failure */;
+ rx_origin =
+ other_ix /* i.e. if other-is-float */
+ ? HOP3c(rx_origin, 1, strend)
+ : HOP4c(last, 1 - other->min_offset, strbeg, strend);
+ goto restart;
+ }
+ else {
+ DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
+ (long)(s - strpos)));
+
+ if (other_ix) { /* if (other-is-float) */
+ /* other_last is set to s, not s+1, since its possible for
+ * a floating substr to fail first time, then succeed
+ * second time at the same floating position; e.g.:
+ * "-AB--AABZ" =~ /\wAB\d*Z/
+ * The first time round, anchored and float match at
+ * "-(AB)--AAB(Z)" then fail on the initial \w character
+ * class. Second time round, they match at "-AB--A(AB)(Z)".
+ */
+ other_last = s;
+ }
+ else {
+ rx_origin = HOP3c(s, -other->min_offset, strbeg);
+ other_last = HOP3c(s, 1, strend);
+ }
+ }
+ }
+ else {
+ DEBUG_OPTIMISE_MORE_r(
+ PerlIO_printf(Perl_debug_log,
+ " Check-only match: offset min:%"IVdf" max:%"IVdf
+ " check_at:%"IVdf" rx_origin:%"IVdf" rx_origin-check_at:%"IVdf
+ " strend-strpos:%"IVdf"\n",
+ (IV)prog->check_offset_min,
+ (IV)prog->check_offset_max,
+ (IV)(check_at-strpos),
+ (IV)(rx_origin-strpos),
+ (IV)(rx_origin-check_at),
+ (IV)(strend-strpos)
+ )
+ );
}
-
- t= (char*)HOP3( s, -prog->check_offset_max, (prog->check_offset_max<0) ? strend : strpos);
-
- DEBUG_OPTIMISE_MORE_r(
- PerlIO_printf(Perl_debug_log,
- "Check offset min:%"IVdf" max:%"IVdf" S:%"IVdf" t:%"IVdf" D:%"IVdf" end:%"IVdf"\n",
- (IV)prog->check_offset_min,
- (IV)prog->check_offset_max,
- (IV)(s-strpos),
- (IV)(t-strpos),
- (IV)(t-s),
- (IV)(strend-strpos)
- )
- );
+ postprocess_substr_matches:
- if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
- && (!utf8_target
- || ((t = (char*)reghopmaybe3((U8*)s, -prog->check_offset_max, (U8*) ((prog->check_offset_max<0) ? strend : strpos)))
- && t > strpos)))
+ /* handle the extra constraint of /^/m */
+
+ if (ml_anch && rx_origin != strbeg && rx_origin[-1] != '\n'
+ /* May be due to an implicit anchor of m{.*foo} */
+ && !(prog->intflags & PREGf_IMPLICIT))
{
+ DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
+ " looking for /^/m anchor"));
+
+ /* we have failed the constraint of a \n before rx_origin.
+ * Find the next \n, if any, even if it's beyond the current
+ * anchored and/or floating substrings. Whether we should be
+ * scanning ahead for the next \n or the next substr is debatable.
+ * On the one hand you'd expect rare substrings to appear less
+ * often than \n's. On the other hand, searching for \n means
+ * we're effectively flipping been check_substr and "\n" on each
+ * iteration as the current "rarest" string candidate, which
+ * means for example that we'll quickly reject the whole string if
+ * hasn't got a \n, rather than trying every substr position
+ * first
+ */
+
+ rx_origin = (char *)memchr(rx_origin, '\n',
+ HOP3c(strend, - prog->minlen, strpos) - rx_origin);
+ if (!rx_origin) {
+ DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
+ " Did not find /%s^%s/m...\n",
+ PL_colors[0], PL_colors[1]));
+ goto fail_finish;
+ }
+
+ /* earliest possible origin is 1 char after the \n.
+ * (since *rx_origin == '\n', it's safe to ++ here rather than
+ * HOP(rx_origin, 1)) */
+ rx_origin++;
+
+ if (prog->substrs->check_ix == 0 /* check is anchored */
+ || rx_origin >= HOP3c(check_at, - prog->check_offset_min, strpos))
+ {
+ /* Position contradicts check-string; either because
+ * check was anchored (and thus has no wiggle room),
+ * or check was float and rx_origin is above the float range */
+ DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
+ " Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
+ PL_colors[0], PL_colors[1], (long)(rx_origin - strpos)));
+ goto restart;
+ }
+
+ /* if we get here, the check substr must have been float,
+ * is in range, and we may or may not have had an anchored
+ * "other" substr which still contradicts */
+ assert(prog->substrs->check_ix); /* check is float */
+
+ if (utf8_target ? prog->anchored_utf8 : prog->anchored_substr) {
+ /* whoops, the anchored "other" substr exists, so we still
+ * contradict. On the other hand, the float "check" substr
+ * didn't contradict, so just retry the anchored "other"
+ * substr */
+ DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
+ " Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
+ PL_colors[0], PL_colors[1],
+ (long)(rx_origin - strpos),
+ (long)(rx_origin - strpos + prog->anchored_offset)));
+ goto do_other_substr;
+ }
+
+ /* success: we don't contradict the found floating substring
+ * (and there's no anchored substr). */
+ DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
+ " Found /%s^%s/m at offset %ld...\n",
+ PL_colors[0], PL_colors[1], (long)(rx_origin - strpos)));
+ }
+ else {
+ DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
+ " Starting position does not contradict /%s^%s/m...\n",
+ PL_colors[0], PL_colors[1]));
+ }
+
+
+ /* Decide whether using the substrings helped */
+
+ if (rx_origin != strpos) {
/* Fixed substring is found far enough so that the match
cannot start at strpos. */
- try_at_offset:
- if (ml_anch && t[-1] != '\n') {
- /* Eventually fbm_*() should handle this, but often
- anchored_offset is not 0, so this check will not be wasted. */
- /* XXXX In the code below we prefer to look for "^" even in
- presence of anchored substrings. And we search even
- beyond the found float position. These pessimizations
- are historical artefacts only. */
- find_anchor:
- while (t < strend - prog->minlen) {
- if (*t == '\n') {
- if (t < check_at - prog->check_offset_min) {
- if (utf8_target ? prog->anchored_utf8 : prog->anchored_substr) {
- /* Since we moved from the found position,
- we definitely contradict the found anchored
- substr. Due to the above check we do not
- contradict "check" substr.
- Thus we can arrive here only if check substr
- is float. Redo checking for "other"=="fixed".
- */
- strpos = t + 1;
- DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
- PL_colors[0], PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
- goto do_other_anchored;
- }
- /* We don't contradict the found floating substring. */
- /* XXXX Why not check for STCLASS? */
- s = t + 1;
- DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
- PL_colors[0], PL_colors[1], (long)(s - i_strpos)));
- goto set_useful;
- }
- /* Position contradicts check-string */
- /* XXXX probably better to look for check-string
- than for "\n", so one should lower the limit for t? */
- DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
- PL_colors[0], PL_colors[1], (long)(t + 1 - i_strpos)));
- other_last = strpos = s = t + 1;
- goto restart;
- }
- t++;
- }
- DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
- PL_colors[0], PL_colors[1]));
- goto fail_finish;
- }
- else {
- DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
- PL_colors[0], PL_colors[1]));
- }
- s = t;
- set_useful:
+
+ DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " try at offset...\n"));
++BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr); /* hooray/5 */
}
else {
@@ -1064,20 +1183,6 @@ Perl_re_intuit_start(pTHX_
- no optimization of calling REx engine can be performed,
unless it was an MBOL and we are not after MBOL,
or a future STCLASS check will fail this. */
- try_at_start:
- /* Even in this situation we may use MBOL flag if strpos is offset
- wrt the start of the string. */
- if (ml_anch && (strpos != strbeg) && strpos[-1] != '\n'
- /* May be due to an implicit anchor of m{.*foo} */
- && !(prog->intflags & PREGf_IMPLICIT))
- {
- t = strpos;
- goto find_anchor;
- }
- DEBUG_EXECUTE_r( if (ml_anch)
- PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
- (long)(strpos - i_strpos), PL_colors[0], PL_colors[1]);
- );
success_at_start:
if (!(prog->intflags & PREGf_NAUGHTY) /* XXXX If strpos moved? */
&& (utf8_target ? (
@@ -1091,14 +1196,13 @@ Perl_re_intuit_start(pTHX_
)))
{
/* If flags & SOMETHING - do not do it many times on the same match */
- DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
+ DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " ... Disabling check substring...\n"));
/* XXX Does the destruction order has to change with utf8_target? */
SvREFCNT_dec(utf8_target ? prog->check_utf8 : prog->check_substr);
SvREFCNT_dec(utf8_target ? prog->check_substr : prog->check_utf8);
prog->check_substr = prog->check_utf8 = NULL; /* disable */
prog->float_substr = prog->float_utf8 = NULL; /* clear */
check = NULL; /* abort */
- s = strpos;
/* XXXX If the check string was an implicit check MBOL, then we need to unset the relevant flag
see http://bugs.activestate.com/show_bug.cgi?id=87173 */
if (prog->intflags & PREGf_IMPLICIT) {
@@ -1113,8 +1217,6 @@ Perl_re_intuit_start(pTHX_
prog->extflags &= ~RXf_USE_INTUIT;
/* XXXX What other flags might need to be cleared in this branch? */
}
- else
- s = strpos;
}
/* Last resort... */
@@ -1124,13 +1226,15 @@ Perl_re_intuit_start(pTHX_
if (progi->regstclass && PL_regkind[OP(progi->regstclass)]!=TRIE) {
/* minlen == 0 is possible if regstclass is \b or \B,
and the fixed substr is ''$.
- Since minlen is already taken into account, s+1 is before strend;
- accidentally, minlen >= 1 guaranties no false positives at s + 1
+ Since minlen is already taken into account, rx_origin+1 is before strend;
+ accidentally, minlen >= 1 guaranties no false positives at rx_origin + 1
even for \b or \B. But (minlen? 1 : 0) below assumes that
regstclass does not come from lookahead... */
/* If regstclass takes bytelength more than 1: If charlength==1, OK.
This leaves EXACTF-ish only, which are dealt with in find_byclass(). */
const U8* const str = (U8*)STRING(progi->regstclass);
+ char *t;
+
/* XXX this value could be pre-computed */
const int cl_l = (PL_regkind[OP(progi->regstclass)] == EXACT
? (reginfo->is_utf8_pat
@@ -1138,6 +1242,7 @@ Perl_re_intuit_start(pTHX_
: STR_LEN(progi->regstclass))
: 1);
char * endpos;
+ char *s = rx_origin;
if (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
endpos= HOP3c(s, (prog->minlen ? cl_l : 0), strend);
else if (prog->float_substr || prog->float_utf8)
@@ -1147,8 +1252,12 @@ Perl_re_intuit_start(pTHX_
if (checked_upto < s)
checked_upto = s;
- DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" s: %"IVdf" endpos: %"IVdf" checked_upto: %"IVdf"\n",
- (IV)start_shift, (IV)(check_at - strbeg), (IV)(s - strbeg), (IV)(endpos - strbeg), (IV)(checked_upto- strbeg)));
+ DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
+ " looking for class: start_shift: %"IVdf" check_at: %"IVdf
+ " s: %"IVdf" endpos: %"IVdf" checked_upto: %"IVdf"\n",
+ (IV)start_shift, (IV)(check_at - strbeg),
+ (IV)(s - strbeg), (IV)(endpos - strbeg),
+ (IV)(checked_upto- strbeg)));
t = s;
s = find_byclass(prog, progi->regstclass, checked_upto, endpos,
@@ -1161,15 +1270,15 @@ Perl_re_intuit_start(pTHX_
#endif
if (endpos == strend) {
DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
- "Could not match STCLASS...\n") );
+ " Could not match STCLASS...\n") );
goto fail;
}
DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
- "This position contradicts STCLASS...\n") );
+ " This position contradicts STCLASS...\n") );
if ((prog->intflags & PREGf_ANCH) && !ml_anch)
goto fail;
checked_upto = HOPBACKc(endpos, start_shift);
- DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" endpos: %"IVdf" checked_upto: %"IVdf"\n",
+ DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " start_shift: %"IVdf" check_at: %"IVdf" endpos: %"IVdf" checked_upto: %"IVdf"\n",
(IV)start_shift, (IV)(check_at - strbeg), (IV)(endpos - strbeg), (IV)(checked_upto- strbeg)));
/* Contradict one of substrings */
if (prog->anchored_substr || prog->anchored_utf8) {
@@ -1180,38 +1289,45 @@ Perl_re_intuit_start(pTHX_
if (s + start_shift + end_shift > strend) {
/* XXXX Should be taken into account earlier? */
DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
- "Could not match STCLASS...\n") );
+ " Could not match STCLASS...\n") );
goto fail;
}
+ rx_origin = s;
if (!check)
goto giveup;
DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
- "Looking for %s substr starting at offset %ld...\n",
- what, (long)(s + start_shift - i_strpos)) );
+ " Looking for %s substr starting at offset %ld...\n",
+ what, (long)(rx_origin + start_shift - strpos)) );
goto restart;
}
/* Have both, check_string is floating */
if (t + start_shift >= check_at) /* Contradicts floating=check */
goto retry_floating_check;
/* Recheck anchored substring, but not floating... */
- s = check_at;
- if (!check)
+ if (!check) {
+ rx_origin = NULL;
goto giveup;
+ }
DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
- "Looking for anchored substr starting at offset %ld...\n",
- (long)(other_last - i_strpos)) );
- goto do_other_anchored;
+ " Looking for anchored substr starting at offset %ld...\n",
+ (long)(other_last - strpos)) );
+ assert(prog->substrs->check_ix); /* other is float */
+ goto do_other_substr;
}
/* Another way we could have checked stclass at the
current position only: */
if (ml_anch) {
- s = t = t + 1;
+ s = rx_origin = t + 1;
if (!check)
goto giveup;
DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
- "Looking for /%s^%s/m starting at offset %ld...\n",
- PL_colors[0], PL_colors[1], (long)(t - i_strpos)) );
- goto try_at_offset;
+ " Looking for /%s^%s/m starting at offset %ld...\n",
+ PL_colors[0], PL_colors[1],
+ (long)(rx_origin - strpos)) );
+ /* XXX DAPM I don't yet know why this is true, but the code
+ * assumed it when it used to do goto try_at_offset */
+ assert(rx_origin != strpos);
+ goto postprocess_substr_matches;
}
if (!(utf8_target ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
goto fail;
@@ -1223,21 +1339,21 @@ Perl_re_intuit_start(pTHX_
}
if (t != s) {
DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
- "By STCLASS: moving %ld --> %ld\n",
- (long)(t - i_strpos), (long)(s - i_strpos))
+ " By STCLASS: moving %ld --> %ld\n",
+ (long)(t - strpos), (long)(s - strpos))
);
}
else {
DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
- "Does not contradict STCLASS...\n");
+ " Does not contradict STCLASS...\n");
);
}
}
giveup:
- DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
- PL_colors[4], (check ? "Guessed" : "Giving up"),
- PL_colors[5], (long)(s - i_strpos)) );
- return s;
+ DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Intuit: %s%s:%s match at offset %ld\n",
+ PL_colors[4], (check ? "Successfully guessed" : "Giving up"),
+ PL_colors[5], (long)(rx_origin - strpos)) );
+ return rx_origin;
fail_finish: /* Substring not found */
if (prog->check_substr || prog->check_utf8) /* could be removed already */
@@ -2699,7 +2815,7 @@ Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend,
dontbother = 0;
strend = HOPc(strend, -dontbother);
while ( (s <= last) &&
- (s = fbm_instr((unsigned char*)HOP3(s, back_min, (back_min<0 ? strbeg : strend)),
+ (s = fbm_instr((unsigned char*)HOP4c(s, back_min, strbeg, strend),
(unsigned char*)strend, must,
multiline ? FBMrf_MULTILINE : 0)) ) {
DEBUG_EXECUTE_r( did_match = 1 );
@@ -7688,11 +7804,6 @@ S_reghop3(U8 *s, SSize_t off, const U8* lim)
return s;
}
-#ifdef XXX_dmq
-/* there are a bunch of places where we use two reghop3's that should
- be replaced with this routine. but since thats not done yet
- we ifdef it out - dmq
-*/
STATIC U8 *
S_reghop4(U8 *s, SSize_t off, const U8* llim, const U8* rlim)
{
@@ -7718,7 +7829,6 @@ S_reghop4(U8 *s, SSize_t off, const U8* llim, const U8* rlim)
}
return s;
}
-#endif
STATIC U8 *
S_reghopmaybe3(U8* s, SSize_t off, const U8* lim)
diff --git a/regexp.h b/regexp.h
index b6091364ba..d32e669a4c 100644
--- a/regexp.h
+++ b/regexp.h
@@ -36,13 +36,14 @@ struct regexp_engine;
struct regexp;
struct reg_substr_datum {
- SSize_t min_offset;
- SSize_t max_offset;
+ SSize_t min_offset; /* min pos (in chars) that substr must appear */
+ SSize_t max_offset /* max pos (in chars) that substr must appear */;
SV *substr; /* non-utf8 variant */
SV *utf8_substr; /* utf8 variant */
- SSize_t end_shift;
+ SSize_t end_shift; /* how many fixed chars must end the string */
};
struct reg_substr_data {
+ U8 check_ix; /* index into data[] of check substr */
struct reg_substr_datum data[3]; /* Actual array */
};
diff --git a/t/re/pat.t b/t/re/pat.t
index 91274e60c7..79c7e6aaf9 100644
--- a/t/re/pat.t
+++ b/t/re/pat.t
@@ -20,7 +20,7 @@ BEGIN {
require './test.pl';
}
-plan tests => 712; # Update this when adding/deleting tests.
+plan tests => 717; # Update this when adding/deleting tests.
run_tests() unless caller;
@@ -1513,6 +1513,33 @@ EOP
is($i, 0, "RT 120446: mustn't run slowly");
}
+ {
+ # [perl #120692]
+ # these tests should be virtually instantaneous. If they take 10s of
+ # seconds, there's a bug in intuit_start.
+
+ my $s = 'ab' x 1_000_000;
+ utf8::upgrade($s);
+ 1 while $s =~ m/\Ga+ba+b/g;
+ pass("RT#120692 \\G mustn't run slowly");
+
+ $s=~ /^a{1,2}x/ for 1..10_000;
+ pass("RT#120692 a{1,2} mustn't run slowly");
+
+ $s=~ /ab.{1,2}x/;
+ pass("RT#120692 ab.{1,2} mustn't run slowly");
+
+ $s = "-a-bc" x 250_000;
+ $s .= "1a1bc";
+ utf8::upgrade($s);
+ ok($s =~ /\da\d{0,30000}bc/, "\\d{30000}");
+
+ $s = "-ab\n" x 250_000;
+ $s .= "abx";
+ ok($s =~ /^ab.*x/m, "distant float with /m");
+
+ }
+
# These are based on looking at the code in regcomp.c
# We don't look for specific code, just the existence of an SSC
foreach my $re (qw( qr/a?c/
diff --git a/t/re/re_tests b/t/re/re_tests
index d5c66f76fa..577a9f0b15 100644
--- a/t/re/re_tests
+++ b/t/re/re_tests
@@ -1848,5 +1848,13 @@ A+(*PRUNE)BC(?{}) AAABC y $& AAABC
/^\S+=/d \x{3a3}=\x{3a0} y $& \x{3a3}=
/^\S+=/u \x{3a3}=\x{3a0} y $& \x{3a3}=
+# utf8 cache length panics
+\x{100}[xy]\x{100}{2} \x{100}y\x{100}\x{100} y $& \x{100}y\x{100}\x{100}
+\x{100}a{2,3} \x{100}aaa y $& \x{100}aaa
+^x?abc?de abcde y $& abcde
+
+'(?-m:^abc)'m abcde y $& abc
+'(?-m:^abc)'m x\nabcde n - -
+
# Keep these lines at the end of the file
# vim: softtabstop=0 noexpandtab