summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--ext/re/t/regop.t18
-rw-r--r--regcomp.c85
-rw-r--r--regcomp.h14
-rw-r--r--regexec.c387
-rw-r--r--regexp.h15
-rw-r--r--t/op/svleak.t5
6 files changed, 302 insertions, 222 deletions
diff --git a/ext/re/t/regop.t b/ext/re/t/regop.t
index 46e6ec04f5..c24c32f95c 100644
--- a/ext/re/t/regop.t
+++ b/ext/re/t/regop.t
@@ -41,7 +41,9 @@ foreach my $testout ( @tests ) {
s/\s+$//;
ok( $testout=~/\Q$_\E/, "$_: /$pattern/" )
or do {
- !$diaged++ and diag("$_: /$pattern/\n$testout");
+ !$diaged++ and diag("PATTERN: /$pattern/\n\n"
+ . "EXPECTED:\n$_\n\n"
+ . "WITHIN GOT:\n$testout");
};
}
}
@@ -152,16 +154,17 @@ minlen 3
# # 8| W 4 @ 0
# # 9| W 5 @ 0
# # A| W 6 @ 0
+# word_info N:(prev,char)= 1:(0,1) 2:(0,1) 3:(0,1) 4:(0,1) 5:(0,1) 6:(0,1)
# Final program:
-# 1: EXACT <ABC>(3)
-# 3: TRIEC-EXACT<S:4/10 W:6 L:1/1 C:24/7>[A-EGP](20)
+# 1: EXACT <ABC> (3)
+# 3: TRIEC-EXACT<S:4/10 W:6 L:1/1 C:24/7>[A-EGP] (20)
# <P>
# <G>
# <E>
# <B>
# <A>
# <D>
-# 20: END(0)
+# 20: END (0)
# anchored "ABC" at 0 (checking anchored) minlen 4
# Offsets: [20]
# 1:4[3] 3:4[15] 19:32[0] 20:34[0]
@@ -172,10 +175,10 @@ minlen 3
# 0 <> <ABCD> | 1:EXACT <ABC>(3)
# 3 <ABC> <D> | 3:TRIEC-EXACT<S:4/10 W:6 L:1/1 C:24/7>[A-EGP](20)
# 3 <ABC> <D> | State: 4 Accepted: 0 Charid: 7 CP: 44 After State: a
-# 4 <ABCD> <> | State: a Accepted: 1 Charid: 6 CP: 0 After State: 0
+# 4 <ABCD> <> | State: a Accepted: 1 Charid: 7 CP: 0 After State: 0
# got 1 possible matches
-# only one match left: #6 <D>
-# 4 <ABCD> <> | 20:END(0)
+# TRIE matched word #6, continuing
+# 4 <ABCD> <> | 20: END(0)
# Match successful!
# %MATCHED%
# Freeing REx: "(?:ABCP|ABCG|ABCE|ABCB|ABCA|ABCD)"
@@ -183,7 +186,6 @@ minlen 3
EXACT <ABC>
TRIEC-EXACT
[A-EGP]
-only one match left: #6 <D>
S:4/10
W:6
L:1/1
diff --git a/regcomp.c b/regcomp.c
index 1a815c67fd..f665f0b5d5 100644
--- a/regcomp.c
+++ b/regcomp.c
@@ -878,6 +878,7 @@ S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
U32 state;
SV *sv=sv_newmortal();
int colwidth= widecharmap ? 6 : 4;
+ U16 word;
GET_RE_DEBUG_FLAGS_DECL;
PERL_ARGS_ASSERT_DUMP_TRIE;
@@ -947,6 +948,13 @@ S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
}
PerlIO_printf( Perl_debug_log, "\n" );
}
+ PerlIO_printf(Perl_debug_log, "%*sword_info N:(prev,len)=", (int)depth*2, "");
+ for (word=1; word <= trie->wordcount; word++) {
+ PerlIO_printf(Perl_debug_log, " %d:(%d,%d)",
+ (int)word, (int)(trie->wordinfo[word].prev),
+ (int)(trie->wordinfo[word].len));
+ }
+ PerlIO_printf(Perl_debug_log, "\n" );
}
/*
Dumps a fully constructed but uncompressed trie in list form.
@@ -1077,6 +1085,7 @@ S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
#endif
+
/* make_trie(startbranch,first,last,tail,word_count,flags,depth)
startbranch: the first branch in the whole branch sequence
first : start branch of sequence of branch-exact nodes.
@@ -1257,8 +1266,6 @@ is the recommended Unicode-aware way of saying
U16 dupe= trie->states[ state ].wordnum; \
regnode * const noper_next = regnext( noper ); \
\
- if (trie->wordlen) \
- trie->wordlen[ curword ] = wordlen; \
DEBUG_r({ \
/* store the word for dumping */ \
SV* tmp; \
@@ -1270,6 +1277,9 @@ is the recommended Unicode-aware way of saying
}); \
\
curword++; \
+ trie->wordinfo[curword].prev = 0; \
+ trie->wordinfo[curword].len = wordlen; \
+ trie->wordinfo[curword].accept = state; \
\
if ( noper_next < tail ) { \
if (!trie->jump) \
@@ -1282,16 +1292,11 @@ is the recommended Unicode-aware way of saying
} \
\
if ( dupe ) { \
- /* So it's a dupe. This means we need to maintain a */\
- /* linked-list from the first to the next. */\
- /* we only allocate the nextword buffer when there */\
- /* a dupe, so first time we have to do the allocation */\
- if (!trie->nextword) \
- trie->nextword = (U16 *) \
- PerlMemShared_calloc( word_count + 1, sizeof(U16)); \
- while ( trie->nextword[dupe] ) \
- dupe= trie->nextword[dupe]; \
- trie->nextword[dupe]= curword; \
+ /* It's a dupe. Pre-insert into the wordinfo[].prev */\
+ /* chain, so that when the bits of chain are later */\
+ /* linked together, the dups appear in the chain */\
+ trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
+ trie->wordinfo[dupe].prev = curword; \
} else { \
/* we haven't inserted this word yet. */ \
trie->states[ state ].wordnum = curword; \
@@ -1329,6 +1334,7 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs
regnode *jumper = NULL;
regnode *nextbranch = NULL;
regnode *convert = NULL;
+ U32 *prev_states; /* temp array mapping each state to previous one */
/* we just use folder as a flag in utf8 */
const U8 * const folder = ( flags == EXACTF
? PL_fold
@@ -1364,6 +1370,9 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs
trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
if (!(UTF && folder))
trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
+ trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
+ trie->wordcount+1, sizeof(reg_trie_wordinfo));
+
DEBUG_r({
trie_words = newAV();
});
@@ -1496,7 +1505,6 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs
(int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
(int)trie->minlen, (int)trie->maxlen )
);
- trie->wordlen = (U32 *) PerlMemShared_calloc( word_count, sizeof(U32) );
/*
We now know what we are dealing with in terms of unique chars and
@@ -1520,6 +1528,9 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs
*/
+ Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
+ prev_states[1] = 0;
+
if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
/*
Second Pass -- Array Of Lists Representation
@@ -1590,6 +1601,7 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs
}
if ( ! newstate ) {
newstate = next_alloc++;
+ prev_states[newstate] = state;
TRIE_LIST_PUSH( state, charid, newstate );
transcount++;
}
@@ -1773,6 +1785,8 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs
if ( !trie->trans[ state + charid ].next ) {
trie->trans[ state + charid ].next = next_alloc;
trie->trans[ state ].check++;
+ prev_states[TRIE_NODENUM(next_alloc)]
+ = TRIE_NODENUM(state);
next_alloc += trie->uniquecharcount;
}
state = trie->trans[ state + charid ].next;
@@ -1920,9 +1934,6 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs
PerlMemShared_realloc( trie->trans, trie->lasttrans
* sizeof(reg_trie_trans) );
- /* and now dump out the compressed format */
- DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
-
{ /* Modify the program and insert the new TRIE node*/
U8 nodetype =(U8)(flags & 0xFF);
char *str=NULL;
@@ -2052,6 +2063,7 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs
break;
}
}
+ trie->prefixlen = (state-1);
if (str) {
regnode *n = convert+NODE_SZ_STR(convert);
NEXT_OFF(convert) = NODE_SZ_STR(convert);
@@ -2147,6 +2159,42 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs
Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
});
} /* end node insert */
+
+ /* Finish populating the prev field of the wordinfo array. Walk back
+ * from each accept state until we find another accept state, and if
+ * so, point the first word's .prev field at the second word. If the
+ * second already has a .prev field set, stop now. This will be the
+ * case either if we've already processed that word's accept state,
+ * or that that state had multiple words, and the overspill words
+ * were already linked up earlier.
+ */
+ {
+ U16 word;
+ U32 state;
+ U16 prev;
+
+ for (word=1; word <= trie->wordcount; word++) {
+ prev = 0;
+ if (trie->wordinfo[word].prev)
+ continue;
+ state = trie->wordinfo[word].accept;
+ while (state) {
+ state = prev_states[state];
+ if (!state)
+ break;
+ prev = trie->states[state].wordnum;
+ if (prev)
+ break;
+ }
+ trie->wordinfo[word].prev = prev;
+ }
+ Safefree(prev_states);
+ }
+
+
+ /* and now dump out the compressed format */
+ DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
+
RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
#ifdef DEBUGGING
RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
@@ -9571,12 +9619,9 @@ Perl_regfree_internal(pTHX_ REGEXP * const rx)
PerlMemShared_free(trie->trans);
if (trie->bitmap)
PerlMemShared_free(trie->bitmap);
- if (trie->wordlen)
- PerlMemShared_free(trie->wordlen);
if (trie->jump)
PerlMemShared_free(trie->jump);
- if (trie->nextword)
- PerlMemShared_free(trie->nextword);
+ PerlMemShared_free(trie->wordinfo);
/* do this last!!!! */
PerlMemShared_free(ri->data->data[n]);
}
diff --git a/regcomp.h b/regcomp.h
index 20b4401ed2..a20d6e11bd 100644
--- a/regcomp.h
+++ b/regcomp.h
@@ -586,6 +586,15 @@ struct _reg_trie_state {
} trans;
};
+/* info per word; indexed by wordnum */
+typedef struct {
+ U16 prev; /* previous word in acceptance chain; eg in
+ * zzz|abc|ab/ after matching the chars abc, the
+ * accepted word is #2, and the previous accepted
+ * word is #3 */
+ U32 len; /* how many chars long is this word? */
+ U32 accept; /* accept state for this word */
+} reg_trie_wordinfo;
typedef struct _reg_trie_state reg_trie_state;
@@ -603,15 +612,14 @@ struct _reg_trie_data {
reg_trie_state *states; /* state data */
reg_trie_trans *trans; /* array of transition elements */
char *bitmap; /* stclass bitmap */
- U32 *wordlen; /* array of lengths of words */
U16 *jump; /* optional 1 indexed array of offsets before tail
for the node following a given word. */
- U16 *nextword; /* optional 1 indexed array to support linked list
- of duplicate wordnums */
+ reg_trie_wordinfo *wordinfo; /* array of info per word */
U16 uniquecharcount; /* unique chars in trie (width of trans table) */
U32 startstate; /* initial state - used for common prefix optimisation */
STRLEN minlen; /* minimum length of words in trie - build/opt only? */
STRLEN maxlen; /* maximum length of words in trie - build/opt only? */
+ U32 prefixlen; /* #chars in common prefix */
U32 statecount; /* Build only - number of states in the states array
(including the unused zero state) */
U32 wordcount; /* Build only */
diff --git a/regexec.c b/regexec.c
index 7222efe8c7..4aa68efde0 100644
--- a/regexec.c
+++ b/regexec.c
@@ -1736,7 +1736,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
}
if ( word ) {
- U8 *lpos= points[ (pointpos - trie->wordlen[word-1] ) % maxlen ];
+ U8 *lpos= points[ (pointpos - trie->wordinfo[word].len) % maxlen ];
if (!leftmost || lpos < leftmost) {
DEBUG_r(accepted_word=word);
leftmost= lpos;
@@ -1810,7 +1810,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
}
}
if ( aho->states[ state ].wordnum ) {
- U8 *lpos = points[ (pointpos - trie->wordlen[aho->states[ state ].wordnum-1]) % maxlen ];
+ U8 *lpos = points[ (pointpos - trie->wordinfo[aho->states[ state ].wordnum].len) % maxlen ];
if (!leftmost || lpos < leftmost) {
DEBUG_r(accepted_word=aho->states[ state ].wordnum);
leftmost = lpos;
@@ -2505,9 +2505,6 @@ S_regtry(pTHX_ regmatch_info *reginfo, char **startpos)
#define REPORT_CODE_OFF 32
-/* Make sure there is a test for this +1 options in re_tests */
-#define TRIE_INITAL_ACCEPT_BUFFLEN 4;
-
#define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */
#define CHRTEST_VOID -1000 /* the c1/c2 "next char" test should be skipped */
@@ -3069,6 +3066,50 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog)
}
/* FALL THROUGH */
case TRIE:
+ /* the basic plan of execution of the trie is:
+ * At the beginning, run though all the states, and
+ * find the longest-matching word. Also remember the position
+ * of the shortest matching word. For example, this pattern:
+ * 1 2 3 4 5
+ * ab|a|x|abcd|abc
+ * when matched against the string "abcde", will generate
+ * accept states for all words except 3, with the longest
+ * matching word being 4, and the shortest being 1 (with
+ * the position being after char 1 of the string).
+ *
+ * Then for each matching word, in word order (i.e. 1,2,4,5),
+ * we run the remainder of the pattern; on each try setting
+ * the current position to the character following the word,
+ * returning to try the next word on failure.
+ *
+ * We avoid having to build a list of words at runtime by
+ * using a compile-time structure, wordinfo[].prev, which
+ * gives, for each word, the previous accepting word (if any).
+ * In the case above it would contain the mappings 1->2, 2->0,
+ * 3->0, 4->5, 5->1. We can use this table to generate, from
+ * the longest word (4 above), a list of all words, by
+ * following the list of prev pointers; this gives us the
+ * unordered list 4,5,1,2. Then given the current word we have
+ * just tried, we can go through the list and find the
+ * next-biggest word to try (so if we just failed on word 2,
+ * the next in the list is 4).
+ *
+ * Since at runtime we don't record the matching position in
+ * the string for each word, we have to work that out for
+ * each word we're about to process. The wordinfo table holds
+ * the character length of each word; given that we recorded
+ * at the start: the position of the shortest word and its
+ * length in chars, we just need to move the pointer the
+ * difference between the two char lengths. Depending on
+ * Unicode status and folding, that's cheap or expensive.
+ *
+ * This algorithm is optimised for the case where are only a
+ * small number of accept states, i.e. 0,1, or maybe 2.
+ * With lots of accepts states, and having to try all of them,
+ * it becomes quadratic on number of accept states to find all
+ * the next words.
+ */
+
{
/* what type of TRIE am I? (utf8 makes this contextual) */
DECL_TRIE_TYPE(scan);
@@ -3105,76 +3146,62 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog)
STRLEN len = 0;
STRLEN foldlen = 0;
U8 *uscan = (U8*)NULL;
- STRLEN bufflen=0;
- SV *sv_accept_buff = NULL;
U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
+ U32 charcount = 0; /* how many input chars we have matched */
+ U32 accepted = 0; /* have we seen any accepting states? */
- ST.accepted = 0; /* how many accepting states we have seen */
ST.B = next;
ST.jump = trie->jump;
ST.me = scan;
- /*
- traverse the TRIE keeping track of all accepting states
- we transition through until we get to a failing node.
- */
+ ST.firstpos = NULL;
+ ST.longfold = FALSE; /* char longer if folded => it's harder */
+ ST.nextword = 0;
+
+ /* fully traverse the TRIE; note the position of the
+ shortest accept state and the wordnum of the longest
+ accept state */
while ( state && uc <= (U8*)PL_regeol ) {
U32 base = trie->states[ state ].trans.base;
UV uvc = 0;
U16 charid;
- /* We use charid to hold the wordnum as we don't use it
- for charid until after we have done the wordnum logic.
- We define an alias just so that the wordnum logic reads
- more naturally. */
-
-#define got_wordnum charid
- got_wordnum = trie->states[ state ].wordnum;
-
- if ( got_wordnum ) {
- if ( ! ST.accepted ) {
- ENTER;
- SAVETMPS; /* XXX is this necessary? dmq */
- bufflen = TRIE_INITAL_ACCEPT_BUFFLEN;
- sv_accept_buff=newSV(bufflen *
- sizeof(reg_trie_accepted) - 1);
- SvCUR_set(sv_accept_buff, 0);
- SvPOK_on(sv_accept_buff);
- sv_2mortal(sv_accept_buff);
- SAVETMPS;
- ST.accept_buff =
- (reg_trie_accepted*)SvPV_nolen(sv_accept_buff );
- }
- do {
- if (ST.accepted >= bufflen) {
- bufflen *= 2;
- ST.accept_buff =(reg_trie_accepted*)
- SvGROW(sv_accept_buff,
- bufflen * sizeof(reg_trie_accepted));
+ U16 wordnum;
+ wordnum = trie->states[ state ].wordnum;
+
+ if (wordnum) { /* it's an accept state */
+ if (!accepted) {
+ accepted = 1;
+ /* record first match position */
+ if (ST.longfold) {
+ ST.firstpos = (U8*)locinput;
+ ST.firstchars = 0;
}
- SvCUR_set(sv_accept_buff,SvCUR(sv_accept_buff)
- + sizeof(reg_trie_accepted));
-
-
- ST.accept_buff[ST.accepted].wordnum = got_wordnum;
- ST.accept_buff[ST.accepted].endpos = uc;
- ++ST.accepted;
- } while (trie->nextword && (got_wordnum= trie->nextword[got_wordnum]));
+ else {
+ ST.firstpos = uc;
+ ST.firstchars = charcount;
+ }
+ }
+ if (!ST.nextword || wordnum < ST.nextword)
+ ST.nextword = wordnum;
+ ST.topword = wordnum;
}
-#undef got_wordnum
DEBUG_TRIE_EXECUTE_r({
DUMP_EXEC_POS( (char *)uc, scan, do_utf8 );
PerlIO_printf( Perl_debug_log,
- "%*s %sState: %4"UVxf" Accepted: %4"UVxf" ",
+ "%*s %sState: %4"UVxf" Accepted: %c ",
2+depth * 2, "", PL_colors[4],
- (UV)state, (UV)ST.accepted );
+ (UV)state, (accepted ? 'Y' : 'N'));
});
+ /* read a char and goto next state */
if ( base ) {
REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
uscan, len, uvc, charid, foldlen,
foldbuf, uniflags);
-
+ charcount++;
+ if (foldlen>0)
+ ST.longfold = TRUE;
if (charid &&
(base + charid > trie->uniquecharcount )
&& (base + charid - 1 - trie->uniquecharcount
@@ -3200,77 +3227,38 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog)
charid, uvc, (UV)state, PL_colors[5] );
);
}
- if (!ST.accepted )
+ if (!accepted)
sayNO;
+ /* calculate total number of accept states */
+ {
+ U16 w = ST.topword;
+ accepted = 0;
+ while (w) {
+ w = trie->wordinfo[w].prev;
+ accepted++;
+ }
+ ST.accepted = accepted;
+ }
+
DEBUG_EXECUTE_r(
PerlIO_printf( Perl_debug_log,
"%*s %sgot %"IVdf" possible matches%s\n",
REPORT_CODE_OFF + depth * 2, "",
PL_colors[4], (IV)ST.accepted, PL_colors[5] );
);
+ goto trie_first_try; /* jump into the fail handler */
}}
- goto trie_first_try; /* jump into the fail handler */
/* NOTREACHED */
- case TRIE_next_fail: /* we failed - try next alterative */
+
+ case TRIE_next_fail: /* we failed - try next alternative */
if ( ST.jump) {
REGCP_UNWIND(ST.cp);
for (n = *PL_reglastparen; n > ST.lastparen; n--)
PL_regoffs[n].end = -1;
*PL_reglastparen = n;
}
- trie_first_try:
- if (do_cutgroup) {
- do_cutgroup = 0;
- no_final = 0;
- }
-
- if ( ST.jump) {
- ST.lastparen = *PL_reglastparen;
- REGCP_SET(ST.cp);
- }
- if ( ST.accepted == 1 ) {
- /* only one choice left - just continue */
- DEBUG_EXECUTE_r({
- AV *const trie_words
- = MUTABLE_AV(rexi->data->data[ARG(ST.me)+TRIE_WORDS_OFFSET]);
- SV ** const tmp = av_fetch( trie_words,
- ST.accept_buff[ 0 ].wordnum-1, 0 );
- SV *sv= tmp ? sv_newmortal() : NULL;
-
- PerlIO_printf( Perl_debug_log,
- "%*s %sonly one match left: #%d <%s>%s\n",
- REPORT_CODE_OFF+depth*2, "", PL_colors[4],
- ST.accept_buff[ 0 ].wordnum,
- tmp ? pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 0,
- PL_colors[0], PL_colors[1],
- (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)
- )
- : "not compiled under -Dr",
- PL_colors[5] );
- });
- PL_reginput = (char *)ST.accept_buff[ 0 ].endpos;
- /* in this case we free tmps/leave before we call regmatch
- as we wont be using accept_buff again. */
-
- locinput = PL_reginput;
- nextchr = UCHARAT(locinput);
- if ( !ST.jump || !ST.jump[ST.accept_buff[0].wordnum])
- scan = ST.B;
- else
- scan = ST.me + ST.jump[ST.accept_buff[0].wordnum];
- if (!has_cutgroup) {
- FREETMPS;
- LEAVE;
- } else {
- ST.accepted--;
- PUSH_YES_STATE_GOTO(TRIE_next, scan);
- }
-
- continue; /* execute rest of RE */
- }
-
- if ( !ST.accepted-- ) {
+ if (!--ST.accepted) {
DEBUG_EXECUTE_r({
PerlIO_printf( Perl_debug_log,
"%*s %sTRIE failed...%s\n",
@@ -3278,86 +3266,129 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog)
PL_colors[4],
PL_colors[5] );
});
- FREETMPS;
- LEAVE;
sayNO_SILENT;
- /*NOTREACHED*/
- }
+ }
+ {
+ /* Find next-highest word to process. Note that this code
+ * is O(N^2) per trie run (O(N) per branch), so keep tight */
+ register U32 min = 0;
+ register U32 word;
+ register U16 const nextword = ST.nextword;
+ register reg_trie_wordinfo * const wordinfo
+ = ((reg_trie_data*)rexi->data->data[ARG(ST.me)])->wordinfo;
+ for (word=ST.topword; word; word=wordinfo[word].prev) {
+ if (word > nextword && (!min || word < min))
+ min = word;
+ }
+ ST.nextword = min;
+ }
- /*
- There are at least two accepting states left. Presumably
- the number of accepting states is going to be low,
- typically two. So we simply scan through to find the one
- with lowest wordnum. Once we find it, we swap the last
- state into its place and decrement the size. We then try to
- match the rest of the pattern at the point where the word
- ends. If we succeed, control just continues along the
- regex; if we fail we return here to try the next accepting
- state
- */
+ trie_first_try:
+ if (do_cutgroup) {
+ do_cutgroup = 0;
+ no_final = 0;
+ }
- {
- U32 best = 0;
- U32 cur;
- for( cur = 1 ; cur <= ST.accepted ; cur++ ) {
- DEBUG_TRIE_EXECUTE_r(
- PerlIO_printf( Perl_debug_log,
- "%*s %sgot %"IVdf" (%d) as best, looking at %"IVdf" (%d)%s\n",
- REPORT_CODE_OFF + depth * 2, "", PL_colors[4],
- (IV)best, ST.accept_buff[ best ].wordnum, (IV)cur,
- ST.accept_buff[ cur ].wordnum, PL_colors[5] );
- );
+ if ( ST.jump) {
+ ST.lastparen = *PL_reglastparen;
+ REGCP_SET(ST.cp);
+ }
- if (ST.accept_buff[cur].wordnum <
- ST.accept_buff[best].wordnum)
- best = cur;
+ /* find start char of end of current word */
+ {
+ U32 chars; /* how many chars to skip */
+ U8 *uc = ST.firstpos;
+ reg_trie_data * const trie
+ = (reg_trie_data*)rexi->data->data[ARG(ST.me)];
+
+ assert((trie->wordinfo[ST.nextword].len - trie->prefixlen)
+ >= ST.firstchars);
+ chars = (trie->wordinfo[ST.nextword].len - trie->prefixlen)
+ - ST.firstchars;
+
+ if (ST.longfold) {
+ /* the hard option - fold each char in turn and find
+ * its folded length (which may be different */
+ U8 foldbuf[UTF8_MAXBYTES_CASE + 1];
+ STRLEN foldlen;
+ STRLEN len;
+ U8 uvc;
+ U8 *uscan;
+
+ while (chars) {
+ if (do_utf8) {
+ uvc = utf8n_to_uvuni((U8*)uc, UTF8_MAXLEN, &len,
+ uniflags);
+ uc += len;
+ }
+ else {
+ uvc = *uc;
+ uc++;
+ }
+ uvc = to_uni_fold(uvc, foldbuf, &foldlen);
+ uscan = foldbuf;
+ while (foldlen) {
+ if (!--chars)
+ break;
+ uvc = utf8n_to_uvuni(uscan, UTF8_MAXLEN, &len,
+ uniflags);
+ uscan += len;
+ foldlen -= len;
+ }
+ }
}
+ else {
+ if (do_utf8)
+ while (chars--)
+ uc += UTF8SKIP(uc);
+ else
+ uc += chars;
+ }
+ PL_reginput = (char *)uc;
+ }
- DEBUG_EXECUTE_r({
- AV *const trie_words
- = MUTABLE_AV(rexi->data->data[ARG(ST.me)+TRIE_WORDS_OFFSET]);
- SV ** const tmp = av_fetch( trie_words,
- ST.accept_buff[ best ].wordnum - 1, 0 );
- regnode *nextop=(!ST.jump || !ST.jump[ST.accept_buff[best].wordnum]) ?
- ST.B :
- ST.me + ST.jump[ST.accept_buff[best].wordnum];
- SV *sv= tmp ? sv_newmortal() : NULL;
-
- PerlIO_printf( Perl_debug_log,
- "%*s %strying alternation #%d <%s> at node #%d %s\n",
- REPORT_CODE_OFF+depth*2, "", PL_colors[4],
- ST.accept_buff[best].wordnum,
- tmp ? pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 0,
- PL_colors[0], PL_colors[1],
- (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)
- ) : "not compiled under -Dr",
- REG_NODE_NUM(nextop),
- PL_colors[5] );
- });
+ scan = (ST.jump && ST.jump[ST.nextword])
+ ? ST.me + ST.jump[ST.nextword]
+ : ST.B;
- if ( best<ST.accepted ) {
- reg_trie_accepted tmp = ST.accept_buff[ best ];
- ST.accept_buff[ best ] = ST.accept_buff[ ST.accepted ];
- ST.accept_buff[ ST.accepted ] = tmp;
- best = ST.accepted;
- }
- PL_reginput = (char *)ST.accept_buff[ best ].endpos;
- if ( !ST.jump || !ST.jump[ST.accept_buff[best].wordnum]) {
- scan = ST.B;
- } else {
- scan = ST.me + ST.jump[ST.accept_buff[best].wordnum];
- }
- PUSH_YES_STATE_GOTO(TRIE_next, scan);
- /* NOTREACHED */
+ DEBUG_EXECUTE_r({
+ PerlIO_printf( Perl_debug_log,
+ "%*s %sTRIE matched word #%d, continuing%s\n",
+ REPORT_CODE_OFF+depth*2, "",
+ PL_colors[4],
+ ST.nextword,
+ PL_colors[5]
+ );
+ });
+
+ if (ST.accepted > 1 || has_cutgroup) {
+ PUSH_STATE_GOTO(TRIE_next, scan);
+ /* NOTREACHED */
}
+ /* only one choice left - just continue */
+ DEBUG_EXECUTE_r({
+ AV *const trie_words
+ = MUTABLE_AV(rexi->data->data[ARG(ST.me)+TRIE_WORDS_OFFSET]);
+ SV ** const tmp = av_fetch( trie_words,
+ ST.nextword-1, 0 );
+ SV *sv= tmp ? sv_newmortal() : NULL;
+
+ PerlIO_printf( Perl_debug_log,
+ "%*s %sonly one match left, short-circuiting: #%d <%s>%s\n",
+ REPORT_CODE_OFF+depth*2, "", PL_colors[4],
+ ST.nextword,
+ tmp ? pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 0,
+ PL_colors[0], PL_colors[1],
+ (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)
+ )
+ : "not compiled under -Dr",
+ PL_colors[5] );
+ });
+
+ locinput = PL_reginput;
+ nextchr = UCHARAT(locinput);
+ continue; /* execute rest of RE */
/* NOTREACHED */
- case TRIE_next:
- /* we dont want to throw this away, see bug 57042*/
- if (oreplsv != GvSV(PL_replgv))
- sv_setsv(oreplsv, GvSV(PL_replgv));
- FREETMPS;
- LEAVE;
- sayYES;
#undef ST
case EXACT: {
diff --git a/regexp.h b/regexp.h
index 90e3406a96..a9dd2e1276 100644
--- a/regexp.h
+++ b/regexp.h
@@ -490,13 +490,6 @@ and check for NULL.
#define FBMrf_MULTILINE 1
-/* an accepting state/position*/
-struct _reg_trie_accepted {
- U8 *endpos;
- U16 wordnum;
-};
-typedef struct _reg_trie_accepted reg_trie_accepted;
-
/* some basic information about the current match that is created by
* Perl_regexec_flags and then passed to regtry(), regmatch() etc */
@@ -557,11 +550,15 @@ typedef struct regmatch_state {
U32 lastparen;
CHECKPOINT cp;
- reg_trie_accepted *accept_buff; /* accepting states we have seen */
- U32 accepted; /* how many accepting states we have seen */
+ U32 accepted; /* how many accepting states left */
U16 *jump; /* positive offsets from me */
regnode *B; /* node following the trie */
regnode *me; /* Which node am I - needed for jump tries*/
+ U8 *firstpos;/* pos in string of first trie match */
+ U32 firstchars;/* len in chars of firstpos from start */
+ U16 nextword;/* next word to try */
+ U16 topword; /* longest accepted word */
+ bool longfold;/* saw a fold with a 1->n char mapping */
} trie;
/* special types - these members are used to store state for special
diff --git a/t/op/svleak.t b/t/op/svleak.t
index 7b1f8f08c7..07c2efcb71 100644
--- a/t/op/svleak.t
+++ b/t/op/svleak.t
@@ -70,7 +70,4 @@ sub STORE { $_[0]->[$_[1]] = $_[2] }
# [perl #74484] repeated tries leaked SVs on the tmps stack
-{
- local $TODO = 'not fixed yet';
- leak_expr(5, 0, q{"YYYYYa" =~ /.+?(a(.+?)|b)/ }, "trie leak");
-}
+leak_expr(5, 0, q{"YYYYYa" =~ /.+?(a(.+?)|b)/ }, "trie leak");