summaryrefslogtreecommitdiff
path: root/lib/dfa.c
diff options
context:
space:
mode:
authorNorihiro Tanaka <noritnk@kcn.ne.jp>2020-09-12 18:51:55 -0700
committerPaul Eggert <eggert@cs.ucla.edu>2020-09-12 18:53:36 -0700
commitda0e8454a8e68035ef4b87dbb9097f85df6ece27 (patch)
tree004978b5276a5715f557e5a3da71402d520068d1 /lib/dfa.c
parentd468be5b5950bc5f2e0a5a4fbaeb0ea6a88c4c9f (diff)
downloadgnulib-da0e8454a8e68035ef4b87dbb9097f85df6ece27.tar.gz
dfa: use backward set in removal of epsilon closure
When removing in epsilon closure, the code searched all nodes sequentially, and this was slow for some cases. Build a backward set before search, and only check previous position with the set. Problem reported in <https://bugs.gnu.org/40634>. * lib/dfa.c (struct dfa): New member 'epsilon'. (addtok_mb): Check whether a pattern has epsilon node or not. (epsclosure): New arg BACKWORD; caller changed. When removing epsilon node and reconnecting, check only previous positions. Treat BEG as if it were character. (dfaanalyze): Build backward set.
Diffstat (limited to 'lib/dfa.c')
-rw-r--r--lib/dfa.c65
1 files changed, 51 insertions, 14 deletions
diff --git a/lib/dfa.c b/lib/dfa.c
index 1f0587a7a3..7c8eb66852 100644
--- a/lib/dfa.c
+++ b/lib/dfa.c
@@ -488,6 +488,7 @@ struct dfa
bool fast; /* The DFA is fast. */
token utf8_anychar_classes[9]; /* To lower ANYCHAR in UTF-8 locales. */
mbstate_t mbs; /* Multibyte conversion state. */
+ bool epsilon;
/* The following are valid only if MB_CUR_MAX > 1. */
@@ -1615,13 +1616,21 @@ addtok_mb (struct dfa *dfa, token t, char mbprop)
dfa->parse.depth--;
break;
- case BACKREF:
- dfa->fast = false;
+ case BEGLINE:
+ case ENDLINE:
+ case BEGWORD:
+ case ENDWORD:
+ case LIMWORD:
+ case NOTLIMWORD:
+ case EMPTY:
+ dfa->epsilon = true;
FALLTHROUGH;
+
default:
- dfa->nleaves++;
- FALLTHROUGH;
- case EMPTY:
+ if (t == BACKREF)
+ dfa->fast = false;
+ if (t != EMPTY)
+ dfa->nleaves++;
dfa->parse.depth++;
break;
}
@@ -2297,14 +2306,15 @@ state_index (struct dfa *d, position_set const *s, int context)
constraint. Repeat exhaustively until no funny positions are left.
S->elems must be large enough to hold the result. */
static void
-epsclosure (struct dfa const *d)
+epsclosure (struct dfa const *d, position_set *backword)
{
position_set tmp;
alloc_position_set (&tmp, d->nleaves);
for (idx_t i = 0; i < d->tindex; i++)
if (d->follows[i].nelem > 0 && d->tokens[i] >= NOTCHAR
&& d->tokens[i] != BACKREF && d->tokens[i] != ANYCHAR
- && d->tokens[i] != MBCSET && d->tokens[i] < CSET)
+ && d->tokens[i] != MBCSET && d->tokens[i] < CSET
+ && d->tokens[i] != BEG)
{
unsigned int constraint;
switch (d->tokens[i])
@@ -2327,16 +2337,21 @@ epsclosure (struct dfa const *d)
case NOTLIMWORD:
constraint = NOTLIMWORD_CONSTRAINT;
break;
- default:
+ case EMPTY:
constraint = NO_CONSTRAINT;
break;
+ default:
+ abort ();
}
delete (i, &d->follows[i]);
- for (idx_t j = 0; j < d->tindex; j++)
- if (i != j && d->follows[j].nelem > 0)
- replace (&d->follows[j], i, &d->follows[i], constraint, &tmp);
+ for (idx_t j = 0; j < backword[i].nelem; j++)
+ replace (&d->follows[backword[i].elems[j].index], i, &d->follows[i],
+ constraint, &tmp);
+ for (idx_t j = 0; j < d->follows[i].nelem; j++)
+ replace (&backword[d->follows[i].elems[j].index], i, &backword[i],
+ NO_CONSTRAINT, &tmp);
}
free (tmp.elems);
}
@@ -2643,6 +2658,7 @@ dfaanalyze (struct dfa *d, bool searchflag)
/* Firstpos and lastpos elements. */
position *firstpos = posalloc;
position *lastpos = firstpos + d->nleaves;
+ position_set *backword = NULL;
position pos;
position_set tmp;
@@ -2675,6 +2691,9 @@ dfaanalyze (struct dfa *d, bool searchflag)
alloc_position_set (&merged, d->nleaves);
d->follows = xcalloc (d->tindex, sizeof *d->follows);
+ if (d->epsilon)
+ backword = xcalloc (d->tindex, sizeof *backword);
+
for (idx_t i = 0; i < d->tindex; i++)
{
switch (d->tokens[i])
@@ -2712,6 +2731,17 @@ dfaanalyze (struct dfa *d, bool searchflag)
case CAT:
/* Every element in the firstpos of the second argument is in the
follow of every element in the lastpos of the first argument. */
+ if (d->epsilon)
+ {
+ tmp.nelem = stk[-2].nlastpos;
+ tmp.elems = lastpos - stk[-1].nlastpos - stk[-2].nlastpos;
+ position *p = firstpos - stk[-1].nfirstpos;
+ for (idx_t j = 0; j < stk[-1].nfirstpos; j++)
+ {
+ merge (&tmp, &backword[p[j].index], &merged);
+ copy (&merged, &backword[p[j].index]);
+ }
+ }
{
tmp.nelem = stk[-1].nfirstpos;
tmp.elems = firstpos - stk[-1].nfirstpos;
@@ -2801,9 +2831,15 @@ dfaanalyze (struct dfa *d, bool searchflag)
#endif
}
- /* For each follow set that is the follow set of a real position, replace
- it with its epsilon closure. */
- epsclosure (d);
+ if (d->epsilon)
+ {
+ /* For each follow set that is the follow set of a real position,
+ replace it with its epsilon closure. */
+ epsclosure (d, backword);
+
+ for (idx_t i = 0; i < d->tindex; i++)
+ free (backword[i].elems);
+ }
dfaoptimize (d);
@@ -2865,6 +2901,7 @@ dfaanalyze (struct dfa *d, bool searchflag)
d->min_trcount++;
d->trcount = 0;
+ free (backword);
free (posalloc);
free (stkalloc);
free (merged.elems);