summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorNorihiro Tanaka <noritnk@kcn.ne.jp>2014-05-13 10:30:21 +0900
committerPaul Eggert <eggert@cs.ucla.edu>2014-09-27 20:56:42 -0700
commitfaed4fd7cee4e452f8d3bd27c87fbbd92d56ae2e (patch)
tree926de421044d2f205d9321625807f4ad7fafa3b6
parent3e09538374d9f03c735e86c98bbf41191f73906a (diff)
downloadgrep-faed4fd7cee4e452f8d3bd27c87fbbd92d56ae2e.tar.gz
dfa: speed-up at initial state
DFA state is always 0 until have found potential match. So we improve matching there by continuing to use the transition table. * src/dfa.c (skip_remains_mb): New function. (dfaexec): Speed-up at initial state.
-rw-r--r--src/dfa.c54
1 files changed, 35 insertions, 19 deletions
diff --git a/src/dfa.c b/src/dfa.c
index d903c7de..5930e542 100644
--- a/src/dfa.c
+++ b/src/dfa.c
@@ -3237,6 +3237,24 @@ transit_state (struct dfa *d, state_num s, unsigned char const **pp,
return s1;
}
+/* The initial state may encounter a byte which is not a single byte character
+ nor the first byte of a multibyte character. But it is incorrect for the
+ initial state to accept such a byte. For example, in Shift JIS the regular
+ expression "\\" accepts the codepoint 0x5c, but should not accept the second
+ byte of the codepoint 0x815c. Then the initial state must skip the bytes
+ that are not a single byte character nor the first byte of a multibyte
+ character. */
+static unsigned char const *
+skip_remains_mb (struct dfa *d, unsigned char const *p,
+ unsigned char const *mbp, char const *end)
+{
+ wint_t wc;
+ while (mbp < p)
+ mbp += mbs_to_wchar (&wc, (char const *) mbp,
+ end - (char const *) mbp, d);
+ return mbp;
+}
+
/* Search through a buffer looking for a match to the given struct dfa.
Find the first occurrence of a string matching the regexp in the
buffer, and the shortest possible version thereof. Return a pointer to
@@ -3293,27 +3311,18 @@ dfaexec (struct dfa *d, char const *begin, char *end,
if (s == 0)
{
- /* The initial state may encounter a byte which is not
- a single byte character nor the first byte of a
- multibyte character. But it is incorrect for the
- initial state to accept such a byte. For example,
- in Shift JIS the regular expression "\\" accepts
- the codepoint 0x5c, but should not accept the second
- byte of the codepoint 0x815c. Then the initial
- state must skip the bytes that are not a single
- byte character nor the first byte of a multibyte
- character. */
- wint_t wc;
- while (mbp < p)
- mbp += mbs_to_wchar (&wc, (char const *) mbp,
- end - (char const *) mbp, d);
- p = mbp;
-
- if ((char *) p > end)
+ if (d->states[s].mbps.nelem == 0)
{
- p = NULL;
- goto done;
+ do
+ {
+ while (t[*p] == 0)
+ p++;
+ p = mbp = skip_remains_mb (d, p, mbp, end);
+ }
+ while (t[*p] == 0);
}
+ else
+ p = mbp = skip_remains_mb (d, p, mbp, end);
}
if (d->states[s].mbps.nelem == 0)
@@ -3341,6 +3350,13 @@ dfaexec (struct dfa *d, char const *begin, char *end,
}
else
{
+ if (s == 0 && (t = trans[s]) != NULL)
+ {
+ while (t[*p] == 0)
+ p++;
+ s = t[*p++];
+ }
+
while ((t = trans[s]) != NULL)
{
s1 = t[*p++];