diff options
author | Norihiro Tanaka <noritnk@kcn.ne.jp> | 2014-05-13 10:30:21 +0900 |
---|---|---|
committer | Paul Eggert <eggert@cs.ucla.edu> | 2014-09-27 20:56:42 -0700 |
commit | faed4fd7cee4e452f8d3bd27c87fbbd92d56ae2e (patch) | |
tree | 926de421044d2f205d9321625807f4ad7fafa3b6 | |
parent | 3e09538374d9f03c735e86c98bbf41191f73906a (diff) | |
download | grep-faed4fd7cee4e452f8d3bd27c87fbbd92d56ae2e.tar.gz |
dfa: speed-up at initial state
DFA state is always 0 until have found potential match. So we improve
matching there by continuing to use the transition table.
* src/dfa.c (skip_remains_mb): New function.
(dfaexec): Speed-up at initial state.
-rw-r--r-- | src/dfa.c | 54 |
1 files changed, 35 insertions, 19 deletions
@@ -3237,6 +3237,24 @@ transit_state (struct dfa *d, state_num s, unsigned char const **pp, return s1; } +/* The initial state may encounter a byte which is not a single byte character + nor the first byte of a multibyte character. But it is incorrect for the + initial state to accept such a byte. For example, in Shift JIS the regular + expression "\\" accepts the codepoint 0x5c, but should not accept the second + byte of the codepoint 0x815c. Then the initial state must skip the bytes + that are not a single byte character nor the first byte of a multibyte + character. */ +static unsigned char const * +skip_remains_mb (struct dfa *d, unsigned char const *p, + unsigned char const *mbp, char const *end) +{ + wint_t wc; + while (mbp < p) + mbp += mbs_to_wchar (&wc, (char const *) mbp, + end - (char const *) mbp, d); + return mbp; +} + /* Search through a buffer looking for a match to the given struct dfa. Find the first occurrence of a string matching the regexp in the buffer, and the shortest possible version thereof. Return a pointer to @@ -3293,27 +3311,18 @@ dfaexec (struct dfa *d, char const *begin, char *end, if (s == 0) { - /* The initial state may encounter a byte which is not - a single byte character nor the first byte of a - multibyte character. But it is incorrect for the - initial state to accept such a byte. For example, - in Shift JIS the regular expression "\\" accepts - the codepoint 0x5c, but should not accept the second - byte of the codepoint 0x815c. Then the initial - state must skip the bytes that are not a single - byte character nor the first byte of a multibyte - character. */ - wint_t wc; - while (mbp < p) - mbp += mbs_to_wchar (&wc, (char const *) mbp, - end - (char const *) mbp, d); - p = mbp; - - if ((char *) p > end) + if (d->states[s].mbps.nelem == 0) { - p = NULL; - goto done; + do + { + while (t[*p] == 0) + p++; + p = mbp = skip_remains_mb (d, p, mbp, end); + } + while (t[*p] == 0); } + else + p = mbp = skip_remains_mb (d, p, mbp, end); } if (d->states[s].mbps.nelem == 0) @@ -3341,6 +3350,13 @@ dfaexec (struct dfa *d, char const *begin, char *end, } else { + if (s == 0 && (t = trans[s]) != NULL) + { + while (t[*p] == 0) + p++; + s = t[*p++]; + } + while ((t = trans[s]) != NULL) { s1 = t[*p++]; |