diff options
author | Norihiro Tanaka <noritnk@kcn.ne.jp> | 2019-11-17 07:29:15 +0900 |
---|---|---|
committer | Jim Meyering <meyering@fb.com> | 2019-11-17 07:15:35 -0800 |
commit | 449f1c5805adba00ddd6edad30d96dbaeb8a91a3 (patch) | |
tree | 8df599c4c4dd3fde52dc5981f0e879053a266e03 | |
parent | cea97a849038754933dadce9db4ab9761b681c92 (diff) | |
download | grep-449f1c5805adba00ddd6edad30d96dbaeb8a91a3.tar.gz |
grep: improve grep -Fw performance in non-UTF8 multibyte locales
* src/searchutils.c (mb_goback): New parameter. All callers changed.
* src/search.h (mb_goback): Update prototype.
* src/kwsearch.c (Fexecute): Use mb_goback's MBCLEN to detect a
word-boundary even more efficiently.
-rw-r--r-- | src/dfasearch.c | 2 | ||||
-rw-r--r-- | src/kwsearch.c | 22 | ||||
-rw-r--r-- | src/search.h | 3 | ||||
-rw-r--r-- | src/searchutils.c | 24 |
4 files changed, 29 insertions, 22 deletions
diff --git a/src/dfasearch.c b/src/dfasearch.c index 3ebd25eb..6c95d8cb 100644 --- a/src/dfasearch.c +++ b/src/dfasearch.c @@ -279,7 +279,7 @@ EGexecute (void *vdc, char const *buf, size_t size, size_t *match_size, goto success; if (mb_start < beg) mb_start = beg; - if (mb_goback (&mb_start, match, buflim) == 0) + if (mb_goback (&mb_start, NULL, match, buflim) == 0) goto success; /* The matched line starts in the middle of a multibyte character. Perform the DFA search starting from the diff --git a/src/kwsearch.c b/src/kwsearch.c index f590d197..f121816e 100644 --- a/src/kwsearch.c +++ b/src/kwsearch.c @@ -161,6 +161,7 @@ Fexecute (void *vcp, char const *buf, size_t size, size_t *match_size, bool longest; struct kwsearch *kwsearch = vcp; kwset_t kwset = kwsearch->kwset; + size_t mbclen; if (match_lines) mb_check = longest = false; @@ -194,7 +195,9 @@ Fexecute (void *vcp, char const *buf, size_t size, size_t *match_size, return EGexecute (kwsearch->re, buf, size, match_size, start_ptr); } - if (mb_check && mb_goback (&mb_start, beg + offset, buf + size) != 0) + mbclen = 0; + if (mb_check + && mb_goback (&mb_start, &mbclen, beg + offset, buf + size) != 0) { /* We have matched a single byte that is not at the beginning of a multibyte character. mb_goback has advanced MB_START past that @@ -225,22 +228,19 @@ Fexecute (void *vcp, char const *buf, size_t size, size_t *match_size, /* We need a preceding mb_start pointer. Use the beginning of line if there is a preceding newline. */ - if (mb_check) + if (mbclen == 0) { - char const *nl = memrchr (buf, eol, beg - buf); - mb_start = nl ? nl + 1 : buf; - } - else - { - char const *nl = memrchr (mb_start, eol, beg - mb_start); - if (nl) - mb_start = nl + 1; + char const *nl = memrchr (mb_start, eol, beg - mb_start); + if (nl) + mb_start = nl + 1; } /* Succeed if neither the preceding nor the following character is a word constituent. If the preceding is not, yet the following character IS a word constituent, keep trying with shorter matches. */ - if (! wordchar_prev (mb_start, beg, buf + size)) + if (mbclen > 0 + ? ! wordchar_next (beg - mbclen, buf + size) + : ! wordchar_prev (mb_start, beg, buf + size)) for (;;) { if (! wordchar_next (beg + len, buf + size)) diff --git a/src/search.h b/src/search.h index a782a0c1..d6010b95 100644 --- a/src/search.h +++ b/src/search.h @@ -52,7 +52,8 @@ extern size_t wordchars_size (char const *, char const *) _GL_ATTRIBUTE_PURE; extern size_t wordchar_next (char const *, char const *) _GL_ATTRIBUTE_PURE; extern size_t wordchar_prev (char const *, char const *, char const *) _GL_ATTRIBUTE_PURE; -extern ptrdiff_t mb_goback (char const **, char const *, char const *); +extern ptrdiff_t mb_goback (char const **, size_t *, char const *, + char const *); /* dfasearch.c */ extern void *GEAcompile (char *, size_t, reg_syntax_t); diff --git a/src/searchutils.c b/src/searchutils.c index 9bb35fd4..d6a36f1e 100644 --- a/src/searchutils.c +++ b/src/searchutils.c @@ -75,18 +75,21 @@ kwsinit (bool mb_trans) back from CUR to the previous boundary, where a "boundary" is the start of a multibyte character or is an error-encoding byte. The buffer ends at END (i.e., one past the address of the buffer's last - byte). If CUR is already at a boundary, return 0. If *MB_START is - greater than CUR, return the negative value CUR - *MB_START. + byte). If CUR is already at a boundary, return 0. If CUR is no + larger than *MB_START, return CUR - *MB_START without modifying + *MB_START or *MBCLEN. When returning zero, set *MB_START to CUR. When returning a - positive value, set *MB_START to the next boundary after CUR, or to - END if there is no such boundary. When returning a negative value, - leave *MB_START alone. */ + positive value, set *MB_START to the next boundary after CUR, + or to END if there is no such boundary, and set *MBCLEN to the + length of the preceding character. */ ptrdiff_t -mb_goback (char const **mb_start, char const *cur, char const *end) +mb_goback (char const **mb_start, size_t *mbclen, char const *cur, + char const *end) { const char *p = *mb_start; const char *p0 = p; + size_t clen; if (cur <= p) return cur - p; @@ -94,13 +97,14 @@ mb_goback (char const **mb_start, char const *cur, char const *end) if (localeinfo.using_utf8) { p = cur; + clen = 1; if (cur < end && (*cur & 0xc0) == 0x80) for (int i = 1; i <= 3; i++) if ((cur[-i] & 0xc0) != 0x80) { mbstate_t mbs = { 0 }; - size_t clen = mb_clen (cur - i, end - (cur - i), &mbs); + clen = mb_clen (cur - i, end - (cur - i), &mbs); if (i < clen && clen < (size_t) -2) { p0 = cur - i; @@ -114,7 +118,7 @@ mb_goback (char const **mb_start, char const *cur, char const *end) mbstate_t mbs = { 0 }; do { - size_t clen = mb_clen (p, end - p, &mbs); + clen = mb_clen (p, end - p, &mbs); if ((size_t) -2 <= clen) { @@ -130,6 +134,8 @@ mb_goback (char const **mb_start, char const *cur, char const *end) } *mb_start = p; + if (mbclen) + *mbclen = clen; return p == cur ? 0 : cur - p0; } @@ -192,6 +198,6 @@ wordchar_prev (char const *buf, char const *cur, char const *end) || (localeinfo.using_utf8 && localeinfo.sbclen[b] != -2)) return sbwordchar[b]; char const *p = buf; - cur -= mb_goback (&p, cur, end); + cur -= mb_goback (&p, NULL, cur, end); return wordchar_next (cur, end); } |