summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorNorihiro Tanaka <noritnk@kcn.ne.jp>2019-11-17 07:29:15 +0900
committerJim Meyering <meyering@fb.com>2019-11-17 07:15:35 -0800
commit449f1c5805adba00ddd6edad30d96dbaeb8a91a3 (patch)
tree8df599c4c4dd3fde52dc5981f0e879053a266e03
parentcea97a849038754933dadce9db4ab9761b681c92 (diff)
downloadgrep-449f1c5805adba00ddd6edad30d96dbaeb8a91a3.tar.gz
grep: improve grep -Fw performance in non-UTF8 multibyte locales
* src/searchutils.c (mb_goback): New parameter. All callers changed. * src/search.h (mb_goback): Update prototype. * src/kwsearch.c (Fexecute): Use mb_goback's MBCLEN to detect a word-boundary even more efficiently.
-rw-r--r--src/dfasearch.c2
-rw-r--r--src/kwsearch.c22
-rw-r--r--src/search.h3
-rw-r--r--src/searchutils.c24
4 files changed, 29 insertions, 22 deletions
diff --git a/src/dfasearch.c b/src/dfasearch.c
index 3ebd25eb..6c95d8cb 100644
--- a/src/dfasearch.c
+++ b/src/dfasearch.c
@@ -279,7 +279,7 @@ EGexecute (void *vdc, char const *buf, size_t size, size_t *match_size,
goto success;
if (mb_start < beg)
mb_start = beg;
- if (mb_goback (&mb_start, match, buflim) == 0)
+ if (mb_goback (&mb_start, NULL, match, buflim) == 0)
goto success;
/* The matched line starts in the middle of a multibyte
character. Perform the DFA search starting from the
diff --git a/src/kwsearch.c b/src/kwsearch.c
index f590d197..f121816e 100644
--- a/src/kwsearch.c
+++ b/src/kwsearch.c
@@ -161,6 +161,7 @@ Fexecute (void *vcp, char const *buf, size_t size, size_t *match_size,
bool longest;
struct kwsearch *kwsearch = vcp;
kwset_t kwset = kwsearch->kwset;
+ size_t mbclen;
if (match_lines)
mb_check = longest = false;
@@ -194,7 +195,9 @@ Fexecute (void *vcp, char const *buf, size_t size, size_t *match_size,
return EGexecute (kwsearch->re, buf, size, match_size, start_ptr);
}
- if (mb_check && mb_goback (&mb_start, beg + offset, buf + size) != 0)
+ mbclen = 0;
+ if (mb_check
+ && mb_goback (&mb_start, &mbclen, beg + offset, buf + size) != 0)
{
/* We have matched a single byte that is not at the beginning of a
multibyte character. mb_goback has advanced MB_START past that
@@ -225,22 +228,19 @@ Fexecute (void *vcp, char const *buf, size_t size, size_t *match_size,
/* We need a preceding mb_start pointer. Use the beginning of line
if there is a preceding newline. */
- if (mb_check)
+ if (mbclen == 0)
{
- char const *nl = memrchr (buf, eol, beg - buf);
- mb_start = nl ? nl + 1 : buf;
- }
- else
- {
- char const *nl = memrchr (mb_start, eol, beg - mb_start);
- if (nl)
- mb_start = nl + 1;
+ char const *nl = memrchr (mb_start, eol, beg - mb_start);
+ if (nl)
+ mb_start = nl + 1;
}
/* Succeed if neither the preceding nor the following character is a
word constituent. If the preceding is not, yet the following
character IS a word constituent, keep trying with shorter matches. */
- if (! wordchar_prev (mb_start, beg, buf + size))
+ if (mbclen > 0
+ ? ! wordchar_next (beg - mbclen, buf + size)
+ : ! wordchar_prev (mb_start, beg, buf + size))
for (;;)
{
if (! wordchar_next (beg + len, buf + size))
diff --git a/src/search.h b/src/search.h
index a782a0c1..d6010b95 100644
--- a/src/search.h
+++ b/src/search.h
@@ -52,7 +52,8 @@ extern size_t wordchars_size (char const *, char const *) _GL_ATTRIBUTE_PURE;
extern size_t wordchar_next (char const *, char const *) _GL_ATTRIBUTE_PURE;
extern size_t wordchar_prev (char const *, char const *, char const *)
_GL_ATTRIBUTE_PURE;
-extern ptrdiff_t mb_goback (char const **, char const *, char const *);
+extern ptrdiff_t mb_goback (char const **, size_t *, char const *,
+ char const *);
/* dfasearch.c */
extern void *GEAcompile (char *, size_t, reg_syntax_t);
diff --git a/src/searchutils.c b/src/searchutils.c
index 9bb35fd4..d6a36f1e 100644
--- a/src/searchutils.c
+++ b/src/searchutils.c
@@ -75,18 +75,21 @@ kwsinit (bool mb_trans)
back from CUR to the previous boundary, where a "boundary" is the
start of a multibyte character or is an error-encoding byte. The
buffer ends at END (i.e., one past the address of the buffer's last
- byte). If CUR is already at a boundary, return 0. If *MB_START is
- greater than CUR, return the negative value CUR - *MB_START.
+ byte). If CUR is already at a boundary, return 0. If CUR is no
+ larger than *MB_START, return CUR - *MB_START without modifying
+ *MB_START or *MBCLEN.
When returning zero, set *MB_START to CUR. When returning a
- positive value, set *MB_START to the next boundary after CUR, or to
- END if there is no such boundary. When returning a negative value,
- leave *MB_START alone. */
+ positive value, set *MB_START to the next boundary after CUR,
+ or to END if there is no such boundary, and set *MBCLEN to the
+ length of the preceding character. */
ptrdiff_t
-mb_goback (char const **mb_start, char const *cur, char const *end)
+mb_goback (char const **mb_start, size_t *mbclen, char const *cur,
+ char const *end)
{
const char *p = *mb_start;
const char *p0 = p;
+ size_t clen;
if (cur <= p)
return cur - p;
@@ -94,13 +97,14 @@ mb_goback (char const **mb_start, char const *cur, char const *end)
if (localeinfo.using_utf8)
{
p = cur;
+ clen = 1;
if (cur < end && (*cur & 0xc0) == 0x80)
for (int i = 1; i <= 3; i++)
if ((cur[-i] & 0xc0) != 0x80)
{
mbstate_t mbs = { 0 };
- size_t clen = mb_clen (cur - i, end - (cur - i), &mbs);
+ clen = mb_clen (cur - i, end - (cur - i), &mbs);
if (i < clen && clen < (size_t) -2)
{
p0 = cur - i;
@@ -114,7 +118,7 @@ mb_goback (char const **mb_start, char const *cur, char const *end)
mbstate_t mbs = { 0 };
do
{
- size_t clen = mb_clen (p, end - p, &mbs);
+ clen = mb_clen (p, end - p, &mbs);
if ((size_t) -2 <= clen)
{
@@ -130,6 +134,8 @@ mb_goback (char const **mb_start, char const *cur, char const *end)
}
*mb_start = p;
+ if (mbclen)
+ *mbclen = clen;
return p == cur ? 0 : cur - p0;
}
@@ -192,6 +198,6 @@ wordchar_prev (char const *buf, char const *cur, char const *end)
|| (localeinfo.using_utf8 && localeinfo.sbclen[b] != -2))
return sbwordchar[b];
char const *p = buf;
- cur -= mb_goback (&p, cur, end);
+ cur -= mb_goback (&p, NULL, cur, end);
return wordchar_next (cur, end);
}