diff options
author | Eric Blake <ebb9@byu.net> | 2009-02-18 17:07:58 -0700 |
---|---|---|
committer | Eric Blake <ebb9@byu.net> | 2009-02-23 06:31:26 -0700 |
commit | 4da8364d3193dd976df9b89f87fe63bb2c82abd2 (patch) | |
tree | bc3de8c00ba99e37f680507de4519ade28aac6d0 | |
parent | d8726aac7acd47435ad3e4dedef44d01cd828dc1 (diff) | |
download | m4-4da8364d3193dd976df9b89f87fe63bb2c82abd2.tar.gz |
Speed up translit when from argument is short.
* modules/m4.c (translit): Use memchr2 when possible.
* tests/builtins.at (translit): Add tests.
* NEWS: Document this.
Signed-off-by: Eric Blake <ebb9@byu.net>
(cherry picked from commit 29b625aa941718e43cc04dfc217f314518bbc6d1)
-rw-r--r-- | ChangeLog | 7 | ||||
-rw-r--r-- | NEWS | 7 | ||||
-rw-r--r-- | modules/m4.c | 39 | ||||
-rw-r--r-- | tests/builtins.at | 27 |
4 files changed, 69 insertions, 11 deletions
@@ -1,3 +1,10 @@ +2009-02-23 Eric Blake <ebb9@byu.net> + + Speed up translit when from argument is short. + * modules/m4.c (translit): Use memchr2 when possible. + * tests/builtins.at (translit): Add tests. + * NEWS: Document this. + 2009-02-18 Eric Blake <ebb9@byu.net> Prefer buffer over byte operations. @@ -214,8 +214,6 @@ promoted to 2.0. be silenced by applying this patch: http://git.sv.gnu.org/gitweb/?p=autoconf.git;a=commitdiff;h=714eeee87 -** Improve the speed of the input engine. - ** Fix the `m4wrap' builtin to accumulate wrapped text in FIFO order, as required by POSIX. The manual mentions a way to restore the LIFO order present in earlier GNU M4 versions. NOTE: this change exposes a bug @@ -345,6 +343,11 @@ contains the following beta features that were deemed worth deferring until ** The `divert' and `undivert' builtins have been made more efficient when using temporary files for large diversions. +** The `translit' builtin has been made more efficient when the second + argument is short. + +** The input engine has been optimized for faster processing. + ** The command line option `--debugfile', introduced in 1.4.7, now treats its argument as optional, in order to allow setting the debug output back to stderr when used without an argument; and order is now diff --git a/modules/m4.c b/modules/m4.c index 1ad1ce1c..6844bf51 100644 --- a/modules/m4.c +++ b/modules/m4.c @@ -28,6 +28,7 @@ # include "m4private.h" #endif +#include "memchr2.h" #include "quotearg.h" #include "stdlib--.h" #include "tempname.h" @@ -1104,8 +1105,8 @@ M4BUILTIN_HANDLER (translit) const char *to; size_t from_len; size_t to_len; - char map[UCHAR_MAX + 1] = {0}; - char found[UCHAR_MAX + 1] = {0}; + char map[UCHAR_MAX + 1]; + char found[UCHAR_MAX + 1]; unsigned char ch; enum { ASIS, REPLACE, DELETE }; @@ -1118,26 +1119,46 @@ M4BUILTIN_HANDLER (translit) from = M4ARG (2); from_len = M4ARGLEN (2); - if (memchr (from, '-', from_len) != NULL) - { - from = m4_expand_ranges (from, &from_len, m4_arg_scratch (context)); - assert (from); - } to = M4ARG (3); to_len = M4ARGLEN (3); if (memchr (to, '-', to_len) != NULL) + to = m4_expand_ranges (to, &to_len, m4_arg_scratch (context)); + + /* If there are only one or two bytes to replace, it is faster to + use memchr2. Using expand_ranges does nothing unless there are + at least three bytes. */ + if (from_len <= 2) { - to = m4_expand_ranges (to, &to_len, m4_arg_scratch (context)); - assert (to); + const char *p; + size_t len = M4ARGLEN (1); + int second = from[from_len / 2]; + data = M4ARG (1); + while ((p = (char *) memchr2 (data, from[0], second, len))) + { + obstack_grow (obs, data, p - data); + len -= p - data + 1; + data = p + 1; + if (*p == from[0] && to_len) + obstack_1grow (obs, to[0]); + else if (*p == second && 1 < to_len) + obstack_1grow (obs, to[1]); + } + obstack_grow (obs, data, len); + return; } + if (memchr (from, '-', from_len) != NULL) + from = m4_expand_ranges (from, &from_len, m4_arg_scratch (context)); + /* Calling memchr(from) for each character in data is quadratic, since both strings can be arbitrarily long. Instead, create a from-to mapping in one pass of from, then use that map in one pass of data, for linear behavior. Traditional behavior is that only the first instance of a character in from is consulted, hence the found map. */ + memset (map, 0, sizeof map); + memset (found, 0, sizeof found); while (from_len--) { ch = *from++; diff --git a/tests/builtins.at b/tests/builtins.at index 46ce5482..765ebec8 100644 --- a/tests/builtins.at +++ b/tests/builtins.at @@ -1222,6 +1222,33 @@ AT_DATA([in], [[translit(`«abc~', `~-»') AT_CHECK_M4([in], [0], [[abc ]]) +dnl Validate short strings, which take a different code path. +AT_DATA([in], [[dnl +translit(`abcdeabcde', `a') +translit(`abcdeabcde', `ab') +translit(`abcdeabcde', `a', `f') +translit(`abcdeabcde', `aa', `fg') +translit(`abcdeabcde', `a', `fg') +translit(`abcdeabcde', `ab', `f') +translit(`abcdeabcde', `ab', `fg') +translit(`abcdeabcde', `ab', `ba') +translit(`abcdeabcde', `e', `f') +translit(`abc', `', `cde') +translit(`', `a', `bc') +]]) +AT_CHECK_M4([in], [0], [[bcdebcde +cdecde +fbcdefbcde +fbcdefbcde +fbcdefbcde +fcdefcde +fgcdefgcde +bacdebacde +abcdfabcdf +abc + +]]) + AT_CLEANUP |