summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorEric Blake <ebb9@byu.net>2009-02-18 17:07:58 -0700
committerEric Blake <ebb9@byu.net>2009-02-23 06:31:26 -0700
commit4da8364d3193dd976df9b89f87fe63bb2c82abd2 (patch)
treebc3de8c00ba99e37f680507de4519ade28aac6d0
parentd8726aac7acd47435ad3e4dedef44d01cd828dc1 (diff)
downloadm4-4da8364d3193dd976df9b89f87fe63bb2c82abd2.tar.gz
Speed up translit when from argument is short.
* modules/m4.c (translit): Use memchr2 when possible. * tests/builtins.at (translit): Add tests. * NEWS: Document this. Signed-off-by: Eric Blake <ebb9@byu.net> (cherry picked from commit 29b625aa941718e43cc04dfc217f314518bbc6d1)
-rw-r--r--ChangeLog7
-rw-r--r--NEWS7
-rw-r--r--modules/m4.c39
-rw-r--r--tests/builtins.at27
4 files changed, 69 insertions, 11 deletions
diff --git a/ChangeLog b/ChangeLog
index 9469fbe8..f052cb4f 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,10 @@
+2009-02-23 Eric Blake <ebb9@byu.net>
+
+ Speed up translit when from argument is short.
+ * modules/m4.c (translit): Use memchr2 when possible.
+ * tests/builtins.at (translit): Add tests.
+ * NEWS: Document this.
+
2009-02-18 Eric Blake <ebb9@byu.net>
Prefer buffer over byte operations.
diff --git a/NEWS b/NEWS
index 55abe349..d2d42b54 100644
--- a/NEWS
+++ b/NEWS
@@ -214,8 +214,6 @@ promoted to 2.0.
be silenced by applying this patch:
http://git.sv.gnu.org/gitweb/?p=autoconf.git;a=commitdiff;h=714eeee87
-** Improve the speed of the input engine.
-
** Fix the `m4wrap' builtin to accumulate wrapped text in FIFO order, as
required by POSIX. The manual mentions a way to restore the LIFO order
present in earlier GNU M4 versions. NOTE: this change exposes a bug
@@ -345,6 +343,11 @@ contains the following beta features that were deemed worth deferring until
** The `divert' and `undivert' builtins have been made more efficient
when using temporary files for large diversions.
+** The `translit' builtin has been made more efficient when the second
+ argument is short.
+
+** The input engine has been optimized for faster processing.
+
** The command line option `--debugfile', introduced in 1.4.7, now
treats its argument as optional, in order to allow setting the debug
output back to stderr when used without an argument; and order is now
diff --git a/modules/m4.c b/modules/m4.c
index 1ad1ce1c..6844bf51 100644
--- a/modules/m4.c
+++ b/modules/m4.c
@@ -28,6 +28,7 @@
# include "m4private.h"
#endif
+#include "memchr2.h"
#include "quotearg.h"
#include "stdlib--.h"
#include "tempname.h"
@@ -1104,8 +1105,8 @@ M4BUILTIN_HANDLER (translit)
const char *to;
size_t from_len;
size_t to_len;
- char map[UCHAR_MAX + 1] = {0};
- char found[UCHAR_MAX + 1] = {0};
+ char map[UCHAR_MAX + 1];
+ char found[UCHAR_MAX + 1];
unsigned char ch;
enum { ASIS, REPLACE, DELETE };
@@ -1118,26 +1119,46 @@ M4BUILTIN_HANDLER (translit)
from = M4ARG (2);
from_len = M4ARGLEN (2);
- if (memchr (from, '-', from_len) != NULL)
- {
- from = m4_expand_ranges (from, &from_len, m4_arg_scratch (context));
- assert (from);
- }
to = M4ARG (3);
to_len = M4ARGLEN (3);
if (memchr (to, '-', to_len) != NULL)
+ to = m4_expand_ranges (to, &to_len, m4_arg_scratch (context));
+
+ /* If there are only one or two bytes to replace, it is faster to
+ use memchr2. Using expand_ranges does nothing unless there are
+ at least three bytes. */
+ if (from_len <= 2)
{
- to = m4_expand_ranges (to, &to_len, m4_arg_scratch (context));
- assert (to);
+ const char *p;
+ size_t len = M4ARGLEN (1);
+ int second = from[from_len / 2];
+ data = M4ARG (1);
+ while ((p = (char *) memchr2 (data, from[0], second, len)))
+ {
+ obstack_grow (obs, data, p - data);
+ len -= p - data + 1;
+ data = p + 1;
+ if (*p == from[0] && to_len)
+ obstack_1grow (obs, to[0]);
+ else if (*p == second && 1 < to_len)
+ obstack_1grow (obs, to[1]);
+ }
+ obstack_grow (obs, data, len);
+ return;
}
+ if (memchr (from, '-', from_len) != NULL)
+ from = m4_expand_ranges (from, &from_len, m4_arg_scratch (context));
+
/* Calling memchr(from) for each character in data is quadratic,
since both strings can be arbitrarily long. Instead, create a
from-to mapping in one pass of from, then use that map in one
pass of data, for linear behavior. Traditional behavior is that
only the first instance of a character in from is consulted,
hence the found map. */
+ memset (map, 0, sizeof map);
+ memset (found, 0, sizeof found);
while (from_len--)
{
ch = *from++;
diff --git a/tests/builtins.at b/tests/builtins.at
index 46ce5482..765ebec8 100644
--- a/tests/builtins.at
+++ b/tests/builtins.at
@@ -1222,6 +1222,33 @@ AT_DATA([in], [[translit(`«abc~', `~-»')
AT_CHECK_M4([in], [0], [[abc
]])
+dnl Validate short strings, which take a different code path.
+AT_DATA([in], [[dnl
+translit(`abcdeabcde', `a')
+translit(`abcdeabcde', `ab')
+translit(`abcdeabcde', `a', `f')
+translit(`abcdeabcde', `aa', `fg')
+translit(`abcdeabcde', `a', `fg')
+translit(`abcdeabcde', `ab', `f')
+translit(`abcdeabcde', `ab', `fg')
+translit(`abcdeabcde', `ab', `ba')
+translit(`abcdeabcde', `e', `f')
+translit(`abc', `', `cde')
+translit(`', `a', `bc')
+]])
+AT_CHECK_M4([in], [0], [[bcdebcde
+cdecde
+fbcdefbcde
+fbcdefbcde
+fbcdefbcde
+fcdefcde
+fgcdefgcde
+bacdebacde
+abcdfabcdf
+abc
+
+]])
+
AT_CLEANUP