diff options
author | Daisuke Nojiri <dnojiri@google.com> | 2014-02-08 17:03:15 -0800 |
---|---|---|
committer | chrome-internal-fetch <chrome-internal-fetch@google.com> | 2014-02-12 19:40:52 +0000 |
commit | d3facbd92fe4e3f9815a9c4896bf2d5b31e51899 (patch) | |
tree | 30211686ce73766e052cc6dfac9a167eec441039 | |
parent | a78c59e4acbbd5b85c221b477b2db43f5e5d679b (diff) | |
download | chrome-ec-d3facbd92fe4e3f9815a9c4896bf2d5b31e51899.tar.gz |
Optimize memcpy
This speeds up memcpy by copying a word at a time if source and destination are
aligned in mod 4. That is, if n and m are a positive integer:
4n -> 4m: aligned, 4x speed.
4n -> 4m+1: misaligned.
4n+1 -> 4m+1: aligned in mod 4, 4x speed.
Ran the unit test on Peppy:
> runtest
...
Running test_memcpy... (speed gain: 120300 -> 38103 us) OK
...
Ran make buildall -j:
...
Running test_memcpy... (speed gain: 2084 -> 549 us) OK
...
Note misaligned case is also optimized. Unit test runs in 298 us on Peppy while
it takes about 475 with the original memcpy.
TEST=Described above.
BUG=chrome-os-partner:23720
BRANCH=none
Signed-off-by: Daisuke Nojiri <dnojiri@chromium.org>
Change-Id: Ic12260451c5efd0896d6353017cd45d29cb672db
Tested-by: Daisuke Nojiri <dnojiri@google.com>
Reviewed-on: https://chromium-review.googlesource.com/185618
Reviewed-by: Randall Spangler <rspangler@chromium.org>
Reviewed-by: Vincent Palatin <vpalatin@chromium.org>
Commit-Queue: Daisuke Nojiri <dnojiri@google.com>
-rw-r--r-- | common/util.c | 42 | ||||
-rw-r--r-- | test/utils.c | 52 |
2 files changed, 87 insertions, 7 deletions
diff --git a/common/util.c b/common/util.c index 32439edaaa..628aef9e82 100644 --- a/common/util.c +++ b/common/util.c @@ -173,16 +173,44 @@ int memcmp(const void *s1, const void *s2, int len) void *memcpy(void *dest, const void *src, int len) { - /* - * TODO(crosbug.com/p/23720): if src/dest are aligned, copy a word at a - * time instead. - */ char *d = (char *)dest; const char *s = (const char *)src; - while (len > 0) { - *(d++) = *(s++); - len--; + uint32_t *dw; + const uint32_t *sw; + char *head; + char * const tail = (char *)dest + len; + /* Set 'body' to the last word boundary */ + uint32_t * const body = (uint32_t *)((uintptr_t)tail & ~3); + + if (((uintptr_t)dest & 3) != ((uintptr_t)src & 3)) { + /* Misaligned. no body, no tail. */ + head = tail; + } else { + /* Aligned */ + if ((uintptr_t)tail < (((uintptr_t)d + 3) & ~3)) + /* len is shorter than the first word boundary */ + head = tail; + else + /* Set 'head' to the first word boundary */ + head = (char *)(((uintptr_t)d + 3) & ~3); } + + /* Copy head */ + while (d < head) + *(d++) = *(s++); + + /* Copy body */ + dw = (uint32_t *)d; + sw = (uint32_t *)s; + while (dw < body) + *(dw++) = *(sw++); + + /* Copy tail */ + d = (char *)dw; + s = (const char *)sw; + while (d < tail) + *(d++) = *(s++); + return dest; } diff --git a/test/utils.c b/test/utils.c index 60d2c83a0f..1e58184977 100644 --- a/test/utils.c +++ b/test/utils.c @@ -83,6 +83,57 @@ static int test_memmove(void) return EC_SUCCESS; } +static int test_memcpy(void) +{ + int i; + timestamp_t t0, t1, t2, t3; + char *buf; + const int buf_size = 1000; + const int len = 400; + const int dest_offset = 500; + const int iteration = 1000; + + TEST_ASSERT(shared_mem_acquire(buf_size, &buf) == EC_SUCCESS); + + for (i = 0; i < len; ++i) + buf[i] = i & 0x7f; + for (i = len; i < buf_size; ++i) + buf[i] = 0; + + t0 = get_time(); + for (i = 0; i < iteration; ++i) + memcpy(buf + dest_offset + 1, buf, len); /* unaligned */ + t1 = get_time(); + TEST_ASSERT_ARRAY_EQ(buf + dest_offset + 1, buf, len); + ccprintf(" (speed gain: %d ->", t1.val-t0.val); + + t2 = get_time(); + for (i = 0; i < iteration; ++i) + memcpy(buf + dest_offset, buf, len); /* aligned */ + t3 = get_time(); + ccprintf(" %d us) ", t3.val-t2.val); + TEST_ASSERT_ARRAY_EQ(buf + dest_offset, buf, len); + + /* Expected about 4x speed gain. Use 3x because it fluctuates */ + TEST_ASSERT((t1.val-t0.val) > (t3.val-t2.val) * 3); + + memcpy(buf + dest_offset + 1, buf + 1, len - 1); + TEST_ASSERT_ARRAY_EQ(buf + dest_offset + 1, buf + 1, len - 1); + + /* Test small copies */ + memcpy(buf + dest_offset, buf, 1); + TEST_ASSERT_ARRAY_EQ(buf + dest_offset, buf, 1); + memcpy(buf + dest_offset, buf, 4); + TEST_ASSERT_ARRAY_EQ(buf + dest_offset, buf, 4); + memcpy(buf + dest_offset + 1, buf, 1); + TEST_ASSERT_ARRAY_EQ(buf + dest_offset + 1, buf, 1); + memcpy(buf + dest_offset + 1, buf, 4); + TEST_ASSERT_ARRAY_EQ(buf + dest_offset + 1, buf, 4); + + shared_mem_release(buf); + return EC_SUCCESS; +} + static int test_strzcpy(void) { char dest[10]; @@ -305,6 +356,7 @@ void run_test(void) RUN_TEST(test_strtoi); RUN_TEST(test_parse_bool); RUN_TEST(test_memmove); + RUN_TEST(test_memcpy); RUN_TEST(test_strzcpy); RUN_TEST(test_strlen); RUN_TEST(test_strcasecmp); |