From 56d0bc72cdefaa06deec1030f039f43f967cf603 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Niels=20M=C3=B6ller?= Date: Wed, 6 Oct 2010 22:53:59 +0200 Subject: (memxor3): Optimized. (memxor3_common_alignment): New function. (memxor3_different_alignment_b): New function. (memxor3_different_alignment_ab): New function. (memxor3_different_alignment_all): New function. Rev: nettle/memxor.c:1.3 --- memxor.c | 223 ++++++++++++++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 193 insertions(+), 30 deletions(-) (limited to 'memxor.c') diff --git a/memxor.c b/memxor.c index df69bb41..e4a9a5c2 100644 --- a/memxor.c +++ b/memxor.c @@ -23,7 +23,8 @@ * MA 02111-1307, USA. */ -/* Implementation inspired by memcmp in glibc, contributed to the FSF by Torbjorn Granlund. +/* Implementation inspired by memcmp in glibc, contributed to the FSF + by Torbjorn Granlund. */ #if HAVE_CONFIG_H @@ -34,7 +35,6 @@ #include "memxor.h" -#if 1 typedef unsigned long int word_t; #if SIZEOF_LONG & (SIZEOF_LONG - 1) @@ -55,16 +55,19 @@ typedef unsigned long int word_t; static void memxor_common_alignment (word_t *dst, const word_t *src, size_t n) { - size_t i; - /* FIXME: Unroll four times, like memcmp? */ - i = n & 1; - if (i) - dst[0] ^= src[0]; + /* FIXME: Require n > 0? */ + /* FIXME: Unroll four times, like memcmp? Probably not worth the + effort. */ - for (; i < n; i += 2) + if (n & 1) { - dst[i] ^= src[i]; - dst[i+1] ^= src[i+1]; + *dst++ ^= *src++; + n--; + } + for (; n >= 2; dst += 2, src += 2, n -= 2) + { + dst[0] ^= src[0]; + dst[1] ^= src[1]; } } @@ -83,7 +86,7 @@ memxor_different_alignment (word_t *dst, const uint8_t *src, size_t n) shl = CHAR_BIT * offset; shr = CHAR_BIT * (sizeof(word_t) - offset); - src_word = (const word_t *) (src - offset); + src_word = (const word_t *) ((uintptr_t) src & -SIZEOF_LONG); /* FIXME: Unroll four times, like memcmp? */ i = n & 1; @@ -91,7 +94,6 @@ memxor_different_alignment (word_t *dst, const uint8_t *src, size_t n) if (i) { s1 = src_word[0]; - s0 = src_word[1]; dst[0] ^= MERGE (s1, shl, s0, shr); } @@ -104,6 +106,9 @@ memxor_different_alignment (word_t *dst, const uint8_t *src, size_t n) } } +/* Performance, Intel SU1400 (x86_64): 0.25 cycles/byte aligned, 0.45 + cycles/byte unaligned. */ + /* XOR LEN bytes starting at SRCADDR onto DESTADDR. Result undefined if the source overlaps with the destination. Return DESTADDR. */ uint8_t * @@ -113,8 +118,6 @@ memxor(uint8_t *dst, const uint8_t *src, size_t n) if (n >= WORD_T_THRESH) { - size_t left_over; - /* There are at least some bytes to compare. No need to test for N == 0 in this alignment loop. */ while (ALIGN_OFFSET (dst)) @@ -127,35 +130,195 @@ memxor(uint8_t *dst, const uint8_t *src, size_t n) else memxor_common_alignment ((word_t *) dst, (const word_t *) src, n / sizeof(word_t)); - left_over = n % sizeof(word_t); - dst += n - left_over; - src += n - left_over; - n = left_over; + dst += n & -SIZEOF_LONG; + src += n & -SIZEOF_LONG; + n = n & (SIZEOF_LONG - 1); } for (; n > 0; n--) *dst++ ^= *src++; return orig_dst; } -#else -uint8_t * -memxor(uint8_t *dst, const uint8_t *src, size_t n) + + +/* XOR word-aligned areas. n is the number of words, not bytes. */ +static void +memxor3_common_alignment (word_t *dst, + const word_t *a, const word_t *b, size_t n) { - size_t i; - for (i = 0; i 0? */ + while (n-- > 0) + dst[n] = a[n] ^ b[n]; +} - return dst; +static void +memxor3_different_alignment_b (word_t *dst, + const word_t *a, const uint8_t *b, unsigned offset, size_t n) +{ + int shl, shr; + const word_t *b_word; + + word_t s0, s1; + + shl = CHAR_BIT * offset; + shr = CHAR_BIT * (sizeof(word_t) - offset); + + b_word = (const word_t *) ((uintptr_t) b & -SIZEOF_LONG); + + if (n & 1) + { + n--; + s1 = b_word[n]; + s0 = b_word[n+1]; + dst[n] = a[n] ^ MERGE (s1, shl, s0, shr); + } + else + s1 = b_word[n]; + + while (n > 0) + { + n -= 2; + s0 = b_word[n+1]; + dst[n+1] = a[n+1] ^ MERGE(s0, shl, s1, shr); + s1 = b_word[n]; + dst[n] = a[n] ^ MERGE(s1, shl, s0, shr); + } } -#endif +static void +memxor3_different_alignment_ab (word_t *dst, + const uint8_t *a, const uint8_t *b, + unsigned offset, size_t n) +{ + int shl, shr; + const word_t *a_word; + const word_t *b_word; + + word_t s0, s1; + + shl = CHAR_BIT * offset; + shr = CHAR_BIT * (sizeof(word_t) - offset); + + a_word = (const word_t *) ((uintptr_t) a & -SIZEOF_LONG); + b_word = (const word_t *) ((uintptr_t) b & -SIZEOF_LONG); + + if (n & 1) + { + n--; + s1 = a_word[n] ^ b_word[n]; + s0 = a_word[n+1] ^ b_word[n+1]; + dst[n] = MERGE (s1, shl, s0, shr); + } + else + s1 = a_word[n] ^ b_word[n]; + + while (n > 0) + { + n -= 2; + s0 = a_word[n+1] ^ b_word[n+1]; + dst[n+1] = MERGE(s0, shl, s1, shr); + s1 = a_word[n] ^ b_word[n]; + dst[n] = MERGE(s1, shl, s0, shr); + } +} + +static void +memxor3_different_alignment_all (word_t *dst, + const uint8_t *a, const uint8_t *b, + unsigned a_offset, unsigned b_offset, + size_t n) +{ + int al, ar, bl, br; + const word_t *a_word; + const word_t *b_word; + + word_t a0, a1, b0, b1; + + al = CHAR_BIT * a_offset; + ar = CHAR_BIT * (sizeof(word_t) - a_offset); + bl = CHAR_BIT * b_offset; + br = CHAR_BIT * (sizeof(word_t) - b_offset); + + a_word = (const word_t *) ((uintptr_t) a & -SIZEOF_LONG); + b_word = (const word_t *) ((uintptr_t) b & -SIZEOF_LONG); + + if (n & 1) + { + n--; + a1 = a_word[n]; a0 = a_word[n+1]; + b1 = b_word[n]; b0 = b_word[n+1]; + + dst[n] = MERGE (a1, al, a0, ar) ^ MERGE (b1, bl, b0, br); + } + else + { + a1 = a_word[n]; + b1 = b_word[n]; + } + + while (n > 0) + { + n -= 2; + a0 = a_word[n+1]; b0 = b_word[n+1]; + dst[n+1] = MERGE(a0, al, a1, ar) ^ MERGE(b0, bl, b1, br); + a1 = a_word[n]; b1 = b_word[n]; + dst[n] = MERGE(a1, al, a0, ar) ^ MERGE(b1, bl, b0, br); + } +} + +/* Current implementation processes data in descending order, to + support overlapping operation with one of the sources overlapping + the start of the destination area. This feature is used only + internally by cbc decrypt, and it is not advertised or documented + to nettle users. */ uint8_t * memxor3(uint8_t *dst, const uint8_t *a, const uint8_t *b, size_t n) { - size_t i; - for (i = 0; i= WORD_T_THRESH) + { + unsigned i; + unsigned a_offset; + unsigned b_offset; + size_t nwords; + + for (i = ALIGN_OFFSET(dst + n); i > 0; i--) + { + n--; + dst[n] = a[n] ^ b[n]; + } + + a_offset = ALIGN_OFFSET(a + n); + b_offset = ALIGN_OFFSET(b + n); + + nwords = n / sizeof (word_t); + n %= sizeof (word_t); + + if (a_offset == b_offset) + { + if (!a_offset) + memxor3_common_alignment((word_t *) (dst + n), + (const word_t *) (a + n), + (const word_t *) (b + n), nwords); + else + memxor3_different_alignment_ab((word_t *) (dst + n), + a + n, b + n, a_offset, + nwords); + } + else if (!a_offset) + memxor3_different_alignment_b((word_t *) (dst + n), + (const word_t *) (a + n), b + n, + b_offset, nwords); + else if (!b_offset) + memxor3_different_alignment_b((word_t *) (dst + n), + (const word_t *) (b + n), a + n, + a_offset, nwords); + else + memxor3_different_alignment_all((word_t *) (dst + n), a + n, b + n, + a_offset, b_offset, nwords); + + } + while (n-- > 0) + dst[n] = a[n] ^ b[n]; return dst; } - -- cgit v1.2.1