summaryrefslogtreecommitdiff
path: root/memxor.c
diff options
context:
space:
mode:
authorNiels Möller <nisse@lysator.liu.se>2010-10-06 22:53:59 +0200
committerNiels Möller <nisse@lysator.liu.se>2010-10-06 22:53:59 +0200
commit56d0bc72cdefaa06deec1030f039f43f967cf603 (patch)
tree7005f47c0391e3ec87ae915f79e21ba618ae8e9d /memxor.c
parent5a3187a32257ab2c9f9bfb0a742c616f41fb6f4d (diff)
downloadnettle-56d0bc72cdefaa06deec1030f039f43f967cf603.tar.gz
(memxor3): Optimized.
(memxor3_common_alignment): New function. (memxor3_different_alignment_b): New function. (memxor3_different_alignment_ab): New function. (memxor3_different_alignment_all): New function. Rev: nettle/memxor.c:1.3
Diffstat (limited to 'memxor.c')
-rw-r--r--memxor.c223
1 files changed, 193 insertions, 30 deletions
diff --git a/memxor.c b/memxor.c
index df69bb41..e4a9a5c2 100644
--- a/memxor.c
+++ b/memxor.c
@@ -23,7 +23,8 @@
* MA 02111-1307, USA.
*/
-/* Implementation inspired by memcmp in glibc, contributed to the FSF by Torbjorn Granlund.
+/* Implementation inspired by memcmp in glibc, contributed to the FSF
+ by Torbjorn Granlund.
*/
#if HAVE_CONFIG_H
@@ -34,7 +35,6 @@
#include "memxor.h"
-#if 1
typedef unsigned long int word_t;
#if SIZEOF_LONG & (SIZEOF_LONG - 1)
@@ -55,16 +55,19 @@ typedef unsigned long int word_t;
static void
memxor_common_alignment (word_t *dst, const word_t *src, size_t n)
{
- size_t i;
- /* FIXME: Unroll four times, like memcmp? */
- i = n & 1;
- if (i)
- dst[0] ^= src[0];
+ /* FIXME: Require n > 0? */
+ /* FIXME: Unroll four times, like memcmp? Probably not worth the
+ effort. */
- for (; i < n; i += 2)
+ if (n & 1)
{
- dst[i] ^= src[i];
- dst[i+1] ^= src[i+1];
+ *dst++ ^= *src++;
+ n--;
+ }
+ for (; n >= 2; dst += 2, src += 2, n -= 2)
+ {
+ dst[0] ^= src[0];
+ dst[1] ^= src[1];
}
}
@@ -83,7 +86,7 @@ memxor_different_alignment (word_t *dst, const uint8_t *src, size_t n)
shl = CHAR_BIT * offset;
shr = CHAR_BIT * (sizeof(word_t) - offset);
- src_word = (const word_t *) (src - offset);
+ src_word = (const word_t *) ((uintptr_t) src & -SIZEOF_LONG);
/* FIXME: Unroll four times, like memcmp? */
i = n & 1;
@@ -91,7 +94,6 @@ memxor_different_alignment (word_t *dst, const uint8_t *src, size_t n)
if (i)
{
s1 = src_word[0];
- s0 = src_word[1];
dst[0] ^= MERGE (s1, shl, s0, shr);
}
@@ -104,6 +106,9 @@ memxor_different_alignment (word_t *dst, const uint8_t *src, size_t n)
}
}
+/* Performance, Intel SU1400 (x86_64): 0.25 cycles/byte aligned, 0.45
+ cycles/byte unaligned. */
+
/* XOR LEN bytes starting at SRCADDR onto DESTADDR. Result undefined
if the source overlaps with the destination. Return DESTADDR. */
uint8_t *
@@ -113,8 +118,6 @@ memxor(uint8_t *dst, const uint8_t *src, size_t n)
if (n >= WORD_T_THRESH)
{
- size_t left_over;
-
/* There are at least some bytes to compare. No need to test
for N == 0 in this alignment loop. */
while (ALIGN_OFFSET (dst))
@@ -127,35 +130,195 @@ memxor(uint8_t *dst, const uint8_t *src, size_t n)
else
memxor_common_alignment ((word_t *) dst, (const word_t *) src, n / sizeof(word_t));
- left_over = n % sizeof(word_t);
- dst += n - left_over;
- src += n - left_over;
- n = left_over;
+ dst += n & -SIZEOF_LONG;
+ src += n & -SIZEOF_LONG;
+ n = n & (SIZEOF_LONG - 1);
}
for (; n > 0; n--)
*dst++ ^= *src++;
return orig_dst;
}
-#else
-uint8_t *
-memxor(uint8_t *dst, const uint8_t *src, size_t n)
+
+
+/* XOR word-aligned areas. n is the number of words, not bytes. */
+static void
+memxor3_common_alignment (word_t *dst,
+ const word_t *a, const word_t *b, size_t n)
{
- size_t i;
- for (i = 0; i<n; i++)
- dst[i] ^= src[i];
+ /* FIXME: Require n > 0? */
+ while (n-- > 0)
+ dst[n] = a[n] ^ b[n];
+}
- return dst;
+static void
+memxor3_different_alignment_b (word_t *dst,
+ const word_t *a, const uint8_t *b, unsigned offset, size_t n)
+{
+ int shl, shr;
+ const word_t *b_word;
+
+ word_t s0, s1;
+
+ shl = CHAR_BIT * offset;
+ shr = CHAR_BIT * (sizeof(word_t) - offset);
+
+ b_word = (const word_t *) ((uintptr_t) b & -SIZEOF_LONG);
+
+ if (n & 1)
+ {
+ n--;
+ s1 = b_word[n];
+ s0 = b_word[n+1];
+ dst[n] = a[n] ^ MERGE (s1, shl, s0, shr);
+ }
+ else
+ s1 = b_word[n];
+
+ while (n > 0)
+ {
+ n -= 2;
+ s0 = b_word[n+1];
+ dst[n+1] = a[n+1] ^ MERGE(s0, shl, s1, shr);
+ s1 = b_word[n];
+ dst[n] = a[n] ^ MERGE(s1, shl, s0, shr);
+ }
}
-#endif
+static void
+memxor3_different_alignment_ab (word_t *dst,
+ const uint8_t *a, const uint8_t *b,
+ unsigned offset, size_t n)
+{
+ int shl, shr;
+ const word_t *a_word;
+ const word_t *b_word;
+
+ word_t s0, s1;
+
+ shl = CHAR_BIT * offset;
+ shr = CHAR_BIT * (sizeof(word_t) - offset);
+
+ a_word = (const word_t *) ((uintptr_t) a & -SIZEOF_LONG);
+ b_word = (const word_t *) ((uintptr_t) b & -SIZEOF_LONG);
+
+ if (n & 1)
+ {
+ n--;
+ s1 = a_word[n] ^ b_word[n];
+ s0 = a_word[n+1] ^ b_word[n+1];
+ dst[n] = MERGE (s1, shl, s0, shr);
+ }
+ else
+ s1 = a_word[n] ^ b_word[n];
+
+ while (n > 0)
+ {
+ n -= 2;
+ s0 = a_word[n+1] ^ b_word[n+1];
+ dst[n+1] = MERGE(s0, shl, s1, shr);
+ s1 = a_word[n] ^ b_word[n];
+ dst[n] = MERGE(s1, shl, s0, shr);
+ }
+}
+
+static void
+memxor3_different_alignment_all (word_t *dst,
+ const uint8_t *a, const uint8_t *b,
+ unsigned a_offset, unsigned b_offset,
+ size_t n)
+{
+ int al, ar, bl, br;
+ const word_t *a_word;
+ const word_t *b_word;
+
+ word_t a0, a1, b0, b1;
+
+ al = CHAR_BIT * a_offset;
+ ar = CHAR_BIT * (sizeof(word_t) - a_offset);
+ bl = CHAR_BIT * b_offset;
+ br = CHAR_BIT * (sizeof(word_t) - b_offset);
+
+ a_word = (const word_t *) ((uintptr_t) a & -SIZEOF_LONG);
+ b_word = (const word_t *) ((uintptr_t) b & -SIZEOF_LONG);
+
+ if (n & 1)
+ {
+ n--;
+ a1 = a_word[n]; a0 = a_word[n+1];
+ b1 = b_word[n]; b0 = b_word[n+1];
+
+ dst[n] = MERGE (a1, al, a0, ar) ^ MERGE (b1, bl, b0, br);
+ }
+ else
+ {
+ a1 = a_word[n];
+ b1 = b_word[n];
+ }
+
+ while (n > 0)
+ {
+ n -= 2;
+ a0 = a_word[n+1]; b0 = b_word[n+1];
+ dst[n+1] = MERGE(a0, al, a1, ar) ^ MERGE(b0, bl, b1, br);
+ a1 = a_word[n]; b1 = b_word[n];
+ dst[n] = MERGE(a1, al, a0, ar) ^ MERGE(b1, bl, b0, br);
+ }
+}
+
+/* Current implementation processes data in descending order, to
+ support overlapping operation with one of the sources overlapping
+ the start of the destination area. This feature is used only
+ internally by cbc decrypt, and it is not advertised or documented
+ to nettle users. */
uint8_t *
memxor3(uint8_t *dst, const uint8_t *a, const uint8_t *b, size_t n)
{
- size_t i;
- for (i = 0; i<n; i++)
- dst[i] = a[i] ^ b[i];
+ if (n >= WORD_T_THRESH)
+ {
+ unsigned i;
+ unsigned a_offset;
+ unsigned b_offset;
+ size_t nwords;
+
+ for (i = ALIGN_OFFSET(dst + n); i > 0; i--)
+ {
+ n--;
+ dst[n] = a[n] ^ b[n];
+ }
+
+ a_offset = ALIGN_OFFSET(a + n);
+ b_offset = ALIGN_OFFSET(b + n);
+
+ nwords = n / sizeof (word_t);
+ n %= sizeof (word_t);
+
+ if (a_offset == b_offset)
+ {
+ if (!a_offset)
+ memxor3_common_alignment((word_t *) (dst + n),
+ (const word_t *) (a + n),
+ (const word_t *) (b + n), nwords);
+ else
+ memxor3_different_alignment_ab((word_t *) (dst + n),
+ a + n, b + n, a_offset,
+ nwords);
+ }
+ else if (!a_offset)
+ memxor3_different_alignment_b((word_t *) (dst + n),
+ (const word_t *) (a + n), b + n,
+ b_offset, nwords);
+ else if (!b_offset)
+ memxor3_different_alignment_b((word_t *) (dst + n),
+ (const word_t *) (b + n), a + n,
+ a_offset, nwords);
+ else
+ memxor3_different_alignment_all((word_t *) (dst + n), a + n, b + n,
+ a_offset, b_offset, nwords);
+
+ }
+ while (n-- > 0)
+ dst[n] = a[n] ^ b[n];
return dst;
}
-