summaryrefslogtreecommitdiff
path: root/cipher/chacha20.c
diff options
context:
space:
mode:
authorJussi Kivilinna <jussi.kivilinna@iki.fi>2018-01-09 18:40:25 +0200
committerJussi Kivilinna <jussi.kivilinna@iki.fi>2018-01-09 18:40:25 +0200
commit172ad09cbedc893f147180875335f4c525393c0b (patch)
tree02f489abcd22683b0c39d86a962c0af0c81c18f5 /cipher/chacha20.c
parentb9a471ccf5f02f89e25c7ccc29898d0e4e486099 (diff)
downloadlibgcrypt-172ad09cbedc893f147180875335f4c525393c0b.tar.gz
New ChaCha implementations
* cipher/Makefile.am: Remove 'chacha20-sse2-amd64.S', 'chacha20-ssse3-amd64.S', 'chacha20-avx2-amd64.S'; Add 'chacha20-amd64-ssse3.S', 'chacha20-amd64-avx2.S'. * cipher/chacha20-amd64-avx2.S: New. * cipher/chacha20-amd64-ssse3.S: New. * cipher/chacha20-armv7-neon.S: Rewrite. * cipher/chacha20-avx2-amd64.S: Remove. * cipher/chacha20-sse2-amd64.S: Remove. * cipher/chacha20-ssse3-amd64.S: Remove. * cipher/chacha20.c (CHACHA20_INPUT_LENGTH, USE_SSE2, USE_NEON) (ASM_EXTRA_STACK, chacha20_blocks_t, _gcry_chacha20_amd64_sse2_blocks) (_gcry_chacha20_amd64_ssse3_blocks, _gcry_chacha20_amd64_avx2_blocks) (_gcry_chacha20_armv7_neon_blocks, QROUND, QOUT, chacha20_core) (chacha20_do_encrypt_stream): Remove. (_gcry_chacha20_amd64_ssse3_blocks4, _gcry_chacha20_amd64_avx2_blocks8) (_gcry_chacha20_armv7_neon_blocks4, ROTATE, XOR, PLUS, PLUSONE) (QUARTERROUND, BUF_XOR_LE32): New. (CHACHA20_context_s, chacha20_blocks, chacha20_keysetup) (chacha20_encrypt_stream): Rewrite. (chacha20_do_setkey): Adjust for new CHACHA20_context_s. * configure.ac: Remove 'chacha20-sse2-amd64.lo', 'chacha20-ssse3-amd64.lo', 'chacha20-avx2-amd64.lo'; Add 'chacha20-amd64-ssse3.lo', 'chacha20-amd64-avx2.lo'. -- Intel Core i7-4790K CPU @ 4.00GHz (x86_64/AVX2): CHACHA20 | nanosecs/byte mebibytes/sec cycles/byte STREAM enc | 0.319 ns/B 2988.5 MiB/s 1.28 c/B STREAM dec | 0.318 ns/B 2995.4 MiB/s 1.27 c/B Intel Core i7-4790K CPU @ 4.00GHz (x86_64/SSSE3): CHACHA20 | nanosecs/byte mebibytes/sec cycles/byte STREAM enc | 0.633 ns/B 1507.4 MiB/s 2.53 c/B STREAM dec | 0.633 ns/B 1506.6 MiB/s 2.53 c/B Intel Core i7-4790K CPU @ 4.00GHz (i386): CHACHA20 | nanosecs/byte mebibytes/sec cycles/byte STREAM enc | 2.05 ns/B 465.2 MiB/s 8.20 c/B STREAM dec | 2.04 ns/B 467.5 MiB/s 8.16 c/B Cortex-A53 @ 1152Mhz (armv7/neon): CHACHA20 | nanosecs/byte mebibytes/sec cycles/byte STREAM enc | 5.29 ns/B 180.3 MiB/s 6.09 c/B STREAM dec | 5.29 ns/B 180.1 MiB/s 6.10 c/B Signed-off-by: Jussi Kivilinna <jussi.kivilinna@iki.fi>
Diffstat (limited to 'cipher/chacha20.c')
-rw-r--r--cipher/chacha20.c504
1 files changed, 225 insertions, 279 deletions
diff --git a/cipher/chacha20.c b/cipher/chacha20.c
index 613fa82a..ac6cc29e 100644
--- a/cipher/chacha20.c
+++ b/cipher/chacha20.c
@@ -1,5 +1,5 @@
/* chacha20.c - Bernstein's ChaCha20 cipher
- * Copyright (C) 2014 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ * Copyright (C) 2014,2017,2018 Jussi Kivilinna <jussi.kivilinna@iki.fi>
*
* This file is part of Libgcrypt.
*
@@ -20,16 +20,15 @@
* http://cr.yp.to/chacha.html
*/
-/* The code is based on salsa20.c and public-domain ChaCha implementations:
- * chacha-ref.c version 20080118
- * D. J. Bernstein
- * Public domain.
- * and
- * Andrew Moon
- * https://github.com/floodyberry/chacha-opt
+/*
+ * Based on D. J. Bernstein reference implementation at
+ * http://cr.yp.to/chacha.html:
+ *
+ * chacha-regs.c version 20080118
+ * D. J. Bernstein
+ * Public domain.
*/
-
#include <config.h>
#include <stdio.h>
#include <stdlib.h>
@@ -46,295 +45,216 @@
#define CHACHA20_MIN_IV_SIZE 8 /* Bytes. */
#define CHACHA20_MAX_IV_SIZE 12 /* Bytes. */
#define CHACHA20_CTR_SIZE 16 /* Bytes. */
-#define CHACHA20_INPUT_LENGTH (CHACHA20_BLOCK_SIZE / 4)
-/* USE_SSE2 indicates whether to compile with Intel SSE2 code. */
-#undef USE_SSE2
-#if defined(__x86_64__) && (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
- defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS))
-# define USE_SSE2 1
-#endif
/* USE_SSSE3 indicates whether to compile with Intel SSSE3 code. */
#undef USE_SSSE3
-#if defined(__x86_64__) && (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
- defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && \
- defined(HAVE_GCC_INLINE_ASM_SSSE3)
+#if defined(__x86_64__) && defined(HAVE_GCC_INLINE_ASM_SSSE3) && \
+ (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
+ defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS))
# define USE_SSSE3 1
#endif
/* USE_AVX2 indicates whether to compile with Intel AVX2 code. */
#undef USE_AVX2
-#if defined(__x86_64__) && (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
- defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && \
- defined(ENABLE_AVX2_SUPPORT)
+#if defined(__x86_64__) && defined(HAVE_GCC_INLINE_ASM_AVX2) && \
+ (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
+ defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS))
# define USE_AVX2 1
#endif
-/* USE_NEON indicates whether to enable ARM NEON assembly code. */
-#undef USE_NEON
+/* USE_ARMV7_NEON indicates whether to enable ARMv7 NEON assembly code. */
+#undef USE_ARMV7_NEON
#ifdef ENABLE_NEON_SUPPORT
# if defined(HAVE_ARM_ARCH_V6) && defined(__ARMEL__) \
&& defined(HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS) \
&& defined(HAVE_GCC_INLINE_ASM_NEON)
-# define USE_NEON 1
+# define USE_ARMV7_NEON 1
# endif
-#endif /*ENABLE_NEON_SUPPORT*/
-
-
-struct CHACHA20_context_s;
-
+#endif
/* Assembly implementations use SystemV ABI, ABI conversion and additional
* stack to store XMM6-XMM15 needed on Win64. */
#undef ASM_FUNC_ABI
#undef ASM_EXTRA_STACK
-#if (defined(USE_SSE2) || defined(USE_SSSE3) || defined(USE_AVX2)) && \
- defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)
+#if defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)
# define ASM_FUNC_ABI __attribute__((sysv_abi))
-# define ASM_EXTRA_STACK (10 * 16)
#else
# define ASM_FUNC_ABI
-# define ASM_EXTRA_STACK 0
#endif
-typedef unsigned int (* chacha20_blocks_t)(u32 *state, const byte *src,
- byte *dst,
- size_t bytes) ASM_FUNC_ABI;
-
typedef struct CHACHA20_context_s
{
- u32 input[CHACHA20_INPUT_LENGTH];
- u32 pad[CHACHA20_INPUT_LENGTH];
- chacha20_blocks_t blocks;
+ u32 input[16];
+ unsigned char pad[CHACHA20_BLOCK_SIZE];
unsigned int unused; /* bytes in the pad. */
+ int use_ssse3:1;
+ int use_avx2:1;
+ int use_neon:1;
} CHACHA20_context_t;
-#ifdef USE_SSE2
-
-unsigned int _gcry_chacha20_amd64_sse2_blocks(u32 *state, const byte *in,
- byte *out,
- size_t bytes) ASM_FUNC_ABI;
-
-#endif /* USE_SSE2 */
-
#ifdef USE_SSSE3
-unsigned int _gcry_chacha20_amd64_ssse3_blocks(u32 *state, const byte *in,
- byte *out,
- size_t bytes) ASM_FUNC_ABI;
+unsigned int _gcry_chacha20_amd64_ssse3_blocks4(u32 *state, byte *dst,
+ const byte *src,
+ size_t nblks) ASM_FUNC_ABI;
#endif /* USE_SSSE3 */
#ifdef USE_AVX2
-unsigned int _gcry_chacha20_amd64_avx2_blocks(u32 *state, const byte *in,
- byte *out,
- size_t bytes) ASM_FUNC_ABI;
+unsigned int _gcry_chacha20_amd64_avx2_blocks8(u32 *state, byte *dst,
+ const byte *src,
+ size_t nblks) ASM_FUNC_ABI;
#endif /* USE_AVX2 */
-#ifdef USE_NEON
+#ifdef USE_ARMV7_NEON
-unsigned int _gcry_chacha20_armv7_neon_blocks(u32 *state, const byte *in,
- byte *out,
- size_t bytes) ASM_FUNC_ABI;
+unsigned int _gcry_chacha20_armv7_neon_blocks4(u32 *state, byte *dst,
+ const byte *src,
+ size_t nblks);
-#endif /* USE_NEON */
+#endif /* USE_ARMV7_NEON */
-static void chacha20_setiv (void *context, const byte * iv, size_t ivlen);
static const char *selftest (void);
+#define ROTATE(v,c) (rol(v,c))
+#define XOR(v,w) ((v) ^ (w))
+#define PLUS(v,w) ((u32)((v) + (w)))
+#define PLUSONE(v) (PLUS((v),1))
-#define QROUND(a,b,c,d) \
- do { \
- a += b; d = rol(d ^ a, 16); \
- c += d; b = rol(b ^ c, 12); \
- a += b; d = rol(d ^ a, 8); \
- c += d; b = rol(b ^ c, 7); \
- } while (0)
+#define QUARTERROUND(a,b,c,d) \
+ a = PLUS(a,b); d = ROTATE(XOR(d,a),16); \
+ c = PLUS(c,d); b = ROTATE(XOR(b,c),12); \
+ a = PLUS(a,b); d = ROTATE(XOR(d,a), 8); \
+ c = PLUS(c,d); b = ROTATE(XOR(b,c), 7);
-#define QOUT(ai, bi, ci, di) \
- DO_OUT(ai); DO_OUT(bi); DO_OUT(ci); DO_OUT(di)
+#define BUF_XOR_LE32(dst, src, offset, x) \
+ buf_put_le32((dst) + (offset), buf_get_le32((src) + (offset)) ^ (x))
-
-#ifndef USE_SSE2
-ASM_FUNC_ABI static unsigned int
-chacha20_blocks (u32 *state, const byte *src, byte *dst, size_t bytes)
+static unsigned int
+chacha20_blocks (u32 *input, byte *dst, const byte *src, size_t nblks)
{
- u32 pad[CHACHA20_INPUT_LENGTH];
- u32 inp[CHACHA20_INPUT_LENGTH];
+ u32 x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15;
unsigned int i;
- /* Note: 'bytes' must be multiple of 64 and not zero. */
-
- inp[0] = state[0];
- inp[1] = state[1];
- inp[2] = state[2];
- inp[3] = state[3];
- inp[4] = state[4];
- inp[5] = state[5];
- inp[6] = state[6];
- inp[7] = state[7];
- inp[8] = state[8];
- inp[9] = state[9];
- inp[10] = state[10];
- inp[11] = state[11];
- inp[12] = state[12];
- inp[13] = state[13];
- inp[14] = state[14];
- inp[15] = state[15];
-
- do
+ while (nblks)
{
- /* First round. */
- pad[0] = inp[0];
- pad[4] = inp[4];
- pad[8] = inp[8];
- pad[12] = inp[12];
- QROUND (pad[0], pad[4], pad[8], pad[12]);
- pad[1] = inp[1];
- pad[5] = inp[5];
- pad[9] = inp[9];
- pad[13] = inp[13];
- QROUND (pad[1], pad[5], pad[9], pad[13]);
- pad[2] = inp[2];
- pad[6] = inp[6];
- pad[10] = inp[10];
- pad[14] = inp[14];
- QROUND (pad[2], pad[6], pad[10], pad[14]);
- pad[3] = inp[3];
- pad[7] = inp[7];
- pad[11] = inp[11];
- pad[15] = inp[15];
- QROUND (pad[3], pad[7], pad[11], pad[15]);
-
- QROUND (pad[0], pad[5], pad[10], pad[15]);
- QROUND (pad[1], pad[6], pad[11], pad[12]);
- QROUND (pad[2], pad[7], pad[8], pad[13]);
- QROUND (pad[3], pad[4], pad[9], pad[14]);
-
- for (i = 2; i < 20 - 2; i += 2)
- {
- QROUND (pad[0], pad[4], pad[8], pad[12]);
- QROUND (pad[1], pad[5], pad[9], pad[13]);
- QROUND (pad[2], pad[6], pad[10], pad[14]);
- QROUND (pad[3], pad[7], pad[11], pad[15]);
-
- QROUND (pad[0], pad[5], pad[10], pad[15]);
- QROUND (pad[1], pad[6], pad[11], pad[12]);
- QROUND (pad[2], pad[7], pad[8], pad[13]);
- QROUND (pad[3], pad[4], pad[9], pad[14]);
- }
-
- QROUND (pad[0], pad[4], pad[8], pad[12]);
- QROUND (pad[1], pad[5], pad[9], pad[13]);
- QROUND (pad[2], pad[6], pad[10], pad[14]);
- QROUND (pad[3], pad[7], pad[11], pad[15]);
-
- if (src)
- {
-#define DO_OUT(idx) buf_put_le32(dst + (idx) * 4, \
- (pad[idx] + inp[idx]) ^ \
- buf_get_le32(src + (idx) * 4))
- /* Last round. */
- QROUND (pad[0], pad[5], pad[10], pad[15]);
- QOUT(0, 5, 10, 15);
- QROUND (pad[1], pad[6], pad[11], pad[12]);
- QOUT(1, 6, 11, 12);
- QROUND (pad[2], pad[7], pad[8], pad[13]);
- QOUT(2, 7, 8, 13);
- QROUND (pad[3], pad[4], pad[9], pad[14]);
- QOUT(3, 4, 9, 14);
-#undef DO_OUT
- }
- else
- {
-#define DO_OUT(idx) buf_put_le32(dst + (idx) * 4, pad[idx] + inp[idx])
- /* Last round. */
- QROUND (pad[0], pad[5], pad[10], pad[15]);
- QOUT(0, 5, 10, 15);
- QROUND (pad[1], pad[6], pad[11], pad[12]);
- QOUT(1, 6, 11, 12);
- QROUND (pad[2], pad[7], pad[8], pad[13]);
- QOUT(2, 7, 8, 13);
- QROUND (pad[3], pad[4], pad[9], pad[14]);
- QOUT(3, 4, 9, 14);
-#undef DO_OUT
- }
-
- /* Update counter. */
- inp[13] += (!++inp[12]);
-
- bytes -= CHACHA20_BLOCK_SIZE;
+ x0 = input[0];
+ x1 = input[1];
+ x2 = input[2];
+ x3 = input[3];
+ x4 = input[4];
+ x5 = input[5];
+ x6 = input[6];
+ x7 = input[7];
+ x8 = input[8];
+ x9 = input[9];
+ x10 = input[10];
+ x11 = input[11];
+ x12 = input[12];
+ x13 = input[13];
+ x14 = input[14];
+ x15 = input[15];
+
+ for (i = 20; i > 0; i -= 2)
+ {
+ QUARTERROUND(x0, x4, x8, x12)
+ QUARTERROUND(x1, x5, x9, x13)
+ QUARTERROUND(x2, x6, x10, x14)
+ QUARTERROUND(x3, x7, x11, x15)
+ QUARTERROUND(x0, x5, x10, x15)
+ QUARTERROUND(x1, x6, x11, x12)
+ QUARTERROUND(x2, x7, x8, x13)
+ QUARTERROUND(x3, x4, x9, x14)
+ }
+
+ x0 = PLUS(x0, input[0]);
+ x1 = PLUS(x1, input[1]);
+ x2 = PLUS(x2, input[2]);
+ x3 = PLUS(x3, input[3]);
+ x4 = PLUS(x4, input[4]);
+ x5 = PLUS(x5, input[5]);
+ x6 = PLUS(x6, input[6]);
+ x7 = PLUS(x7, input[7]);
+ x8 = PLUS(x8, input[8]);
+ x9 = PLUS(x9, input[9]);
+ x10 = PLUS(x10, input[10]);
+ x11 = PLUS(x11, input[11]);
+ x12 = PLUS(x12, input[12]);
+ x13 = PLUS(x13, input[13]);
+ x14 = PLUS(x14, input[14]);
+ x15 = PLUS(x15, input[15]);
+
+ input[12] = PLUSONE(input[12]);
+ input[13] = PLUS(input[13], !input[12]);
+
+ BUF_XOR_LE32(dst, src, 0, x0);
+ BUF_XOR_LE32(dst, src, 4, x1);
+ BUF_XOR_LE32(dst, src, 8, x2);
+ BUF_XOR_LE32(dst, src, 12, x3);
+ BUF_XOR_LE32(dst, src, 16, x4);
+ BUF_XOR_LE32(dst, src, 20, x5);
+ BUF_XOR_LE32(dst, src, 24, x6);
+ BUF_XOR_LE32(dst, src, 28, x7);
+ BUF_XOR_LE32(dst, src, 32, x8);
+ BUF_XOR_LE32(dst, src, 36, x9);
+ BUF_XOR_LE32(dst, src, 40, x10);
+ BUF_XOR_LE32(dst, src, 44, x11);
+ BUF_XOR_LE32(dst, src, 48, x12);
+ BUF_XOR_LE32(dst, src, 52, x13);
+ BUF_XOR_LE32(dst, src, 56, x14);
+ BUF_XOR_LE32(dst, src, 60, x15);
+
+ src += CHACHA20_BLOCK_SIZE;
dst += CHACHA20_BLOCK_SIZE;
- src += (src) ? CHACHA20_BLOCK_SIZE : 0;
+ nblks--;
}
- while (bytes >= CHACHA20_BLOCK_SIZE);
-
- state[12] = inp[12];
- state[13] = inp[13];
/* burn_stack */
- return (2 * CHACHA20_INPUT_LENGTH * sizeof(u32) + 6 * sizeof(void *));
-}
-#endif /*!USE_SSE2*/
-
-#undef QROUND
-#undef QOUT
-
-
-static unsigned int
-chacha20_core(u32 *dst, struct CHACHA20_context_s *ctx)
-{
- return ctx->blocks(ctx->input, NULL, (byte *)dst, CHACHA20_BLOCK_SIZE)
- + ASM_EXTRA_STACK;
+ return (17 * sizeof(u32) + 6 * sizeof(void *));
}
static void
-chacha20_keysetup (CHACHA20_context_t * ctx, const byte * key,
+chacha20_keysetup (CHACHA20_context_t *ctx, const byte *key,
unsigned int keylen)
{
- /* These constants are the little endian encoding of the string
- "expand 32-byte k". For the 128 bit variant, the "32" in that
- string will be fixed up to "16". */
- ctx->input[0] = 0x61707865; /* "apxe" */
- ctx->input[1] = 0x3320646e; /* "3 dn" */
- ctx->input[2] = 0x79622d32; /* "yb-2" */
- ctx->input[3] = 0x6b206574; /* "k et" */
-
- ctx->input[4] = buf_get_le32 (key + 0);
- ctx->input[5] = buf_get_le32 (key + 4);
- ctx->input[6] = buf_get_le32 (key + 8);
- ctx->input[7] = buf_get_le32 (key + 12);
-
+ static const char sigma[16] = "expand 32-byte k";
+ static const char tau[16] = "expand 16-byte k";
+ const char *constants;
+
+ ctx->input[4] = buf_get_le32(key + 0);
+ ctx->input[5] = buf_get_le32(key + 4);
+ ctx->input[6] = buf_get_le32(key + 8);
+ ctx->input[7] = buf_get_le32(key + 12);
if (keylen == CHACHA20_MAX_KEY_SIZE) /* 256 bits */
{
- ctx->input[8] = buf_get_le32 (key + 16);
- ctx->input[9] = buf_get_le32 (key + 20);
- ctx->input[10] = buf_get_le32 (key + 24);
- ctx->input[11] = buf_get_le32 (key + 28);
+ key += 16;
+ constants = sigma;
}
else /* 128 bits */
{
- ctx->input[8] = ctx->input[4];
- ctx->input[9] = ctx->input[5];
- ctx->input[10] = ctx->input[6];
- ctx->input[11] = ctx->input[7];
-
- ctx->input[1] -= 0x02000000; /* Change to "1 dn". */
- ctx->input[2] += 0x00000004; /* Change to "yb-6". */
+ constants = tau;
}
+ ctx->input[8] = buf_get_le32(key + 0);
+ ctx->input[9] = buf_get_le32(key + 4);
+ ctx->input[10] = buf_get_le32(key + 8);
+ ctx->input[11] = buf_get_le32(key + 12);
+ ctx->input[0] = buf_get_le32(constants + 0);
+ ctx->input[1] = buf_get_le32(constants + 4);
+ ctx->input[2] = buf_get_le32(constants + 8);
+ ctx->input[3] = buf_get_le32(constants + 12);
}
static void
-chacha20_ivsetup (CHACHA20_context_t * ctx, const byte * iv, size_t ivlen)
+chacha20_ivsetup (CHACHA20_context_t * ctx, const byte *iv, size_t ivlen)
{
if (ivlen == CHACHA20_CTR_SIZE)
{
@@ -367,9 +287,30 @@ chacha20_ivsetup (CHACHA20_context_t * ctx, const byte * iv, size_t ivlen)
}
+static void
+chacha20_setiv (void *context, const byte *iv, size_t ivlen)
+{
+ CHACHA20_context_t *ctx = (CHACHA20_context_t *) context;
+
+ /* draft-nir-cfrg-chacha20-poly1305-02 defines 96-bit and 64-bit nonce. */
+ if (iv && ivlen != CHACHA20_MAX_IV_SIZE && ivlen != CHACHA20_MIN_IV_SIZE
+ && ivlen != CHACHA20_CTR_SIZE)
+ log_info ("WARNING: chacha20_setiv: bad ivlen=%u\n", (u32) ivlen);
+
+ if (iv && (ivlen == CHACHA20_MAX_IV_SIZE || ivlen == CHACHA20_MIN_IV_SIZE
+ || ivlen == CHACHA20_CTR_SIZE))
+ chacha20_ivsetup (ctx, iv, ivlen);
+ else
+ chacha20_ivsetup (ctx, NULL, 0);
+
+ /* Reset the unused pad bytes counter. */
+ ctx->unused = 0;
+}
+
+
static gcry_err_code_t
-chacha20_do_setkey (CHACHA20_context_t * ctx,
- const byte * key, unsigned int keylen)
+chacha20_do_setkey (CHACHA20_context_t *ctx,
+ const byte *key, unsigned int keylen)
{
static int initialized;
static const char *selftest_failed;
@@ -388,25 +329,15 @@ chacha20_do_setkey (CHACHA20_context_t * ctx,
if (keylen != CHACHA20_MAX_KEY_SIZE && keylen != CHACHA20_MIN_KEY_SIZE)
return GPG_ERR_INV_KEYLEN;
-#ifdef USE_SSE2
- ctx->blocks = _gcry_chacha20_amd64_sse2_blocks;
-#else
- ctx->blocks = chacha20_blocks;
-#endif
-
#ifdef USE_SSSE3
- if (features & HWF_INTEL_SSSE3)
- ctx->blocks = _gcry_chacha20_amd64_ssse3_blocks;
+ ctx->use_ssse3 = (features & HWF_INTEL_SSSE3) != 0;
#endif
#ifdef USE_AVX2
- if (features & HWF_INTEL_AVX2)
- ctx->blocks = _gcry_chacha20_amd64_avx2_blocks;
+ ctx->use_avx2 = (features & HWF_INTEL_AVX2) != 0;
#endif
-#ifdef USE_NEON
- if (features & HWF_ARM_NEON)
- ctx->blocks = _gcry_chacha20_armv7_neon_blocks;
+#ifdef USE_ARMV7_NEON
+ ctx->use_neon = (features & HWF_ARM_NEON) != 0;
#endif
-
(void)features;
chacha20_keysetup (ctx, key, keylen);
@@ -419,7 +350,7 @@ chacha20_do_setkey (CHACHA20_context_t * ctx,
static gcry_err_code_t
-chacha20_setkey (void *context, const byte * key, unsigned int keylen)
+chacha20_setkey (void *context, const byte *key, unsigned int keylen)
{
CHACHA20_context_t *ctx = (CHACHA20_context_t *) context;
gcry_err_code_t rc = chacha20_do_setkey (ctx, key, keylen);
@@ -429,37 +360,19 @@ chacha20_setkey (void *context, const byte * key, unsigned int keylen)
static void
-chacha20_setiv (void *context, const byte * iv, size_t ivlen)
+chacha20_encrypt_stream (void *context, byte *outbuf, const byte *inbuf,
+ size_t length)
{
+ static const unsigned char zero_pad[CHACHA20_BLOCK_SIZE] = { 0, };
CHACHA20_context_t *ctx = (CHACHA20_context_t *) context;
-
- /* draft-nir-cfrg-chacha20-poly1305-02 defines 96-bit and 64-bit nonce. */
- if (iv && ivlen != CHACHA20_MAX_IV_SIZE && ivlen != CHACHA20_MIN_IV_SIZE
- && ivlen != CHACHA20_CTR_SIZE)
- log_info ("WARNING: chacha20_setiv: bad ivlen=%u\n", (u32) ivlen);
-
- if (iv && (ivlen == CHACHA20_MAX_IV_SIZE || ivlen == CHACHA20_MIN_IV_SIZE
- || ivlen == CHACHA20_CTR_SIZE))
- chacha20_ivsetup (ctx, iv, ivlen);
- else
- chacha20_ivsetup (ctx, NULL, 0);
-
- /* Reset the unused pad bytes counter. */
- ctx->unused = 0;
-}
-
-
-
-/* Note: This function requires LENGTH > 0. */
-static void
-chacha20_do_encrypt_stream (CHACHA20_context_t * ctx,
- byte * outbuf, const byte * inbuf, size_t length)
-{
unsigned int nburn, burn = 0;
+ if (!length)
+ return;
+
if (ctx->unused)
{
- unsigned char *p = (void *) ctx->pad;
+ unsigned char *p = ctx->pad;
size_t n;
gcry_assert (ctx->unused < CHACHA20_BLOCK_SIZE);
@@ -467,29 +380,73 @@ chacha20_do_encrypt_stream (CHACHA20_context_t * ctx,
n = ctx->unused;
if (n > length)
n = length;
+
buf_xor (outbuf, inbuf, p + CHACHA20_BLOCK_SIZE - ctx->unused, n);
length -= n;
outbuf += n;
inbuf += n;
ctx->unused -= n;
+
if (!length)
return;
gcry_assert (!ctx->unused);
}
+#ifdef USE_AVX2
+ if (ctx->use_avx2 && length >= CHACHA20_BLOCK_SIZE * 8)
+ {
+ size_t nblocks = length / CHACHA20_BLOCK_SIZE;
+ nblocks -= nblocks % 8;
+ nburn = _gcry_chacha20_amd64_avx2_blocks8(ctx->input, outbuf, inbuf,
+ nblocks);
+ burn = nburn > burn ? nburn : burn;
+ length -= nblocks * CHACHA20_BLOCK_SIZE;
+ outbuf += nblocks * CHACHA20_BLOCK_SIZE;
+ inbuf += nblocks * CHACHA20_BLOCK_SIZE;
+ }
+#endif
+
+#ifdef USE_SSSE3
+ if (ctx->use_ssse3 && length >= CHACHA20_BLOCK_SIZE * 4)
+ {
+ size_t nblocks = length / CHACHA20_BLOCK_SIZE;
+ nblocks -= nblocks % 4;
+ nburn = _gcry_chacha20_amd64_ssse3_blocks4(ctx->input, outbuf, inbuf,
+ nblocks);
+ burn = nburn > burn ? nburn : burn;
+ length -= nblocks * CHACHA20_BLOCK_SIZE;
+ outbuf += nblocks * CHACHA20_BLOCK_SIZE;
+ inbuf += nblocks * CHACHA20_BLOCK_SIZE;
+ }
+#endif
+
+#ifdef USE_ARMV7_NEON
+ if (ctx->use_neon && length >= CHACHA20_BLOCK_SIZE * 4)
+ {
+ size_t nblocks = length / CHACHA20_BLOCK_SIZE;
+ nblocks -= nblocks % 4;
+ nburn = _gcry_chacha20_armv7_neon_blocks4(ctx->input, outbuf, inbuf,
+ nblocks);
+ burn = nburn > burn ? nburn : burn;
+ length -= nblocks * CHACHA20_BLOCK_SIZE;
+ outbuf += nblocks * CHACHA20_BLOCK_SIZE;
+ inbuf += nblocks * CHACHA20_BLOCK_SIZE;
+ }
+#endif
+
if (length >= CHACHA20_BLOCK_SIZE)
{
size_t nblocks = length / CHACHA20_BLOCK_SIZE;
- size_t bytes = nblocks * CHACHA20_BLOCK_SIZE;
- burn = ctx->blocks(ctx->input, inbuf, outbuf, bytes);
- length -= bytes;
- outbuf += bytes;
- inbuf += bytes;
+ nburn = chacha20_blocks(ctx->input, outbuf, inbuf, nblocks);
+ burn = nburn > burn ? nburn : burn;
+ length -= nblocks * CHACHA20_BLOCK_SIZE;
+ outbuf += nblocks * CHACHA20_BLOCK_SIZE;
+ inbuf += nblocks * CHACHA20_BLOCK_SIZE;
}
if (length > 0)
{
- nburn = chacha20_core (ctx->pad, ctx);
+ nburn = chacha20_blocks(ctx->input, ctx->pad, zero_pad, 1);
burn = nburn > burn ? nburn : burn;
buf_xor (outbuf, inbuf, ctx->pad, length);
@@ -500,17 +457,6 @@ chacha20_do_encrypt_stream (CHACHA20_context_t * ctx,
}
-static void
-chacha20_encrypt_stream (void *context, byte * outbuf, const byte * inbuf,
- size_t length)
-{
- CHACHA20_context_t *ctx = (CHACHA20_context_t *) context;
-
- if (length)
- chacha20_do_encrypt_stream (ctx, outbuf, inbuf, length);
-}
-
-
static const char *
selftest (void)
{