summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--ChangeLog11
-rw-r--r--chacha-crypt.c102
-rw-r--r--chacha-internal.h14
-rw-r--r--configure.ac7
-rw-r--r--fat-ppc.c4
-rw-r--r--powerpc64/fat/chacha-4core.asm36
-rw-r--r--powerpc64/p7/chacha-4core.asm231
7 files changed, 388 insertions, 17 deletions
diff --git a/ChangeLog b/ChangeLog
index f123ba19..d47c138e 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,5 +1,16 @@
2020-11-30 Niels Möller <nisse@lysator.liu.se>
+ * chacha-crypt.c: (_nettle_chacha_crypt_4core)
+ (_nettle_chacha_crypt32_4core): New functions.
+ * chacha-internal.h: Add prototypes for _nettle_chacha_4core and
+ related functions.
+ * configure.ac (asm_nettle_optional_list): Add chacha-4core.asm.
+ * powerpc64/fat/chacha-4core.asm: New file.
+ * powerpc64/p7/chacha-4core.asm: New file.
+ * fat-ppc.c (fat_init): When altivec is available, use
+ _nettle_chacha_crypt_4core and _nettle_chacha_crypt32_4core
+ instead of _2core variants.
+
* chacha-crypt.c (_nettle_chacha_crypt32_3core): Fix bug in
handling of counter; this function should not propagate any carry.
diff --git a/chacha-crypt.c b/chacha-crypt.c
index a13898f1..d3af5f58 100644
--- a/chacha-crypt.c
+++ b/chacha-crypt.c
@@ -54,17 +54,60 @@
#define CHACHA_ROUNDS 20
-#if HAVE_NATIVE_chacha_3core
+#if HAVE_NATIVE_chacha_4core
+#define _nettle_chacha_crypt_4core chacha_crypt
+#define _nettle_chacha_crypt32_4core chacha_crypt32
+#elif HAVE_NATIVE_chacha_3core
#define _nettle_chacha_crypt_3core chacha_crypt
#define _nettle_chacha_crypt32_3core chacha_crypt32
-#elif HAVE_NATIVE_chacha_2core
-#define _nettle_chacha_crypt_2core chacha_crypt
-#define _nettle_chacha_crypt32_2core chacha_crypt32
-#elif !(HAVE_NATIVE_fat_chacha_3core || HAVE_NATIVE_fat_chacha_2core)
+#elif !(HAVE_NATIVE_fat_chacha_4core || HAVE_NATIVE_fat_chacha_3core)
#define _nettle_chacha_crypt_1core chacha_crypt
#define _nettle_chacha_crypt32_1core chacha_crypt32
#endif
+#if HAVE_NATIVE_chacha_4core || HAVE_NATIVE_fat_chacha_4core
+void
+_nettle_chacha_crypt_4core(struct chacha_ctx *ctx,
+ size_t length,
+ uint8_t *dst,
+ const uint8_t *src)
+{
+ uint32_t x[4*_CHACHA_STATE_LENGTH];
+
+ if (!length)
+ return;
+
+ while (length > 2*CHACHA_BLOCK_SIZE)
+ {
+ _nettle_chacha_4core (x, ctx->state, CHACHA_ROUNDS);
+ ctx->state[12] += 4;
+ ctx->state[13] += (ctx->state[12] < 4);
+ if (length <= 4*CHACHA_BLOCK_SIZE)
+ {
+ memxor3 (dst, src, x, length);
+ return;
+ }
+ memxor3 (dst, src, x, 4*CHACHA_BLOCK_SIZE);
+
+ length -= 4*CHACHA_BLOCK_SIZE;
+ dst += 4*CHACHA_BLOCK_SIZE;
+ src += 4*CHACHA_BLOCK_SIZE;
+ }
+ if (length > CHACHA_BLOCK_SIZE)
+ {
+ _nettle_chacha_2core (x, ctx->state, CHACHA_ROUNDS);
+ ctx->state[12] += 2;
+ ctx->state[13] += (ctx->state[12] < 2);
+ }
+ else
+ {
+ _nettle_chacha_core (x, ctx->state, CHACHA_ROUNDS);
+ ctx->state[13] += (++ctx->state[12] == 0);
+ }
+ memxor3 (dst, src, x, length);
+}
+#endif
+
#if HAVE_NATIVE_chacha_3core || HAVE_NATIVE_fat_chacha_3core
void
_nettle_chacha_crypt_3core(struct chacha_ctx *ctx,
@@ -108,7 +151,7 @@ _nettle_chacha_crypt_3core(struct chacha_ctx *ctx,
}
#endif
-#if HAVE_NATIVE_chacha_2core || HAVE_NATIVE_fat_chacha_2core
+#if 0
void
_nettle_chacha_crypt_2core(struct chacha_ctx *ctx,
size_t length,
@@ -143,7 +186,7 @@ _nettle_chacha_crypt_2core(struct chacha_ctx *ctx,
}
#endif
-#if !(HAVE_NATIVE_chacha_3core || HAVE_NATIVE_chacha_2core)
+#if !(HAVE_NATIVE_chacha_4core || HAVE_NATIVE_chacha_3core)
void
_nettle_chacha_crypt_1core(struct chacha_ctx *ctx,
size_t length,
@@ -177,6 +220,47 @@ _nettle_chacha_crypt_1core(struct chacha_ctx *ctx,
}
#endif
+#if HAVE_NATIVE_chacha_4core || HAVE_NATIVE_fat_chacha_4core
+void
+_nettle_chacha_crypt32_4core(struct chacha_ctx *ctx,
+ size_t length,
+ uint8_t *dst,
+ const uint8_t *src)
+{
+ uint32_t x[4*_CHACHA_STATE_LENGTH];
+
+ if (!length)
+ return;
+
+ while (length > 2*CHACHA_BLOCK_SIZE)
+ {
+ _nettle_chacha_4core32 (x, ctx->state, CHACHA_ROUNDS);
+ ctx->state[12] += 4;
+ if (length <= 4*CHACHA_BLOCK_SIZE)
+ {
+ memxor3 (dst, src, x, length);
+ return;
+ }
+ memxor3 (dst, src, x, 4*CHACHA_BLOCK_SIZE);
+
+ length -= 4*CHACHA_BLOCK_SIZE;
+ dst += 4*CHACHA_BLOCK_SIZE;
+ src += 4*CHACHA_BLOCK_SIZE;
+ }
+ if (length > CHACHA_BLOCK_SIZE)
+ {
+ _nettle_chacha_2core32 (x, ctx->state, CHACHA_ROUNDS);
+ ctx->state[12] += 2;
+ }
+ else
+ {
+ _nettle_chacha_core (x, ctx->state, CHACHA_ROUNDS);
+ ++ctx->state[12];
+ }
+ memxor3 (dst, src, x, length);
+}
+#endif
+
#if HAVE_NATIVE_chacha_3core || HAVE_NATIVE_fat_chacha_3core
void
_nettle_chacha_crypt32_3core(struct chacha_ctx *ctx,
@@ -218,7 +302,7 @@ _nettle_chacha_crypt32_3core(struct chacha_ctx *ctx,
}
#endif
-#if HAVE_NATIVE_chacha_2core || HAVE_NATIVE_fat_chacha_2core
+#if 0
void
_nettle_chacha_crypt32_2core(struct chacha_ctx *ctx,
size_t length,
@@ -252,7 +336,7 @@ _nettle_chacha_crypt32_2core(struct chacha_ctx *ctx,
}
#endif
-#if !(HAVE_NATIVE_chacha_3core || HAVE_NATIVE_chacha_2core)
+#if !(HAVE_NATIVE_chacha_4core || HAVE_NATIVE_chacha_3core)
void
_nettle_chacha_crypt32_1core(struct chacha_ctx *ctx,
size_t length,
diff --git a/chacha-internal.h b/chacha-internal.h
index d92a6779..897fdc16 100644
--- a/chacha-internal.h
+++ b/chacha-internal.h
@@ -56,19 +56,25 @@ void
_nettle_chacha_3core32(uint32_t *dst, const uint32_t *src, unsigned rounds);
void
+_nettle_chacha_4core(uint32_t *dst, const uint32_t *src, unsigned rounds);
+
+void
+_nettle_chacha_4core32(uint32_t *dst, const uint32_t *src, unsigned rounds);
+
+void
_nettle_chacha_crypt_1core(struct chacha_ctx *ctx,
size_t length,
uint8_t *dst,
const uint8_t *src);
void
-_nettle_chacha_crypt_2core(struct chacha_ctx *ctx,
+_nettle_chacha_crypt_3core(struct chacha_ctx *ctx,
size_t length,
uint8_t *dst,
const uint8_t *src);
void
-_nettle_chacha_crypt_3core(struct chacha_ctx *ctx,
+_nettle_chacha_crypt_4core(struct chacha_ctx *ctx,
size_t length,
uint8_t *dst,
const uint8_t *src);
@@ -80,13 +86,13 @@ _nettle_chacha_crypt32_1core(struct chacha_ctx *ctx,
const uint8_t *src);
void
-_nettle_chacha_crypt32_2core(struct chacha_ctx *ctx,
+_nettle_chacha_crypt32_3core(struct chacha_ctx *ctx,
size_t length,
uint8_t *dst,
const uint8_t *src);
void
-_nettle_chacha_crypt32_3core(struct chacha_ctx *ctx,
+_nettle_chacha_crypt32_4core(struct chacha_ctx *ctx,
size_t length,
uint8_t *dst,
const uint8_t *src);
diff --git a/configure.ac b/configure.ac
index 6fafaa77..776a9a61 100644
--- a/configure.ac
+++ b/configure.ac
@@ -499,8 +499,9 @@ asm_replace_list="aes-encrypt-internal.asm aes-decrypt-internal.asm \
# Assembler files which generate additional object files if they are used.
asm_nettle_optional_list="gcm-hash.asm gcm-hash8.asm cpuid.asm \
aes-encrypt-internal-2.asm aes-decrypt-internal-2.asm memxor-2.asm \
- chacha-2core.asm chacha-3core.asm chacha-core-internal-2.asm salsa20-2core.asm \
- salsa20-core-internal-2.asm sha1-compress-2.asm sha256-compress-2.asm \
+ chacha-2core.asm chacha-3core.asm chacha-4core.asm chacha-core-internal-2.asm \
+ salsa20-2core.asm salsa20-core-internal-2.asm \
+ sha1-compress-2.asm sha256-compress-2.asm \
sha3-permute-2.asm sha512-compress-2.asm \
umac-nh-n-2.asm umac-nh-2.asm"
@@ -609,8 +610,10 @@ AH_VERBATIM([HAVE_NATIVE],
#undef HAVE_NATIVE_chacha_core
#undef HAVE_NATIVE_chacha_2core
#undef HAVE_NATIVE_chacha_3core
+#undef HAVE_NATIVE_chacha_4core
#undef HAVE_NATIVE_fat_chacha_2core
#undef HAVE_NATIVE_fat_chacha_3core
+#undef HAVE_NATIVE_fat_chacha_4core
#undef HAVE_NATIVE_ecc_curve25519_modp
#undef HAVE_NATIVE_ecc_curve448_modp
#undef HAVE_NATIVE_ecc_secp192r1_modp
diff --git a/fat-ppc.c b/fat-ppc.c
index 8d4a703d..847af14f 100644
--- a/fat-ppc.c
+++ b/fat-ppc.c
@@ -214,8 +214,8 @@ fat_init (void)
if (verbose)
fprintf (stderr, "libnettle: enabling altivec code.\n");
_nettle_chacha_core_vec = _nettle_chacha_core_altivec;
- nettle_chacha_crypt_vec = _nettle_chacha_crypt_2core;
- nettle_chacha_crypt32_vec = _nettle_chacha_crypt32_2core;
+ nettle_chacha_crypt_vec = _nettle_chacha_crypt_4core;
+ nettle_chacha_crypt32_vec = _nettle_chacha_crypt32_4core;
}
else
{
diff --git a/powerpc64/fat/chacha-4core.asm b/powerpc64/fat/chacha-4core.asm
new file mode 100644
index 00000000..bd6be1be
--- /dev/null
+++ b/powerpc64/fat/chacha-4core.asm
@@ -0,0 +1,36 @@
+C powerpc64/fat/chacha-4core.asm
+
+
+ifelse(`
+ Copyright (C) 2020 Niels Möller
+
+ This file is part of GNU Nettle.
+
+ GNU Nettle is free software: you can redistribute it and/or
+ modify it under the terms of either:
+
+ * the GNU Lesser General Public License as published by the Free
+ Software Foundation; either version 3 of the License, or (at your
+ option) any later version.
+
+ or
+
+ * the GNU General Public License as published by the Free
+ Software Foundation; either version 2 of the License, or (at your
+ option) any later version.
+
+ or both in parallel, as here.
+
+ GNU Nettle is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received copies of the GNU General Public License and
+ the GNU Lesser General Public License along with this program. If
+ not, see http://www.gnu.org/licenses/.
+')
+
+dnl PROLOGUE(_nettle_fat_chacha_4core) picked up by configure
+
+include_src(`powerpc64/p7/chacha-4core.asm')
diff --git a/powerpc64/p7/chacha-4core.asm b/powerpc64/p7/chacha-4core.asm
new file mode 100644
index 00000000..49a801be
--- /dev/null
+++ b/powerpc64/p7/chacha-4core.asm
@@ -0,0 +1,231 @@
+C powerpc64/chacha-4core.asm
+
+ifelse(`
+ Copyright (C) 2020 Niels Möller and Torbjörn Granlund
+ This file is part of GNU Nettle.
+
+ GNU Nettle is free software: you can redistribute it and/or
+ modify it under the terms of either:
+
+ * the GNU Lesser General Public License as published by the Free
+ Software Foundation; either version 3 of the License, or (at your
+ option) any later version.
+
+ or
+
+ * the GNU General Public License as published by the Free
+ Software Foundation; either version 2 of the License, or (at your
+ option) any later version.
+
+ or both in parallel, as here.
+
+ GNU Nettle is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received copies of the GNU General Public License and
+ the GNU Lesser General Public License along with this program. If
+ not, see http://www.gnu.org/licenses/.
+')
+
+C Register usage:
+
+define(`SP', `r1')
+define(`TOCP', `r2')
+
+C Argments
+define(`DST', `r3')
+define(`SRC', `r4')
+define(`ROUNDS', `r5')
+
+C Working state in v0,...,v15
+
+define(`ROT16', v16)
+define(`ROT12', v17)
+define(`ROT8', v18)
+define(`ROT7', v19)
+
+C During the loop, used to save the original values for last 4 words
+C of each block. Also used as temporaries for transpose.
+define(`T0', `v20')
+define(`T1', `v21')
+define(`T2', `v22')
+define(`T3', `v23')
+
+C Main loop for round
+define(`QR',`
+ vadduwm $1, $1, $2
+ vxor $4, $4, $1
+ vrlw $4, $4, ROT16
+ vadduwm $3, $3, $4
+ vxor $2, $2, $3
+ vrlw $2, $2, ROT12
+ vadduwm $1, $1, $2
+ vxor $4, $4, $1
+ vrlw $4, $4, ROT8
+ vadduwm $3, $3, $4
+ vxor $2, $2, $3
+ vrlw $2, $2, ROT7
+ ')
+
+define(`TRANSPOSE',`
+ vmrghw T0, $1, $3 C A0 A2 B0 B2
+ vmrghw T1, $2, $4 C A1 A3 B1 B3
+ vmrglw T2, $1, $3 C C0 C2 D0 D2
+ vmrglw T3, $2, $4 C C1 C3 D1 D3
+
+ vmrghw $1, T0, T1 C A0 A1 A2 A3
+ vmrglw $2, T0, T1 C B0 B1 B2 B3
+ vmrghw $3, T2, T3 C C0 C2 C1 C3
+ vmrglw $4, T2, T3 C D0 D1 D2 D3
+')
+
+ C _chacha_4core(uint32_t *dst, const uint32_t *src, unsigned rounds)
+define(`FUNC_ALIGN', `5')
+PROLOGUE(_nettle_chacha_4core)
+
+ vspltisw T2, 1 C Apply counter carries
+
+.Lshared_entry:
+
+ li r6, 0x10 C set up some...
+ li r7, 0x20 C ...useful...
+ li r8, 0x30 C ...offsets
+
+ addi SP, SP, -0x40 C Save callee-save registers
+ stvx v20, 0, SP
+ stvx v21, r6, SP
+ stvx v22, r7, SP
+ stvx v23, r8, SP
+
+ vspltisw ROT16, -16 C -16 instead of 16 actually works!
+ vspltisw ROT12, 12
+ vspltisw ROT8, 8
+ vspltisw ROT7, 7
+
+C Load state and splat
+ lxvw4x VSR(v0), 0, SRC C "expa ..."
+ lxvw4x VSR(v4), r6, SRC C key
+ lxvw4x VSR(v8), r7, SRC C key
+ lxvw4x VSR(v12), r8, SRC C cnt and nonce
+
+ vspltw v1, v0, 1
+ vspltw v2, v0, 2
+ vspltw v3, v0, 3
+ vspltw v0, v0, 0
+ vspltw v5, v4, 1
+ vspltw v6, v4, 2
+ vspltw v7, v4, 3
+ vspltw v4, v4, 0
+ vspltw v9, v8, 1
+ vspltw v10, v8, 2
+ vspltw v11, v8, 3
+ vspltw v8, v8, 0
+ vspltw v13, v12, 1
+ vspltw v14, v12, 2
+ vspltw v15, v12, 3
+ vspltw v12, v12, 0
+
+ ld r9, .Lcnts@got(r2)
+ lxvw4x VSR(T0), 0, r9 C increments
+ vaddcuw T1, v12, T0 C compute carry-out
+ vadduwm v12, v12, T0 C low adds
+ vand T1, T1, T2 C discard carries for 32-bit counter variant
+ vadduwm v13, v13, T1 C apply carries
+
+ C Save all 4x4 of the last words.
+ vor T0, v12, v12
+ vor T1, v13, v13
+ vor T2, v14, v14
+ vor T3, v15, v15
+
+ srdi ROUNDS, ROUNDS, 1
+ mtctr ROUNDS
+.Loop:
+ QR(v0, v4, v8, v12)
+ QR(v1, v5, v9, v13)
+ QR(v2, v6, v10, v14)
+ QR(v3, v7, v11, v15)
+ QR(v0, v5, v10, v15)
+ QR(v1, v6, v11, v12)
+ QR(v2, v7, v8, v13)
+ QR(v3, v4, v9, v14)
+ bdnz .Loop
+
+ C Add in saved original words, including counters, before
+ C transpose.
+ vadduwm v12, v12, T0
+ vadduwm v13, v13, T1
+ vadduwm v14, v14, T2
+ vadduwm v15, v15, T3
+
+ TRANSPOSE(v0, v1,v2, v3)
+ TRANSPOSE(v4, v5, v6, v7)
+ TRANSPOSE(v8, v9, v10, v11)
+ TRANSPOSE(v12, v13, v14, v15)
+
+ lxvw4x VSR(T0), 0, SRC
+ lxvw4x VSR(T1), r6, SRC
+ lxvw4x VSR(T2), r7, SRC
+
+ vadduwm v0, v0, T0
+ vadduwm v1, v1, T0
+ vadduwm v2, v2, T0
+ vadduwm v3, v3, T0
+
+ vadduwm v4, v4, T1
+ vadduwm v5, v5, T1
+ vadduwm v6, v6, T1
+ vadduwm v7, v7, T1
+
+ vadduwm v8, v8, T2
+ vadduwm v9, v9, T2
+ vadduwm v10, v10, T2
+ vadduwm v11, v11, T2
+
+ stxvw4x VSR(v0), 0, DST
+ stxvw4x VSR(v4), r6, DST
+ stxvw4x VSR(v8), r7, DST
+ stxvw4x VSR(v12), r8, DST
+
+ addi DST, DST, 64
+
+ stxvw4x VSR(v1), 0, DST
+ stxvw4x VSR(v5), r6, DST
+ stxvw4x VSR(v9), r7, DST
+ stxvw4x VSR(v13), r8, DST
+
+ addi DST, DST, 64
+
+ stxvw4x VSR(v2), 0, DST
+ stxvw4x VSR(v6), r6, DST
+ stxvw4x VSR(v10), r7, DST
+ stxvw4x VSR(v14), r8, DST
+
+ addi DST, DST, 64
+
+ stxvw4x VSR(v3), 0, DST
+ stxvw4x VSR(v7), r6, DST
+ stxvw4x VSR(v11), r7, DST
+ stxvw4x VSR(v15), r8, DST
+
+ C Restore callee-save registers
+ lvx v20, 0, SP
+ lvx v21, r6, SP
+ lvx v22, r7, SP
+ lvx v23, r8, SP
+ addi SP, SP, 0x40
+
+ blr
+EPILOGUE(_nettle_chacha_4core)
+
+define(`FUNC_ALIGN', `5')
+PROLOGUE(_nettle_chacha_4core32)
+ vspltisw T2, 0 C Ignore counter carries
+ b .Lshared_entry
+EPILOGUE(_nettle_chacha_4core32)
+
+ .section .rodata
+ ALIGN(16)
+.Lcnts: .long 0,1,2,3 C increments