diff options
-rw-r--r-- | ChangeLog | 11 | ||||
-rw-r--r-- | chacha-crypt.c | 102 | ||||
-rw-r--r-- | chacha-internal.h | 14 | ||||
-rw-r--r-- | configure.ac | 7 | ||||
-rw-r--r-- | fat-ppc.c | 4 | ||||
-rw-r--r-- | powerpc64/fat/chacha-4core.asm | 36 | ||||
-rw-r--r-- | powerpc64/p7/chacha-4core.asm | 231 |
7 files changed, 388 insertions, 17 deletions
@@ -1,5 +1,16 @@ 2020-11-30 Niels Möller <nisse@lysator.liu.se> + * chacha-crypt.c: (_nettle_chacha_crypt_4core) + (_nettle_chacha_crypt32_4core): New functions. + * chacha-internal.h: Add prototypes for _nettle_chacha_4core and + related functions. + * configure.ac (asm_nettle_optional_list): Add chacha-4core.asm. + * powerpc64/fat/chacha-4core.asm: New file. + * powerpc64/p7/chacha-4core.asm: New file. + * fat-ppc.c (fat_init): When altivec is available, use + _nettle_chacha_crypt_4core and _nettle_chacha_crypt32_4core + instead of _2core variants. + * chacha-crypt.c (_nettle_chacha_crypt32_3core): Fix bug in handling of counter; this function should not propagate any carry. diff --git a/chacha-crypt.c b/chacha-crypt.c index a13898f1..d3af5f58 100644 --- a/chacha-crypt.c +++ b/chacha-crypt.c @@ -54,17 +54,60 @@ #define CHACHA_ROUNDS 20 -#if HAVE_NATIVE_chacha_3core +#if HAVE_NATIVE_chacha_4core +#define _nettle_chacha_crypt_4core chacha_crypt +#define _nettle_chacha_crypt32_4core chacha_crypt32 +#elif HAVE_NATIVE_chacha_3core #define _nettle_chacha_crypt_3core chacha_crypt #define _nettle_chacha_crypt32_3core chacha_crypt32 -#elif HAVE_NATIVE_chacha_2core -#define _nettle_chacha_crypt_2core chacha_crypt -#define _nettle_chacha_crypt32_2core chacha_crypt32 -#elif !(HAVE_NATIVE_fat_chacha_3core || HAVE_NATIVE_fat_chacha_2core) +#elif !(HAVE_NATIVE_fat_chacha_4core || HAVE_NATIVE_fat_chacha_3core) #define _nettle_chacha_crypt_1core chacha_crypt #define _nettle_chacha_crypt32_1core chacha_crypt32 #endif +#if HAVE_NATIVE_chacha_4core || HAVE_NATIVE_fat_chacha_4core +void +_nettle_chacha_crypt_4core(struct chacha_ctx *ctx, + size_t length, + uint8_t *dst, + const uint8_t *src) +{ + uint32_t x[4*_CHACHA_STATE_LENGTH]; + + if (!length) + return; + + while (length > 2*CHACHA_BLOCK_SIZE) + { + _nettle_chacha_4core (x, ctx->state, CHACHA_ROUNDS); + ctx->state[12] += 4; + ctx->state[13] += (ctx->state[12] < 4); + if (length <= 4*CHACHA_BLOCK_SIZE) + { + memxor3 (dst, src, x, length); + return; + } + memxor3 (dst, src, x, 4*CHACHA_BLOCK_SIZE); + + length -= 4*CHACHA_BLOCK_SIZE; + dst += 4*CHACHA_BLOCK_SIZE; + src += 4*CHACHA_BLOCK_SIZE; + } + if (length > CHACHA_BLOCK_SIZE) + { + _nettle_chacha_2core (x, ctx->state, CHACHA_ROUNDS); + ctx->state[12] += 2; + ctx->state[13] += (ctx->state[12] < 2); + } + else + { + _nettle_chacha_core (x, ctx->state, CHACHA_ROUNDS); + ctx->state[13] += (++ctx->state[12] == 0); + } + memxor3 (dst, src, x, length); +} +#endif + #if HAVE_NATIVE_chacha_3core || HAVE_NATIVE_fat_chacha_3core void _nettle_chacha_crypt_3core(struct chacha_ctx *ctx, @@ -108,7 +151,7 @@ _nettle_chacha_crypt_3core(struct chacha_ctx *ctx, } #endif -#if HAVE_NATIVE_chacha_2core || HAVE_NATIVE_fat_chacha_2core +#if 0 void _nettle_chacha_crypt_2core(struct chacha_ctx *ctx, size_t length, @@ -143,7 +186,7 @@ _nettle_chacha_crypt_2core(struct chacha_ctx *ctx, } #endif -#if !(HAVE_NATIVE_chacha_3core || HAVE_NATIVE_chacha_2core) +#if !(HAVE_NATIVE_chacha_4core || HAVE_NATIVE_chacha_3core) void _nettle_chacha_crypt_1core(struct chacha_ctx *ctx, size_t length, @@ -177,6 +220,47 @@ _nettle_chacha_crypt_1core(struct chacha_ctx *ctx, } #endif +#if HAVE_NATIVE_chacha_4core || HAVE_NATIVE_fat_chacha_4core +void +_nettle_chacha_crypt32_4core(struct chacha_ctx *ctx, + size_t length, + uint8_t *dst, + const uint8_t *src) +{ + uint32_t x[4*_CHACHA_STATE_LENGTH]; + + if (!length) + return; + + while (length > 2*CHACHA_BLOCK_SIZE) + { + _nettle_chacha_4core32 (x, ctx->state, CHACHA_ROUNDS); + ctx->state[12] += 4; + if (length <= 4*CHACHA_BLOCK_SIZE) + { + memxor3 (dst, src, x, length); + return; + } + memxor3 (dst, src, x, 4*CHACHA_BLOCK_SIZE); + + length -= 4*CHACHA_BLOCK_SIZE; + dst += 4*CHACHA_BLOCK_SIZE; + src += 4*CHACHA_BLOCK_SIZE; + } + if (length > CHACHA_BLOCK_SIZE) + { + _nettle_chacha_2core32 (x, ctx->state, CHACHA_ROUNDS); + ctx->state[12] += 2; + } + else + { + _nettle_chacha_core (x, ctx->state, CHACHA_ROUNDS); + ++ctx->state[12]; + } + memxor3 (dst, src, x, length); +} +#endif + #if HAVE_NATIVE_chacha_3core || HAVE_NATIVE_fat_chacha_3core void _nettle_chacha_crypt32_3core(struct chacha_ctx *ctx, @@ -218,7 +302,7 @@ _nettle_chacha_crypt32_3core(struct chacha_ctx *ctx, } #endif -#if HAVE_NATIVE_chacha_2core || HAVE_NATIVE_fat_chacha_2core +#if 0 void _nettle_chacha_crypt32_2core(struct chacha_ctx *ctx, size_t length, @@ -252,7 +336,7 @@ _nettle_chacha_crypt32_2core(struct chacha_ctx *ctx, } #endif -#if !(HAVE_NATIVE_chacha_3core || HAVE_NATIVE_chacha_2core) +#if !(HAVE_NATIVE_chacha_4core || HAVE_NATIVE_chacha_3core) void _nettle_chacha_crypt32_1core(struct chacha_ctx *ctx, size_t length, diff --git a/chacha-internal.h b/chacha-internal.h index d92a6779..897fdc16 100644 --- a/chacha-internal.h +++ b/chacha-internal.h @@ -56,19 +56,25 @@ void _nettle_chacha_3core32(uint32_t *dst, const uint32_t *src, unsigned rounds); void +_nettle_chacha_4core(uint32_t *dst, const uint32_t *src, unsigned rounds); + +void +_nettle_chacha_4core32(uint32_t *dst, const uint32_t *src, unsigned rounds); + +void _nettle_chacha_crypt_1core(struct chacha_ctx *ctx, size_t length, uint8_t *dst, const uint8_t *src); void -_nettle_chacha_crypt_2core(struct chacha_ctx *ctx, +_nettle_chacha_crypt_3core(struct chacha_ctx *ctx, size_t length, uint8_t *dst, const uint8_t *src); void -_nettle_chacha_crypt_3core(struct chacha_ctx *ctx, +_nettle_chacha_crypt_4core(struct chacha_ctx *ctx, size_t length, uint8_t *dst, const uint8_t *src); @@ -80,13 +86,13 @@ _nettle_chacha_crypt32_1core(struct chacha_ctx *ctx, const uint8_t *src); void -_nettle_chacha_crypt32_2core(struct chacha_ctx *ctx, +_nettle_chacha_crypt32_3core(struct chacha_ctx *ctx, size_t length, uint8_t *dst, const uint8_t *src); void -_nettle_chacha_crypt32_3core(struct chacha_ctx *ctx, +_nettle_chacha_crypt32_4core(struct chacha_ctx *ctx, size_t length, uint8_t *dst, const uint8_t *src); diff --git a/configure.ac b/configure.ac index 6fafaa77..776a9a61 100644 --- a/configure.ac +++ b/configure.ac @@ -499,8 +499,9 @@ asm_replace_list="aes-encrypt-internal.asm aes-decrypt-internal.asm \ # Assembler files which generate additional object files if they are used. asm_nettle_optional_list="gcm-hash.asm gcm-hash8.asm cpuid.asm \ aes-encrypt-internal-2.asm aes-decrypt-internal-2.asm memxor-2.asm \ - chacha-2core.asm chacha-3core.asm chacha-core-internal-2.asm salsa20-2core.asm \ - salsa20-core-internal-2.asm sha1-compress-2.asm sha256-compress-2.asm \ + chacha-2core.asm chacha-3core.asm chacha-4core.asm chacha-core-internal-2.asm \ + salsa20-2core.asm salsa20-core-internal-2.asm \ + sha1-compress-2.asm sha256-compress-2.asm \ sha3-permute-2.asm sha512-compress-2.asm \ umac-nh-n-2.asm umac-nh-2.asm" @@ -609,8 +610,10 @@ AH_VERBATIM([HAVE_NATIVE], #undef HAVE_NATIVE_chacha_core #undef HAVE_NATIVE_chacha_2core #undef HAVE_NATIVE_chacha_3core +#undef HAVE_NATIVE_chacha_4core #undef HAVE_NATIVE_fat_chacha_2core #undef HAVE_NATIVE_fat_chacha_3core +#undef HAVE_NATIVE_fat_chacha_4core #undef HAVE_NATIVE_ecc_curve25519_modp #undef HAVE_NATIVE_ecc_curve448_modp #undef HAVE_NATIVE_ecc_secp192r1_modp @@ -214,8 +214,8 @@ fat_init (void) if (verbose) fprintf (stderr, "libnettle: enabling altivec code.\n"); _nettle_chacha_core_vec = _nettle_chacha_core_altivec; - nettle_chacha_crypt_vec = _nettle_chacha_crypt_2core; - nettle_chacha_crypt32_vec = _nettle_chacha_crypt32_2core; + nettle_chacha_crypt_vec = _nettle_chacha_crypt_4core; + nettle_chacha_crypt32_vec = _nettle_chacha_crypt32_4core; } else { diff --git a/powerpc64/fat/chacha-4core.asm b/powerpc64/fat/chacha-4core.asm new file mode 100644 index 00000000..bd6be1be --- /dev/null +++ b/powerpc64/fat/chacha-4core.asm @@ -0,0 +1,36 @@ +C powerpc64/fat/chacha-4core.asm + + +ifelse(` + Copyright (C) 2020 Niels Möller + + This file is part of GNU Nettle. + + GNU Nettle is free software: you can redistribute it and/or + modify it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + + or + + * the GNU General Public License as published by the Free + Software Foundation; either version 2 of the License, or (at your + option) any later version. + + or both in parallel, as here. + + GNU Nettle is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received copies of the GNU General Public License and + the GNU Lesser General Public License along with this program. If + not, see http://www.gnu.org/licenses/. +') + +dnl PROLOGUE(_nettle_fat_chacha_4core) picked up by configure + +include_src(`powerpc64/p7/chacha-4core.asm') diff --git a/powerpc64/p7/chacha-4core.asm b/powerpc64/p7/chacha-4core.asm new file mode 100644 index 00000000..49a801be --- /dev/null +++ b/powerpc64/p7/chacha-4core.asm @@ -0,0 +1,231 @@ +C powerpc64/chacha-4core.asm + +ifelse(` + Copyright (C) 2020 Niels Möller and Torbjörn Granlund + This file is part of GNU Nettle. + + GNU Nettle is free software: you can redistribute it and/or + modify it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + + or + + * the GNU General Public License as published by the Free + Software Foundation; either version 2 of the License, or (at your + option) any later version. + + or both in parallel, as here. + + GNU Nettle is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received copies of the GNU General Public License and + the GNU Lesser General Public License along with this program. If + not, see http://www.gnu.org/licenses/. +') + +C Register usage: + +define(`SP', `r1') +define(`TOCP', `r2') + +C Argments +define(`DST', `r3') +define(`SRC', `r4') +define(`ROUNDS', `r5') + +C Working state in v0,...,v15 + +define(`ROT16', v16) +define(`ROT12', v17) +define(`ROT8', v18) +define(`ROT7', v19) + +C During the loop, used to save the original values for last 4 words +C of each block. Also used as temporaries for transpose. +define(`T0', `v20') +define(`T1', `v21') +define(`T2', `v22') +define(`T3', `v23') + +C Main loop for round +define(`QR',` + vadduwm $1, $1, $2 + vxor $4, $4, $1 + vrlw $4, $4, ROT16 + vadduwm $3, $3, $4 + vxor $2, $2, $3 + vrlw $2, $2, ROT12 + vadduwm $1, $1, $2 + vxor $4, $4, $1 + vrlw $4, $4, ROT8 + vadduwm $3, $3, $4 + vxor $2, $2, $3 + vrlw $2, $2, ROT7 + ') + +define(`TRANSPOSE',` + vmrghw T0, $1, $3 C A0 A2 B0 B2 + vmrghw T1, $2, $4 C A1 A3 B1 B3 + vmrglw T2, $1, $3 C C0 C2 D0 D2 + vmrglw T3, $2, $4 C C1 C3 D1 D3 + + vmrghw $1, T0, T1 C A0 A1 A2 A3 + vmrglw $2, T0, T1 C B0 B1 B2 B3 + vmrghw $3, T2, T3 C C0 C2 C1 C3 + vmrglw $4, T2, T3 C D0 D1 D2 D3 +') + + C _chacha_4core(uint32_t *dst, const uint32_t *src, unsigned rounds) +define(`FUNC_ALIGN', `5') +PROLOGUE(_nettle_chacha_4core) + + vspltisw T2, 1 C Apply counter carries + +.Lshared_entry: + + li r6, 0x10 C set up some... + li r7, 0x20 C ...useful... + li r8, 0x30 C ...offsets + + addi SP, SP, -0x40 C Save callee-save registers + stvx v20, 0, SP + stvx v21, r6, SP + stvx v22, r7, SP + stvx v23, r8, SP + + vspltisw ROT16, -16 C -16 instead of 16 actually works! + vspltisw ROT12, 12 + vspltisw ROT8, 8 + vspltisw ROT7, 7 + +C Load state and splat + lxvw4x VSR(v0), 0, SRC C "expa ..." + lxvw4x VSR(v4), r6, SRC C key + lxvw4x VSR(v8), r7, SRC C key + lxvw4x VSR(v12), r8, SRC C cnt and nonce + + vspltw v1, v0, 1 + vspltw v2, v0, 2 + vspltw v3, v0, 3 + vspltw v0, v0, 0 + vspltw v5, v4, 1 + vspltw v6, v4, 2 + vspltw v7, v4, 3 + vspltw v4, v4, 0 + vspltw v9, v8, 1 + vspltw v10, v8, 2 + vspltw v11, v8, 3 + vspltw v8, v8, 0 + vspltw v13, v12, 1 + vspltw v14, v12, 2 + vspltw v15, v12, 3 + vspltw v12, v12, 0 + + ld r9, .Lcnts@got(r2) + lxvw4x VSR(T0), 0, r9 C increments + vaddcuw T1, v12, T0 C compute carry-out + vadduwm v12, v12, T0 C low adds + vand T1, T1, T2 C discard carries for 32-bit counter variant + vadduwm v13, v13, T1 C apply carries + + C Save all 4x4 of the last words. + vor T0, v12, v12 + vor T1, v13, v13 + vor T2, v14, v14 + vor T3, v15, v15 + + srdi ROUNDS, ROUNDS, 1 + mtctr ROUNDS +.Loop: + QR(v0, v4, v8, v12) + QR(v1, v5, v9, v13) + QR(v2, v6, v10, v14) + QR(v3, v7, v11, v15) + QR(v0, v5, v10, v15) + QR(v1, v6, v11, v12) + QR(v2, v7, v8, v13) + QR(v3, v4, v9, v14) + bdnz .Loop + + C Add in saved original words, including counters, before + C transpose. + vadduwm v12, v12, T0 + vadduwm v13, v13, T1 + vadduwm v14, v14, T2 + vadduwm v15, v15, T3 + + TRANSPOSE(v0, v1,v2, v3) + TRANSPOSE(v4, v5, v6, v7) + TRANSPOSE(v8, v9, v10, v11) + TRANSPOSE(v12, v13, v14, v15) + + lxvw4x VSR(T0), 0, SRC + lxvw4x VSR(T1), r6, SRC + lxvw4x VSR(T2), r7, SRC + + vadduwm v0, v0, T0 + vadduwm v1, v1, T0 + vadduwm v2, v2, T0 + vadduwm v3, v3, T0 + + vadduwm v4, v4, T1 + vadduwm v5, v5, T1 + vadduwm v6, v6, T1 + vadduwm v7, v7, T1 + + vadduwm v8, v8, T2 + vadduwm v9, v9, T2 + vadduwm v10, v10, T2 + vadduwm v11, v11, T2 + + stxvw4x VSR(v0), 0, DST + stxvw4x VSR(v4), r6, DST + stxvw4x VSR(v8), r7, DST + stxvw4x VSR(v12), r8, DST + + addi DST, DST, 64 + + stxvw4x VSR(v1), 0, DST + stxvw4x VSR(v5), r6, DST + stxvw4x VSR(v9), r7, DST + stxvw4x VSR(v13), r8, DST + + addi DST, DST, 64 + + stxvw4x VSR(v2), 0, DST + stxvw4x VSR(v6), r6, DST + stxvw4x VSR(v10), r7, DST + stxvw4x VSR(v14), r8, DST + + addi DST, DST, 64 + + stxvw4x VSR(v3), 0, DST + stxvw4x VSR(v7), r6, DST + stxvw4x VSR(v11), r7, DST + stxvw4x VSR(v15), r8, DST + + C Restore callee-save registers + lvx v20, 0, SP + lvx v21, r6, SP + lvx v22, r7, SP + lvx v23, r8, SP + addi SP, SP, 0x40 + + blr +EPILOGUE(_nettle_chacha_4core) + +define(`FUNC_ALIGN', `5') +PROLOGUE(_nettle_chacha_4core32) + vspltisw T2, 0 C Ignore counter carries + b .Lshared_entry +EPILOGUE(_nettle_chacha_4core32) + + .section .rodata + ALIGN(16) +.Lcnts: .long 0,1,2,3 C increments |