diff options
author | Jussi Kivilinna <jussi.kivilinna@iki.fi> | 2022-04-15 12:23:31 +0300 |
---|---|---|
committer | Jussi Kivilinna <jussi.kivilinna@iki.fi> | 2022-04-24 01:44:16 +0300 |
commit | 4e6896eb9fce74908e15e085da00edfed0fa1923 (patch) | |
tree | 76ca66ee88af17ca1d562465db469c32d773e66c | |
parent | 3410d40996d8f7377935192ebecf4cad66688b25 (diff) | |
download | libgcrypt-4e6896eb9fce74908e15e085da00edfed0fa1923.tar.gz |
Add GFNI/AVX2 implementation of Camellia
* cipher/Makefile.am: Add "camellia-gfni-avx2-amd64.S".
* cipher/camellia-aesni-avx2-amd64.h [CAMELLIA_GFNI_BUILD]: Add GFNI
support.
* cipher/camellia-gfni-avx2-amd64.S: New.
* cipher/camellia-glue.c (USE_GFNI_AVX2): New.
(CAMELLIA_context) [USE_AESNI_AVX2]: New member "use_gfni_avx2".
[USE_GFNI_AVX2] (_gcry_camellia_gfni_avx2_ctr_enc)
(_gcry_camellia_gfni_avx2_cbc_dec, _gcry_camellia_gfni_avx2_cfb_dec)
(_gcry_camellia_gfni_avx2_ocb_enc, _gcry_camellia_gfni_avx2_ocb_dec)
(_gcry_camellia_gfni_avx2_ocb_auth): New.
(camellia_setkey) [USE_GFNI_AVX2]: Enable GFNI if supported by HW.
(_gcry_camellia_ctr_enc) [USE_GFNI_AVX2]: Add GFNI support.
(_gcry_camellia_cbc_dec) [USE_GFNI_AVX2]: Add GFNI support.
(_gcry_camellia_cfb_dec) [USE_GFNI_AVX2]: Add GFNI support.
(_gcry_camellia_ocb_crypt) [USE_GFNI_AVX2]: Add GFNI support.
(_gcry_camellia_ocb_auth) [USE_GFNI_AVX2]: Add GFNI support.
* configure.ac: Add "camellia-gfni-avx2-amd64.lo".
--
Benchmark on Intel Core i3-1115G4 (tigerlake):
Before (VAES/AVX2 implementation):
CAMELLIA128 | nanosecs/byte mebibytes/sec cycles/byte auto Mhz
CBC dec | 0.579 ns/B 1646 MiB/s 2.37 c/B 4090
CFB dec | 0.579 ns/B 1648 MiB/s 2.37 c/B 4089
CTR enc | 0.586 ns/B 1628 MiB/s 2.40 c/B 4090
CTR dec | 0.587 ns/B 1626 MiB/s 2.40 c/B 4090
OCB enc | 0.607 ns/B 1570 MiB/s 2.48 c/B 4089
OCB dec | 0.611 ns/B 1561 MiB/s 2.50 c/B 4089
OCB auth | 0.602 ns/B 1585 MiB/s 2.46 c/B 4089
After (~80% faster):
CAMELLIA128 | nanosecs/byte mebibytes/sec cycles/byte auto Mhz
CBC dec | 0.299 ns/B 3186 MiB/s 1.22 c/B 4090
CFB dec | 0.314 ns/B 3039 MiB/s 1.28 c/B 4089
CTR enc | 0.322 ns/B 2962 MiB/s 1.32 c/B 4090
CTR dec | 0.321 ns/B 2970 MiB/s 1.31 c/B 4090
OCB enc | 0.339 ns/B 2817 MiB/s 1.38 c/B 4089
OCB dec | 0.346 ns/B 2756 MiB/s 1.41 c/B 4089
OCB auth | 0.337 ns/B 2831 MiB/s 1.38 c/B 4089
Signed-off-by: Jussi Kivilinna <jussi.kivilinna@iki.fi>
-rw-r--r-- | cipher/Makefile.am | 5 | ||||
-rw-r--r-- | cipher/camellia-aesni-avx2-amd64.h | 249 | ||||
-rw-r--r-- | cipher/camellia-gfni-avx2-amd64.S | 34 | ||||
-rw-r--r-- | cipher/camellia-glue.c | 170 | ||||
-rw-r--r-- | configure.ac | 3 |
5 files changed, 398 insertions, 63 deletions
diff --git a/cipher/Makefile.am b/cipher/Makefile.am index 07e5ba26..7a429e8b 100644 --- a/cipher/Makefile.am +++ b/cipher/Makefile.am @@ -139,8 +139,9 @@ EXTRA_libcipher_la_SOURCES = \ twofish-avx2-amd64.S \ rfc2268.c \ camellia.c camellia.h camellia-glue.c camellia-aesni-avx-amd64.S \ - camellia-aesni-avx2-amd64.h camellia-vaes-avx2-amd64.S \ - camellia-aesni-avx2-amd64.S camellia-arm.S camellia-aarch64.S \ + camellia-aesni-avx2-amd64.h camellia-gfni-avx2-amd64.S \ + camellia-vaes-avx2-amd64.S camellia-aesni-avx2-amd64.S \ + camellia-arm.S camellia-aarch64.S \ blake2.c \ blake2b-amd64-avx2.S blake2s-amd64-avx.S diff --git a/cipher/camellia-aesni-avx2-amd64.h b/cipher/camellia-aesni-avx2-amd64.h index e93c40b8..8cd4b1cd 100644 --- a/cipher/camellia-aesni-avx2-amd64.h +++ b/cipher/camellia-aesni-avx2-amd64.h @@ -1,6 +1,6 @@ -/* camellia-aesni-avx2-amd64.h - AES-NI/VAES/AVX2 implementation of Camellia +/* camellia-aesni-avx2-amd64.h - AES-NI/VAES/GFNI/AVX2 implementation of Camellia * - * Copyright (C) 2013-2015,2020-2021 Jussi Kivilinna <jussi.kivilinna@iki.fi> + * Copyright (C) 2013-2015,2020-2022 Jussi Kivilinna <jussi.kivilinna@iki.fi> * * This file is part of Libgcrypt. * @@ -36,6 +36,8 @@ /********************************************************************** helper macros **********************************************************************/ + +#ifndef CAMELLIA_GFNI_BUILD #define filter_8bit(x, lo_t, hi_t, mask4bit, tmp0) \ vpand x, mask4bit, tmp0; \ vpandn x, mask4bit, x; \ @@ -44,6 +46,7 @@ vpshufb tmp0, lo_t, tmp0; \ vpshufb x, hi_t, x; \ vpxor tmp0, x, x; +#endif #define ymm0_x xmm0 #define ymm1_x xmm1 @@ -71,10 +74,60 @@ #endif /********************************************************************** + GFNI helper macros and constants + **********************************************************************/ + +#ifdef CAMELLIA_GFNI_BUILD + +#define BV8(a0,a1,a2,a3,a4,a5,a6,a7) \ + ( (((a0) & 1) << 0) | \ + (((a1) & 1) << 1) | \ + (((a2) & 1) << 2) | \ + (((a3) & 1) << 3) | \ + (((a4) & 1) << 4) | \ + (((a5) & 1) << 5) | \ + (((a6) & 1) << 6) | \ + (((a7) & 1) << 7) ) + +#define BM8X8(l0,l1,l2,l3,l4,l5,l6,l7) \ + ( ((l7) << (0 * 8)) | \ + ((l6) << (1 * 8)) | \ + ((l5) << (2 * 8)) | \ + ((l4) << (3 * 8)) | \ + ((l3) << (4 * 8)) | \ + ((l2) << (5 * 8)) | \ + ((l1) << (6 * 8)) | \ + ((l0) << (7 * 8)) ) + +/* Pre-filters and post-filters constants for Camellia sboxes s1, s2, s3 and s4. + * See http://urn.fi/URN:NBN:fi:oulu-201305311409, pages 43-48. + * + * Pre-filters are directly from above source, "θ₁"/"θ₄". Post-filters are + * combination of function "A" (AES SubBytes affine transformation) and + * "ψ₁"/"ψ₂"/"ψ₃". + */ + +/* Constant from "θ₁(x)" and "θ₄(x)" functions. */ +#define pre_filter_constant_s1234 BV8(1, 0, 1, 0, 0, 0, 1, 0) + +/* Constant from "ψ₁(A(x))" function: */ +#define post_filter_constant_s14 BV8(0, 1, 1, 1, 0, 1, 1, 0) + +/* Constant from "ψ₂(A(x))" function: */ +#define post_filter_constant_s2 BV8(0, 0, 1, 1, 1, 0, 1, 1) + +/* Constant from "ψ₃(A(x))" function: */ +#define post_filter_constant_s3 BV8(1, 1, 1, 0, 1, 1, 0, 0) + +#endif /* CAMELLIA_GFNI_BUILD */ + +/********************************************************************** 32-way camellia **********************************************************************/ -/* +#ifdef CAMELLIA_GFNI_BUILD + +/* roundsm32 (GFNI version) * IN: * x0..x7: byte-sliced AB state * mem_cd: register pointer storing CD state @@ -82,7 +135,119 @@ * OUT: * x0..x7: new byte-sliced CD state */ +#define roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, t0, t1, t2, t3, t4, t5, \ + t6, t7, mem_cd, key) \ + /* \ + * S-function with AES subbytes \ + */ \ + vpbroadcastq .Lpre_filter_bitmatrix_s123 rRIP, t5; \ + vpbroadcastq .Lpre_filter_bitmatrix_s4 rRIP, t2; \ + vpbroadcastq .Lpost_filter_bitmatrix_s14 rRIP, t4; \ + vpbroadcastq .Lpost_filter_bitmatrix_s2 rRIP, t3; \ + vpbroadcastq .Lpost_filter_bitmatrix_s3 rRIP, t6; \ + vpxor t7##_x, t7##_x, t7##_x; \ + vpbroadcastq key, t0; /* higher 64-bit duplicate ignored */ \ + \ + /* prefilter sboxes */ \ + vgf2p8affineqb $(pre_filter_constant_s1234), t5, x0, x0; \ + vgf2p8affineqb $(pre_filter_constant_s1234), t5, x7, x7; \ + vgf2p8affineqb $(pre_filter_constant_s1234), t2, x3, x3; \ + vgf2p8affineqb $(pre_filter_constant_s1234), t2, x6, x6; \ + vgf2p8affineqb $(pre_filter_constant_s1234), t5, x2, x2; \ + vgf2p8affineqb $(pre_filter_constant_s1234), t5, x5, x5; \ + vgf2p8affineqb $(pre_filter_constant_s1234), t5, x1, x1; \ + vgf2p8affineqb $(pre_filter_constant_s1234), t5, x4, x4; \ + \ + /* sbox GF8 inverse + postfilter sboxes 1 and 4 */ \ + vgf2p8affineinvqb $(post_filter_constant_s14), t4, x0, x0; \ + vgf2p8affineinvqb $(post_filter_constant_s14), t4, x7, x7; \ + vgf2p8affineinvqb $(post_filter_constant_s14), t4, x3, x3; \ + vgf2p8affineinvqb $(post_filter_constant_s14), t4, x6, x6; \ + \ + /* sbox GF8 inverse + postfilter sbox 3 */ \ + vgf2p8affineinvqb $(post_filter_constant_s3), t6, x2, x2; \ + vgf2p8affineinvqb $(post_filter_constant_s3), t6, x5, x5; \ + \ + /* sbox GF8 inverse + postfilter sbox 2 */ \ + vgf2p8affineinvqb $(post_filter_constant_s2), t3, x1, x1; \ + vgf2p8affineinvqb $(post_filter_constant_s2), t3, x4, x4; \ + \ + vpsrldq $1, t0, t1; \ + vpsrldq $2, t0, t2; \ + vpshufb t7, t1, t1; \ + vpsrldq $3, t0, t3; \ + \ + /* P-function */ \ + vpxor x5, x0, x0; \ + vpxor x6, x1, x1; \ + vpxor x7, x2, x2; \ + vpxor x4, x3, x3; \ + \ + vpshufb t7, t2, t2; \ + vpsrldq $4, t0, t4; \ + vpshufb t7, t3, t3; \ + vpsrldq $5, t0, t5; \ + vpshufb t7, t4, t4; \ + \ + vpxor x2, x4, x4; \ + vpxor x3, x5, x5; \ + vpxor x0, x6, x6; \ + vpxor x1, x7, x7; \ + \ + vpsrldq $6, t0, t6; \ + vpshufb t7, t5, t5; \ + vpshufb t7, t6, t6; \ + \ + vpxor x7, x0, x0; \ + vpxor x4, x1, x1; \ + vpxor x5, x2, x2; \ + vpxor x6, x3, x3; \ + \ + vpxor x3, x4, x4; \ + vpxor x0, x5, x5; \ + vpxor x1, x6, x6; \ + vpxor x2, x7, x7; /* note: high and low parts swapped */ \ + \ + /* Add key material and result to CD (x becomes new CD) */ \ + \ + vpxor t6, x1, x1; \ + vpxor 5 * 32(mem_cd), x1, x1; \ + \ + vpsrldq $7, t0, t6; \ + vpshufb t7, t0, t0; \ + vpshufb t7, t6, t7; \ + \ + vpxor t7, x0, x0; \ + vpxor 4 * 32(mem_cd), x0, x0; \ + \ + vpxor t5, x2, x2; \ + vpxor 6 * 32(mem_cd), x2, x2; \ + \ + vpxor t4, x3, x3; \ + vpxor 7 * 32(mem_cd), x3, x3; \ + \ + vpxor t3, x4, x4; \ + vpxor 0 * 32(mem_cd), x4, x4; \ + \ + vpxor t2, x5, x5; \ + vpxor 1 * 32(mem_cd), x5, x5; \ + \ + vpxor t1, x6, x6; \ + vpxor 2 * 32(mem_cd), x6, x6; \ + \ + vpxor t0, x7, x7; \ + vpxor 3 * 32(mem_cd), x7, x7; +#else /* CAMELLIA_GFNI_BUILD */ + +/* roundsm32 (AES-NI / VAES version) + * IN: + * x0..x7: byte-sliced AB state + * mem_cd: register pointer storing CD state + * key: index for key material + * OUT: + * x0..x7: new byte-sliced CD state + */ #define roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, t0, t1, t2, t3, t4, t5, \ t6, t7, mem_cd, key) \ /* \ @@ -181,7 +346,7 @@ /* postfilter sbox 2 */ \ filter_8bit(x1, t4, t5, t7, t2); \ filter_8bit(x4, t4, t5, t7, t2); \ - vpxor t7, t7, t7; \ + vpxor t7##_x, t7##_x, t7##_x; \ \ vpsrldq $1, t0, t1; \ vpsrldq $2, t0, t2; \ @@ -249,6 +414,8 @@ vpxor t0, x7, x7; \ vpxor 3 * 32(mem_cd), x7, x7; +#endif /* CAMELLIA_GFNI_BUILD */ + /* * IN/OUT: * x0..x7: byte-sliced AB state preloaded @@ -623,6 +790,9 @@ #define SHUFB_BYTES(idx) \ 0 + (idx), 4 + (idx), 8 + (idx), 12 + (idx) +FUNC_NAME(_constants): +ELF(.type FUNC_NAME(_constants),@object;) + .Lshufb_16x16b: .byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3) .byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3) @@ -635,6 +805,74 @@ .Lbswap128_mask: .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 +#ifdef CAMELLIA_GFNI_BUILD + +/* Pre-filters and post-filters bit-matrixes for Camellia sboxes s1, s2, s3 + * and s4. + * See http://urn.fi/URN:NBN:fi:oulu-201305311409, pages 43-48. + * + * Pre-filters are directly from above source, "θ₁"/"θ₄". Post-filters are + * combination of function "A" (AES SubBytes affine transformation) and + * "ψ₁"/"ψ₂"/"ψ₃". + */ + +/* Bit-matrix from "θ₁(x)" function: */ +.Lpre_filter_bitmatrix_s123: + .quad BM8X8(BV8(1, 1, 1, 0, 1, 1, 0, 1), + BV8(0, 0, 1, 1, 0, 0, 1, 0), + BV8(1, 1, 0, 1, 0, 0, 0, 0), + BV8(1, 0, 1, 1, 0, 0, 1, 1), + BV8(0, 0, 0, 0, 1, 1, 0, 0), + BV8(1, 0, 1, 0, 0, 1, 0, 0), + BV8(0, 0, 1, 0, 1, 1, 0, 0), + BV8(1, 0, 0, 0, 0, 1, 1, 0)) + +/* Bit-matrix from "θ₄(x)" function: */ +.Lpre_filter_bitmatrix_s4: + .quad BM8X8(BV8(1, 1, 0, 1, 1, 0, 1, 1), + BV8(0, 1, 1, 0, 0, 1, 0, 0), + BV8(1, 0, 1, 0, 0, 0, 0, 1), + BV8(0, 1, 1, 0, 0, 1, 1, 1), + BV8(0, 0, 0, 1, 1, 0, 0, 0), + BV8(0, 1, 0, 0, 1, 0, 0, 1), + BV8(0, 1, 0, 1, 1, 0, 0, 0), + BV8(0, 0, 0, 0, 1, 1, 0, 1)) + +/* Bit-matrix from "ψ₁(A(x))" function: */ +.Lpost_filter_bitmatrix_s14: + .quad BM8X8(BV8(0, 0, 0, 0, 0, 0, 0, 1), + BV8(0, 1, 1, 0, 0, 1, 1, 0), + BV8(1, 0, 1, 1, 1, 1, 1, 0), + BV8(0, 0, 0, 1, 1, 0, 1, 1), + BV8(1, 0, 0, 0, 1, 1, 1, 0), + BV8(0, 1, 0, 1, 1, 1, 1, 0), + BV8(0, 1, 1, 1, 1, 1, 1, 1), + BV8(0, 0, 0, 1, 1, 1, 0, 0)) + +/* Bit-matrix from "ψ₂(A(x))" function: */ +.Lpost_filter_bitmatrix_s2: + .quad BM8X8(BV8(0, 0, 0, 1, 1, 1, 0, 0), + BV8(0, 0, 0, 0, 0, 0, 0, 1), + BV8(0, 1, 1, 0, 0, 1, 1, 0), + BV8(1, 0, 1, 1, 1, 1, 1, 0), + BV8(0, 0, 0, 1, 1, 0, 1, 1), + BV8(1, 0, 0, 0, 1, 1, 1, 0), + BV8(0, 1, 0, 1, 1, 1, 1, 0), + BV8(0, 1, 1, 1, 1, 1, 1, 1)) + +/* Bit-matrix from "ψ₃(A(x))" function: */ +.Lpost_filter_bitmatrix_s3: + .quad BM8X8(BV8(0, 1, 1, 0, 0, 1, 1, 0), + BV8(1, 0, 1, 1, 1, 1, 1, 0), + BV8(0, 0, 0, 1, 1, 0, 1, 1), + BV8(1, 0, 0, 0, 1, 1, 1, 0), + BV8(0, 1, 0, 1, 1, 1, 1, 0), + BV8(0, 1, 1, 1, 1, 1, 1, 1), + BV8(0, 0, 0, 1, 1, 1, 0, 0), + BV8(0, 0, 0, 0, 0, 0, 0, 1)) + +#else /* CAMELLIA_GFNI_BUILD */ + /* * pre-SubByte transform * @@ -756,6 +994,9 @@ .L0f0f0f0f: .long 0x0f0f0f0f +#endif /* CAMELLIA_GFNI_BUILD */ + +ELF(.size FUNC_NAME(_constants),.-FUNC_NAME(_constants);) .align 8 ELF(.type __camellia_enc_blk32,@function;) diff --git a/cipher/camellia-gfni-avx2-amd64.S b/cipher/camellia-gfni-avx2-amd64.S new file mode 100644 index 00000000..20c9a432 --- /dev/null +++ b/cipher/camellia-gfni-avx2-amd64.S @@ -0,0 +1,34 @@ +/* camellia-vaes-avx2-amd64.S - GFNI/AVX2 implementation of Camellia cipher + * + * Copyright (C) 2022 Jussi Kivilinna <jussi.kivilinna@iki.fi> + * + * This file is part of Libgcrypt. + * + * Libgcrypt is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * Libgcrypt is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this program; if not, see <http://www.gnu.org/licenses/>. + */ + +#include <config.h> + +#ifdef __x86_64 +#if (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \ + defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && \ + defined(ENABLE_GFNI_SUPPORT) && defined(ENABLE_AVX2_SUPPORT) + +#define CAMELLIA_GFNI_BUILD 1 +#define FUNC_NAME(func) _gcry_camellia_gfni_avx2_ ## func + +#include "camellia-aesni-avx2-amd64.h" + +#endif /* defined(ENABLE_GFNI_SUPPORT) && defined(ENABLE_AVX2_SUPPORT) */ +#endif /* __x86_64 */ diff --git a/cipher/camellia-glue.c b/cipher/camellia-glue.c index 72c02d77..7f009db4 100644 --- a/cipher/camellia-glue.c +++ b/cipher/camellia-glue.c @@ -97,6 +97,12 @@ # define USE_VAES_AVX2 1 #endif +/* USE_GFNI_AVX2 inidicates whether to compile with Intel GFNI/AVX2 code. */ +#undef USE_GFNI_AVX2 +#if defined(USE_AESNI_AVX2) && defined(ENABLE_GFNI_SUPPORT) +# define USE_GFNI_AVX2 1 +#endif + typedef struct { KEY_TABLE_TYPE keytable; @@ -107,6 +113,7 @@ typedef struct #ifdef USE_AESNI_AVX2 unsigned int use_aesni_avx2:1;/* AES-NI/AVX2 implementation shall be used. */ unsigned int use_vaes_avx2:1; /* VAES/AVX2 implementation shall be used. */ + unsigned int use_gfni_avx2:1; /* GFNI/AVX2 implementation shall be used. */ #endif /*USE_AESNI_AVX2*/ } CAMELLIA_context; @@ -248,6 +255,46 @@ extern void _gcry_camellia_vaes_avx2_ocb_auth(CAMELLIA_context *ctx, const u64 Ls[32]) ASM_FUNC_ABI; #endif +#ifdef USE_GFNI_AVX2 +/* Assembler implementations of Camellia using GFNI and AVX2. Process data + in 32 block same time. + */ +extern void _gcry_camellia_gfni_avx2_ctr_enc(CAMELLIA_context *ctx, + unsigned char *out, + const unsigned char *in, + unsigned char *ctr) ASM_FUNC_ABI; + +extern void _gcry_camellia_gfni_avx2_cbc_dec(CAMELLIA_context *ctx, + unsigned char *out, + const unsigned char *in, + unsigned char *iv) ASM_FUNC_ABI; + +extern void _gcry_camellia_gfni_avx2_cfb_dec(CAMELLIA_context *ctx, + unsigned char *out, + const unsigned char *in, + unsigned char *iv) ASM_FUNC_ABI; + +extern void _gcry_camellia_gfni_avx2_ocb_enc(CAMELLIA_context *ctx, + unsigned char *out, + const unsigned char *in, + unsigned char *offset, + unsigned char *checksum, + const u64 Ls[32]) ASM_FUNC_ABI; + +extern void _gcry_camellia_gfni_avx2_ocb_dec(CAMELLIA_context *ctx, + unsigned char *out, + const unsigned char *in, + unsigned char *offset, + unsigned char *checksum, + const u64 Ls[32]) ASM_FUNC_ABI; + +extern void _gcry_camellia_gfni_avx2_ocb_auth(CAMELLIA_context *ctx, + const unsigned char *abuf, + unsigned char *offset, + unsigned char *checksum, + const u64 Ls[32]) ASM_FUNC_ABI; +#endif + static const char *selftest(void); static void _gcry_camellia_ctr_enc (void *context, unsigned char *ctr, @@ -272,7 +319,8 @@ camellia_setkey(void *c, const byte *key, unsigned keylen, CAMELLIA_context *ctx=c; static int initialized=0; static const char *selftest_failed=NULL; -#if defined(USE_AESNI_AVX) || defined(USE_AESNI_AVX2) || defined(USE_VAES_AVX2) +#if defined(USE_AESNI_AVX) || defined(USE_AESNI_AVX2) \ + || defined(USE_VAES_AVX2) || defined(USE_GFNI_AVX2) unsigned int hwf = _gcry_get_hw_features (); #endif @@ -296,10 +344,14 @@ camellia_setkey(void *c, const byte *key, unsigned keylen, #ifdef USE_AESNI_AVX2 ctx->use_aesni_avx2 = (hwf & HWF_INTEL_AESNI) && (hwf & HWF_INTEL_AVX2); ctx->use_vaes_avx2 = 0; + ctx->use_gfni_avx2 = 0; #endif #ifdef USE_VAES_AVX2 ctx->use_vaes_avx2 = (hwf & HWF_INTEL_VAES_VPCLMUL) && (hwf & HWF_INTEL_AVX2); #endif +#ifdef USE_GFNI_AVX2 + ctx->use_gfni_avx2 = (hwf & HWF_INTEL_GFNI) && (hwf & HWF_INTEL_AVX2); +#endif ctx->keybitlength=keylen*8; @@ -440,20 +492,22 @@ _gcry_camellia_ctr_enc(void *context, unsigned char *ctr, if (ctx->use_aesni_avx2) { int did_use_aesni_avx2 = 0; + typeof (&_gcry_camellia_aesni_avx2_ctr_enc) bulk_ctr_fn = + _gcry_camellia_aesni_avx2_ctr_enc; + #ifdef USE_VAES_AVX2 - int use_vaes = ctx->use_vaes_avx2; + if (ctx->use_vaes_avx2) + bulk_ctr_fn =_gcry_camellia_vaes_avx2_ctr_enc; +#endif +#ifdef USE_GFNI_AVX2 + if (ctx->use_gfni_avx2) + bulk_ctr_fn =_gcry_camellia_gfni_avx2_ctr_enc; #endif /* Process data in 32 block chunks. */ while (nblocks >= 32) { -#ifdef USE_VAES_AVX2 - if (use_vaes) - _gcry_camellia_vaes_avx2_ctr_enc(ctx, outbuf, inbuf, ctr); - else -#endif - _gcry_camellia_aesni_avx2_ctr_enc(ctx, outbuf, inbuf, ctr); - + bulk_ctr_fn (ctx, outbuf, inbuf, ctr); nblocks -= 32; outbuf += 32 * CAMELLIA_BLOCK_SIZE; inbuf += 32 * CAMELLIA_BLOCK_SIZE; @@ -537,20 +591,22 @@ _gcry_camellia_cbc_dec(void *context, unsigned char *iv, if (ctx->use_aesni_avx2) { int did_use_aesni_avx2 = 0; + typeof (&_gcry_camellia_aesni_avx2_cbc_dec) bulk_cbc_fn = + _gcry_camellia_aesni_avx2_cbc_dec; + #ifdef USE_VAES_AVX2 - int use_vaes = ctx->use_vaes_avx2; + if (ctx->use_vaes_avx2) + bulk_cbc_fn =_gcry_camellia_vaes_avx2_cbc_dec; +#endif +#ifdef USE_GFNI_AVX2 + if (ctx->use_gfni_avx2) + bulk_cbc_fn =_gcry_camellia_gfni_avx2_cbc_dec; #endif /* Process data in 32 block chunks. */ while (nblocks >= 32) { -#ifdef USE_VAES_AVX2 - if (use_vaes) - _gcry_camellia_vaes_avx2_cbc_dec(ctx, outbuf, inbuf, iv); - else -#endif - _gcry_camellia_aesni_avx2_cbc_dec(ctx, outbuf, inbuf, iv); - + bulk_cbc_fn (ctx, outbuf, inbuf, iv); nblocks -= 32; outbuf += 32 * CAMELLIA_BLOCK_SIZE; inbuf += 32 * CAMELLIA_BLOCK_SIZE; @@ -631,20 +687,22 @@ _gcry_camellia_cfb_dec(void *context, unsigned char *iv, if (ctx->use_aesni_avx2) { int did_use_aesni_avx2 = 0; + typeof (&_gcry_camellia_aesni_avx2_cfb_dec) bulk_cfb_fn = + _gcry_camellia_aesni_avx2_cfb_dec; + #ifdef USE_VAES_AVX2 - int use_vaes = ctx->use_vaes_avx2; + if (ctx->use_vaes_avx2) + bulk_cfb_fn =_gcry_camellia_vaes_avx2_cfb_dec; +#endif +#ifdef USE_GFNI_AVX2 + if (ctx->use_gfni_avx2) + bulk_cfb_fn =_gcry_camellia_gfni_avx2_cfb_dec; #endif /* Process data in 32 block chunks. */ while (nblocks >= 32) { -#ifdef USE_VAES_AVX2 - if (use_vaes) - _gcry_camellia_vaes_avx2_cfb_dec(ctx, outbuf, inbuf, iv); - else -#endif - _gcry_camellia_aesni_avx2_cfb_dec(ctx, outbuf, inbuf, iv); - + bulk_cfb_fn (ctx, outbuf, inbuf, iv); nblocks -= 32; outbuf += 32 * CAMELLIA_BLOCK_SIZE; inbuf += 32 * CAMELLIA_BLOCK_SIZE; @@ -729,10 +787,6 @@ _gcry_camellia_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg, if (ctx->use_aesni_avx2) { int did_use_aesni_avx2 = 0; -#ifdef USE_VAES_AVX2 - int encrypt_use_vaes = encrypt && ctx->use_vaes_avx2; - int decrypt_use_vaes = !encrypt && ctx->use_vaes_avx2; -#endif u64 Ls[32]; unsigned int n = 32 - (blkn % 32); u64 *l; @@ -740,6 +794,21 @@ _gcry_camellia_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg, if (nblocks >= 32) { + typeof (&_gcry_camellia_aesni_avx2_ocb_dec) bulk_ocb_fn = + encrypt ? _gcry_camellia_aesni_avx2_ocb_enc + : _gcry_camellia_aesni_avx2_ocb_dec; + +#ifdef USE_VAES_AVX2 + if (ctx->use_vaes_avx2) + bulk_ocb_fn = encrypt ? _gcry_camellia_vaes_avx2_ocb_enc + : _gcry_camellia_vaes_avx2_ocb_dec; +#endif +#ifdef USE_GFNI_AVX2 + if (ctx->use_gfni_avx2) + bulk_ocb_fn = encrypt ? _gcry_camellia_gfni_avx2_ocb_enc + : _gcry_camellia_gfni_avx2_ocb_dec; +#endif + for (i = 0; i < 32; i += 8) { /* Use u64 to store pointers for x32 support (assembly function @@ -764,21 +833,7 @@ _gcry_camellia_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg, blkn += 32; *l = (uintptr_t)(void *)ocb_get_l(c, blkn - blkn % 32); - if (0) {} -#ifdef USE_VAES_AVX2 - else if (encrypt_use_vaes) - _gcry_camellia_vaes_avx2_ocb_enc(ctx, outbuf, inbuf, c->u_iv.iv, - c->u_ctr.ctr, Ls); - else if (decrypt_use_vaes) - _gcry_camellia_vaes_avx2_ocb_dec(ctx, outbuf, inbuf, c->u_iv.iv, - c->u_ctr.ctr, Ls); -#endif - else if (encrypt) - _gcry_camellia_aesni_avx2_ocb_enc(ctx, outbuf, inbuf, c->u_iv.iv, - c->u_ctr.ctr, Ls); - else - _gcry_camellia_aesni_avx2_ocb_dec(ctx, outbuf, inbuf, c->u_iv.iv, - c->u_ctr.ctr, Ls); + bulk_ocb_fn (ctx, outbuf, inbuf, c->u_iv.iv, c->u_ctr.ctr, Ls); nblocks -= 32; outbuf += 32 * CAMELLIA_BLOCK_SIZE; @@ -891,9 +946,6 @@ _gcry_camellia_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg, if (ctx->use_aesni_avx2) { int did_use_aesni_avx2 = 0; -#ifdef USE_VAES_AVX2 - int use_vaes = ctx->use_vaes_avx2; -#endif u64 Ls[32]; unsigned int n = 32 - (blkn % 32); u64 *l; @@ -901,6 +953,18 @@ _gcry_camellia_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg, if (nblocks >= 32) { + typeof (&_gcry_camellia_aesni_avx2_ocb_auth) bulk_auth_fn = + _gcry_camellia_aesni_avx2_ocb_auth; + +#ifdef USE_VAES_AVX2 + if (ctx->use_vaes_avx2) + bulk_auth_fn = _gcry_camellia_vaes_avx2_ocb_auth; +#endif +#ifdef USE_GFNI_AVX2 + if (ctx->use_gfni_avx2) + bulk_auth_fn = _gcry_camellia_gfni_avx2_ocb_auth; +#endif + for (i = 0; i < 32; i += 8) { /* Use u64 to store pointers for x32 support (assembly function @@ -925,16 +989,8 @@ _gcry_camellia_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg, blkn += 32; *l = (uintptr_t)(void *)ocb_get_l(c, blkn - blkn % 32); -#ifdef USE_VAES_AVX2 - if (use_vaes) - _gcry_camellia_vaes_avx2_ocb_auth(ctx, abuf, - c->u_mode.ocb.aad_offset, - c->u_mode.ocb.aad_sum, Ls); - else -#endif - _gcry_camellia_aesni_avx2_ocb_auth(ctx, abuf, - c->u_mode.ocb.aad_offset, - c->u_mode.ocb.aad_sum, Ls); + bulk_auth_fn (ctx, abuf, c->u_mode.ocb.aad_offset, + c->u_mode.ocb.aad_sum, Ls); nblocks -= 32; abuf += 32 * CAMELLIA_BLOCK_SIZE; diff --git a/configure.ac b/configure.ac index 15c92018..c5d61657 100644 --- a/configure.ac +++ b/configure.ac @@ -2755,6 +2755,9 @@ if test "$found" = "1" ; then # Build with the VAES/AVX2 implementation GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS camellia-vaes-avx2-amd64.lo" + + # Build with the GFNI/AVX2 implementation + GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS camellia-gfni-avx2-amd64.lo" fi fi fi |