diff options
author | Tianjia Zhang <tianjia.zhang@linux.alibaba.com> | 2022-04-01 17:17:36 +0800 |
---|---|---|
committer | Jussi Kivilinna <jussi.kivilinna@iki.fi> | 2022-04-04 18:49:54 +0300 |
commit | fe891ff4a3cdc74957b215db4a9a9e01fefe0cd4 (patch) | |
tree | 550178e80eec6f90641f463fb4b04637b5d638c4 | |
parent | 29bfb3ebbc63d7ed18b916c5c6946790fb3d15df (diff) | |
download | libgcrypt-fe891ff4a3cdc74957b215db4a9a9e01fefe0cd4.tar.gz |
Add SM3 ARMv8/AArch64/CE assembly implementation
* cipher/Makefile.am: Add 'sm3-armv8-aarch64-ce.S'.
* cipher/sm3-armv8-aarch64-ce.S: New.
* cipher/sm3.c (USE_ARM_CE): New.
[USE_ARM_CE] (_gcry_sm3_transform_armv8_ce)
(do_sm3_transform_armv8_ce): New.
(sm3_init) [USE_ARM_CE]: New.
* configure.ac: Add 'sm3-armv8-aarch64-ce.lo'.
--
Benchmark on T-Head Yitian-710 2.75 GHz:
Before:
| nanosecs/byte mebibytes/sec cycles/byte auto Mhz
SM3 | 2.84 ns/B 335.3 MiB/s 7.82 c/B 2749
After (~55% faster):
| nanosecs/byte mebibytes/sec cycles/byte auto Mhz
SM3 | 1.84 ns/B 518.1 MiB/s 5.06 c/B 2749
Signed-off-by: Tianjia Zhang <tianjia.zhang@linux.alibaba.com>
-rw-r--r-- | cipher/Makefile.am | 2 | ||||
-rw-r--r-- | cipher/sm3-armv8-aarch64-ce.S | 218 | ||||
-rw-r--r-- | cipher/sm3.c | 28 | ||||
-rw-r--r-- | configure.ac | 1 |
4 files changed, 248 insertions, 1 deletions
diff --git a/cipher/Makefile.am b/cipher/Makefile.am index 1ac1923b..30be9f98 100644 --- a/cipher/Makefile.am +++ b/cipher/Makefile.am @@ -130,7 +130,7 @@ EXTRA_libcipher_la_SOURCES = \ sha512-avx2-bmi2-amd64.S sha512-avx512-amd64.S \ sha512-armv7-neon.S sha512-arm.S \ sha512-ppc.c sha512-ssse3-i386.c \ - sm3.c sm3-avx-bmi2-amd64.S sm3-aarch64.S \ + sm3.c sm3-avx-bmi2-amd64.S sm3-aarch64.S sm3-armv8-aarch64-ce.S \ keccak.c keccak_permute_32.h keccak_permute_64.h keccak-armv7-neon.S \ stribog.c \ tiger.c \ diff --git a/cipher/sm3-armv8-aarch64-ce.S b/cipher/sm3-armv8-aarch64-ce.S new file mode 100644 index 00000000..0900b84f --- /dev/null +++ b/cipher/sm3-armv8-aarch64-ce.S @@ -0,0 +1,218 @@ +/* sm3-armv8-aarch64-ce.S - ARMv8/AArch64/CE accelerated SM3 cipher + * + * Copyright (C) 2022 Alibaba Group. + * Copyright (C) 2022 Tianjia Zhang <tianjia.zhang@linux.alibaba.com> + * + * This file is part of Libgcrypt. + * + * Libgcrypt is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * Libgcrypt is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this program; if not, see <http://www.gnu.org/licenses/>. + */ + +#include "asm-common-aarch64.h" + +#if defined(__AARCH64EL__) && \ + defined(HAVE_COMPATIBLE_GCC_AARCH64_PLATFORM_AS) && \ + defined(HAVE_GCC_INLINE_ASM_AARCH64_CRYPTO) && \ + defined(USE_SM3) + +.cpu generic+simd+crypto + +/* Must be consistent with register macros */ +#define vecnum_v0 0 +#define vecnum_v1 1 +#define vecnum_v2 2 +#define vecnum_v3 3 +#define vecnum_v4 4 +#define vecnum_CTX1 16 +#define vecnum_CTX2 17 +#define vecnum_SS1 18 +#define vecnum_WT 19 +#define vecnum_K0 20 +#define vecnum_K1 21 +#define vecnum_K2 22 +#define vecnum_K3 23 +#define vecnum_RTMP0 24 +#define vecnum_RTMP1 25 + +#define sm3partw1(vd, vn, vm) \ + .inst (0xce60c000 | (vecnum_##vm << 16) | (vecnum_##vn << 5) | vecnum_##vd) + +#define sm3partw2(vd, vn, vm) \ + .inst (0xce60c400 | (vecnum_##vm << 16) | (vecnum_##vn << 5) | vecnum_##vd) + +#define sm3ss1(vd, vn, vm, va) \ + .inst (0xce400000 | (vecnum_##vm << 16) | (vecnum_##va << 10) \ + | (vecnum_##vn << 5) | vecnum_##vd) + +#define sm3tt1a(vd, vn, vm, imm2) \ + .inst (0xce408000 | (vecnum_##vm << 16) | imm2 << 12 \ + | (vecnum_##vn << 5) | vecnum_##vd) + +#define sm3tt1b(vd, vn, vm, imm2) \ + .inst (0xce408400 | (vecnum_##vm << 16) | imm2 << 12 \ + | (vecnum_##vn << 5) | vecnum_##vd) + +#define sm3tt2a(vd, vn, vm, imm2) \ + .inst (0xce408800 | (vecnum_##vm << 16) | imm2 << 12 \ + | (vecnum_##vn << 5) | vecnum_##vd) + +#define sm3tt2b(vd, vn, vm, imm2) \ + .inst (0xce408c00 | (vecnum_##vm << 16) | imm2 << 12 \ + | (vecnum_##vn << 5) | vecnum_##vd) + +/* Constants */ + +.text +.align 4 +ELF(.type _gcry_sm3_armv8_ce_consts,@object) +_gcry_sm3_armv8_ce_consts: +.Lsm3_Ktable: + .long 0x79cc4519, 0xf3988a32, 0xe7311465, 0xce6228cb + .long 0x9cc45197, 0x3988a32f, 0x7311465e, 0xe6228cbc + .long 0xcc451979, 0x988a32f3, 0x311465e7, 0x6228cbce + .long 0xc451979c, 0x88a32f39, 0x11465e73, 0x228cbce6 + .long 0x9d8a7a87, 0x3b14f50f, 0x7629ea1e, 0xec53d43c + .long 0xd8a7a879, 0xb14f50f3, 0x629ea1e7, 0xc53d43ce + .long 0x8a7a879d, 0x14f50f3b, 0x29ea1e76, 0x53d43cec + .long 0xa7a879d8, 0x4f50f3b1, 0x9ea1e762, 0x3d43cec5 + .long 0x7a879d8a, 0xf50f3b14, 0xea1e7629, 0xd43cec53 + .long 0xa879d8a7, 0x50f3b14f, 0xa1e7629e, 0x43cec53d + .long 0x879d8a7a, 0x0f3b14f5, 0x1e7629ea, 0x3cec53d4 + .long 0x79d8a7a8, 0xf3b14f50, 0xe7629ea1, 0xcec53d43 + .long 0x9d8a7a87, 0x3b14f50f, 0x7629ea1e, 0xec53d43c + .long 0xd8a7a879, 0xb14f50f3, 0x629ea1e7, 0xc53d43ce + .long 0x8a7a879d, 0x14f50f3b, 0x29ea1e76, 0x53d43cec + .long 0xa7a879d8, 0x4f50f3b1, 0x9ea1e762, 0x3d43cec5 +ELF(.size _gcry_sm3_armv8_ce_consts,.-_gcry_sm3_armv8_ce_consts) + +/* Register macros */ + +/* Must be consistent with vecnum_ macros */ +#define CTX1 v16 +#define CTX2 v17 +#define SS1 v18 +#define WT v19 + +#define K0 v20 +#define K1 v21 +#define K2 v22 +#define K3 v23 + +#define RTMP0 v24 +#define RTMP1 v25 + +/* Helper macros. */ + +#define _(...) /*_*/ + +#define SCHED_W_1(s0, s1, s2, s3, s4) ext s4.16b, s1.16b, s2.16b, #12 +#define SCHED_W_2(s0, s1, s2, s3, s4) ext RTMP0.16b, s0.16b, s1.16b, #12 +#define SCHED_W_3(s0, s1, s2, s3, s4) ext RTMP1.16b, s2.16b, s3.16b, #8 +#define SCHED_W_4(s0, s1, s2, s3, s4) sm3partw1(s4, s0, s3) +#define SCHED_W_5(s0, s1, s2, s3, s4) sm3partw2(s4, RTMP1, RTMP0) + +#define SCHED_W(n, s0, s1, s2, s3, s4) SCHED_W_##n(s0, s1, s2, s3, s4) + +#define R(ab, s0, s1, s2, s3, s4, IOP) \ + ld4 {K0.s, K1.s, K2.s, K3.s}[3], [x3], #16; \ + eor WT.16b, s0.16b, s1.16b; \ + \ + sm3ss1(SS1, CTX1, CTX2, K0); \ + IOP(1, s0, s1, s2, s3, s4); \ + sm3tt1##ab(CTX1, SS1, WT, 0); \ + sm3tt2##ab(CTX2, SS1, s0, 0); \ + \ + IOP(2, s0, s1, s2, s3, s4); \ + sm3ss1(SS1, CTX1, CTX2, K1); \ + IOP(3, s0, s1, s2, s3, s4); \ + sm3tt1##ab(CTX1, SS1, WT, 1); \ + sm3tt2##ab(CTX2, SS1, s0, 1); \ + \ + sm3ss1(SS1, CTX1, CTX2, K2); \ + IOP(4, s0, s1, s2, s3, s4); \ + sm3tt1##ab(CTX1, SS1, WT, 2); \ + sm3tt2##ab(CTX2, SS1, s0, 2); \ + \ + sm3ss1(SS1, CTX1, CTX2, K3); \ + IOP(5, s0, s1, s2, s3, s4); \ + sm3tt1##ab(CTX1, SS1, WT, 3); \ + sm3tt2##ab(CTX2, SS1, s0, 3); + +#define R1(s0, s1, s2, s3, s4, IOP) R(a, s0, s1, s2, s3, s4, IOP) +#define R2(s0, s1, s2, s3, s4, IOP) R(b, s0, s1, s2, s3, s4, IOP) + +.align 3 +.global _gcry_sm3_transform_armv8_ce +ELF(.type _gcry_sm3_transform_armv8_ce,%function;) +_gcry_sm3_transform_armv8_ce: + /* input: + * x0: CTX + * x1: data + * x2: nblocks + */ + CFI_STARTPROC(); + + ld1 {CTX1.4s, CTX2.4s}, [x0]; + rev64 CTX1.4s, CTX1.4s; + rev64 CTX2.4s, CTX2.4s; + ext CTX1.16b, CTX1.16b, CTX1.16b, #8; + ext CTX2.16b, CTX2.16b, CTX2.16b, #8; + +.Lloop: + GET_DATA_POINTER(x3, .Lsm3_Ktable); + ld1 {v0.16b-v3.16b}, [x1], #64; + sub x2, x2, #1; + + mov v6.16b, CTX1.16b; + mov v7.16b, CTX2.16b; + + rev32 v0.16b, v0.16b; + rev32 v1.16b, v1.16b; + rev32 v2.16b, v2.16b; + rev32 v3.16b, v3.16b; + + R1(v0, v1, v2, v3, v4, SCHED_W); + R1(v1, v2, v3, v4, v0, SCHED_W); + R1(v2, v3, v4, v0, v1, SCHED_W); + R1(v3, v4, v0, v1, v2, SCHED_W); + R2(v4, v0, v1, v2, v3, SCHED_W); + R2(v0, v1, v2, v3, v4, SCHED_W); + R2(v1, v2, v3, v4, v0, SCHED_W); + R2(v2, v3, v4, v0, v1, SCHED_W); + R2(v3, v4, v0, v1, v2, SCHED_W); + R2(v4, v0, v1, v2, v3, SCHED_W); + R2(v0, v1, v2, v3, v4, SCHED_W); + R2(v1, v2, v3, v4, v0, SCHED_W); + R2(v2, v3, v4, v0, v1, SCHED_W); + R2(v3, v4, v0, v1, v2, _); + R2(v4, v0, v1, v2, v3, _); + R2(v0, v1, v2, v3, v4, _); + + eor CTX1.16b, CTX1.16b, v6.16b; + eor CTX2.16b, CTX2.16b, v7.16b; + + cbnz x2, .Lloop; + + /* save state */ + rev64 CTX1.4s, CTX1.4s; + rev64 CTX2.4s, CTX2.4s; + ext CTX1.16b, CTX1.16b, CTX1.16b, #8; + ext CTX2.16b, CTX2.16b, CTX2.16b, #8; + st1 {CTX1.4s, CTX2.4s}, [x0]; + + ret_spec_stop; + CFI_ENDPROC(); +ELF(.size _gcry_sm3_transform_armv8_ce, .-_gcry_sm3_transform_armv8_ce;) + +#endif diff --git a/cipher/sm3.c b/cipher/sm3.c index 0ab5f506..bfe9f4c2 100644 --- a/cipher/sm3.c +++ b/cipher/sm3.c @@ -67,6 +67,16 @@ # endif #endif +/* USE_ARM_CE indicates whether to enable ARMv8 Crypto Extension code. */ +#undef USE_ARM_CE +#ifdef ENABLE_ARM_CRYPTO_SUPPORT +# if defined(__AARCH64EL__) && \ + defined(HAVE_COMPATIBLE_GCC_AARCH64_PLATFORM_AS) && \ + defined(HAVE_GCC_INLINE_ASM_AARCH64_CRYPTO) +# define USE_ARM_CE 1 +# endif +#endif + typedef struct { gcry_md_block_ctx_t bctx; @@ -117,6 +127,20 @@ do_sm3_transform_aarch64(void *context, const unsigned char *data, size_t nblks) } #endif /* USE_AARCH64_SIMD */ +#ifdef USE_ARM_CE +void _gcry_sm3_transform_armv8_ce(void *state, const void *input_data, + size_t num_blks); + +static unsigned int +do_sm3_transform_armv8_ce(void *context, const unsigned char *data, + size_t nblks) +{ + SM3_CONTEXT *hd = context; + _gcry_sm3_transform_armv8_ce (hd->h, data, nblks); + return 0; +} +#endif /* USE_ARM_CE */ + static unsigned int transform (void *c, const unsigned char *data, size_t nblks); @@ -153,6 +177,10 @@ sm3_init (void *context, unsigned int flags) if (features & HWF_ARM_NEON) hd->bctx.bwrite = do_sm3_transform_aarch64; #endif +#ifdef USE_ARM_CE + if (features & HWF_ARM_SM3) + hd->bctx.bwrite = do_sm3_transform_armv8_ce; +#endif (void)features; } diff --git a/configure.ac b/configure.ac index e214082b..fc49bb86 100644 --- a/configure.ac +++ b/configure.ac @@ -3049,6 +3049,7 @@ if test "$found" = "1" ; then aarch64-*-*) # Build with the assembly implementation GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS sm3-aarch64.lo" + GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS sm3-armv8-aarch64-ce.lo" ;; esac fi |