From 0b2da804ee813eee22c386ba7f253415103b34ea Mon Sep 17 00:00:00 2001 From: Jussi Kivilinna Date: Wed, 1 Mar 2023 21:03:09 +0200 Subject: Add PowerPC vector implementation of SM4 * cipher/Makefile.am: Add 'sm4-ppc.c'. * cipher/sm4-ppc.c: New. * cipher/sm4.c (USE_PPC_CRYPTO): New. (SM4_context): Add 'use_ppc8le' and 'use_ppc9le'. [USE_PPC_CRYPTO] (_gcry_sm4_ppc8le_crypt_blk1_16) (_gcry_sm4_ppc9le_crypt_blk1_16, sm4_ppc8le_crypt_blk1_16) (sm4_ppc9le_crypt_blk1_16): New. (sm4_setkey) [USE_PPC_CRYPTO]: Set use_ppc8le and use_ppc9le based on HW features. (sm4_get_crypt_blk1_16_fn) [USE_PPC_CRYPTO]: Add PowerPC implementation selection. -- Benchmark on POWER9: Before: SM4 | nanosecs/byte mebibytes/sec cycles/byte ECB enc | 14.47 ns/B 65.89 MiB/s 33.29 c/B ECB dec | 14.47 ns/B 65.89 MiB/s 33.29 c/B CBC enc | 35.09 ns/B 27.18 MiB/s 80.71 c/B CBC dec | 16.69 ns/B 57.13 MiB/s 38.39 c/B CFB enc | 35.09 ns/B 27.18 MiB/s 80.71 c/B CFB dec | 16.76 ns/B 56.90 MiB/s 38.55 c/B CTR enc | 16.88 ns/B 56.50 MiB/s 38.82 c/B CTR dec | 16.88 ns/B 56.50 MiB/s 38.82 c/B After (ECB ~4.4x faster): SM4 | nanosecs/byte mebibytes/sec cycles/byte ECB enc | 3.26 ns/B 292.3 MiB/s 7.50 c/B ECB dec | 3.26 ns/B 292.3 MiB/s 7.50 c/B CBC enc | 35.10 ns/B 27.17 MiB/s 80.72 c/B CBC dec | 3.33 ns/B 286.3 MiB/s 7.66 c/B CFB enc | 35.10 ns/B 27.17 MiB/s 80.74 c/B CFB dec | 3.36 ns/B 283.8 MiB/s 7.73 c/B CTR enc | 3.47 ns/B 275.0 MiB/s 7.98 c/B CTR dec | 3.47 ns/B 275.0 MiB/s 7.98 c/B Signed-off-by: Jussi Kivilinna --- cipher/Makefile.am | 7 ++ cipher/sm4-ppc.c | 342 +++++++++++++++++++++++++++++++++++++++++++++++++++++ cipher/sm4.c | 48 ++++++++ 3 files changed, 397 insertions(+) create mode 100644 cipher/sm4-ppc.c (limited to 'cipher') diff --git a/cipher/Makefile.am b/cipher/Makefile.am index dcaa68bb..cf1fbe85 100644 --- a/cipher/Makefile.am +++ b/cipher/Makefile.am @@ -123,6 +123,7 @@ EXTRA_libcipher_la_SOURCES = \ sm4.c sm4-aesni-avx-amd64.S sm4-aesni-avx2-amd64.S \ sm4-gfni-avx2-amd64.S sm4-gfni-avx512-amd64.S \ sm4-aarch64.S sm4-armv8-aarch64-ce.S sm4-armv9-aarch64-sve-ce.S \ + sm4-ppc.c \ serpent-avx2-amd64.S serpent-armv7-neon.S \ sha1.c sha1-ssse3-amd64.S sha1-avx-amd64.S sha1-avx-bmi2-amd64.S \ sha1-avx2-bmi2-amd64.S sha1-armv7-neon.S sha1-armv8-aarch32-ce.S \ @@ -309,3 +310,9 @@ camellia-aarch64-ce.o: $(srcdir)/camellia-aarch64-ce.c Makefile camellia-aarch64-ce.lo: $(srcdir)/camellia-aarch64-ce.c Makefile `echo $(LTCOMPILE) $(aarch64_neon_cflags) -c $< | $(instrumentation_munging) ` + +sm4-ppc.o: $(srcdir)/sm4-ppc.c Makefile + `echo $(COMPILE) $(ppc_vcrypto_cflags) -c $< | $(instrumentation_munging) ` + +sm4-ppc.lo: $(srcdir)/sm4-ppc.c Makefile + `echo $(LTCOMPILE) $(ppc_vcrypto_cflags) -c $< | $(instrumentation_munging) ` diff --git a/cipher/sm4-ppc.c b/cipher/sm4-ppc.c new file mode 100644 index 00000000..bb2c55e0 --- /dev/null +++ b/cipher/sm4-ppc.c @@ -0,0 +1,342 @@ +/* sm4-ppc.c - PowerPC implementation of SM4 cipher + * + * Copyright (C) 2023 Jussi Kivilinna + * + * This file is part of Libgcrypt. + * + * Libgcrypt is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * Libgcrypt is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this program; if not, see . + */ + +#include + +#if defined(ENABLE_PPC_CRYPTO_SUPPORT) && \ + defined(HAVE_COMPATIBLE_CC_PPC_ALTIVEC) && \ + defined(HAVE_GCC_INLINE_ASM_PPC_ALTIVEC) && \ + !defined(WORDS_BIGENDIAN) && (__GNUC__ >= 4) + +#include +#include "bufhelp.h" + +typedef vector unsigned char vector16x_u8; +typedef vector unsigned int vector4x_u32; +typedef vector unsigned long long vector2x_u64; + +#ifdef HAVE_GCC_ATTRIBUTE_OPTIMIZE +# define FUNC_ATTR_OPT __attribute__((optimize("-O2"))) +#else +# define FUNC_ATTR_OPT +#endif + +#if defined(__clang__) && defined(HAVE_CLANG_ATTRIBUTE_PPC_TARGET) +# define FUNC_ATTR_TARGET_P8 __attribute__((target("arch=pwr8"))) +# define FUNC_ATTR_TARGET_P9 __attribute__((target("arch=pwr9"))) +# define HAVE_FUNC_ATTR_TARGET 1 +#elif defined(HAVE_GCC_ATTRIBUTE_PPC_TARGET) +# define FUNC_ATTR_TARGET_P8 __attribute__((target("cpu=power8"))) +# define FUNC_ATTR_TARGET_P9 __attribute__((target("cpu=power9"))) +# define HAVE_FUNC_ATTR_TARGET 1 +#else +# define FUNC_ATTR_TARGET_P8 +# define FUNC_ATTR_TARGET_P9 +# undef HAVE_FUNC_ATTR_TARGET +#endif + +#define ALWAYS_INLINE inline __attribute__((always_inline)) +#define NO_INLINE __attribute__((noinline)) +#define NO_INSTRUMENT_FUNCTION __attribute__((no_instrument_function)) + +#define ASM_FUNC_ATTR NO_INSTRUMENT_FUNCTION +#define ASM_FUNC_ATTR_INLINE ASM_FUNC_ATTR ALWAYS_INLINE +#define ASM_FUNC_ATTR_NOINLINE ASM_FUNC_ATTR NO_INLINE + +#ifdef __clang__ +/* clang has mismatching prototype for vec_sbox_be. */ +static ASM_FUNC_ATTR_INLINE vector16x_u8 +asm_sbox_be(vector16x_u8 b) +{ + vector16x_u8 o; + __asm__ ("vsbox %0, %1\n\t" : "=v" (o) : "v" (b)); + return o; +} +#undef vec_sbox_be +#define vec_sbox_be asm_sbox_be +#endif /* __clang__ */ + +#define transpose_4x4(x0, x1, x2, x3, t1, t2) \ + t2 = (vector4x_u32)vec_mergel((vector4x_u32)x0, (vector4x_u32)x1); \ + x0 = (vector4x_u32)vec_mergeh((vector4x_u32)x0, (vector4x_u32)x1); \ + \ + t1 = (vector4x_u32)vec_mergeh((vector4x_u32)x2, (vector4x_u32)x3); \ + x2 = (vector4x_u32)vec_mergel((vector4x_u32)x2, (vector4x_u32)x3); \ + \ + x1 = (vector4x_u32)vec_mergel((vector2x_u64)x0, (vector2x_u64)t1); \ + x0 = (vector4x_u32)vec_mergeh((vector2x_u64)x0, (vector2x_u64)t1); \ + \ + x3 = (vector4x_u32)vec_mergel((vector2x_u64)t2, (vector2x_u64)x2); \ + x2 = (vector4x_u32)vec_mergeh((vector2x_u64)t2, (vector2x_u64)x2); + +#define filter_8bit(x, lo_t, hi_t, mask4bit, tmp0) ({ \ + tmp0 = x & mask4bit; \ + x = (vector4x_u32)((vector16x_u8)x >> 4); \ + \ + tmp0 = (vector4x_u32)vec_perm((vector16x_u8)lo_t, (vector16x_u8)lo_t, \ + (vector16x_u8)tmp0); \ + x = (vector4x_u32)vec_perm((vector16x_u8)hi_t, (vector16x_u8)hi_t, \ + (vector16x_u8)x); \ + x = x ^ tmp0; \ + }) + +#define GET_RKEY(round) vec_splat(r4keys, round) + +#define ROUND4(round, s0, s1, s2, s3) ({ \ + vector4x_u32 rkey = GET_RKEY(round); \ + vector4x_u32 rx0 = rkey ^ s1 ^ s2 ^ s3; \ + filter_8bit(rx0, pre_tf_lo_s, pre_tf_hi_s, mask_0f, tmp0); \ + rx0 = (vector4x_u32)vec_sbox_be((vector16x_u8)rx0); \ + filter_8bit(rx0, post_tf_lo_s, post_tf_hi_s, mask_0f, tmp0); \ + s0 ^= rx0 ^ vec_rl(rx0, rotate2) ^ vec_rl(rx0, rotate10) ^ \ + vec_rl(rx0, rotate18) ^ vec_rl(rx0, rotate24); \ + }) + +#define ROUND8(round, s0, s1, s2, s3, r0, r1, r2, r3) ({ \ + vector4x_u32 rkey = GET_RKEY(round); \ + vector4x_u32 rx0 = rkey ^ s1 ^ s2 ^ s3; \ + vector4x_u32 rx1 = rkey ^ r1 ^ r2 ^ r3; \ + filter_8bit(rx0, pre_tf_lo_s, pre_tf_hi_s, mask_0f, tmp0); \ + filter_8bit(rx1, pre_tf_lo_s, pre_tf_hi_s, mask_0f, tmp0); \ + rx0 = (vector4x_u32)vec_sbox_be((vector16x_u8)rx0); \ + rx1 = (vector4x_u32)vec_sbox_be((vector16x_u8)rx1); \ + filter_8bit(rx0, post_tf_lo_s, post_tf_hi_s, mask_0f, tmp0); \ + filter_8bit(rx1, post_tf_lo_s, post_tf_hi_s, mask_0f, tmp0); \ + s0 ^= rx0 ^ vec_rl(rx0, rotate2) ^ vec_rl(rx0, rotate10) ^ \ + vec_rl(rx0, rotate18) ^ vec_rl(rx0, rotate24); \ + r0 ^= rx1 ^ vec_rl(rx1, rotate2) ^ vec_rl(rx1, rotate10) ^ \ + vec_rl(rx1, rotate18) ^ vec_rl(rx1, rotate24); \ + }) + +static const vector4x_u32 mask_0f = + { 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f }; +static const vector2x_u64 pre_tf_lo_s = + { 0x9096E3E575730600ULL, 0xC6C0B5B323255056ULL }; +static const vector2x_u64 pre_tf_hi_s = + { 0xE341AA08EA48A301ULL, 0xF153B81AF85AB113ULL }; +static const vector2x_u64 post_tf_lo_s = + { 0x6F53C6FA95A93C00ULL, 0xD9E5704C231F8AB6ULL }; +static const vector2x_u64 post_tf_hi_s = + { 0x9A4635E9479BE834ULL, 0x25F98A56F824578BULL }; +static const vector4x_u32 rotate2 = { 2, 2, 2, 2 }; +static const vector4x_u32 rotate10 = { 10, 10, 10, 10 }; +static const vector4x_u32 rotate18 = { 18, 18, 18, 18 }; +static const vector4x_u32 rotate24 = { 24, 24, 24, 24 }; + +static ASM_FUNC_ATTR_INLINE void +sm4_ppc_crypt_blk16(u32 *rk, byte *out, const byte *in) +{ + vector4x_u32 ra0, ra1, ra2, ra3; + vector4x_u32 rb0, rb1, rb2, rb3; + vector4x_u32 rc0, rc1, rc2, rc3; + vector4x_u32 rd0, rd1, rd2, rd3; + vector4x_u32 tmp0, tmp1; + u32 *rk_end; + + ra0 = vec_revb((vector4x_u32)vec_xl(0, in + 0 * 16)); + ra1 = vec_revb((vector4x_u32)vec_xl(0, in + 1 * 16)); + ra2 = vec_revb((vector4x_u32)vec_xl(0, in + 2 * 16)); + ra3 = vec_revb((vector4x_u32)vec_xl(0, in + 3 * 16)); + rb0 = vec_revb((vector4x_u32)vec_xl(0, in + 4 * 16)); + rb1 = vec_revb((vector4x_u32)vec_xl(0, in + 5 * 16)); + rb2 = vec_revb((vector4x_u32)vec_xl(0, in + 6 * 16)); + rb3 = vec_revb((vector4x_u32)vec_xl(0, in + 7 * 16)); + in += 8 * 16; + rc0 = vec_revb((vector4x_u32)vec_xl(0, in + 0 * 16)); + rc1 = vec_revb((vector4x_u32)vec_xl(0, in + 1 * 16)); + rc2 = vec_revb((vector4x_u32)vec_xl(0, in + 2 * 16)); + rc3 = vec_revb((vector4x_u32)vec_xl(0, in + 3 * 16)); + rd0 = vec_revb((vector4x_u32)vec_xl(0, in + 4 * 16)); + rd1 = vec_revb((vector4x_u32)vec_xl(0, in + 5 * 16)); + rd2 = vec_revb((vector4x_u32)vec_xl(0, in + 6 * 16)); + rd3 = vec_revb((vector4x_u32)vec_xl(0, in + 7 * 16)); + + transpose_4x4(ra0, ra1, ra2, ra3, tmp0, tmp1); + transpose_4x4(rb0, rb1, rb2, rb3, tmp0, tmp1); + transpose_4x4(rc0, rc1, rc2, rc3, tmp0, tmp1); + transpose_4x4(rd0, rd1, rd2, rd3, tmp0, tmp1); + + for (rk_end = rk + 32; rk < rk_end; rk += 4) + { + vector4x_u32 r4keys = vec_xl(0, rk); + ROUND8(0, ra0, ra1, ra2, ra3, rb0, rb1, rb2, rb3); + ROUND8(0, rc0, rc1, rc2, rc3, rd0, rd1, rd2, rd3); + ROUND8(1, ra1, ra2, ra3, ra0, rb1, rb2, rb3, rb0); + ROUND8(1, rc1, rc2, rc3, rc0, rd1, rd2, rd3, rd0); + ROUND8(2, ra2, ra3, ra0, ra1, rb2, rb3, rb0, rb1); + ROUND8(2, rc2, rc3, rc0, rc1, rd2, rd3, rd0, rd1); + ROUND8(3, ra3, ra0, ra1, ra2, rb3, rb0, rb1, rb2); + ROUND8(3, rc3, rc0, rc1, rc2, rd3, rd0, rd1, rd2); + } + + transpose_4x4(ra3, ra2, ra1, ra0, tmp0, tmp1); + transpose_4x4(rb3, rb2, rb1, rb0, tmp0, tmp1); + transpose_4x4(rc3, rc2, rc1, rc0, tmp0, tmp1); + transpose_4x4(rd3, rd2, rd1, rd0, tmp0, tmp1); + + vec_xst((vector16x_u8)vec_revb(ra3), 0, out + 0 * 16); + vec_xst((vector16x_u8)vec_revb(ra2), 0, out + 1 * 16); + vec_xst((vector16x_u8)vec_revb(ra1), 0, out + 2 * 16); + vec_xst((vector16x_u8)vec_revb(ra0), 0, out + 3 * 16); + vec_xst((vector16x_u8)vec_revb(rb3), 0, out + 4 * 16); + vec_xst((vector16x_u8)vec_revb(rb2), 0, out + 5 * 16); + vec_xst((vector16x_u8)vec_revb(rb1), 0, out + 6 * 16); + vec_xst((vector16x_u8)vec_revb(rb0), 0, out + 7 * 16); + out += 8 * 16; + vec_xst((vector16x_u8)vec_revb(rc3), 0, out + 0 * 16); + vec_xst((vector16x_u8)vec_revb(rc2), 0, out + 1 * 16); + vec_xst((vector16x_u8)vec_revb(rc1), 0, out + 2 * 16); + vec_xst((vector16x_u8)vec_revb(rc0), 0, out + 3 * 16); + vec_xst((vector16x_u8)vec_revb(rd3), 0, out + 4 * 16); + vec_xst((vector16x_u8)vec_revb(rd2), 0, out + 5 * 16); + vec_xst((vector16x_u8)vec_revb(rd1), 0, out + 6 * 16); + vec_xst((vector16x_u8)vec_revb(rd0), 0, out + 7 * 16); +} + +static ASM_FUNC_ATTR_INLINE void +sm4_ppc_crypt_blk8(u32 *rk, byte *out, const byte *in) +{ + vector4x_u32 ra0, ra1, ra2, ra3; + vector4x_u32 rb0, rb1, rb2, rb3; + vector4x_u32 tmp0, tmp1; + u32 *rk_end; + + ra0 = vec_revb((vector4x_u32)vec_xl(0, in + 0 * 16)); + ra1 = vec_revb((vector4x_u32)vec_xl(0, in + 1 * 16)); + ra2 = vec_revb((vector4x_u32)vec_xl(0, in + 2 * 16)); + ra3 = vec_revb((vector4x_u32)vec_xl(0, in + 3 * 16)); + rb0 = vec_revb((vector4x_u32)vec_xl(0, in + 4 * 16)); + rb1 = vec_revb((vector4x_u32)vec_xl(0, in + 5 * 16)); + rb2 = vec_revb((vector4x_u32)vec_xl(0, in + 6 * 16)); + rb3 = vec_revb((vector4x_u32)vec_xl(0, in + 7 * 16)); + + transpose_4x4(ra0, ra1, ra2, ra3, tmp0, tmp1); + transpose_4x4(rb0, rb1, rb2, rb3, tmp0, tmp1); + + for (rk_end = rk + 32; rk < rk_end; rk += 4) + { + vector4x_u32 r4keys = vec_xl(0, rk); + ROUND8(0, ra0, ra1, ra2, ra3, rb0, rb1, rb2, rb3); + ROUND8(1, ra1, ra2, ra3, ra0, rb1, rb2, rb3, rb0); + ROUND8(2, ra2, ra3, ra0, ra1, rb2, rb3, rb0, rb1); + ROUND8(3, ra3, ra0, ra1, ra2, rb3, rb0, rb1, rb2); + } + + transpose_4x4(ra3, ra2, ra1, ra0, tmp0, tmp1); + transpose_4x4(rb3, rb2, rb1, rb0, tmp0, tmp1); + + vec_xst((vector16x_u8)vec_revb(ra3), 0, out + 0 * 16); + vec_xst((vector16x_u8)vec_revb(ra2), 0, out + 1 * 16); + vec_xst((vector16x_u8)vec_revb(ra1), 0, out + 2 * 16); + vec_xst((vector16x_u8)vec_revb(ra0), 0, out + 3 * 16); + vec_xst((vector16x_u8)vec_revb(rb3), 0, out + 4 * 16); + vec_xst((vector16x_u8)vec_revb(rb2), 0, out + 5 * 16); + vec_xst((vector16x_u8)vec_revb(rb1), 0, out + 6 * 16); + vec_xst((vector16x_u8)vec_revb(rb0), 0, out + 7 * 16); +} + +static ASM_FUNC_ATTR_INLINE void +sm4_ppc_crypt_blk1_4(u32 *rk, byte *out, const byte *in, size_t nblks) +{ + vector4x_u32 ra0, ra1, ra2, ra3; + vector4x_u32 tmp0, tmp1; + u32 *rk_end; + + ra0 = vec_revb((vector4x_u32)vec_xl(0, in + 0 * 16)); + ra1 = ra0; + ra2 = ra0; + ra3 = ra0; + if (LIKELY(nblks > 1)) + ra1 = vec_revb((vector4x_u32)vec_xl(0, in + 1 * 16)); + if (LIKELY(nblks > 2)) + ra2 = vec_revb((vector4x_u32)vec_xl(0, in + 2 * 16)); + if (LIKELY(nblks > 3)) + ra3 = vec_revb((vector4x_u32)vec_xl(0, in + 3 * 16)); + + transpose_4x4(ra0, ra1, ra2, ra3, tmp0, tmp1); + + for (rk_end = rk + 32; rk < rk_end; rk += 4) + { + vector4x_u32 r4keys = vec_xl(0, rk); + ROUND4(0, ra0, ra1, ra2, ra3); + ROUND4(1, ra1, ra2, ra3, ra0); + ROUND4(2, ra2, ra3, ra0, ra1); + ROUND4(3, ra3, ra0, ra1, ra2); + } + + transpose_4x4(ra3, ra2, ra1, ra0, tmp0, tmp1); + + vec_xst((vector16x_u8)vec_revb(ra3), 0, out + 0 * 16); + if (LIKELY(nblks > 1)) + vec_xst((vector16x_u8)vec_revb(ra2), 0, out + 1 * 16); + if (LIKELY(nblks > 2)) + vec_xst((vector16x_u8)vec_revb(ra1), 0, out + 2 * 16); + if (LIKELY(nblks > 3)) + vec_xst((vector16x_u8)vec_revb(ra0), 0, out + 3 * 16); +} + +static ASM_FUNC_ATTR_INLINE void +sm4_ppc_crypt_blk1_16(u32 *rk, byte *out, const byte *in, size_t nblks) +{ + if (nblks >= 16) + { + sm4_ppc_crypt_blk16(rk, out, in); + return; + } + + while (nblks >= 8) + { + sm4_ppc_crypt_blk8(rk, out, in); + in += 8 * 16; + out += 8 * 16; + nblks -= 8; + } + + while (nblks) + { + size_t currblks = nblks > 4 ? 4 : nblks; + sm4_ppc_crypt_blk1_4(rk, out, in, currblks); + in += currblks * 16; + out += currblks * 16; + nblks -= currblks; + } +} + +ASM_FUNC_ATTR_NOINLINE FUNC_ATTR_TARGET_P8 void +_gcry_sm4_ppc8le_crypt_blk1_16(u32 *rk, byte *out, const byte *in, + size_t nblks) +{ + sm4_ppc_crypt_blk1_16(rk, out, in, nblks); +} + +ASM_FUNC_ATTR_NOINLINE FUNC_ATTR_TARGET_P9 void +_gcry_sm4_ppc9le_crypt_blk1_16(u32 *rk, byte *out, const byte *in, + size_t nblks) +{ +#ifdef HAVE_FUNC_ATTR_TARGET + /* Inline for POWER9 target optimization. */ + sm4_ppc_crypt_blk1_16(rk, out, in, nblks); +#else + /* Target selecting not working, just call the other noinline function. */ + _gcry_sm4_ppc8le_crypt_blk1_16(rk, out, in, nblks); +#endif +} + +#endif /* ENABLE_PPC_CRYPTO_SUPPORT */ diff --git a/cipher/sm4.c b/cipher/sm4.c index b0402b64..06b843f8 100644 --- a/cipher/sm4.c +++ b/cipher/sm4.c @@ -115,6 +115,14 @@ # endif #endif +#undef USE_PPC_CRYPTO +#if defined(ENABLE_PPC_CRYPTO_SUPPORT) && \ + defined(HAVE_COMPATIBLE_CC_PPC_ALTIVEC) && \ + defined(HAVE_GCC_INLINE_ASM_PPC_ALTIVEC) && \ + !defined(WORDS_BIGENDIAN) && (__GNUC__ >= 4) +# define USE_PPC_CRYPTO 1 +#endif + static const char *sm4_selftest (void); static void _gcry_sm4_ctr_enc (void *context, unsigned char *ctr, @@ -169,6 +177,10 @@ typedef struct #ifdef USE_ARM_SVE_CE unsigned int use_arm_sve_ce:1; #endif +#ifdef USE_PPC_CRYPTO + unsigned int use_ppc8le:1; + unsigned int use_ppc9le:1; +#endif } SM4_context; static const u32 fk[4] = @@ -598,6 +610,28 @@ sm4_armv9_sve_ce_crypt_blk1_16(void *rk, byte *out, const byte *in, extern unsigned int _gcry_sm4_armv9_sve_get_vl(void); #endif /* USE_ARM_SVE_CE */ +#ifdef USE_PPC_CRYPTO +extern void _gcry_sm4_ppc8le_crypt_blk1_16(u32 *rk, byte *out, const byte *in, + size_t num_blks); + +extern void _gcry_sm4_ppc9le_crypt_blk1_16(u32 *rk, byte *out, const byte *in, + size_t num_blks); + +static inline unsigned int +sm4_ppc8le_crypt_blk1_16(void *rk, byte *out, const byte *in, size_t num_blks) +{ + _gcry_sm4_ppc8le_crypt_blk1_16(rk, out, in, num_blks); + return 0; +} + +static inline unsigned int +sm4_ppc9le_crypt_blk1_16(void *rk, byte *out, const byte *in, size_t num_blks) +{ + _gcry_sm4_ppc9le_crypt_blk1_16(rk, out, in, num_blks); + return 0; +} +#endif /* USE_PPC_CRYPTO */ + static inline void prefetch_sbox_table(void) { const volatile byte *vtab = (void *)&sbox_table; @@ -775,6 +809,10 @@ sm4_setkey (void *context, const byte *key, const unsigned keylen, ctx->use_arm_sve_ce = (hwf & HWF_ARM_SVE2) && (hwf & HWF_ARM_SVESM4) && _gcry_sm4_armv9_sve_get_vl() > 16; #endif +#ifdef USE_PPC_CRYPTO + ctx->use_ppc8le = (hwf & HWF_PPC_VCRYPTO) != 0; + ctx->use_ppc9le = (hwf & HWF_PPC_VCRYPTO) && (hwf & HWF_PPC_ARCH_3_00); +#endif #ifdef USE_GFNI_AVX2 if (ctx->use_gfni_avx2) @@ -1008,6 +1046,16 @@ sm4_get_crypt_blk1_16_fn(SM4_context *ctx) { return &sm4_aarch64_crypt_blk1_16; } +#endif +#ifdef USE_PPC_CRYPTO + else if (ctx->use_ppc9le) + { + return &sm4_ppc9le_crypt_blk1_16; + } + else if (ctx->use_ppc8le) + { + return &sm4_ppc8le_crypt_blk1_16; + } #endif else { -- cgit v1.2.1