From 0b2da804ee813eee22c386ba7f253415103b34ea Mon Sep 17 00:00:00 2001
From: Jussi Kivilinna <jussi.kivilinna@iki.fi>
Date: Wed, 1 Mar 2023 21:03:09 +0200
Subject: Add PowerPC vector implementation of SM4

* cipher/Makefile.am: Add 'sm4-ppc.c'.
* cipher/sm4-ppc.c: New.
* cipher/sm4.c (USE_PPC_CRYPTO): New.
(SM4_context): Add 'use_ppc8le' and 'use_ppc9le'.
[USE_PPC_CRYPTO] (_gcry_sm4_ppc8le_crypt_blk1_16)
(_gcry_sm4_ppc9le_crypt_blk1_16, sm4_ppc8le_crypt_blk1_16)
(sm4_ppc9le_crypt_blk1_16): New.
(sm4_setkey) [USE_PPC_CRYPTO]: Set use_ppc8le and use_ppc9le
based on HW features.
(sm4_get_crypt_blk1_16_fn) [USE_PPC_CRYPTO]: Add PowerPC
implementation selection.
--

Benchmark on POWER9:

 Before:
 SM4            |  nanosecs/byte   mebibytes/sec   cycles/byte
        ECB enc |     14.47 ns/B     65.89 MiB/s     33.29 c/B
        ECB dec |     14.47 ns/B     65.89 MiB/s     33.29 c/B
        CBC enc |     35.09 ns/B     27.18 MiB/s     80.71 c/B
        CBC dec |     16.69 ns/B     57.13 MiB/s     38.39 c/B
        CFB enc |     35.09 ns/B     27.18 MiB/s     80.71 c/B
        CFB dec |     16.76 ns/B     56.90 MiB/s     38.55 c/B
        CTR enc |     16.88 ns/B     56.50 MiB/s     38.82 c/B
        CTR dec |     16.88 ns/B     56.50 MiB/s     38.82 c/B

 After (ECB ~4.4x faster):
 SM4            |  nanosecs/byte   mebibytes/sec   cycles/byte
        ECB enc |      3.26 ns/B     292.3 MiB/s      7.50 c/B
        ECB dec |      3.26 ns/B     292.3 MiB/s      7.50 c/B
        CBC enc |     35.10 ns/B     27.17 MiB/s     80.72 c/B
        CBC dec |      3.33 ns/B     286.3 MiB/s      7.66 c/B
        CFB enc |     35.10 ns/B     27.17 MiB/s     80.74 c/B
        CFB dec |      3.36 ns/B     283.8 MiB/s      7.73 c/B
        CTR enc |      3.47 ns/B     275.0 MiB/s      7.98 c/B
        CTR dec |      3.47 ns/B     275.0 MiB/s      7.98 c/B

Signed-off-by: Jussi Kivilinna <jussi.kivilinna@iki.fi>
---
 cipher/Makefile.am |   7 ++
 cipher/sm4-ppc.c   | 342 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 cipher/sm4.c       |  48 ++++++++
 3 files changed, 397 insertions(+)
 create mode 100644 cipher/sm4-ppc.c

(limited to 'cipher')

diff --git a/cipher/Makefile.am b/cipher/Makefile.am
index dcaa68bb..cf1fbe85 100644
--- a/cipher/Makefile.am
+++ b/cipher/Makefile.am
@@ -123,6 +123,7 @@ EXTRA_libcipher_la_SOURCES = \
 	sm4.c sm4-aesni-avx-amd64.S sm4-aesni-avx2-amd64.S \
 	sm4-gfni-avx2-amd64.S sm4-gfni-avx512-amd64.S \
 	sm4-aarch64.S sm4-armv8-aarch64-ce.S sm4-armv9-aarch64-sve-ce.S \
+	sm4-ppc.c \
 	serpent-avx2-amd64.S serpent-armv7-neon.S \
 	sha1.c sha1-ssse3-amd64.S sha1-avx-amd64.S sha1-avx-bmi2-amd64.S \
 	sha1-avx2-bmi2-amd64.S sha1-armv7-neon.S sha1-armv8-aarch32-ce.S \
@@ -309,3 +310,9 @@ camellia-aarch64-ce.o: $(srcdir)/camellia-aarch64-ce.c Makefile
 
 camellia-aarch64-ce.lo: $(srcdir)/camellia-aarch64-ce.c Makefile
 	`echo $(LTCOMPILE) $(aarch64_neon_cflags) -c $< | $(instrumentation_munging) `
+
+sm4-ppc.o: $(srcdir)/sm4-ppc.c Makefile
+	`echo $(COMPILE) $(ppc_vcrypto_cflags) -c $< | $(instrumentation_munging) `
+
+sm4-ppc.lo: $(srcdir)/sm4-ppc.c Makefile
+	`echo $(LTCOMPILE) $(ppc_vcrypto_cflags) -c $< | $(instrumentation_munging) `
diff --git a/cipher/sm4-ppc.c b/cipher/sm4-ppc.c
new file mode 100644
index 00000000..bb2c55e0
--- /dev/null
+++ b/cipher/sm4-ppc.c
@@ -0,0 +1,342 @@
+/* sm4-ppc.c  -  PowerPC implementation of SM4 cipher
+ *
+ * Copyright (C) 2023 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <config.h>
+
+#if defined(ENABLE_PPC_CRYPTO_SUPPORT) && \
+    defined(HAVE_COMPATIBLE_CC_PPC_ALTIVEC) && \
+    defined(HAVE_GCC_INLINE_ASM_PPC_ALTIVEC) && \
+    !defined(WORDS_BIGENDIAN) && (__GNUC__ >= 4)
+
+#include <altivec.h>
+#include "bufhelp.h"
+
+typedef vector unsigned char vector16x_u8;
+typedef vector unsigned int vector4x_u32;
+typedef vector unsigned long long vector2x_u64;
+
+#ifdef HAVE_GCC_ATTRIBUTE_OPTIMIZE
+# define FUNC_ATTR_OPT __attribute__((optimize("-O2")))
+#else
+# define FUNC_ATTR_OPT
+#endif
+
+#if defined(__clang__) && defined(HAVE_CLANG_ATTRIBUTE_PPC_TARGET)
+# define FUNC_ATTR_TARGET_P8 __attribute__((target("arch=pwr8")))
+# define FUNC_ATTR_TARGET_P9 __attribute__((target("arch=pwr9")))
+# define HAVE_FUNC_ATTR_TARGET 1
+#elif defined(HAVE_GCC_ATTRIBUTE_PPC_TARGET)
+# define FUNC_ATTR_TARGET_P8 __attribute__((target("cpu=power8")))
+# define FUNC_ATTR_TARGET_P9 __attribute__((target("cpu=power9")))
+# define HAVE_FUNC_ATTR_TARGET 1
+#else
+# define FUNC_ATTR_TARGET_P8
+# define FUNC_ATTR_TARGET_P9
+# undef HAVE_FUNC_ATTR_TARGET
+#endif
+
+#define ALWAYS_INLINE inline __attribute__((always_inline))
+#define NO_INLINE __attribute__((noinline))
+#define NO_INSTRUMENT_FUNCTION __attribute__((no_instrument_function))
+
+#define ASM_FUNC_ATTR          NO_INSTRUMENT_FUNCTION
+#define ASM_FUNC_ATTR_INLINE   ASM_FUNC_ATTR ALWAYS_INLINE
+#define ASM_FUNC_ATTR_NOINLINE ASM_FUNC_ATTR NO_INLINE
+
+#ifdef __clang__
+/* clang has mismatching prototype for vec_sbox_be. */
+static ASM_FUNC_ATTR_INLINE vector16x_u8
+asm_sbox_be(vector16x_u8 b)
+{
+  vector16x_u8 o;
+  __asm__ ("vsbox %0, %1\n\t" : "=v" (o) : "v" (b));
+  return o;
+}
+#undef vec_sbox_be
+#define vec_sbox_be asm_sbox_be
+#endif /* __clang__ */
+
+#define transpose_4x4(x0, x1, x2, x3, t1, t2) \
+	t2 = (vector4x_u32)vec_mergel((vector4x_u32)x0, (vector4x_u32)x1); \
+	x0 = (vector4x_u32)vec_mergeh((vector4x_u32)x0, (vector4x_u32)x1); \
+	\
+	t1 = (vector4x_u32)vec_mergeh((vector4x_u32)x2, (vector4x_u32)x3); \
+	x2 = (vector4x_u32)vec_mergel((vector4x_u32)x2, (vector4x_u32)x3); \
+	\
+	x1 = (vector4x_u32)vec_mergel((vector2x_u64)x0, (vector2x_u64)t1); \
+	x0 = (vector4x_u32)vec_mergeh((vector2x_u64)x0, (vector2x_u64)t1); \
+	\
+	x3 = (vector4x_u32)vec_mergel((vector2x_u64)t2, (vector2x_u64)x2); \
+	x2 = (vector4x_u32)vec_mergeh((vector2x_u64)t2, (vector2x_u64)x2);
+
+#define filter_8bit(x, lo_t, hi_t, mask4bit, tmp0) ({ \
+	tmp0 = x & mask4bit; \
+	x = (vector4x_u32)((vector16x_u8)x >> 4); \
+	\
+	tmp0 = (vector4x_u32)vec_perm((vector16x_u8)lo_t, (vector16x_u8)lo_t, \
+				      (vector16x_u8)tmp0); \
+	x = (vector4x_u32)vec_perm((vector16x_u8)hi_t, (vector16x_u8)hi_t, \
+				   (vector16x_u8)x); \
+	x = x ^ tmp0; \
+      })
+
+#define GET_RKEY(round) vec_splat(r4keys, round)
+
+#define ROUND4(round, s0, s1, s2, s3) ({ \
+	vector4x_u32 rkey = GET_RKEY(round); \
+	vector4x_u32 rx0 = rkey ^ s1 ^ s2 ^ s3; \
+	filter_8bit(rx0, pre_tf_lo_s, pre_tf_hi_s, mask_0f, tmp0); \
+	rx0 = (vector4x_u32)vec_sbox_be((vector16x_u8)rx0); \
+	filter_8bit(rx0, post_tf_lo_s, post_tf_hi_s, mask_0f, tmp0); \
+	s0 ^= rx0 ^ vec_rl(rx0, rotate2) ^ vec_rl(rx0, rotate10) ^ \
+		    vec_rl(rx0, rotate18) ^ vec_rl(rx0, rotate24); \
+      })
+
+#define ROUND8(round, s0, s1, s2, s3, r0, r1, r2, r3) ({ \
+	vector4x_u32 rkey = GET_RKEY(round); \
+	vector4x_u32 rx0 = rkey ^ s1 ^ s2 ^ s3; \
+	vector4x_u32 rx1 = rkey ^ r1 ^ r2 ^ r3; \
+	filter_8bit(rx0, pre_tf_lo_s, pre_tf_hi_s, mask_0f, tmp0); \
+	filter_8bit(rx1, pre_tf_lo_s, pre_tf_hi_s, mask_0f, tmp0); \
+	rx0 = (vector4x_u32)vec_sbox_be((vector16x_u8)rx0); \
+	rx1 = (vector4x_u32)vec_sbox_be((vector16x_u8)rx1); \
+	filter_8bit(rx0, post_tf_lo_s, post_tf_hi_s, mask_0f, tmp0); \
+	filter_8bit(rx1, post_tf_lo_s, post_tf_hi_s, mask_0f, tmp0); \
+	s0 ^= rx0 ^ vec_rl(rx0, rotate2) ^ vec_rl(rx0, rotate10) ^ \
+		    vec_rl(rx0, rotate18) ^ vec_rl(rx0, rotate24); \
+	r0 ^= rx1 ^ vec_rl(rx1, rotate2) ^ vec_rl(rx1, rotate10) ^ \
+		    vec_rl(rx1, rotate18) ^ vec_rl(rx1, rotate24); \
+      })
+
+static const vector4x_u32 mask_0f =
+  { 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f };
+static const vector2x_u64 pre_tf_lo_s =
+  { 0x9096E3E575730600ULL, 0xC6C0B5B323255056ULL };
+static const vector2x_u64 pre_tf_hi_s =
+  { 0xE341AA08EA48A301ULL, 0xF153B81AF85AB113ULL };
+static const vector2x_u64 post_tf_lo_s =
+  { 0x6F53C6FA95A93C00ULL, 0xD9E5704C231F8AB6ULL };
+static const vector2x_u64 post_tf_hi_s =
+  { 0x9A4635E9479BE834ULL, 0x25F98A56F824578BULL };
+static const vector4x_u32 rotate2 = { 2, 2, 2, 2 };
+static const vector4x_u32 rotate10 = { 10, 10, 10, 10 };
+static const vector4x_u32 rotate18 = { 18, 18, 18, 18 };
+static const vector4x_u32 rotate24 = { 24, 24, 24, 24 };
+
+static ASM_FUNC_ATTR_INLINE void
+sm4_ppc_crypt_blk16(u32 *rk, byte *out, const byte *in)
+{
+  vector4x_u32 ra0, ra1, ra2, ra3;
+  vector4x_u32 rb0, rb1, rb2, rb3;
+  vector4x_u32 rc0, rc1, rc2, rc3;
+  vector4x_u32 rd0, rd1, rd2, rd3;
+  vector4x_u32 tmp0, tmp1;
+  u32 *rk_end;
+
+  ra0 = vec_revb((vector4x_u32)vec_xl(0, in + 0 * 16));
+  ra1 = vec_revb((vector4x_u32)vec_xl(0, in + 1 * 16));
+  ra2 = vec_revb((vector4x_u32)vec_xl(0, in + 2 * 16));
+  ra3 = vec_revb((vector4x_u32)vec_xl(0, in + 3 * 16));
+  rb0 = vec_revb((vector4x_u32)vec_xl(0, in + 4 * 16));
+  rb1 = vec_revb((vector4x_u32)vec_xl(0, in + 5 * 16));
+  rb2 = vec_revb((vector4x_u32)vec_xl(0, in + 6 * 16));
+  rb3 = vec_revb((vector4x_u32)vec_xl(0, in + 7 * 16));
+  in += 8 * 16;
+  rc0 = vec_revb((vector4x_u32)vec_xl(0, in + 0 * 16));
+  rc1 = vec_revb((vector4x_u32)vec_xl(0, in + 1 * 16));
+  rc2 = vec_revb((vector4x_u32)vec_xl(0, in + 2 * 16));
+  rc3 = vec_revb((vector4x_u32)vec_xl(0, in + 3 * 16));
+  rd0 = vec_revb((vector4x_u32)vec_xl(0, in + 4 * 16));
+  rd1 = vec_revb((vector4x_u32)vec_xl(0, in + 5 * 16));
+  rd2 = vec_revb((vector4x_u32)vec_xl(0, in + 6 * 16));
+  rd3 = vec_revb((vector4x_u32)vec_xl(0, in + 7 * 16));
+
+  transpose_4x4(ra0, ra1, ra2, ra3, tmp0, tmp1);
+  transpose_4x4(rb0, rb1, rb2, rb3, tmp0, tmp1);
+  transpose_4x4(rc0, rc1, rc2, rc3, tmp0, tmp1);
+  transpose_4x4(rd0, rd1, rd2, rd3, tmp0, tmp1);
+
+  for (rk_end = rk + 32; rk < rk_end; rk += 4)
+    {
+      vector4x_u32 r4keys = vec_xl(0, rk);
+      ROUND8(0, ra0, ra1, ra2, ra3, rb0, rb1, rb2, rb3);
+      ROUND8(0, rc0, rc1, rc2, rc3, rd0, rd1, rd2, rd3);
+      ROUND8(1, ra1, ra2, ra3, ra0, rb1, rb2, rb3, rb0);
+      ROUND8(1, rc1, rc2, rc3, rc0, rd1, rd2, rd3, rd0);
+      ROUND8(2, ra2, ra3, ra0, ra1, rb2, rb3, rb0, rb1);
+      ROUND8(2, rc2, rc3, rc0, rc1, rd2, rd3, rd0, rd1);
+      ROUND8(3, ra3, ra0, ra1, ra2, rb3, rb0, rb1, rb2);
+      ROUND8(3, rc3, rc0, rc1, rc2, rd3, rd0, rd1, rd2);
+    }
+
+  transpose_4x4(ra3, ra2, ra1, ra0, tmp0, tmp1);
+  transpose_4x4(rb3, rb2, rb1, rb0, tmp0, tmp1);
+  transpose_4x4(rc3, rc2, rc1, rc0, tmp0, tmp1);
+  transpose_4x4(rd3, rd2, rd1, rd0, tmp0, tmp1);
+
+  vec_xst((vector16x_u8)vec_revb(ra3), 0, out + 0 * 16);
+  vec_xst((vector16x_u8)vec_revb(ra2), 0, out + 1 * 16);
+  vec_xst((vector16x_u8)vec_revb(ra1), 0, out + 2 * 16);
+  vec_xst((vector16x_u8)vec_revb(ra0), 0, out + 3 * 16);
+  vec_xst((vector16x_u8)vec_revb(rb3), 0, out + 4 * 16);
+  vec_xst((vector16x_u8)vec_revb(rb2), 0, out + 5 * 16);
+  vec_xst((vector16x_u8)vec_revb(rb1), 0, out + 6 * 16);
+  vec_xst((vector16x_u8)vec_revb(rb0), 0, out + 7 * 16);
+  out += 8 * 16;
+  vec_xst((vector16x_u8)vec_revb(rc3), 0, out + 0 * 16);
+  vec_xst((vector16x_u8)vec_revb(rc2), 0, out + 1 * 16);
+  vec_xst((vector16x_u8)vec_revb(rc1), 0, out + 2 * 16);
+  vec_xst((vector16x_u8)vec_revb(rc0), 0, out + 3 * 16);
+  vec_xst((vector16x_u8)vec_revb(rd3), 0, out + 4 * 16);
+  vec_xst((vector16x_u8)vec_revb(rd2), 0, out + 5 * 16);
+  vec_xst((vector16x_u8)vec_revb(rd1), 0, out + 6 * 16);
+  vec_xst((vector16x_u8)vec_revb(rd0), 0, out + 7 * 16);
+}
+
+static ASM_FUNC_ATTR_INLINE void
+sm4_ppc_crypt_blk8(u32 *rk, byte *out, const byte *in)
+{
+  vector4x_u32 ra0, ra1, ra2, ra3;
+  vector4x_u32 rb0, rb1, rb2, rb3;
+  vector4x_u32 tmp0, tmp1;
+  u32 *rk_end;
+
+  ra0 = vec_revb((vector4x_u32)vec_xl(0, in + 0 * 16));
+  ra1 = vec_revb((vector4x_u32)vec_xl(0, in + 1 * 16));
+  ra2 = vec_revb((vector4x_u32)vec_xl(0, in + 2 * 16));
+  ra3 = vec_revb((vector4x_u32)vec_xl(0, in + 3 * 16));
+  rb0 = vec_revb((vector4x_u32)vec_xl(0, in + 4 * 16));
+  rb1 = vec_revb((vector4x_u32)vec_xl(0, in + 5 * 16));
+  rb2 = vec_revb((vector4x_u32)vec_xl(0, in + 6 * 16));
+  rb3 = vec_revb((vector4x_u32)vec_xl(0, in + 7 * 16));
+
+  transpose_4x4(ra0, ra1, ra2, ra3, tmp0, tmp1);
+  transpose_4x4(rb0, rb1, rb2, rb3, tmp0, tmp1);
+
+  for (rk_end = rk + 32; rk < rk_end; rk += 4)
+    {
+      vector4x_u32 r4keys = vec_xl(0, rk);
+      ROUND8(0, ra0, ra1, ra2, ra3, rb0, rb1, rb2, rb3);
+      ROUND8(1, ra1, ra2, ra3, ra0, rb1, rb2, rb3, rb0);
+      ROUND8(2, ra2, ra3, ra0, ra1, rb2, rb3, rb0, rb1);
+      ROUND8(3, ra3, ra0, ra1, ra2, rb3, rb0, rb1, rb2);
+    }
+
+  transpose_4x4(ra3, ra2, ra1, ra0, tmp0, tmp1);
+  transpose_4x4(rb3, rb2, rb1, rb0, tmp0, tmp1);
+
+  vec_xst((vector16x_u8)vec_revb(ra3), 0, out + 0 * 16);
+  vec_xst((vector16x_u8)vec_revb(ra2), 0, out + 1 * 16);
+  vec_xst((vector16x_u8)vec_revb(ra1), 0, out + 2 * 16);
+  vec_xst((vector16x_u8)vec_revb(ra0), 0, out + 3 * 16);
+  vec_xst((vector16x_u8)vec_revb(rb3), 0, out + 4 * 16);
+  vec_xst((vector16x_u8)vec_revb(rb2), 0, out + 5 * 16);
+  vec_xst((vector16x_u8)vec_revb(rb1), 0, out + 6 * 16);
+  vec_xst((vector16x_u8)vec_revb(rb0), 0, out + 7 * 16);
+}
+
+static ASM_FUNC_ATTR_INLINE void
+sm4_ppc_crypt_blk1_4(u32 *rk, byte *out, const byte *in, size_t nblks)
+{
+  vector4x_u32 ra0, ra1, ra2, ra3;
+  vector4x_u32 tmp0, tmp1;
+  u32 *rk_end;
+
+  ra0 = vec_revb((vector4x_u32)vec_xl(0, in + 0 * 16));
+  ra1 = ra0;
+  ra2 = ra0;
+  ra3 = ra0;
+  if (LIKELY(nblks > 1))
+    ra1 = vec_revb((vector4x_u32)vec_xl(0, in + 1 * 16));
+  if (LIKELY(nblks > 2))
+    ra2 = vec_revb((vector4x_u32)vec_xl(0, in + 2 * 16));
+  if (LIKELY(nblks > 3))
+    ra3 = vec_revb((vector4x_u32)vec_xl(0, in + 3 * 16));
+
+  transpose_4x4(ra0, ra1, ra2, ra3, tmp0, tmp1);
+
+  for (rk_end = rk + 32; rk < rk_end; rk += 4)
+    {
+      vector4x_u32 r4keys = vec_xl(0, rk);
+      ROUND4(0, ra0, ra1, ra2, ra3);
+      ROUND4(1, ra1, ra2, ra3, ra0);
+      ROUND4(2, ra2, ra3, ra0, ra1);
+      ROUND4(3, ra3, ra0, ra1, ra2);
+    }
+
+  transpose_4x4(ra3, ra2, ra1, ra0, tmp0, tmp1);
+
+  vec_xst((vector16x_u8)vec_revb(ra3), 0, out + 0 * 16);
+  if (LIKELY(nblks > 1))
+    vec_xst((vector16x_u8)vec_revb(ra2), 0, out + 1 * 16);
+  if (LIKELY(nblks > 2))
+    vec_xst((vector16x_u8)vec_revb(ra1), 0, out + 2 * 16);
+  if (LIKELY(nblks > 3))
+    vec_xst((vector16x_u8)vec_revb(ra0), 0, out + 3 * 16);
+}
+
+static ASM_FUNC_ATTR_INLINE void
+sm4_ppc_crypt_blk1_16(u32 *rk, byte *out, const byte *in, size_t nblks)
+{
+  if (nblks >= 16)
+    {
+      sm4_ppc_crypt_blk16(rk, out, in);
+      return;
+    }
+
+  while (nblks >= 8)
+    {
+      sm4_ppc_crypt_blk8(rk, out, in);
+      in += 8 * 16;
+      out += 8 * 16;
+      nblks -= 8;
+    }
+
+  while (nblks)
+    {
+      size_t currblks = nblks > 4 ? 4 : nblks;
+      sm4_ppc_crypt_blk1_4(rk, out, in, currblks);
+      in += currblks * 16;
+      out += currblks * 16;
+      nblks -= currblks;
+    }
+}
+
+ASM_FUNC_ATTR_NOINLINE FUNC_ATTR_TARGET_P8 void
+_gcry_sm4_ppc8le_crypt_blk1_16(u32 *rk, byte *out, const byte *in,
+			       size_t nblks)
+{
+  sm4_ppc_crypt_blk1_16(rk, out, in, nblks);
+}
+
+ASM_FUNC_ATTR_NOINLINE FUNC_ATTR_TARGET_P9 void
+_gcry_sm4_ppc9le_crypt_blk1_16(u32 *rk, byte *out, const byte *in,
+			       size_t nblks)
+{
+#ifdef HAVE_FUNC_ATTR_TARGET
+  /* Inline for POWER9 target optimization. */
+  sm4_ppc_crypt_blk1_16(rk, out, in, nblks);
+#else
+  /* Target selecting not working, just call the other noinline function. */
+  _gcry_sm4_ppc8le_crypt_blk1_16(rk, out, in, nblks);
+#endif
+}
+
+#endif /* ENABLE_PPC_CRYPTO_SUPPORT */
diff --git a/cipher/sm4.c b/cipher/sm4.c
index b0402b64..06b843f8 100644
--- a/cipher/sm4.c
+++ b/cipher/sm4.c
@@ -115,6 +115,14 @@
 # endif
 #endif
 
+#undef USE_PPC_CRYPTO
+#if defined(ENABLE_PPC_CRYPTO_SUPPORT) && \
+    defined(HAVE_COMPATIBLE_CC_PPC_ALTIVEC) && \
+    defined(HAVE_GCC_INLINE_ASM_PPC_ALTIVEC) && \
+    !defined(WORDS_BIGENDIAN) && (__GNUC__ >= 4)
+# define USE_PPC_CRYPTO 1
+#endif
+
 static const char *sm4_selftest (void);
 
 static void _gcry_sm4_ctr_enc (void *context, unsigned char *ctr,
@@ -169,6 +177,10 @@ typedef struct
 #ifdef USE_ARM_SVE_CE
   unsigned int use_arm_sve_ce:1;
 #endif
+#ifdef USE_PPC_CRYPTO
+  unsigned int use_ppc8le:1;
+  unsigned int use_ppc9le:1;
+#endif
 } SM4_context;
 
 static const u32 fk[4] =
@@ -598,6 +610,28 @@ sm4_armv9_sve_ce_crypt_blk1_16(void *rk, byte *out, const byte *in,
 extern unsigned int _gcry_sm4_armv9_sve_get_vl(void);
 #endif /* USE_ARM_SVE_CE */
 
+#ifdef USE_PPC_CRYPTO
+extern void _gcry_sm4_ppc8le_crypt_blk1_16(u32 *rk, byte *out, const byte *in,
+					   size_t num_blks);
+
+extern void _gcry_sm4_ppc9le_crypt_blk1_16(u32 *rk, byte *out, const byte *in,
+					   size_t num_blks);
+
+static inline unsigned int
+sm4_ppc8le_crypt_blk1_16(void *rk, byte *out, const byte *in, size_t num_blks)
+{
+  _gcry_sm4_ppc8le_crypt_blk1_16(rk, out, in, num_blks);
+  return 0;
+}
+
+static inline unsigned int
+sm4_ppc9le_crypt_blk1_16(void *rk, byte *out, const byte *in, size_t num_blks)
+{
+  _gcry_sm4_ppc9le_crypt_blk1_16(rk, out, in, num_blks);
+  return 0;
+}
+#endif /* USE_PPC_CRYPTO */
+
 static inline void prefetch_sbox_table(void)
 {
   const volatile byte *vtab = (void *)&sbox_table;
@@ -775,6 +809,10 @@ sm4_setkey (void *context, const byte *key, const unsigned keylen,
   ctx->use_arm_sve_ce = (hwf & HWF_ARM_SVE2) && (hwf & HWF_ARM_SVESM4)
 		&& _gcry_sm4_armv9_sve_get_vl() > 16;
 #endif
+#ifdef USE_PPC_CRYPTO
+  ctx->use_ppc8le = (hwf & HWF_PPC_VCRYPTO) != 0;
+  ctx->use_ppc9le = (hwf & HWF_PPC_VCRYPTO) && (hwf & HWF_PPC_ARCH_3_00);
+#endif
 
 #ifdef USE_GFNI_AVX2
   if (ctx->use_gfni_avx2)
@@ -1008,6 +1046,16 @@ sm4_get_crypt_blk1_16_fn(SM4_context *ctx)
     {
       return &sm4_aarch64_crypt_blk1_16;
     }
+#endif
+#ifdef USE_PPC_CRYPTO
+  else if (ctx->use_ppc9le)
+    {
+      return &sm4_ppc9le_crypt_blk1_16;
+    }
+  else if (ctx->use_ppc8le)
+    {
+      return &sm4_ppc8le_crypt_blk1_16;
+    }
 #endif
   else
     {
-- 
cgit v1.2.1