Add AVX and AVX2/BMI implementations for SHA-256

* LICENSES: Add 'cipher/sha256-avx-amd64.S' and 'cipher/sha256-avx2-bmi2-amd64.S'. * cipher/Makefile.am: Add 'sha256-avx-amd64.S' and 'sha256-avx2-bmi2-amd64.S'. * cipher/sha256-avx-amd64.S: New. * cipher/sha256-avx2-bmi2-amd64.S: New. * cipher/sha256-ssse3-amd64.S: Use 'lea' instead of 'add' in few places for tiny speed improvement. * cipher/sha256.c (USE_AVX, USE_AVX2): New. (SHA256_CONTEXT) [USE_AVX, USE_AVX2]: Add 'use_avx' and 'use_avx2'. (sha256_init, sha224_init) [USE_AVX, USE_AVX2]: Initialize above new context members. [USE_AVX] (_gcry_sha256_transform_amd64_avx): New. [USE_AVX2] (_gcry_sha256_transform_amd64_avx2): New. (transform) [USE_AVX2]: Use AVX2 assembly if enabled. (transform) [USE_AVX]: Use AVX assembly if enabled. * configure.ac: Add 'sha256-avx-amd64.lo' and 'sha256-avx2-bmi2-amd64.lo'. -- Patch adds fast AVX and AVX2/BMI2 implementations of SHA-256 by Intel Corporation. The assembly source is licensed under 3-clause BSD license, thus compatible with LGPL2.1+. Original source can be accessed at: http://www.intel.com/p/en_US/embedded/hwsw/technology/packet-processing#docs Implementation is described in white paper "Fast SHA - 256 Implementations on Intel® Architecture Processors" http://www.intel.com/content/www/us/en/intelligent-systems/intel-technology/sha-256-implementations-paper.html Note: AVX implementation uses SHLD instruction to emulate RORQ, since it's faster on Intel Sandy-Bridge. However, on non-Intel CPUs SHLD is much slower than RORQ, so therefore AVX implementation is (for now) limited to Intel CPUs. Note: AVX2 implementation also uses BMI2 instruction rorx, thus additional HWF flag. Benchmarks: cpu C-lang SSSE3 AVX/AVX2 C vs AVX/AVX2 vs SSSE3 Intel i5-4570 13.86 c/B 10.27 c/B 8.70 c/B 1.59x 1.18x Intel i5-2450M 17.25 c/B 12.36 c/B 10.31 c/B 1.67x 1.19x Signed-off-by: Jussi Kivilinna <jussi.kivilinna@iki.fi>
author: Jussi Kivilinna <jussi.kivilinna@iki.fi> 2013-12-17 15:35:38 +0200
committer: Jussi Kivilinna <jussi.kivilinna@iki.fi> 2013-12-18 17:00:04 +0200
commit: a5c2bbfe0db515d739ab683297903c77b1eec124 (patch)
tree: ef6d9ba8d35b6e621aee58e91431d0fd446e940e /cipher/sha256.c
parent: e4e458465b124e25b6aec7a60174bf1ca32dc5fd (diff)
download: libgcrypt-a5c2bbfe0db515d739ab683297903c77b1eec124.tar.gz
1 files changed, 66 insertions, 2 deletions
diff --git a/cipher/sha256.c b/cipher/sha256.c
index c2045b8f..601e9c0b 100644
--- a/cipher/sha256.c
+++ b/cipher/sha256.c
@@ -55,6 +55,22 @@
 # define USE_SSSE3 1
 #endif
 
+/* USE_AVX indicates whether to compile with Intel AVX code. */
+#undef USE_AVX
+#if defined(__x86_64__) && defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) && \
+    defined(HAVE_GCC_INLINE_ASM_AVX) && \
+    defined(HAVE_INTEL_SYNTAX_PLATFORM_AS)
+# define USE_AVX 1
+#endif
+
+/* USE_AVX2 indicates whether to compile with Intel AVX2/BMI2 code. */
+#undef USE_AVX2
+#if defined(__x86_64__) && defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) && \
+    defined(HAVE_GCC_INLINE_ASM_AVX2) && defined(HAVE_GCC_INLINE_ASM_BMI2) && \
+    defined(HAVE_INTEL_SYNTAX_PLATFORM_AS)
+# define USE_AVX2 1
+#endif
+
 
 typedef struct {
   gcry_md_block_ctx_t bctx;
@@ -62,6 +78,12 @@ typedef struct {
 #ifdef USE_SSSE3
   unsigned int use_ssse3:1;
 #endif
+#ifdef USE_AVX
+  unsigned int use_avx:1;
+#endif
+#ifdef USE_AVX2
+  unsigned int use_avx2:1;
+#endif
 } SHA256_CONTEXT;
 
 
@@ -73,6 +95,7 @@ static void
 sha256_init (void *context)
 {
   SHA256_CONTEXT *hd = context;
+  unsigned int features = _gcry_get_hw_features ();
 
   hd->h0 = 0x6a09e667;
   hd->h1 = 0xbb67ae85;
@@ -90,8 +113,17 @@ sha256_init (void *context)
   hd->bctx.bwrite = transform;
 
 #ifdef USE_SSSE3
-  hd->use_ssse3 = (_gcry_get_hw_features () & HWF_INTEL_SSSE3) != 0;
+  hd->use_ssse3 = (features & HWF_INTEL_SSSE3) != 0;
+#endif
+#ifdef USE_AVX
+  /* AVX implementation uses SHLD which is known to be slow on non-Intel CPUs.
+   * Therefore use this implementation on Intel CPUs only. */
+  hd->use_avx = (features & HWF_INTEL_AVX) && (features & HWF_INTEL_CPU);
+#endif
+#ifdef USE_AVX2
+  hd->use_avx2 = (features & HWF_INTEL_AVX2) && (features & HWF_INTEL_BMI2);
 #endif
+  (void)features;
 }
 
 
@@ -99,6 +131,7 @@ static void
 sha224_init (void *context)
 {
   SHA256_CONTEXT *hd = context;
+  unsigned int features = _gcry_get_hw_features ();
 
   hd->h0 = 0xc1059ed8;
   hd->h1 = 0x367cd507;
@@ -116,8 +149,17 @@ sha224_init (void *context)
   hd->bctx.bwrite = transform;
 
 #ifdef USE_SSSE3
-  hd->use_ssse3 = (_gcry_get_hw_features () & HWF_INTEL_SSSE3) != 0;
+  hd->use_ssse3 = (features & HWF_INTEL_SSSE3) != 0;
 #endif
+#ifdef USE_AVX
+  /* AVX implementation uses SHLD which is known to be slow on non-Intel CPUs.
+   * Therefore use this implementation on Intel CPUs only. */
+  hd->use_avx = (features & HWF_INTEL_AVX) && (features & HWF_INTEL_CPU);
+#endif
+#ifdef USE_AVX2
+  hd->use_avx2 = (features & HWF_INTEL_AVX2) && (features & HWF_INTEL_BMI2);
+#endif
+  (void)features;
 }
 
 
@@ -281,6 +323,16 @@ unsigned int _gcry_sha256_transform_amd64_ssse3(const void *input_data,
 					        u32 state[8], size_t num_blks);
 #endif
 
+#ifdef USE_AVX
+unsigned int _gcry_sha256_transform_amd64_avx(const void *input_data,
+					      u32 state[8], size_t num_blks);
+#endif
+
+#ifdef USE_AVX2
+unsigned int _gcry_sha256_transform_amd64_avx2(const void *input_data,
+					       u32 state[8], size_t num_blks);
+#endif
+
 
 static unsigned int
 transform (void *ctx, const unsigned char *data, size_t nblks)
@@ -288,6 +340,18 @@ transform (void *ctx, const unsigned char *data, size_t nblks)
   SHA256_CONTEXT *hd = ctx;
   unsigned int burn;
 
+#ifdef USE_AVX2
+  if (hd->use_avx2)
+    return _gcry_sha256_transform_amd64_avx2 (data, &hd->h0, nblks)
+           + 4 * sizeof(void*);
+#endif
+
+#ifdef USE_AVX
+  if (hd->use_avx)
+    return _gcry_sha256_transform_amd64_avx (data, &hd->h0, nblks)
+           + 4 * sizeof(void*);
+#endif
+
 #ifdef USE_SSSE3
   if (hd->use_ssse3)
     return _gcry_sha256_transform_amd64_ssse3 (data, &hd->h0, nblks)
author	Jussi Kivilinna <jussi.kivilinna@iki.fi>	2013-12-17 15:35:38 +0200
committer	Jussi Kivilinna <jussi.kivilinna@iki.fi>	2013-12-18 17:00:04 +0200
commit	a5c2bbfe0db515d739ab683297903c77b1eec124 (patch)
tree	ef6d9ba8d35b6e621aee58e91431d0fd446e940e /cipher/sha256.c
parent	e4e458465b124e25b6aec7a60174bf1ca32dc5fd (diff)
download	libgcrypt-a5c2bbfe0db515d739ab683297903c77b1eec124.tar.gz