SHA-512: Add AVX and AVX2 implementations for x86-64

* cipher/Makefile.am: Add 'sha512-avx-amd64.S' and 'sha512-avx2-bmi2-amd64.S'. * cipher/sha512-avx-amd64.S: New. * cipher/sha512-avx2-bmi2-amd64.S: New. * cipher/sha512.c (USE_AVX, USE_AVX2): New. (SHA512_CONTEXT) [USE_AVX]: Add 'use_avx'. (SHA512_CONTEXT) [USE_AVX2]: Add 'use_avx2'. (sha512_init, sha384_init) [USE_AVX]: Initialize 'use_avx'. (sha512_init, sha384_init) [USE_AVX2]: Initialize 'use_avx2'. [USE_AVX] (_gcry_sha512_transform_amd64_avx): New. [USE_AVX2] (_gcry_sha512_transform_amd64_avx2): New. (transform) [USE_AVX2]: Add call for AVX2 implementation. (transform) [USE_AVX]: Add call for AVX implementation. * configure.ac (HAVE_GCC_INLINE_ASM_BMI2): New check. (sha512): Add 'sha512-avx-amd64.lo' and 'sha512-avx2-bmi2-amd64.lo'. * doc/gcrypt.texi: Document 'intel-cpu' and 'intel-bmi2'. * src/g10lib.h (HWF_INTEL_CPU, HWF_INTEL_BMI2): New. * src/hwfeatures.c (hwflist): Add "intel-cpu" and "intel-bmi2". * src/hwf-x86.c (detect_x86_gnuc): Check for HWF_INTEL_CPU and HWF_INTEL_BMI2. -- Patch adds fast AVX and AVX2 implementation of SHA-512 by Intel Corporation. The assembly source is licensed under 3-clause BSD license, thus compatible with LGPL2.1+. Original source can be accessed at: http://www.intel.com/p/en_US/embedded/hwsw/technology/packet-processing#docs Implementation is described in white paper "Fast SHA512 Implementations on Intel® Architecture Processors" http://www.intel.com/content/www/us/en/intelligent-systems/intel-technology/fast-sha512-implementat$ Note: AVX implementation uses SHLD instruction to emulate RORQ, since it's faster on Intel Sandy-Bridge. However, on non-Intel CPUs SHLD is much slower than RORQ, so therefore AVX implementation is (for now) limited to Intel CPUs. Note: AVX2 implementation also uses BMI2 instruction rorx, thus additional HWF flag. Benchmarks: cpu Old SSSE3 AVX/AVX2 Old vs AVX/AVX2 vs SSSE3 Intel i5-4570 10.11 c/B 7.56 c/B 6.72 c/B 1.50x 1.12x Intel i5-2450M 14.11 c/B 10.53 c/B 8.88 c/B 1.58x 1.18x Signed-off-by: Jussi Kivilinna <jussi.kivilinna@iki.fi>
author: Jussi Kivilinna <jussi.kivilinna@iki.fi> 2013-12-12 13:56:13 +0200
committer: Jussi Kivilinna <jussi.kivilinna@iki.fi> 2013-12-13 00:13:12 +0200
commit: 2e4253dc8eb512cd0e807360926dc6ba912c95b4 (patch)
tree: 6ea95c8c04aa2593ba8eb5b83ec3ad6b70121918 /cipher/sha512.c
parent: 69a6d0f9562fcd26112a589318c13de66ce1700e (diff)
download: libgcrypt-2e4253dc8eb512cd0e807360926dc6ba912c95b4.tar.gz
1 files changed, 68 insertions, 4 deletions
diff --git a/cipher/sha512.c b/cipher/sha512.c
index 34b3215c..586c8097 100644
--- a/cipher/sha512.c
+++ b/cipher/sha512.c
@@ -74,6 +74,24 @@
 #endif
 
 
+/* USE_AVX indicates whether to compile with Intel AVX code. */
+#undef USE_AVX
+#if defined(__x86_64__) && defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) && \
+    defined(HAVE_GCC_INLINE_ASM_AVX) && \
+    defined(HAVE_INTEL_SYNTAX_PLATFORM_AS)
+# define USE_AVX 1
+#endif
+
+
+/* USE_AVX2 indicates whether to compile with Intel AVX2/rorx code. */
+#undef USE_AVX2
+#if defined(__x86_64__) && defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) && \
+    defined(HAVE_GCC_INLINE_ASM_AVX2) && defined(HAVE_GCC_INLINE_ASM_BMI2) && \
+    defined(HAVE_INTEL_SYNTAX_PLATFORM_AS)
+# define USE_AVX2 1
+#endif
+
+
 typedef struct
 {
   u64 h0, h1, h2, h3, h4, h5, h6, h7;
@@ -89,6 +107,12 @@ typedef struct
 #ifdef USE_SSSE3
   unsigned int use_ssse3:1;
 #endif
+#ifdef USE_AVX
+  unsigned int use_avx:1;
+#endif
+#ifdef USE_AVX2
+  unsigned int use_avx2:1;
+#endif
 } SHA512_CONTEXT;
 
 static unsigned int
@@ -99,6 +123,7 @@ sha512_init (void *context)
 {
   SHA512_CONTEXT *ctx = context;
   SHA512_STATE *hd = &ctx->state;
+  unsigned int features = _gcry_get_hw_features ();
 
   hd->h0 = U64_C(0x6a09e667f3bcc908);
   hd->h1 = U64_C(0xbb67ae8584caa73b);
@@ -116,11 +141,19 @@ sha512_init (void *context)
   ctx->bctx.bwrite = transform;
 
 #ifdef USE_ARM_NEON_ASM
-  ctx->use_neon = (_gcry_get_hw_features () & HWF_ARM_NEON) != 0;
+  ctx->use_neon = (features & HWF_ARM_NEON) != 0;
 #endif
 #ifdef USE_SSSE3
-  ctx->use_ssse3 = (_gcry_get_hw_features () & HWF_INTEL_SSSE3) != 0;
+  ctx->use_ssse3 = (features & HWF_INTEL_SSSE3) != 0;
+#endif
+#ifdef USE_AVX
+  ctx->use_avx = (features & HWF_INTEL_AVX) && (features & HWF_INTEL_CPU);
+#endif
+#ifdef USE_AVX2
+  ctx->use_avx2 = (features & HWF_INTEL_AVX2) && (features & HWF_INTEL_BMI2);
 #endif
+
+  (void)features;
 }
 
 static void
@@ -128,6 +161,7 @@ sha384_init (void *context)
 {
   SHA512_CONTEXT *ctx = context;
   SHA512_STATE *hd = &ctx->state;
+  unsigned int features = _gcry_get_hw_features ();
 
   hd->h0 = U64_C(0xcbbb9d5dc1059ed8);
   hd->h1 = U64_C(0x629a292a367cd507);
@@ -145,11 +179,19 @@ sha384_init (void *context)
   ctx->bctx.bwrite = transform;
 
 #ifdef USE_ARM_NEON_ASM
-  ctx->use_neon = (_gcry_get_hw_features () & HWF_ARM_NEON) != 0;
+  ctx->use_neon = (features & HWF_ARM_NEON) != 0;
 #endif
 #ifdef USE_SSSE3
-  ctx->use_ssse3 = (_gcry_get_hw_features () & HWF_INTEL_SSSE3) != 0;
+  ctx->use_ssse3 = (features & HWF_INTEL_SSSE3) != 0;
+#endif
+#ifdef USE_AVX
+  ctx->use_avx = (features & HWF_INTEL_AVX) && (features & HWF_INTEL_CPU);
 #endif
+#ifdef USE_AVX2
+  ctx->use_avx2 = (features & HWF_INTEL_AVX2) && (features & HWF_INTEL_BMI2);
+#endif
+
+  (void)features;
 }
 
 
@@ -507,12 +549,34 @@ unsigned int _gcry_sha512_transform_amd64_ssse3(const void *input_data,
 					        void *state, size_t num_blks);
 #endif
 
+#ifdef USE_AVX
+unsigned int _gcry_sha512_transform_amd64_avx(const void *input_data,
+					      void *state, size_t num_blks);
+#endif
+
+#ifdef USE_AVX2
+unsigned int _gcry_sha512_transform_amd64_avx2(const void *input_data,
+					       void *state, size_t num_blks);
+#endif
+
 
 static unsigned int
 transform (void *context, const unsigned char *data)
 {
   SHA512_CONTEXT *ctx = context;
 
+#ifdef USE_AVX2
+  if (ctx->use_avx2)
+    return _gcry_sha512_transform_amd64_avx2 (data, &ctx->state, 1)
+           + 4 * sizeof(void*);
+#endif
+
+#ifdef USE_AVX
+  if (ctx->use_avx)
+    return _gcry_sha512_transform_amd64_avx (data, &ctx->state, 1)
+           + 4 * sizeof(void*);
+#endif
+
 #ifdef USE_SSSE3
   if (ctx->use_ssse3)
     return _gcry_sha512_transform_amd64_ssse3 (data, &ctx->state, 1)
author	Jussi Kivilinna <jussi.kivilinna@iki.fi>	2013-12-12 13:56:13 +0200
committer	Jussi Kivilinna <jussi.kivilinna@iki.fi>	2013-12-13 00:13:12 +0200
commit	2e4253dc8eb512cd0e807360926dc6ba912c95b4 (patch)
tree	6ea95c8c04aa2593ba8eb5b83ec3ad6b70121918 /cipher/sha512.c
parent	69a6d0f9562fcd26112a589318c13de66ce1700e (diff)
download	libgcrypt-2e4253dc8eb512cd0e807360926dc6ba912c95b4.tar.gz