hwf-x86: enable VPGATHER usage for AMD CPUs with AVX512

* src/hwf-x86.c (detect_x86_gnuc): Move model based checks and forced soft hwfeatures enablement at end; Enable VPGATHER for AMD CPUs with AVX512. -- AMD Zen4 is able to benefit from VPGATHER based table-lookup for Twofish. Benchmark on Ryzen 9 7900X: Before: TWOFISH | nanosecs/byte mebibytes/sec cycles/byte auto Mhz CTR enc | 1.79 ns/B 532.8 MiB/s 10.07 c/B 5625 CTR dec | 1.79 ns/B 532.6 MiB/s 10.07 c/B 5625 After (~10% faster): TWOFISH | nanosecs/byte mebibytes/sec cycles/byte auto Mhz CTR enc | 1.61 ns/B 593.5 MiB/s 9.05 c/B 5631±2 CTR dec | 1.61 ns/B 590.8 MiB/s 9.08 c/B 5625 Signed-off-by: Jussi Kivilinna <jussi.kivilinna@iki.fi>
author: Jussi Kivilinna <jussi.kivilinna@iki.fi> 2022-10-22 16:54:57 +0300
committer: Jussi Kivilinna <jussi.kivilinna@iki.fi> 2022-10-26 21:43:04 +0300
commit: 4b1cb76e3587a8fdf59673a8368d47e4cd2fe151 (patch)
tree: bcbdad3914935fb5cd38319241bfc6e0826baa68 /src
parent: c0f85e0c8657030eb979a465199a07e2819f81e4 (diff)
download: libgcrypt-4b1cb76e3587a8fdf59673a8368d47e4cd2fe151.tar.gz
1 files changed, 83 insertions, 74 deletions
diff --git a/src/hwf-x86.c b/src/hwf-x86.c
index b440827e..c6f493eb 100644
--- a/src/hwf-x86.c
+++ b/src/hwf-x86.c
@@ -187,6 +187,7 @@ detect_x86_gnuc (void)
   unsigned int fms, family, model;
   unsigned int result = 0;
   unsigned int avoid_vpgather = 0;
+  unsigned int is_amd_cpu = 0;
 
   (void)os_supports_avx_avx2_registers;
   (void)os_supports_avx512_registers;
@@ -242,6 +243,7 @@ detect_x86_gnuc (void)
   else if (!strcmp (vendor_id.c, "AuthenticAMD"))
     {
       /* This is an AMD CPU.  */
+      is_amd_cpu = 1;
     }
 
   /* Detect Intel features, that might also be supported by other
@@ -253,77 +255,6 @@ detect_x86_gnuc (void)
   family = ((fms & 0xf00) >> 8) + ((fms & 0xff00000) >> 20);
   model = ((fms & 0xf0) >> 4) + ((fms & 0xf0000) >> 12);
 
-  if ((result & HWF_INTEL_CPU) && family == 6)
-    {
-      /* These Intel Core processor models have SHLD/SHRD instruction that
-       * can do integer rotation faster actual ROL/ROR instructions. */
-      switch (model)
-	{
-	case 0x2A:
-	case 0x2D:
-	case 0x3A:
-	case 0x3C:
-	case 0x3F:
-	case 0x45:
-	case 0x46:
-	case 0x3D:
-	case 0x4F:
-	case 0x56:
-	case 0x47:
-	case 0x4E:
-	case 0x5E:
-	case 0x8E:
-	case 0x9E:
-	case 0x55:
-	case 0x66:
-	  result |= HWF_INTEL_FAST_SHLD;
-	  break;
-	}
-
-      /* These Intel Core processors that have AVX2 have slow VPGATHER and
-       * should be avoided for table-lookup use. */
-      switch (model)
-	{
-	case 0x3C:
-	case 0x3F:
-	case 0x45:
-	case 0x46:
-	  /* Haswell */
-	  avoid_vpgather |= 1;
-	  break;
-	}
-    }
-  else
-    {
-      /* Avoid VPGATHER for non-Intel CPUs as testing is needed to
-       * make sure it is fast enough. */
-
-      avoid_vpgather |= 1;
-    }
-
-#ifdef ENABLE_FORCE_SOFT_HWFEATURES
-  /* Soft HW features mark functionality that is available on all systems
-   * but not feasible to use because of slow HW implementation. */
-
-  /* SHLD is faster at rotating register than actual ROR/ROL instructions
-   * on older Intel systems (~sandy-bridge era). However, SHLD is very
-   * slow on almost anything else and later Intel processors have faster
-   * ROR/ROL. Therefore in regular build HWF_INTEL_FAST_SHLD is enabled
-   * only for those Intel processors that benefit from the SHLD
-   * instruction. Enabled here unconditionally as requested. */
-  result |= HWF_INTEL_FAST_SHLD;
-
-  /* VPGATHER instructions are used for look-up table based
-   * implementations which require VPGATHER to be fast enough to beat
-   * regular parallelized look-up table implementations (see Twofish).
-   * So far, only Intel processors beginning with skylake have had
-   * VPGATHER fast enough to be enabled. AMD Zen3 comes close to
-   * being feasible, but not quite (where twofish-avx2 is few percent
-   * slower than twofish-3way). Enable VPGATHER here unconditionally
-   * as requested. */
-  avoid_vpgather = 0;
-#endif
-
 #ifdef ENABLE_PCLMUL_SUPPORT
   /* Test bit 1 for PCLMUL.  */
   if (features & 0x00000002)
@@ -392,9 +323,6 @@ detect_x86_gnuc (void)
       if (features & 0x00000020)
         if (os_supports_avx_avx2_registers)
           result |= HWF_INTEL_AVX2;
-
-      if ((result & HWF_INTEL_AVX2) && !avoid_vpgather)
-        result |= HWF_INTEL_FAST_VPGATHER;
 #endif /*ENABLE_AVX_SUPPORT*/
 
       /* Test bit 29 for SHA Extensions. */
@@ -446,6 +374,87 @@ detect_x86_gnuc (void)
         result |= HWF_INTEL_GFNI;
     }
 
+  if ((result & HWF_INTEL_CPU) && family == 6)
+    {
+      /* These Intel Core processor models have SHLD/SHRD instruction that
+       * can do integer rotation faster actual ROL/ROR instructions. */
+      switch (model)
+	{
+	case 0x2A:
+	case 0x2D:
+	case 0x3A:
+	case 0x3C:
+	case 0x3F:
+	case 0x45:
+	case 0x46:
+	case 0x3D:
+	case 0x4F:
+	case 0x56:
+	case 0x47:
+	case 0x4E:
+	case 0x5E:
+	case 0x8E:
+	case 0x9E:
+	case 0x55:
+	case 0x66:
+	  result |= HWF_INTEL_FAST_SHLD;
+	  break;
+	}
+
+      /* These Intel Core processors that have AVX2 have slow VPGATHER and
+       * should be avoided for table-lookup use. */
+      switch (model)
+	{
+	case 0x3C:
+	case 0x3F:
+	case 0x45:
+	case 0x46:
+	  /* Haswell */
+	  avoid_vpgather |= 1;
+	  break;
+	}
+    }
+  else if (is_amd_cpu)
+    {
+      /* Non-AVX512 AMD CPUs (pre-Zen4) have slow VPGATHER and should be
+       * avoided for table-lookup use. */
+      avoid_vpgather |= !(result & HWF_INTEL_AVX512);
+    }
+  else
+    {
+      /* Avoid VPGATHER for non-Intel/non-AMD CPUs as testing is needed to
+       * make sure it is fast enough. */
+      avoid_vpgather |= 1;
+    }
+
+#ifdef ENABLE_FORCE_SOFT_HWFEATURES
+  /* Soft HW features mark functionality that is available on all systems
+   * but not feasible to use because of slow HW implementation. */
+
+  /* Some implementations are disabled for non-Intel CPUs. Mark
+   * current CPU as Intel one to enable those implementations. */
+  result |= HWF_INTEL_CPU;
+
+  /* SHLD is faster at rotating register than actual ROR/ROL instructions
+   * on older Intel systems (~sandy-bridge era). However, SHLD is very
+   * slow on almost anything else and later Intel processors have faster
+   * ROR/ROL. Therefore in regular build HWF_INTEL_FAST_SHLD is enabled
+   * only for those Intel processors that benefit from the SHLD
+   * instruction. Enabled here unconditionally as requested. */
+  result |= HWF_INTEL_FAST_SHLD;
+
+  /* VPGATHER instructions are used for look-up table based
+   * implementations which require VPGATHER to be fast enough to beat
+   * regular parallelized look-up table implementations (see Twofish).
+   * So far, only Intel processors beginning with Skylake and AMD
+   * processors starting with Zen4 have had VPGATHER fast enough to be
+   * enabled. Enable VPGATHER here unconditionally as requested. */
+  avoid_vpgather = 0;
+#endif
+
+  if ((result & HWF_INTEL_AVX2) && !avoid_vpgather)
+    result |= HWF_INTEL_FAST_VPGATHER;
+
   return result;
 }
 #endif /* HAS_X86_CPUID */
author	Jussi Kivilinna <jussi.kivilinna@iki.fi>	2022-10-22 16:54:57 +0300
committer	Jussi Kivilinna <jussi.kivilinna@iki.fi>	2022-10-26 21:43:04 +0300
commit	4b1cb76e3587a8fdf59673a8368d47e4cd2fe151 (patch)
tree	bcbdad3914935fb5cd38319241bfc6e0826baa68 /src
parent	c0f85e0c8657030eb979a465199a07e2819f81e4 (diff)
download	libgcrypt-4b1cb76e3587a8fdf59673a8368d47e4cd2fe151.tar.gz