summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorJussi Kivilinna <jussi.kivilinna@iki.fi>2022-10-22 16:54:57 +0300
committerJussi Kivilinna <jussi.kivilinna@iki.fi>2022-10-26 21:43:04 +0300
commit4b1cb76e3587a8fdf59673a8368d47e4cd2fe151 (patch)
treebcbdad3914935fb5cd38319241bfc6e0826baa68 /src
parentc0f85e0c8657030eb979a465199a07e2819f81e4 (diff)
downloadlibgcrypt-4b1cb76e3587a8fdf59673a8368d47e4cd2fe151.tar.gz
hwf-x86: enable VPGATHER usage for AMD CPUs with AVX512
* src/hwf-x86.c (detect_x86_gnuc): Move model based checks and forced soft hwfeatures enablement at end; Enable VPGATHER for AMD CPUs with AVX512. -- AMD Zen4 is able to benefit from VPGATHER based table-lookup for Twofish. Benchmark on Ryzen 9 7900X: Before: TWOFISH | nanosecs/byte mebibytes/sec cycles/byte auto Mhz CTR enc | 1.79 ns/B 532.8 MiB/s 10.07 c/B 5625 CTR dec | 1.79 ns/B 532.6 MiB/s 10.07 c/B 5625 After (~10% faster): TWOFISH | nanosecs/byte mebibytes/sec cycles/byte auto Mhz CTR enc | 1.61 ns/B 593.5 MiB/s 9.05 c/B 5631±2 CTR dec | 1.61 ns/B 590.8 MiB/s 9.08 c/B 5625 Signed-off-by: Jussi Kivilinna <jussi.kivilinna@iki.fi>
Diffstat (limited to 'src')
-rw-r--r--src/hwf-x86.c157
1 files changed, 83 insertions, 74 deletions
diff --git a/src/hwf-x86.c b/src/hwf-x86.c
index b440827e..c6f493eb 100644
--- a/src/hwf-x86.c
+++ b/src/hwf-x86.c
@@ -187,6 +187,7 @@ detect_x86_gnuc (void)
unsigned int fms, family, model;
unsigned int result = 0;
unsigned int avoid_vpgather = 0;
+ unsigned int is_amd_cpu = 0;
(void)os_supports_avx_avx2_registers;
(void)os_supports_avx512_registers;
@@ -242,6 +243,7 @@ detect_x86_gnuc (void)
else if (!strcmp (vendor_id.c, "AuthenticAMD"))
{
/* This is an AMD CPU. */
+ is_amd_cpu = 1;
}
/* Detect Intel features, that might also be supported by other
@@ -253,77 +255,6 @@ detect_x86_gnuc (void)
family = ((fms & 0xf00) >> 8) + ((fms & 0xff00000) >> 20);
model = ((fms & 0xf0) >> 4) + ((fms & 0xf0000) >> 12);
- if ((result & HWF_INTEL_CPU) && family == 6)
- {
- /* These Intel Core processor models have SHLD/SHRD instruction that
- * can do integer rotation faster actual ROL/ROR instructions. */
- switch (model)
- {
- case 0x2A:
- case 0x2D:
- case 0x3A:
- case 0x3C:
- case 0x3F:
- case 0x45:
- case 0x46:
- case 0x3D:
- case 0x4F:
- case 0x56:
- case 0x47:
- case 0x4E:
- case 0x5E:
- case 0x8E:
- case 0x9E:
- case 0x55:
- case 0x66:
- result |= HWF_INTEL_FAST_SHLD;
- break;
- }
-
- /* These Intel Core processors that have AVX2 have slow VPGATHER and
- * should be avoided for table-lookup use. */
- switch (model)
- {
- case 0x3C:
- case 0x3F:
- case 0x45:
- case 0x46:
- /* Haswell */
- avoid_vpgather |= 1;
- break;
- }
- }
- else
- {
- /* Avoid VPGATHER for non-Intel CPUs as testing is needed to
- * make sure it is fast enough. */
-
- avoid_vpgather |= 1;
- }
-
-#ifdef ENABLE_FORCE_SOFT_HWFEATURES
- /* Soft HW features mark functionality that is available on all systems
- * but not feasible to use because of slow HW implementation. */
-
- /* SHLD is faster at rotating register than actual ROR/ROL instructions
- * on older Intel systems (~sandy-bridge era). However, SHLD is very
- * slow on almost anything else and later Intel processors have faster
- * ROR/ROL. Therefore in regular build HWF_INTEL_FAST_SHLD is enabled
- * only for those Intel processors that benefit from the SHLD
- * instruction. Enabled here unconditionally as requested. */
- result |= HWF_INTEL_FAST_SHLD;
-
- /* VPGATHER instructions are used for look-up table based
- * implementations which require VPGATHER to be fast enough to beat
- * regular parallelized look-up table implementations (see Twofish).
- * So far, only Intel processors beginning with skylake have had
- * VPGATHER fast enough to be enabled. AMD Zen3 comes close to
- * being feasible, but not quite (where twofish-avx2 is few percent
- * slower than twofish-3way). Enable VPGATHER here unconditionally
- * as requested. */
- avoid_vpgather = 0;
-#endif
-
#ifdef ENABLE_PCLMUL_SUPPORT
/* Test bit 1 for PCLMUL. */
if (features & 0x00000002)
@@ -392,9 +323,6 @@ detect_x86_gnuc (void)
if (features & 0x00000020)
if (os_supports_avx_avx2_registers)
result |= HWF_INTEL_AVX2;
-
- if ((result & HWF_INTEL_AVX2) && !avoid_vpgather)
- result |= HWF_INTEL_FAST_VPGATHER;
#endif /*ENABLE_AVX_SUPPORT*/
/* Test bit 29 for SHA Extensions. */
@@ -446,6 +374,87 @@ detect_x86_gnuc (void)
result |= HWF_INTEL_GFNI;
}
+ if ((result & HWF_INTEL_CPU) && family == 6)
+ {
+ /* These Intel Core processor models have SHLD/SHRD instruction that
+ * can do integer rotation faster actual ROL/ROR instructions. */
+ switch (model)
+ {
+ case 0x2A:
+ case 0x2D:
+ case 0x3A:
+ case 0x3C:
+ case 0x3F:
+ case 0x45:
+ case 0x46:
+ case 0x3D:
+ case 0x4F:
+ case 0x56:
+ case 0x47:
+ case 0x4E:
+ case 0x5E:
+ case 0x8E:
+ case 0x9E:
+ case 0x55:
+ case 0x66:
+ result |= HWF_INTEL_FAST_SHLD;
+ break;
+ }
+
+ /* These Intel Core processors that have AVX2 have slow VPGATHER and
+ * should be avoided for table-lookup use. */
+ switch (model)
+ {
+ case 0x3C:
+ case 0x3F:
+ case 0x45:
+ case 0x46:
+ /* Haswell */
+ avoid_vpgather |= 1;
+ break;
+ }
+ }
+ else if (is_amd_cpu)
+ {
+ /* Non-AVX512 AMD CPUs (pre-Zen4) have slow VPGATHER and should be
+ * avoided for table-lookup use. */
+ avoid_vpgather |= !(result & HWF_INTEL_AVX512);
+ }
+ else
+ {
+ /* Avoid VPGATHER for non-Intel/non-AMD CPUs as testing is needed to
+ * make sure it is fast enough. */
+ avoid_vpgather |= 1;
+ }
+
+#ifdef ENABLE_FORCE_SOFT_HWFEATURES
+ /* Soft HW features mark functionality that is available on all systems
+ * but not feasible to use because of slow HW implementation. */
+
+ /* Some implementations are disabled for non-Intel CPUs. Mark
+ * current CPU as Intel one to enable those implementations. */
+ result |= HWF_INTEL_CPU;
+
+ /* SHLD is faster at rotating register than actual ROR/ROL instructions
+ * on older Intel systems (~sandy-bridge era). However, SHLD is very
+ * slow on almost anything else and later Intel processors have faster
+ * ROR/ROL. Therefore in regular build HWF_INTEL_FAST_SHLD is enabled
+ * only for those Intel processors that benefit from the SHLD
+ * instruction. Enabled here unconditionally as requested. */
+ result |= HWF_INTEL_FAST_SHLD;
+
+ /* VPGATHER instructions are used for look-up table based
+ * implementations which require VPGATHER to be fast enough to beat
+ * regular parallelized look-up table implementations (see Twofish).
+ * So far, only Intel processors beginning with Skylake and AMD
+ * processors starting with Zen4 have had VPGATHER fast enough to be
+ * enabled. Enable VPGATHER here unconditionally as requested. */
+ avoid_vpgather = 0;
+#endif
+
+ if ((result & HWF_INTEL_AVX2) && !avoid_vpgather)
+ result |= HWF_INTEL_FAST_VPGATHER;
+
return result;
}
#endif /* HAS_X86_CPUID */