diff options
author | Jussi Kivilinna <jussi.kivilinna@iki.fi> | 2022-10-22 16:54:57 +0300 |
---|---|---|
committer | Jussi Kivilinna <jussi.kivilinna@iki.fi> | 2022-10-26 21:43:04 +0300 |
commit | 4b1cb76e3587a8fdf59673a8368d47e4cd2fe151 (patch) | |
tree | bcbdad3914935fb5cd38319241bfc6e0826baa68 | |
parent | c0f85e0c8657030eb979a465199a07e2819f81e4 (diff) | |
download | libgcrypt-4b1cb76e3587a8fdf59673a8368d47e4cd2fe151.tar.gz |
hwf-x86: enable VPGATHER usage for AMD CPUs with AVX512
* src/hwf-x86.c (detect_x86_gnuc): Move model based checks and
forced soft hwfeatures enablement at end; Enable VPGATHER for
AMD CPUs with AVX512.
--
AMD Zen4 is able to benefit from VPGATHER based table-lookup for
Twofish.
Benchmark on Ryzen 9 7900X:
Before:
TWOFISH | nanosecs/byte mebibytes/sec cycles/byte auto Mhz
CTR enc | 1.79 ns/B 532.8 MiB/s 10.07 c/B 5625
CTR dec | 1.79 ns/B 532.6 MiB/s 10.07 c/B 5625
After (~10% faster):
TWOFISH | nanosecs/byte mebibytes/sec cycles/byte auto Mhz
CTR enc | 1.61 ns/B 593.5 MiB/s 9.05 c/B 5631±2
CTR dec | 1.61 ns/B 590.8 MiB/s 9.08 c/B 5625
Signed-off-by: Jussi Kivilinna <jussi.kivilinna@iki.fi>
-rw-r--r-- | src/hwf-x86.c | 157 |
1 files changed, 83 insertions, 74 deletions
diff --git a/src/hwf-x86.c b/src/hwf-x86.c index b440827e..c6f493eb 100644 --- a/src/hwf-x86.c +++ b/src/hwf-x86.c @@ -187,6 +187,7 @@ detect_x86_gnuc (void) unsigned int fms, family, model; unsigned int result = 0; unsigned int avoid_vpgather = 0; + unsigned int is_amd_cpu = 0; (void)os_supports_avx_avx2_registers; (void)os_supports_avx512_registers; @@ -242,6 +243,7 @@ detect_x86_gnuc (void) else if (!strcmp (vendor_id.c, "AuthenticAMD")) { /* This is an AMD CPU. */ + is_amd_cpu = 1; } /* Detect Intel features, that might also be supported by other @@ -253,77 +255,6 @@ detect_x86_gnuc (void) family = ((fms & 0xf00) >> 8) + ((fms & 0xff00000) >> 20); model = ((fms & 0xf0) >> 4) + ((fms & 0xf0000) >> 12); - if ((result & HWF_INTEL_CPU) && family == 6) - { - /* These Intel Core processor models have SHLD/SHRD instruction that - * can do integer rotation faster actual ROL/ROR instructions. */ - switch (model) - { - case 0x2A: - case 0x2D: - case 0x3A: - case 0x3C: - case 0x3F: - case 0x45: - case 0x46: - case 0x3D: - case 0x4F: - case 0x56: - case 0x47: - case 0x4E: - case 0x5E: - case 0x8E: - case 0x9E: - case 0x55: - case 0x66: - result |= HWF_INTEL_FAST_SHLD; - break; - } - - /* These Intel Core processors that have AVX2 have slow VPGATHER and - * should be avoided for table-lookup use. */ - switch (model) - { - case 0x3C: - case 0x3F: - case 0x45: - case 0x46: - /* Haswell */ - avoid_vpgather |= 1; - break; - } - } - else - { - /* Avoid VPGATHER for non-Intel CPUs as testing is needed to - * make sure it is fast enough. */ - - avoid_vpgather |= 1; - } - -#ifdef ENABLE_FORCE_SOFT_HWFEATURES - /* Soft HW features mark functionality that is available on all systems - * but not feasible to use because of slow HW implementation. */ - - /* SHLD is faster at rotating register than actual ROR/ROL instructions - * on older Intel systems (~sandy-bridge era). However, SHLD is very - * slow on almost anything else and later Intel processors have faster - * ROR/ROL. Therefore in regular build HWF_INTEL_FAST_SHLD is enabled - * only for those Intel processors that benefit from the SHLD - * instruction. Enabled here unconditionally as requested. */ - result |= HWF_INTEL_FAST_SHLD; - - /* VPGATHER instructions are used for look-up table based - * implementations which require VPGATHER to be fast enough to beat - * regular parallelized look-up table implementations (see Twofish). - * So far, only Intel processors beginning with skylake have had - * VPGATHER fast enough to be enabled. AMD Zen3 comes close to - * being feasible, but not quite (where twofish-avx2 is few percent - * slower than twofish-3way). Enable VPGATHER here unconditionally - * as requested. */ - avoid_vpgather = 0; -#endif - #ifdef ENABLE_PCLMUL_SUPPORT /* Test bit 1 for PCLMUL. */ if (features & 0x00000002) @@ -392,9 +323,6 @@ detect_x86_gnuc (void) if (features & 0x00000020) if (os_supports_avx_avx2_registers) result |= HWF_INTEL_AVX2; - - if ((result & HWF_INTEL_AVX2) && !avoid_vpgather) - result |= HWF_INTEL_FAST_VPGATHER; #endif /*ENABLE_AVX_SUPPORT*/ /* Test bit 29 for SHA Extensions. */ @@ -446,6 +374,87 @@ detect_x86_gnuc (void) result |= HWF_INTEL_GFNI; } + if ((result & HWF_INTEL_CPU) && family == 6) + { + /* These Intel Core processor models have SHLD/SHRD instruction that + * can do integer rotation faster actual ROL/ROR instructions. */ + switch (model) + { + case 0x2A: + case 0x2D: + case 0x3A: + case 0x3C: + case 0x3F: + case 0x45: + case 0x46: + case 0x3D: + case 0x4F: + case 0x56: + case 0x47: + case 0x4E: + case 0x5E: + case 0x8E: + case 0x9E: + case 0x55: + case 0x66: + result |= HWF_INTEL_FAST_SHLD; + break; + } + + /* These Intel Core processors that have AVX2 have slow VPGATHER and + * should be avoided for table-lookup use. */ + switch (model) + { + case 0x3C: + case 0x3F: + case 0x45: + case 0x46: + /* Haswell */ + avoid_vpgather |= 1; + break; + } + } + else if (is_amd_cpu) + { + /* Non-AVX512 AMD CPUs (pre-Zen4) have slow VPGATHER and should be + * avoided for table-lookup use. */ + avoid_vpgather |= !(result & HWF_INTEL_AVX512); + } + else + { + /* Avoid VPGATHER for non-Intel/non-AMD CPUs as testing is needed to + * make sure it is fast enough. */ + avoid_vpgather |= 1; + } + +#ifdef ENABLE_FORCE_SOFT_HWFEATURES + /* Soft HW features mark functionality that is available on all systems + * but not feasible to use because of slow HW implementation. */ + + /* Some implementations are disabled for non-Intel CPUs. Mark + * current CPU as Intel one to enable those implementations. */ + result |= HWF_INTEL_CPU; + + /* SHLD is faster at rotating register than actual ROR/ROL instructions + * on older Intel systems (~sandy-bridge era). However, SHLD is very + * slow on almost anything else and later Intel processors have faster + * ROR/ROL. Therefore in regular build HWF_INTEL_FAST_SHLD is enabled + * only for those Intel processors that benefit from the SHLD + * instruction. Enabled here unconditionally as requested. */ + result |= HWF_INTEL_FAST_SHLD; + + /* VPGATHER instructions are used for look-up table based + * implementations which require VPGATHER to be fast enough to beat + * regular parallelized look-up table implementations (see Twofish). + * So far, only Intel processors beginning with Skylake and AMD + * processors starting with Zen4 have had VPGATHER fast enough to be + * enabled. Enable VPGATHER here unconditionally as requested. */ + avoid_vpgather = 0; +#endif + + if ((result & HWF_INTEL_AVX2) && !avoid_vpgather) + result |= HWF_INTEL_FAST_VPGATHER; + return result; } #endif /* HAS_X86_CPUID */ |