From a7acf79a8adaeff8fa3e9d6d3f90c4566d54eb8c Mon Sep 17 00:00:00 2001 From: Florian Weimer Date: Thu, 10 May 2018 11:50:00 +0200 Subject: Use custom x86 feature selection in libm --- sysdeps/x86_64/fpu/multiarch/Makefile | 2 + sysdeps/x86_64/fpu/multiarch/ifunc-avx-fma4.h | 13 ++-- sysdeps/x86_64/fpu/multiarch/ifunc-fma.h | 7 +- sysdeps/x86_64/fpu/multiarch/ifunc-fma4.h | 10 +-- sysdeps/x86_64/fpu/multiarch/ifunc-sse4_1.h | 5 +- sysdeps/x86_64/fpu/multiarch/s_fma.c | 5 +- sysdeps/x86_64/fpu/multiarch/s_fmaf.c | 5 +- sysdeps/x86_64/fpu/multiarch/x86-math-features.c | 96 ++++++++++++++++++++++++ sysdeps/x86_64/fpu/multiarch/x86-math-features.h | 18 +++++ 9 files changed, 140 insertions(+), 21 deletions(-) create mode 100644 sysdeps/x86_64/fpu/multiarch/x86-math-features.c create mode 100644 sysdeps/x86_64/fpu/multiarch/x86-math-features.h diff --git a/sysdeps/x86_64/fpu/multiarch/Makefile b/sysdeps/x86_64/fpu/multiarch/Makefile index 9a89bfc286..9987e1bb2b 100644 --- a/sysdeps/x86_64/fpu/multiarch/Makefile +++ b/sysdeps/x86_64/fpu/multiarch/Makefile @@ -15,6 +15,8 @@ libm-sysdep_routines += e_exp-fma e_log-fma e_pow-fma s_atan-fma \ halfulp-fma mpexp-fma \ mpatan2-fma mpatan-fma mpsqrt-fma mptan-fma +libm-sysdep_routines += x86-math-features + CFLAGS-doasin-fma.c = -mfma -mavx2 CFLAGS-dosincos-fma.c = -mfma -mavx2 CFLAGS-e_asin-fma.c = -mfma -mavx2 diff --git a/sysdeps/x86_64/fpu/multiarch/ifunc-avx-fma4.h b/sysdeps/x86_64/fpu/multiarch/ifunc-avx-fma4.h index a5f9375afc..c5924309c9 100644 --- a/sysdeps/x86_64/fpu/multiarch/ifunc-avx-fma4.h +++ b/sysdeps/x86_64/fpu/multiarch/ifunc-avx-fma4.h @@ -18,6 +18,7 @@ . */ #include +#include extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (avx) attribute_hidden; @@ -27,16 +28,14 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (fma4) attribute_hidden; static inline void * IFUNC_SELECTOR (void) { - const struct cpu_features* cpu_features = __get_cpu_features (); + unsigned int features = __x86_math_features (); - if (CPU_FEATURES_ARCH_P (cpu_features, FMA_Usable) - && CPU_FEATURES_ARCH_P (cpu_features, AVX2_Usable)) + if ((features & x86_math_feature_fma) + && (features & x86_math_feature_avx2)) return OPTIMIZE (fma); - - if (CPU_FEATURES_ARCH_P (cpu_features, FMA4_Usable)) + if (features & x86_math_feature_fma4) return OPTIMIZE (fma4); - - if (CPU_FEATURES_ARCH_P (cpu_features, AVX_Usable)) + if (features & x86_math_feature_avx) return OPTIMIZE (avx); return OPTIMIZE (sse2); diff --git a/sysdeps/x86_64/fpu/multiarch/ifunc-fma.h b/sysdeps/x86_64/fpu/multiarch/ifunc-fma.h index 63a8cd221f..1b0a95db0d 100644 --- a/sysdeps/x86_64/fpu/multiarch/ifunc-fma.h +++ b/sysdeps/x86_64/fpu/multiarch/ifunc-fma.h @@ -17,6 +17,7 @@ . */ #include +#include extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (fma) attribute_hidden; @@ -24,10 +25,10 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (fma) attribute_hidden; static inline void * IFUNC_SELECTOR (void) { - const struct cpu_features* cpu_features = __get_cpu_features (); + unsigned int features = __x86_math_features (); - if (CPU_FEATURES_ARCH_P (cpu_features, FMA_Usable) - && CPU_FEATURES_ARCH_P (cpu_features, AVX2_Usable)) + if ((features & x86_math_feature_fma) + && (features & x86_math_feature_avx2)) return OPTIMIZE (fma); return OPTIMIZE (sse2); diff --git a/sysdeps/x86_64/fpu/multiarch/ifunc-fma4.h b/sysdeps/x86_64/fpu/multiarch/ifunc-fma4.h index a2526a2ee0..6fb21ee024 100644 --- a/sysdeps/x86_64/fpu/multiarch/ifunc-fma4.h +++ b/sysdeps/x86_64/fpu/multiarch/ifunc-fma4.h @@ -18,6 +18,7 @@ . */ #include +#include extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (fma) attribute_hidden; @@ -26,13 +27,12 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (fma4) attribute_hidden; static inline void * IFUNC_SELECTOR (void) { - const struct cpu_features* cpu_features = __get_cpu_features (); + unsigned int features = __x86_math_features (); - if (CPU_FEATURES_ARCH_P (cpu_features, FMA_Usable) - && CPU_FEATURES_ARCH_P (cpu_features, AVX2_Usable)) + if ((features & x86_math_feature_fma) + && (features & x86_math_feature_avx2)) return OPTIMIZE (fma); - - if (CPU_FEATURES_ARCH_P (cpu_features, FMA4_Usable)) + if (features & x86_math_feature_fma4) return OPTIMIZE (fma4); return OPTIMIZE (sse2); diff --git a/sysdeps/x86_64/fpu/multiarch/ifunc-sse4_1.h b/sysdeps/x86_64/fpu/multiarch/ifunc-sse4_1.h index a8710ba802..ff136f4dc5 100644 --- a/sysdeps/x86_64/fpu/multiarch/ifunc-sse4_1.h +++ b/sysdeps/x86_64/fpu/multiarch/ifunc-sse4_1.h @@ -17,6 +17,7 @@ . */ #include +#include extern __typeof (REDIRECT_NAME) OPTIMIZE (c) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (sse41) attribute_hidden; @@ -24,9 +25,9 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse41) attribute_hidden; static inline void * IFUNC_SELECTOR (void) { - const struct cpu_features* cpu_features = __get_cpu_features (); + unsigned int features = __x86_math_features (); - if (CPU_FEATURES_CPU_P (cpu_features, SSE4_1)) + if (features & x86_math_feature_sse41) return OPTIMIZE (sse41); return OPTIMIZE (c); diff --git a/sysdeps/x86_64/fpu/multiarch/s_fma.c b/sysdeps/x86_64/fpu/multiarch/s_fma.c index 875c76d372..66da7ff132 100644 --- a/sysdeps/x86_64/fpu/multiarch/s_fma.c +++ b/sysdeps/x86_64/fpu/multiarch/s_fma.c @@ -20,6 +20,7 @@ #include #include #include +#include #include extern double __fma_sse2 (double x, double y, double z) attribute_hidden; @@ -41,8 +42,8 @@ __fma_fma4 (double x, double y, double z) } -libm_ifunc (__fma, HAS_ARCH_FEATURE (FMA_Usable) - ? __fma_fma3 : (HAS_ARCH_FEATURE (FMA4_Usable) +libm_ifunc (__fma, __x86_math_features () & x86_math_feature_fma + ? __fma_fma3 : (__x86_math_features () & x86_math_feature_fma4 ? __fma_fma4 : __fma_sse2)); libm_alias_double (__fma, fma) diff --git a/sysdeps/x86_64/fpu/multiarch/s_fmaf.c b/sysdeps/x86_64/fpu/multiarch/s_fmaf.c index 5f4c2ec0be..d65aa9a16f 100644 --- a/sysdeps/x86_64/fpu/multiarch/s_fmaf.c +++ b/sysdeps/x86_64/fpu/multiarch/s_fmaf.c @@ -19,6 +19,7 @@ #include #include #include +#include #include extern float __fmaf_sse2 (float x, float y, float z) attribute_hidden; @@ -40,8 +41,8 @@ __fmaf_fma4 (float x, float y, float z) } -libm_ifunc (__fmaf, HAS_ARCH_FEATURE (FMA_Usable) - ? __fmaf_fma3 : (HAS_ARCH_FEATURE (FMA4_Usable) +libm_ifunc (__fmaf, __x86_math_features () & x86_math_feature_fma + ? __fmaf_fma3 : (__x86_math_features () & x86_math_feature_fma4 ? __fmaf_fma4 : __fmaf_sse2)); libm_alias_float (__fma, fma) diff --git a/sysdeps/x86_64/fpu/multiarch/x86-math-features.c b/sysdeps/x86_64/fpu/multiarch/x86-math-features.c new file mode 100644 index 0000000000..e803b73229 --- /dev/null +++ b/sysdeps/x86_64/fpu/multiarch/x86-math-features.c @@ -0,0 +1,96 @@ +/* Initialize CPU features for use by the math library. + This file is part of the GNU C Library. + Copyright (C) 2008-2018 Free Software Foundation, Inc. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#include +#include +#include +#include +#include + +static unsigned int features; + +unsigned int +__x86_math_features (void) +{ + unsigned int features_local = atomic_load_relaxed (&features); + if (features_local != 0) + /* At least the initialization bit is set, which means that we + have a proper value. */ + return features_local; + + /* Perform initialization. */ + features_local = x86_math_feature_initialized; + + unsigned int eax, ebx, ecx, edx; + unsigned int max_cpuid; + __cpuid (0, max_cpuid, ebx, ecx, edx); + bool cpu_amd = ebx == 0x68747541 && ecx == 0x444d4163 && edx == 0x69746e65; + + if (max_cpuid >= 7) + { + __cpuid (1, eax, ebx, ecx, edx); + bool flag_fma = ecx & bit_cpu_FMA; + bool flag_osxsave = ecx & bit_cpu_OSXSAVE; + bool flag_avx = ecx & bit_cpu_AVX; + bool flag_sse41 = ecx & bit_cpu_SSE4_1; + + if (flag_sse41) + features_local |= x86_math_feature_sse41; + + __cpuid_count (7, 0, eax, ebx, ecx, edx); + bool flag_avx2 = ebx & bit_cpu_AVX2; + + if (flag_osxsave) + { + unsigned int xcrlow; + unsigned int xcrhigh; + asm ("xgetbv" : "=a" (xcrlow), "=d" (xcrhigh) : "c" (0)); + bool ymm_xmm_usable + = (xcrlow & (bit_YMM_state | bit_XMM_state)) + == (bit_YMM_state | bit_XMM_state); + + /* Is YMM and XMM state usable? */ + if (ymm_xmm_usable) + { + if (flag_avx) + { + features_local |= x86_math_feature_avx; + if (flag_avx2) + features_local |= x86_math_feature_avx2; + if (flag_fma) + features_local |= x86_math_feature_fma; + + if (cpu_amd) + { + __cpuid (0x80000000, eax, ebx, ecx, edx); + if (eax >= 0x80000001) + { + __cpuid (0x80000001, eax, ebx, ecx, edx); + bool flag_fma4 = ecx & bit_cpu_FMA4; + if (flag_fma4) + features_local |= x86_math_feature_fma4; + } + } + } + } + } + } + + atomic_store_relaxed (&features, features_local); + return features_local; +} diff --git a/sysdeps/x86_64/fpu/multiarch/x86-math-features.h b/sysdeps/x86_64/fpu/multiarch/x86-math-features.h new file mode 100644 index 0000000000..c43b66de7c --- /dev/null +++ b/sysdeps/x86_64/fpu/multiarch/x86-math-features.h @@ -0,0 +1,18 @@ +#ifndef X86_MATH_FEATURES_H +#define X86_MATH_FEATURES_H + +enum + { + x86_math_feature_initialized = 1 << 0, + x86_math_feature_avx = 1 << 1, + x86_math_feature_avx2 = 1 << 2, + x86_math_feature_fma = 1 << 3, + x86_math_feature_fma4 = 1 << 4, + x86_math_feature_sse41 = 1 << 5, + }; + +/* Return a combination of flags x86_math_feature_* above. */ +unsigned int __x86_math_features (void) + __attribute__ ((const)) attribute_hidden; + +#endif /* X86_MATH_FEATURES_H */ -- cgit v1.2.1