diff options
author | Sayed Adel <seiko@imavr.com> | 2020-01-21 11:56:33 +0200 |
---|---|---|
committer | Sayed Adel <seiko@imavr.com> | 2020-02-05 05:09:21 +0200 |
commit | ad174001a869f42bb89ccff77ac3eec04a9d71e8 (patch) | |
tree | 3dbcad1b3e121d499ec2f072b7e80b32bb9a62ef /numpy/core/src | |
parent | f71d9937d1e8a1e709f325f689f1e971e64c26a7 (diff) | |
download | numpy-ad174001a869f42bb89ccff77ac3eec04a9d71e8.tar.gz |
ENH: improve runtime detection of CPU features
- Put the old CPU detection code to rest
The current CPU detection code only supports x86 and
it's count on compiler built-in functions that not widely supported
by other compilers or platforms.
NOTE: `npy_cpu_supports` is removed rather than deprecated,
use the macro `NPY_CPU_HAVE(FEATURE_NAME_WITHOUT_QUOTES)` instead.
- Initialize the new CPU features runtime detector
Almost similar to GCC built-in functions,
so instead of `__builtin_cpu_init`, `__builtin_cpu_supports`
its provide `npy_cpu_init`, `npy_cpu_have` and `NPY_CPU_HAVE`.
NOTE: `npy_cpu_init` must be called before any use of
`npy_cpu_have` and `NPY_CPU_HAVE`, however `npy_cpu_init`
already called during the load of module `umath`
so there's no reason to call it again in most of the cases.
- Add X86 support
detect almost all x86 features, also provide
CPU feature groups that gather several features.
e.g. `AVX512_KNM` detect Knights Mill's `AVX512` features
- Add IBM/Power support
only supports Linux and count here on `glibc(getauxval)`
to detect VSX support and fail-back to the compiler definitions
for other platforms.
- Add ARM support
Same as IBM/Power but its parse `/proc/self/auxv`
if `glibc(getauxval)` isn't available.
- Update umath generator
- Add testing unit for Linux only
- Add new attribute `__cpu_features__` to umath module
`__cpu_features__` is a dictionary contains all supported
CPU feature names with runtime availability
Diffstat (limited to 'numpy/core/src')
-rw-r--r-- | numpy/core/src/common/npy_config.h | 1 | ||||
-rw-r--r-- | numpy/core/src/common/npy_cpu_features.c.src | 404 | ||||
-rw-r--r-- | numpy/core/src/common/npy_cpu_features.h | 117 | ||||
-rw-r--r-- | numpy/core/src/multiarray/multiarraymodule.c | 15 | ||||
-rw-r--r-- | numpy/core/src/umath/cpuid.c | 97 | ||||
-rw-r--r-- | numpy/core/src/umath/cpuid.h | 9 |
6 files changed, 537 insertions, 106 deletions
diff --git a/numpy/core/src/common/npy_config.h b/numpy/core/src/common/npy_config.h index eedfbe364..aebe241a5 100644 --- a/numpy/core/src/common/npy_config.h +++ b/numpy/core/src/common/npy_config.h @@ -2,6 +2,7 @@ #define _NPY_NPY_CONFIG_H_ #include "config.h" +#include "npy_cpu_features.h" #include "numpy/numpyconfig.h" #include "numpy/npy_cpu.h" #include "numpy/npy_os.h" diff --git a/numpy/core/src/common/npy_cpu_features.c.src b/numpy/core/src/common/npy_cpu_features.c.src new file mode 100644 index 000000000..cbd99827b --- /dev/null +++ b/numpy/core/src/common/npy_cpu_features.c.src @@ -0,0 +1,404 @@ +#include "npy_cpu_features.h" +#include "numpy/npy_common.h" // for NPY_INLINE +#include "numpy/npy_cpu.h" // To guarantee of having CPU definitions in scope. + +/******************** Private Definitions *********************/ + +// Hold all CPU features boolean values +static unsigned char npy__cpu_have[NPY_CPU_FEATURE_MAX]; + +/******************** Private Declarations *********************/ + +// Almost detect all CPU features in runtime +static void +npy__cpu_init_features(void); + +/******************** Public Definitions *********************/ + +NPY_VISIBILITY_HIDDEN int +npy_cpu_have(int feature_id) +{ + if (feature_id <= NPY_CPU_FEATURE_NONE || feature_id >= NPY_CPU_FEATURE_MAX) + return 0; + return npy__cpu_have[feature_id]; +} + +NPY_VISIBILITY_HIDDEN int +npy_cpu_init(void) +{ + npy__cpu_init_features(); + return 0; +} + +NPY_VISIBILITY_HIDDEN PyObject * +npy_cpu_features_dict(void) +{ + PyObject *dict = PyDict_New(); + if (dict) { + /**begin repeat + * #feature = MMX, SSE, SSE2, SSE3, SSSE3, SSE41, POPCNT, SSE42, + * AVX, F16C, XOP, FMA4, FMA3, AVX2, AVX512F, + * AVX512CD, AVX512ER, AVX512PF, AVX5124FMAPS, AVX5124VNNIW, + * AVX512VPOPCNTDQ, AVX512VL, AVX512BW, AVX512DQ, AVX512VNNI, + * AVX512IFMA, AVX512VBMI, AVX512VBMI2, AVX512BITALG, + * AVX512_KNL, AVX512_KNM, AVX512_SKX, AVX512_CLX, AVX512_CNL, AVX512_ICL, + * VSX, VSX2, VSX3, + * NEON, NEON_FP16, NEON_VFPV4, ASIMD, FPHP, ASIMDHP, ASIMDDP, ASIMDFHM# + */ + if (PyDict_SetItemString(dict, "@feature@", + npy__cpu_have[NPY_CPU_FEATURE_@feature@] ? Py_True : Py_False) < 0) { + Py_DECREF(dict); + return NULL; + } + /**end repeat**/ + } + return dict; +} + +/**************************************************************** + * This section is reserved to defining @npy__cpu_init_features + * for each CPU architecture, please try to keep it clean. Ty + ****************************************************************/ + +/***************** X86 ******************/ + +#if defined(NPY_CPU_AMD64) || defined(NPY_CPU_X86) + +#ifdef _MSC_VER + #include <intrin.h> +#elif defined(__INTEL_COMPILER) + #include <immintrin.h> +#endif + +static int +npy__cpu_getxcr0(void) +{ +#if defined(_MSC_VER) || defined (__INTEL_COMPILER) + return _xgetbv(0); +#elif defined(__GNUC__) || defined(__clang__) + unsigned int eax, edx; + __asm__("xgetbv" : "=a" (eax), "=d" (edx) : "c" (0)); + return (eax | (unsigned long long)edx << 32); +#else + // TODO: handle other x86 compilers + return 0; +#endif +} + +static void +npy__cpu_cpuid(int reg[4], int func_id) +{ +#if defined(_MSC_VER) + __cpuidex(reg, func_id, 0); +#elif defined(__INTEL_COMPILER) + __cpuid(reg, func_id); +#elif defined(__GNUC__) || defined(__clang__) + #if defined(NPY_CPU_X86) && defined(__PIC__) + // %ebx may be the PIC register + #define NPY__CPUID_ASM \ + "xchg{l}\t{%%}ebx, %1\n\t" \ + "cpuid\n\t" \ + "xchg{l}\t{%%}ebx, %1\n\t" + #else + #define NPY__CPUID_ASM "cpuid" + #endif + __asm__(NPY__CPUID_ASM : "=a" (reg[0]), "=b" (reg[1]), "=c" (reg[2]), "=d" (reg[3]) : "a" (func_id), "c" (0) : ); +#else + // TODO: handle other x86 compilers + reg[0] = 0; +#endif +} + +static void +npy__cpu_init_features(void) +{ + memset(npy__cpu_have, 0, sizeof(npy__cpu_have[0]) * NPY_CPU_FEATURE_MAX); + + // validate platform support + int reg[] = {0, 0, 0, 0}; + npy__cpu_cpuid(reg, 0); + if (reg[0] == 0) + return; + + npy__cpu_cpuid(reg, 1); + npy__cpu_have[NPY_CPU_FEATURE_MMX] = (reg[3] & (1 << 23)) != 0; + npy__cpu_have[NPY_CPU_FEATURE_SSE] = (reg[3] & (1 << 25)) != 0; + npy__cpu_have[NPY_CPU_FEATURE_SSE2] = (reg[3] & (1 << 26)) != 0; + npy__cpu_have[NPY_CPU_FEATURE_SSE3] = (reg[2] & (1 << 0)) != 0; + npy__cpu_have[NPY_CPU_FEATURE_SSSE3] = (reg[2] & (1 << 9)) != 0; + npy__cpu_have[NPY_CPU_FEATURE_SSE41] = (reg[2] & (1 << 19)) != 0; + npy__cpu_have[NPY_CPU_FEATURE_POPCNT] = (reg[2] & (1 << 23)) != 0; + npy__cpu_have[NPY_CPU_FEATURE_SSE42] = (reg[2] & (1 << 20)) != 0; + npy__cpu_have[NPY_CPU_FEATURE_F16C] = (reg[2] & (1 << 29)) != 0; + + // check OSXSAVE + if ((reg[2] & (1 << 27)) == 0) + return; + // check AVX OS support + int xcr = npy__cpu_getxcr0(); + if ((xcr & 6) != 6) + return; + npy__cpu_have[NPY_CPU_FEATURE_AVX] = (reg[2] & (1 << 28)) != 0; + if (!npy__cpu_have[NPY_CPU_FEATURE_AVX]) + return; + npy__cpu_have[NPY_CPU_FEATURE_FMA3] = (reg[2] & (1 << 12)) != 0; + + // second call to the cpuid to get extended AMD feature bits + npy__cpu_cpuid(reg, 0x80000001); + npy__cpu_have[NPY_CPU_FEATURE_XOP] = (reg[2] & (1 << 11)) != 0; + npy__cpu_have[NPY_CPU_FEATURE_FMA4] = (reg[2] & (1 << 16)) != 0; + + // third call to the cpuid to get extended AVX2 & AVX512 feature bits + npy__cpu_cpuid(reg, 7); + npy__cpu_have[NPY_CPU_FEATURE_AVX2] = (reg[1] & (1 << 5)) != 0; + if (!npy__cpu_have[NPY_CPU_FEATURE_AVX2]) + return; + // detect AVX2 & FMA3 + npy__cpu_have[NPY_CPU_FEATURE_FMA] = npy__cpu_have[NPY_CPU_FEATURE_FMA3]; + + // check AVX512 OS support + if ((xcr & 0xe6) != 0xe6) + return; + npy__cpu_have[NPY_CPU_FEATURE_AVX512F] = (reg[1] & (1 << 16)) != 0; + npy__cpu_have[NPY_CPU_FEATURE_AVX512CD] = (reg[1] & (1 << 28)) != 0; + if (npy__cpu_have[NPY_CPU_FEATURE_AVX512F] && npy__cpu_have[NPY_CPU_FEATURE_AVX512CD]) { + // Knights Landing + npy__cpu_have[NPY_CPU_FEATURE_AVX512PF] = (reg[1] & (1 << 26)) != 0; + npy__cpu_have[NPY_CPU_FEATURE_AVX512ER] = (reg[1] & (1 << 27)) != 0; + npy__cpu_have[NPY_CPU_FEATURE_AVX512_KNL] = npy__cpu_have[NPY_CPU_FEATURE_AVX512ER] && + npy__cpu_have[NPY_CPU_FEATURE_AVX512PF]; + // Knights Mill + npy__cpu_have[NPY_CPU_FEATURE_AVX512VPOPCNTDQ] = (reg[2] & (1 << 14)) != 0; + npy__cpu_have[NPY_CPU_FEATURE_AVX5124VNNIW] = (reg[3] & (1 << 2)) != 0; + npy__cpu_have[NPY_CPU_FEATURE_AVX5124FMAPS] = (reg[3] & (1 << 3)) != 0; + npy__cpu_have[NPY_CPU_FEATURE_AVX512_KNM] = npy__cpu_have[NPY_CPU_FEATURE_AVX512_KNL] && + npy__cpu_have[NPY_CPU_FEATURE_AVX5124FMAPS] && + npy__cpu_have[NPY_CPU_FEATURE_AVX5124VNNIW] && + npy__cpu_have[NPY_CPU_FEATURE_AVX512VPOPCNTDQ]; + + // Skylake-X + npy__cpu_have[NPY_CPU_FEATURE_AVX512DQ] = (reg[1] & (1 << 17)) != 0; + npy__cpu_have[NPY_CPU_FEATURE_AVX512BW] = (reg[1] & (1 << 30)) != 0; + npy__cpu_have[NPY_CPU_FEATURE_AVX512VL] = (reg[1] & (1 << 31)) != 0; + npy__cpu_have[NPY_CPU_FEATURE_AVX512_SKX] = npy__cpu_have[NPY_CPU_FEATURE_AVX512BW] && + npy__cpu_have[NPY_CPU_FEATURE_AVX512DQ] && + npy__cpu_have[NPY_CPU_FEATURE_AVX512VL]; + // Cascade Lake + npy__cpu_have[NPY_CPU_FEATURE_AVX512VNNI] = (reg[2] & (1 << 11)) != 0; + npy__cpu_have[NPY_CPU_FEATURE_AVX512_CLX] = npy__cpu_have[NPY_CPU_FEATURE_AVX512_SKX] && + npy__cpu_have[NPY_CPU_FEATURE_AVX512VNNI]; + + // Cannon Lake + npy__cpu_have[NPY_CPU_FEATURE_AVX512IFMA] = (reg[1] & (1 << 21)) != 0; + npy__cpu_have[NPY_CPU_FEATURE_AVX512VBMI] = (reg[2] & (1 << 1)) != 0; + npy__cpu_have[NPY_CPU_FEATURE_AVX512_CNL] = npy__cpu_have[NPY_CPU_FEATURE_AVX512_SKX] && + npy__cpu_have[NPY_CPU_FEATURE_AVX512IFMA] && + npy__cpu_have[NPY_CPU_FEATURE_AVX512VBMI]; + // Ice Lake + npy__cpu_have[NPY_CPU_FEATURE_AVX512VBMI2] = (reg[2] & (1 << 6)) != 0; + npy__cpu_have[NPY_CPU_FEATURE_AVX512BITALG] = (reg[2] & (1 << 12)) != 0; + npy__cpu_have[NPY_CPU_FEATURE_AVX512_ICL] = npy__cpu_have[NPY_CPU_FEATURE_AVX512_CLX] && + npy__cpu_have[NPY_CPU_FEATURE_AVX512_CNL] && + npy__cpu_have[NPY_CPU_FEATURE_AVX512VBMI2] && + npy__cpu_have[NPY_CPU_FEATURE_AVX512BITALG] && + npy__cpu_have[NPY_CPU_FEATURE_AVX512VPOPCNTDQ]; + } +} + +/***************** POWER ******************/ + +#elif defined(NPY_CPU_PPC64) || defined(NPY_CPU_PPC64LE) + +#ifdef __linux__ + #include <sys/auxv.h> + #ifndef AT_HWCAP2 + #define AT_HWCAP2 26 + #endif + #ifndef PPC_FEATURE2_ARCH_3_00 + #define PPC_FEATURE2_ARCH_3_00 0x00800000 + #endif +#endif + +static void +npy__cpu_init_features(void) +{ + memset(npy__cpu_have, 0, sizeof(npy__cpu_have[0]) * NPY_CPU_FEATURE_MAX); +#ifdef __linux__ + unsigned int hwcap = getauxval(AT_HWCAP); + if ((hwcap & PPC_FEATURE_HAS_VSX) == 0) + return; + + hwcap = getauxval(AT_HWCAP2); + if (hwcap & PPC_FEATURE2_ARCH_3_00) + { + npy__cpu_have[NPY_CPU_FEATURE_VSX] = + npy__cpu_have[NPY_CPU_FEATURE_VSX2] = + npy__cpu_have[NPY_CPU_FEATURE_VSX3] = 1; + return; + } + npy__cpu_have[NPY_CPU_FEATURE_VSX2] = (hwcap & PPC_FEATURE2_ARCH_2_07) != 0; + npy__cpu_have[NPY_CPU_FEATURE_VSX] = 1; +// TODO: AIX, FreeBSD +#else + npy__cpu_have[NPY_CPU_FEATURE_VSX] = 1; + #if defined(NPY_CPU_PPC64LE) || defined(NPY_HAVE_VSX2) + npy__cpu_have[NPY_CPU_FEATURE_VSX2] = 1; + #endif + #ifdef NPY_HAVE_VSX3 + npy__cpu_have[NPY_CPU_FEATURE_VSX3] = 1; + #endif +#endif +} + +/***************** ARM ******************/ + +#elif defined(__arm__) || defined(__aarch64__) + +static NPY_INLINE void +npy__cpu_init_features_arm8(void) +{ + npy__cpu_have[NPY_CPU_FEATURE_NEON] = + npy__cpu_have[NPY_CPU_FEATURE_NEON_FP16] = + npy__cpu_have[NPY_CPU_FEATURE_NEON_VFPV4] = + npy__cpu_have[NPY_CPU_FEATURE_ASIMD] = 1; +} + +#ifdef __linux__ +/* + * we aren't sure of what kind kernel or clib we deal with + * so we play it safe +*/ +#include <stdio.h> +#include <fcntl.h> + +#define NPY__HWCAP 16 +#define NPY__HWCAP2 26 + +// arch/arm/include/uapi/asm/hwcap.h +#define NPY__HWCAP_HALF (1 << 1) +#define NPY__HWCAP_NEON (1 << 12) +#define NPY__HWCAP_VFPv3 (1 << 13) +#define NPY__HWCAP_VFPv4 (1 << 16) +#define NPY__HWCAP2_AES (1 << 0) +#define NPY__HWCAP2_PMULL (1 << 1) +#define NPY__HWCAP2_SHA1 (1 << 2) +#define NPY__HWCAP2_SHA2 (1 << 3) +#define NPY__HWCAP2_CRC32 (1 << 4) +// arch/arm64/include/uapi/asm/hwcap.h +#define NPY__HWCAP_FP (1 << 0) +#define NPY__HWCAP_ASIMD (1 << 1) +#define NPY__HWCAP_FPHP (1 << 9) +#define NPY__HWCAP_ASIMDHP (1 << 10) +#define NPY__HWCAP_ASIMDDP (1 << 20) +#define NPY__HWCAP_ASIMDFHM (1 << 23) + +__attribute__((weak)) unsigned long getauxval(unsigned long); // linker should handle it +static int +npy__cpu_init_features_linux(void) +{ + unsigned long hwcap = 0, hwcap2 = 0; + if (getauxval != 0) { + hwcap = getauxval(NPY__HWCAP); + #ifdef __arm__ + hwcap2 = getauxval(NPY__HWCAP2); + #endif + } else { + unsigned long auxv[2]; + int fd = open("/proc/self/auxv", O_RDONLY); + if (fd >= 0) { + while (read(fd, &auxv, sizeof(auxv)) == sizeof(auxv)) { + if (auxv[0] == NPY__HWCAP) { + hwcap = auxv[1]; + } + #ifdef __arm__ + else if (auxv[0] == NPY__HWCAP2) { + hwcap2 = auxv[1]; + } + #endif + // detect the end + else if (auxv[0] == 0 && auxv[1] == 0) { + break; + } + } + close(fd); + } + } + if (hwcap == 0 && hwcap2 == 0) { + /* + * FIXME: failback to compiler definitions, + * BTW we can parse /proc/cpuinfo for badly patched kernels + */ + return 0; + } +#ifdef __arm__ + // Detect Arm8 (aarch32 state) + if ((hwcap2 & NPY__HWCAP2_AES) || (hwcap2 & NPY__HWCAP2_SHA1) || + (hwcap2 & NPY__HWCAP2_SHA2) || (hwcap2 & NPY__HWCAP2_PMULL) || + (hwcap2 & NPY__HWCAP2_CRC32)) +#else + if (1) +#endif + { + if (!(hwcap & (NPY__HWCAP_FP | NPY__HWCAP_ASIMD))) { + // Is this could happen? maybe disabled by kernel + // BTW this will break the baseline of AARCH64 + return 1; + } + npy__cpu_have[NPY_CPU_FEATURE_FPHP] = (hwcap & NPY__HWCAP_FPHP) != 0; + npy__cpu_have[NPY_CPU_FEATURE_ASIMDHP] = (hwcap & NPY__HWCAP_ASIMDHP) != 0; + npy__cpu_have[NPY_CPU_FEATURE_ASIMDDP] = (hwcap & NPY__HWCAP_ASIMDDP) != 0; + npy__cpu_have[NPY_CPU_FEATURE_ASIMDFHM] = (hwcap & NPY__HWCAP_ASIMDFHM) != 0; + npy__cpu_init_features_arm8(); + } else { + npy__cpu_have[NPY_CPU_FEATURE_NEON] = (hwcap & NPY__HWCAP_NEON) != 0; + npy__cpu_have[NPY_CPU_FEATURE_NEON_FP16] = (hwcap & (NPY__HWCAP_NEON | NPY__HWCAP_VFPv3 | + NPY__HWCAP_HALF)) != 0; + npy__cpu_have[NPY_CPU_FEATURE_NEON_VFPV4] = (hwcap & (NPY__HWCAP_NEON | NPY__HWCAP_VFPv4)) != 0; + } + return 1; +} +#endif + +static void +npy__cpu_init_features(void) +{ + memset(npy__cpu_have, 0, sizeof(npy__cpu_have[0]) * NPY_CPU_FEATURE_MAX); +#ifdef __linux__ + if (npy__cpu_init_features_linux()) + return; +#endif + // We have nothing else todo +#if defined(NPY_HAVE_NEON_ARM8) || defined(__aarch64__) || (defined(__ARM_ARCH) && __ARM_ARCH >= 8) + #if defined(NPY_HAVE_FPHP) || defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) + npy__cpu_have[NPY_CPU_FEATURE_FPHP] = 1; + #endif + #if defined(NPY_HAVE_ASIMDHP) || defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) + npy__cpu_have[NPY_CPU_FEATURE_ASIMDHP] = 1; + #endif + #if defined(NPY_HAVE_ASIMDDP) || defined(__ARM_FEATURE_DOTPROD) + npy__cpu_have[NPY_CPU_FEATURE_ASIMDDP] = 1; + #endif + #if defined(NPY_HAVE_ASIMDFHM) || defined(__ARM_FEATURE_FP16FML) + npy__cpu_have[NPY_CPU_FEATURE_ASIMDFHM] = 1; + #endif + npy__cpu_init_features_arm8(); +#else + #if defined(NPY_HAVE_NEON) || defined(__ARM_NEON__) + npy__cpu_have[NPY_CPU_FEATURE_NEON] = 1; + #endif + #if defined(NPY_HAVE_NEON_FP16) || defined(__ARM_FP16_FORMAT_IEEE) || (defined(__ARM_FP) && (__ARM_FP & 2)) + npy__cpu_have[NPY_CPU_FEATURE_NEON_FP16] = npy__cpu_have[NPY_CPU_FEATURE_NEON]; + #endif + #if defined(NPY_HAVE_NEON_VFPV4) || defined(__ARM_FEATURE_FMA) + npy__cpu_have[NPY_CPU_FEATURE_NEON_VFPV4] = npy__cpu_have[NPY_CPU_FEATURE_NEON]; + #endif +#endif +} + +/*********** Unsupported ARCH ***********/ +#else +static void +npy__cpu_init_features(void) +{ +} +#endif diff --git a/numpy/core/src/common/npy_cpu_features.h b/numpy/core/src/common/npy_cpu_features.h new file mode 100644 index 000000000..0e8901328 --- /dev/null +++ b/numpy/core/src/common/npy_cpu_features.h @@ -0,0 +1,117 @@ +#ifndef _NPY_CPU_FEATURES_H_ +#define _NPY_CPU_FEATURES_H_ + +#include "numpy/numpyconfig.h" // for NPY_VISIBILITY_HIDDEN +#include <Python.h> // for PyObject + +#ifdef __cplusplus +extern "C" { +#endif + +enum npy_cpu_features +{ + NPY_CPU_FEATURE_NONE = 0, + // X86 + NPY_CPU_FEATURE_MMX = 1, + NPY_CPU_FEATURE_SSE = 2, + NPY_CPU_FEATURE_SSE2 = 3, + NPY_CPU_FEATURE_SSE3 = 4, + NPY_CPU_FEATURE_SSSE3 = 5, + NPY_CPU_FEATURE_SSE41 = 6, + NPY_CPU_FEATURE_POPCNT = 7, + NPY_CPU_FEATURE_SSE42 = 8, + NPY_CPU_FEATURE_AVX = 9, + NPY_CPU_FEATURE_F16C = 10, + NPY_CPU_FEATURE_XOP = 11, + NPY_CPU_FEATURE_FMA4 = 12, + NPY_CPU_FEATURE_FMA3 = 13, + NPY_CPU_FEATURE_AVX2 = 14, + NPY_CPU_FEATURE_FMA = 15, // AVX2 & FMA3, provides backward compatibility + + NPY_CPU_FEATURE_AVX512F = 30, + NPY_CPU_FEATURE_AVX512CD = 31, + NPY_CPU_FEATURE_AVX512ER = 32, + NPY_CPU_FEATURE_AVX512PF = 33, + NPY_CPU_FEATURE_AVX5124FMAPS = 34, + NPY_CPU_FEATURE_AVX5124VNNIW = 35, + NPY_CPU_FEATURE_AVX512VPOPCNTDQ = 36, + NPY_CPU_FEATURE_AVX512BW = 37, + NPY_CPU_FEATURE_AVX512DQ = 38, + NPY_CPU_FEATURE_AVX512VL = 39, + NPY_CPU_FEATURE_AVX512IFMA = 40, + NPY_CPU_FEATURE_AVX512VBMI = 41, + NPY_CPU_FEATURE_AVX512VNNI = 42, + NPY_CPU_FEATURE_AVX512VBMI2 = 43, + NPY_CPU_FEATURE_AVX512BITALG = 44, + + // X86 CPU Groups + // Knights Landing (F,CD,ER,PF) + NPY_CPU_FEATURE_AVX512_KNL = 101, + // Knights Mill (F,CD,ER,PF,4FMAPS,4VNNIW,VPOPCNTDQ) + NPY_CPU_FEATURE_AVX512_KNM = 102, + // Skylake-X (F,CD,BW,DQ,VL) + NPY_CPU_FEATURE_AVX512_SKX = 103, + // Cascade Lake (F,CD,BW,DQ,VL,VNNI) + NPY_CPU_FEATURE_AVX512_CLX = 104, + // Cannon Lake (F,CD,BW,DQ,VL,IFMA,VBMI) + NPY_CPU_FEATURE_AVX512_CNL = 105, + // Ice Lake (F,CD,BW,DQ,VL,IFMA,VBMI,VNNI,VBMI2,BITALG,VPOPCNTDQ) + NPY_CPU_FEATURE_AVX512_ICL = 106, + + // IBM/POWER VSX + // POWER7 + NPY_CPU_FEATURE_VSX = 200, + // POWER8 + NPY_CPU_FEATURE_VSX2 = 201, + // POWER9 + NPY_CPU_FEATURE_VSX3 = 202, + + // ARM + NPY_CPU_FEATURE_NEON = 300, + NPY_CPU_FEATURE_NEON_FP16 = 301, + // FMA + NPY_CPU_FEATURE_NEON_VFPV4 = 302, + // Advanced SIMD + NPY_CPU_FEATURE_ASIMD = 303, + // ARMv8.2 half-precision + NPY_CPU_FEATURE_FPHP = 304, + // ARMv8.2 half-precision vector arithm + NPY_CPU_FEATURE_ASIMDHP = 305, + // ARMv8.2 dot product + NPY_CPU_FEATURE_ASIMDDP = 306, + // ARMv8.2 single&half-precision multiply + NPY_CPU_FEATURE_ASIMDFHM = 307, + + NPY_CPU_FEATURE_MAX +}; + +/* + * Initialize CPU features + * return 0 on success otherwise return -1 +*/ +NPY_VISIBILITY_HIDDEN int +npy_cpu_init(void); + +/* + * return 0 if CPU feature isn't available + * note: `npy_cpu_init` must be called first otherwise it will always return 0 +*/ +NPY_VISIBILITY_HIDDEN int +npy_cpu_have(int feature_id); + +#define NPY_CPU_HAVE(FEATURE_NAME) \ +npy_cpu_have(NPY_CPU_FEATURE_##FEATURE_NAME) + +/* + * return a new dictionary contains CPU feature names + * with runtime availability. + * same as npy_cpu_have, `npy_cpu_init` must be called first. + */ +NPY_VISIBILITY_HIDDEN PyObject * +npy_cpu_features_dict(void); + +#ifdef __cplusplus +} +#endif + +#endif // _NPY_CPU_FEATURES_H_ diff --git a/numpy/core/src/multiarray/multiarraymodule.c b/numpy/core/src/multiarray/multiarraymodule.c index af0cef15b..46d3fc0a2 100644 --- a/numpy/core/src/multiarray/multiarraymodule.c +++ b/numpy/core/src/multiarray/multiarraymodule.c @@ -4387,6 +4387,11 @@ PyMODINIT_FUNC PyInit__multiarray_umath(void) { PyObject *m, *d, *s; PyObject *c_api; + /* Initialize CPU features */ + if (npy_cpu_init() < 0) { + goto err; + } + /* Create the module and add the functions */ m = PyModule_Create(&moduledef); if (!m) { @@ -4512,6 +4517,16 @@ PyMODINIT_FUNC PyInit__multiarray_umath(void) { PyDict_SetItemString(d, "__version__", s); Py_DECREF(s); + s = npy_cpu_features_dict(); + if (s == NULL) { + goto err; + } + if (PyDict_SetItemString(d, "__cpu_features__", s) < 0) { + Py_DECREF(s); + goto err; + } + Py_DECREF(s); + s = NpyCapsule_FromVoidPtr((void *)_datetime_strings, NULL); if (s == NULL) { goto err; diff --git a/numpy/core/src/umath/cpuid.c b/numpy/core/src/umath/cpuid.c deleted file mode 100644 index 72c6493e8..000000000 --- a/numpy/core/src/umath/cpuid.c +++ /dev/null @@ -1,97 +0,0 @@ -#define _UMATHMODULE -#define _MULTIARRAYMODULE -#define NPY_NO_DEPRECATED_API NPY_API_VERSION - -#include <Python.h> - -#include "npy_config.h" - -#include "cpuid.h" - -#define XCR_XFEATURE_ENABLED_MASK 0x0 -#define XSTATE_SSE 0x2 -#define XSTATE_YMM 0x4 -#define XSTATE_ZMM 0x70 - -/* - * verify the OS supports avx instructions - * it can be disabled in some OS, e.g. with the nosavex boot option of linux - */ -static NPY_INLINE -int os_avx_support(void) -{ -#if HAVE_XGETBV - /* - * use bytes for xgetbv to avoid issues with compiler not knowing the - * instruction - */ - unsigned int eax, edx; - unsigned int ecx = XCR_XFEATURE_ENABLED_MASK; - __asm__("xgetbv" : "=a" (eax), "=d" (edx) : "c" (ecx)); - return (eax & (XSTATE_SSE | XSTATE_YMM)) == (XSTATE_SSE | XSTATE_YMM); -#else - return 0; -#endif -} - -static NPY_INLINE -int os_avx512_support(void) -{ -#if HAVE_XGETBV - unsigned int eax, edx; - unsigned int ecx = XCR_XFEATURE_ENABLED_MASK; - unsigned int xcr0 = XSTATE_ZMM | XSTATE_YMM | XSTATE_SSE; - __asm__("xgetbv" : "=a" (eax), "=d" (edx) : "c" (ecx)); - return (eax & xcr0) == xcr0; -#else - return 0; -#endif -} - -static NPY_INLINE -int cpu_supports_fma(void) -{ -#ifdef __x86_64__ - unsigned int feature = 0x01; - unsigned int a, b, c, d; - __asm__ volatile ( - "cpuid" "\n\t" - : "=a" (a), "=b" (b), "=c" (c), "=d" (d) - : "a" (feature)); - /* - * FMA is the 12th bit of ECX - */ - return (c >> 12) & 1; -#else - return 0; -#endif -} - -/* - * Primitive cpu feature detect function - * Currently only supports checking for avx on gcc compatible compilers. - */ -NPY_NO_EXPORT int -npy_cpu_supports(const char * feature) -{ -#ifdef HAVE___BUILTIN_CPU_SUPPORTS - if (strcmp(feature, "avx512f") == 0) { -#ifdef HAVE___BUILTIN_CPU_SUPPORTS_AVX512F - return __builtin_cpu_supports("avx512f") && os_avx512_support(); -#else - return 0; -#endif - } - else if (strcmp(feature, "fma") == 0) { - return cpu_supports_fma() && __builtin_cpu_supports("avx2") && os_avx_support(); - } - else if (strcmp(feature, "avx2") == 0) { - return __builtin_cpu_supports("avx2") && os_avx_support(); - } - else if (strcmp(feature, "avx") == 0) { - return __builtin_cpu_supports("avx") && os_avx_support(); - } -#endif - - return 0; -} diff --git a/numpy/core/src/umath/cpuid.h b/numpy/core/src/umath/cpuid.h deleted file mode 100644 index 33702ed41..000000000 --- a/numpy/core/src/umath/cpuid.h +++ /dev/null @@ -1,9 +0,0 @@ -#ifndef _NPY_PRIVATE__CPUID_H_ -#define _NPY_PRIVATE__CPUID_H_ - -#include <numpy/ndarraytypes.h> /* for NPY_NO_EXPORT */ - -NPY_NO_EXPORT int -npy_cpu_supports(const char * feature); - -#endif |