summaryrefslogtreecommitdiff
path: root/doc/source/reference
diff options
context:
space:
mode:
authorPierre de Buyl <pdebuyl@pdebuyl.be>2021-12-21 11:28:03 +0100
committerGitHub <noreply@github.com>2021-12-21 11:28:03 +0100
commit7bc1d5100fc67abdcaf5400223386ea613d7a872 (patch)
tree4d22c2129c0edf2e615e2afe3378761cb3a6cd87 /doc/source/reference
parent831849ea5d1a9ec0ce56f28f682cd06f8da736c5 (diff)
parent7acb0fd4123673dc38aa5634b47f93770e61cfab (diff)
downloadnumpy-7bc1d5100fc67abdcaf5400223386ea613d7a872.tar.gz
Merge branch 'main' into doctest_for_pytest
Diffstat (limited to 'doc/source/reference')
-rw-r--r--doc/source/reference/c-api/array.rst56
-rw-r--r--doc/source/reference/c-api/types-and-structures.rst7
-rw-r--r--doc/source/reference/index.rst2
-rw-r--r--doc/source/reference/simd/build-options.rst376
-rw-r--r--doc/source/reference/simd/gen_features.py196
-rw-r--r--doc/source/reference/simd/generated_tables/compilers-diff.inc33
-rw-r--r--doc/source/reference/simd/generated_tables/cpu_features.inc106
-rw-r--r--doc/source/reference/simd/how-it-works.rst349
-rw-r--r--doc/source/reference/simd/index.rst43
-rw-r--r--doc/source/reference/simd/log_example.txt79
-rw-r--r--doc/source/reference/simd/simd-optimizations-tables-diff.inc37
-rw-r--r--doc/source/reference/simd/simd-optimizations-tables.inc103
-rw-r--r--doc/source/reference/simd/simd-optimizations.py190
-rw-r--r--doc/source/reference/simd/simd-optimizations.rst533
14 files changed, 1208 insertions, 902 deletions
diff --git a/doc/source/reference/c-api/array.rst b/doc/source/reference/c-api/array.rst
index bb4405825..fb8acffe6 100644
--- a/doc/source/reference/c-api/array.rst
+++ b/doc/source/reference/c-api/array.rst
@@ -127,8 +127,7 @@ and its sub-types).
your own memory, you should use the function :c:func:`PyArray_SetBaseObject`
to set the base to an object which owns the memory.
- If the (deprecated) :c:data:`NPY_ARRAY_UPDATEIFCOPY` or the
- :c:data:`NPY_ARRAY_WRITEBACKIFCOPY` flags are set, it has a different
+ If the :c:data:`NPY_ARRAY_WRITEBACKIFCOPY` flag is set, it has a different
meaning, namely base is the array into which the current array will
be copied upon copy resolution. This overloading of the base property
for two functions is likely to change in a future version of NumPy.
@@ -237,8 +236,7 @@ From scratch
If *data* is not ``NULL``, then it is assumed to point to the memory
to be used for the array and the *flags* argument is used as the
new flags for the array (except the state of :c:data:`NPY_ARRAY_OWNDATA`,
- :c:data:`NPY_ARRAY_WRITEBACKIFCOPY` and :c:data:`NPY_ARRAY_UPDATEIFCOPY`
- flags of the new array will be reset).
+ :c:data:`NPY_ARRAY_WRITEBACKIFCOPY` flag of the new array will be reset).
In addition, if *data* is non-NULL, then *strides* can
also be provided. If *strides* is ``NULL``, then the array strides
@@ -487,13 +485,6 @@ From other objects
will be made writeable again. If *op* is not writeable to begin
with, or if it is not already an array, then an error is raised.
- .. c:macro:: NPY_ARRAY_UPDATEIFCOPY
-
- Deprecated. Use :c:data:`NPY_ARRAY_WRITEBACKIFCOPY`, which is similar.
- This flag "automatically" copies the data back when the returned
- array is deallocated, which is not supported in all python
- implementations.
-
.. c:macro:: NPY_ARRAY_BEHAVED
:c:data:`NPY_ARRAY_ALIGNED` \| :c:data:`NPY_ARRAY_WRITEABLE`
@@ -550,14 +541,12 @@ From other objects
.. c:macro:: NPY_ARRAY_INOUT_ARRAY
:c:data:`NPY_ARRAY_C_CONTIGUOUS` \| :c:data:`NPY_ARRAY_WRITEABLE` \|
- :c:data:`NPY_ARRAY_ALIGNED` \| :c:data:`NPY_ARRAY_WRITEBACKIFCOPY` \|
- :c:data:`NPY_ARRAY_UPDATEIFCOPY`
+ :c:data:`NPY_ARRAY_ALIGNED` \| :c:data:`NPY_ARRAY_WRITEBACKIFCOPY`
.. c:macro:: NPY_ARRAY_INOUT_FARRAY
:c:data:`NPY_ARRAY_F_CONTIGUOUS` \| :c:data:`NPY_ARRAY_WRITEABLE` \|
- :c:data:`NPY_ARRAY_ALIGNED` \| :c:data:`NPY_ARRAY_WRITEBACKIFCOPY` \|
- :c:data:`NPY_ARRAY_UPDATEIFCOPY`
+ :c:data:`NPY_ARRAY_ALIGNED` \| :c:data:`NPY_ARRAY_WRITEBACKIFCOPY`
.. c:function:: int PyArray_GetArrayParamsFromObject( \
PyObject* op, PyArray_Descr* requested_dtype, npy_bool writeable, \
@@ -773,8 +762,7 @@ From other objects
:c:data:`NPY_ARRAY_C_CONTIGUOUS`, :c:data:`NPY_ARRAY_F_CONTIGUOUS`,
:c:data:`NPY_ARRAY_ALIGNED`, :c:data:`NPY_ARRAY_WRITEABLE`,
:c:data:`NPY_ARRAY_NOTSWAPPED`, :c:data:`NPY_ARRAY_ENSURECOPY`,
- :c:data:`NPY_ARRAY_WRITEBACKIFCOPY`, :c:data:`NPY_ARRAY_UPDATEIFCOPY`,
- :c:data:`NPY_ARRAY_FORCECAST`, and
+ :c:data:`NPY_ARRAY_WRITEBACKIFCOPY`, :c:data:`NPY_ARRAY_FORCECAST`, and
:c:data:`NPY_ARRAY_ENSUREARRAY`. Standard combinations of flags can also
be used:
@@ -1375,15 +1363,6 @@ Special functions for NPY_OBJECT
decrement all the items in the object array prior to calling this
function.
-.. c:function:: int PyArray_SetUpdateIfCopyBase(PyArrayObject* arr, PyArrayObject* base)
-
- Precondition: ``arr`` is a copy of ``base`` (though possibly with different
- strides, ordering, etc.) Set the UPDATEIFCOPY flag and ``arr->base`` so
- that when ``arr`` is destructed, it will copy any changes back to ``base``.
- DEPRECATED, use :c:func:`PyArray_SetWritebackIfCopyBase`.
-
- Returns 0 for success, -1 for failure.
-
.. c:function:: int PyArray_SetWritebackIfCopyBase(PyArrayObject* arr, PyArrayObject* base)
Precondition: ``arr`` is a copy of ``base`` (though possibly with different
@@ -1496,14 +1475,6 @@ of the constant names is deprecated in 1.7.
would have returned an error because :c:data:`NPY_ARRAY_WRITEBACKIFCOPY`
would not have been possible.
-.. c:macro:: NPY_ARRAY_UPDATEIFCOPY
-
- A deprecated version of :c:data:`NPY_ARRAY_WRITEBACKIFCOPY` which
- depends upon ``dealloc`` to trigger the writeback. For backwards
- compatibility, :c:func:`PyArray_ResolveWritebackIfCopy` is called at
- ``dealloc`` but relying
- on that behavior is deprecated and not supported in PyPy.
-
:c:func:`PyArray_UpdateFlags` (obj, flags) will update the ``obj->flags``
for ``flags`` which can be any of :c:data:`NPY_ARRAY_C_CONTIGUOUS`,
:c:data:`NPY_ARRAY_F_CONTIGUOUS`, :c:data:`NPY_ARRAY_ALIGNED`, or
@@ -1575,8 +1546,7 @@ For all of these macros *arr* must be an instance of a (subclass of)
combinations of the possible flags an array can have:
:c:data:`NPY_ARRAY_C_CONTIGUOUS`, :c:data:`NPY_ARRAY_F_CONTIGUOUS`,
:c:data:`NPY_ARRAY_OWNDATA`, :c:data:`NPY_ARRAY_ALIGNED`,
- :c:data:`NPY_ARRAY_WRITEABLE`, :c:data:`NPY_ARRAY_WRITEBACKIFCOPY`,
- :c:data:`NPY_ARRAY_UPDATEIFCOPY`.
+ :c:data:`NPY_ARRAY_WRITEABLE`, :c:data:`NPY_ARRAY_WRITEBACKIFCOPY`.
.. c:function:: int PyArray_IS_C_CONTIGUOUS(PyObject *arr)
@@ -2765,7 +2735,7 @@ Array mapping is the machinery behind advanced indexing.
has memory overlap with any of the arrays in ``index`` and with
``extra_op``, and make copies as appropriate to avoid problems if the
input is modified during the iteration. ``iter->array`` may contain a
- copied array (UPDATEIFCOPY/WRITEBACKIFCOPY set).
+ copied array (WRITEBACKIFCOPY set).
Array Scalars
-------------
@@ -3377,8 +3347,8 @@ Memory management
.. c:function:: int PyArray_ResolveWritebackIfCopy(PyArrayObject* obj)
- If ``obj.flags`` has :c:data:`NPY_ARRAY_WRITEBACKIFCOPY` or (deprecated)
- :c:data:`NPY_ARRAY_UPDATEIFCOPY`, this function clears the flags, `DECREF` s
+ If ``obj.flags`` has :c:data:`NPY_ARRAY_WRITEBACKIFCOPY`, this function
+ clears the flags, `DECREF` s
`obj->base` and makes it writeable, and sets ``obj->base`` to NULL. It then
copies ``obj->data`` to `obj->base->data`, and returns the error state of
the copy operation. This is the opposite of
@@ -3609,8 +3579,8 @@ Miscellaneous Macros
.. c:function:: void PyArray_DiscardWritebackIfCopy(PyObject* obj)
- If ``obj.flags`` has :c:data:`NPY_ARRAY_WRITEBACKIFCOPY` or (deprecated)
- :c:data:`NPY_ARRAY_UPDATEIFCOPY`, this function clears the flags, `DECREF` s
+ If ``obj.flags`` has :c:data:`NPY_ARRAY_WRITEBACKIFCOPY`, this function
+ clears the flags, `DECREF` s
`obj->base` and makes it writeable, and sets ``obj->base`` to NULL. In
contrast to :c:func:`PyArray_DiscardWritebackIfCopy` it makes no attempt
to copy the data from `obj->base` This undoes
@@ -3623,8 +3593,8 @@ Miscellaneous Macros
Deprecated in 1.14, use :c:func:`PyArray_DiscardWritebackIfCopy`
followed by ``Py_XDECREF``
- DECREF's an array object which may have the (deprecated)
- :c:data:`NPY_ARRAY_UPDATEIFCOPY` or :c:data:`NPY_ARRAY_WRITEBACKIFCOPY`
+ DECREF's an array object which may have the
+ :c:data:`NPY_ARRAY_WRITEBACKIFCOPY`
flag set without causing the contents to be copied back into the
original array. Resets the :c:data:`NPY_ARRAY_WRITEABLE` flag on the base
object. This is useful for recovering from an error condition when
diff --git a/doc/source/reference/c-api/types-and-structures.rst b/doc/source/reference/c-api/types-and-structures.rst
index 1ea47b498..34437bd30 100644
--- a/doc/source/reference/c-api/types-and-structures.rst
+++ b/doc/source/reference/c-api/types-and-structures.rst
@@ -144,9 +144,8 @@ PyArray_Type and PyArrayObject
- If this array does not own its own memory, then base points to the
Python object that owns it (perhaps another array object)
- - If this array has the (deprecated) :c:data:`NPY_ARRAY_UPDATEIFCOPY` or
- :c:data:`NPY_ARRAY_WRITEBACKIFCOPY` flag set, then this array is a working
- copy of a "misbehaved" array.
+ - If this array has the :c:data:`NPY_ARRAY_WRITEBACKIFCOPY` flag set,
+ then this array is a working copy of a "misbehaved" array.
When ``PyArray_ResolveWritebackIfCopy`` is called, the array pointed to
by base will be updated with the contents of this array.
@@ -169,7 +168,7 @@ PyArray_Type and PyArrayObject
interpreted. Possible flags are :c:data:`NPY_ARRAY_C_CONTIGUOUS`,
:c:data:`NPY_ARRAY_F_CONTIGUOUS`, :c:data:`NPY_ARRAY_OWNDATA`,
:c:data:`NPY_ARRAY_ALIGNED`, :c:data:`NPY_ARRAY_WRITEABLE`,
- :c:data:`NPY_ARRAY_WRITEBACKIFCOPY`, and :c:data:`NPY_ARRAY_UPDATEIFCOPY`.
+ :c:data:`NPY_ARRAY_WRITEBACKIFCOPY`.
.. c:member:: PyObject *weakreflist
diff --git a/doc/source/reference/index.rst b/doc/source/reference/index.rst
index a18211cca..24bb6665d 100644
--- a/doc/source/reference/index.rst
+++ b/doc/source/reference/index.rst
@@ -26,7 +26,7 @@ For learning how to use NumPy, see the :ref:`complete documentation <numpy_docs_
distutils
distutils_guide
c-api/index
- simd/simd-optimizations
+ simd/index
swig
diff --git a/doc/source/reference/simd/build-options.rst b/doc/source/reference/simd/build-options.rst
new file mode 100644
index 000000000..0a40d3ff5
--- /dev/null
+++ b/doc/source/reference/simd/build-options.rst
@@ -0,0 +1,376 @@
+*****************
+CPU build options
+*****************
+
+Description
+-----------
+
+The following options are mainly used to change the default behavior of optimizations
+that target certain CPU features:
+
+- ``--cpu-baseline``: minimal set of required CPU features.
+ Default value is ``min`` which provides the minimum CPU features that can
+ safely run on a wide range of platforms within the processor family.
+
+ .. note::
+
+ During the runtime, NumPy modules will fail to load if any of specified features
+ are not supported by the target CPU (raises Python runtime error).
+
+- ``--cpu-dispatch``: dispatched set of additional CPU features.
+ Default value is ``max -xop -fma4`` which enables all CPU
+ features, except for AMD legacy features (in case of X86).
+
+ .. note::
+
+ During the runtime, NumPy modules will skip any specified features
+ that are not available in the target CPU.
+
+These options are accessible through :py:mod:`distutils` commands
+`distutils.command.build`, `distutils.command.build_clib` and
+`distutils.command.build_ext`.
+They accept a set of :ref:`CPU features <opt-supported-features>`
+or groups of features that gather several features or
+:ref:`special options <opt-special-options>` that
+perform a series of procedures.
+
+.. note::
+
+ If ``build_clib`` or ``build_ext`` are not specified by the user,
+ the arguments of ``build`` will be used instead, which also holds the default values.
+
+To customize both ``build_ext`` and ``build_clib``::
+
+ cd /path/to/numpy
+ python setup.py build --cpu-baseline="avx2 fma3" install --user
+
+To customize only ``build_ext``::
+
+ cd /path/to/numpy
+ python setup.py build_ext --cpu-baseline="avx2 fma3" install --user
+
+To customize only ``build_clib``::
+
+ cd /path/to/numpy
+ python setup.py build_clib --cpu-baseline="avx2 fma3" install --user
+
+You can also customize CPU/build options through PIP command::
+
+ pip install --no-use-pep517 --global-option=build \
+ --global-option="--cpu-baseline=avx2 fma3" \
+ --global-option="--cpu-dispatch=max" ./
+
+Quick Start
+-----------
+
+In general, the default settings tend to not impose certain CPU features that
+may not be available on some older processors. Raising the ceiling of the
+baseline features will often improve performance and may also reduce
+binary size.
+
+
+The following are the most common scenarios that may require changing
+the default settings:
+
+
+I am building NumPy for my local use
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+And I do not intend to export the build to other users or target a
+different CPU than what the host has.
+
+Set `native` for baseline, or manualy specify the CPU features in case of option
+`native` isn't supported by your platform::
+
+ python setup.py build --cpu-baseline="native" bdist
+
+Building NumPy with extra CPU features isn't necessary for this case,
+since all supported features are already defined within the baseline features::
+
+ python setup.py build --cpu-baseline=native --cpu-dispatch=none bdist
+
+.. note::
+
+ A fatal error will be raised if `native` isn't supported by the host platform.
+
+I do not want to support the old processors of the `x86` architecture
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Since most of the CPUs nowadays support at least `AVX`, `F16C` features, you can use::
+
+ python setup.py build --cpu-baseline="avx f16c" bdist
+
+.. note::
+
+ ``--cpu-baseline`` force combine all implied features, so there's no need
+ to add SSE features.
+
+
+I'm facing the same case above but with `ppc64` architecture
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Then raise the ceiling of the baseline features to Power8::
+
+ python setup.py build --cpu-baseline="vsx2" bdist
+
+Having issues with `AVX512` features?
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+You may have some reservations about including of `AVX512` or
+any other CPU feature and you want to exclude from the dispatched features::
+
+ python setup.py build --cpu-dispatch="max -avx512f -avx512cd \
+ -avx512_knl -avx512_knm -avx512_skx -avx512_clx -avx512_cnl -avx512_icl" \
+ bdist
+
+.. _opt-supported-features:
+
+Supported Features
+------------------
+
+The names of the features can express one feature or a group of features,
+as shown in the following tables supported depend on the lowest interest:
+
+.. note::
+
+ The following features may not be supported by all compilers,
+ also some compilers may produce different set of implied features
+ when it comes to features like ``AVX512``, ``AVX2``, and ``FMA3``.
+ See :ref:`opt-platform-differences` for more details.
+
+.. include:: generated_tables/cpu_features.inc
+
+.. _opt-special-options:
+
+Special Options
+---------------
+
+- ``NONE``: enable no features.
+
+- ``NATIVE``: Enables all CPU features that supported by the host CPU,
+ this operation is based on the compiler flags (``-march=native``, ``-xHost``, ``/QxHost``)
+
+- ``MIN``: Enables the minimum CPU features that can safely run on a wide range of platforms:
+
+ .. table::
+ :align: left
+
+ ====================================== =======================================
+ For Arch Implies
+ ====================================== =======================================
+ x86 (32-bit mode) ``SSE`` ``SSE2``
+ x86_64 ``SSE`` ``SSE2`` ``SSE3``
+ IBM/POWER (big-endian mode) ``NONE``
+ IBM/POWER (little-endian mode) ``VSX`` ``VSX2``
+ ARMHF ``NONE``
+ ARM64 A.K. AARCH64 ``NEON`` ``NEON_FP16`` ``NEON_VFPV4``
+ ``ASIMD``
+ IBM/ZSYSTEM(S390X) ``NONE``
+ ====================================== =======================================
+
+- ``MAX``: Enables all supported CPU features by the compiler and platform.
+
+- ``Operators-/+``: remove or add features, useful with options ``MAX``, ``MIN`` and ``NATIVE``.
+
+Behaviors
+---------
+
+- CPU features and other options are case-insensitive, for example::
+
+ python setup.py build --cpu-dispatch="SSE41 avx2 FMA3"
+
+- The order of the requested optimizations doesn't matter::
+
+ python setup.py build --cpu-dispatch="SSE41 AVX2 FMA3"
+ # equivalent to
+ python setup.py build --cpu-dispatch="FMA3 AVX2 SSE41"
+
+- Either commas or spaces or '+' can be used as a separator,
+ for example::
+
+ python setup.py build --cpu-dispatch="avx2 avx512f"
+ # or
+ python setup.py build --cpu-dispatch=avx2,avx512f
+ # or
+ python setup.py build --cpu-dispatch="avx2+avx512f"
+
+ all works but arguments should be enclosed in quotes or escaped
+ by backslash if any spaces are used.
+
+- ``--cpu-baseline`` combines all implied CPU features, for example::
+
+ python setup.py build --cpu-baseline=sse42
+ # equivalent to
+ python setup.py build --cpu-baseline="sse sse2 sse3 ssse3 sse41 popcnt sse42"
+
+- ``--cpu-baseline`` will be treated as "native" if compiler native flag
+ ``-march=native`` or ``-xHost`` or ``/QxHost`` is enabled through environment variable
+ `CFLAGS`::
+
+ export CFLAGS="-march=native"
+ python setup.py install --user
+ # is equivalent to
+ python setup.py build --cpu-baseline=native install --user
+
+- ``--cpu-baseline`` escapes any specified features that aren't supported
+ by the target platform or compiler rather than raising fatal errors.
+
+ .. note::
+
+ Since ``--cpu-baseline`` combines all implied features, the maximum
+ supported of implied features will be enabled rather than escape all of them.
+ For example::
+
+ # Requesting `AVX2,FMA3` but the compiler only support **SSE** features
+ python setup.py build --cpu-baseline="avx2 fma3"
+ # is equivalent to
+ python setup.py build --cpu-baseline="sse sse2 sse3 ssse3 sse41 popcnt sse42"
+
+- ``--cpu-dispatch`` does not combain any of implied CPU features,
+ so you must add them unless you want to disable one or all of them::
+
+ # Only dispatches AVX2 and FMA3
+ python setup.py build --cpu-dispatch=avx2,fma3
+ # Dispatches AVX and SSE features
+ python setup.py build --cpu-baseline=ssse3,sse41,sse42,avx,avx2,fma3
+
+- ``--cpu-dispatch`` escapes any specified baseline features and also escapes
+ any features not supported by the target platform or compiler without rasing
+ fatal errors.
+
+Eventually, you should always check the final report through the build log
+to verify the enabled features. See :ref:`opt-build-report` for more details.
+
+.. _opt-platform-differences:
+
+Platform differences
+--------------------
+
+Some exceptional conditions force us to link some features together when it come to
+certain compilers or architectures, resulting in the impossibility of building them separately.
+
+These conditions can be divided into two parts, as follows:
+
+**Architectural compatibility**
+
+The need to align certain CPU features that are assured to be supported by
+successive generations of the same architecture, some cases:
+
+- On ppc64le ``VSX(ISA 2.06)`` and ``VSX2(ISA 2.07)`` both imply one another since the
+ first generation that supports little-endian mode is Power-8`(ISA 2.07)`
+- On AArch64 ``NEON NEON_FP16 NEON_VFPV4 ASIMD`` implies each other since they are part of the
+ hardware baseline.
+
+For example::
+
+ # On ARMv8/A64, specify NEON is going to enable Advanced SIMD
+ # and all predecessor extensions
+ python setup.py build --cpu-baseline=neon
+ # which equivalent to
+ python setup.py build --cpu-baseline="neon neon_fp16 neon_vfpv4 asimd"
+
+.. note::
+
+ Please take a deep look at :ref:`opt-supported-features`,
+ in order to determine the features that imply one another.
+
+**Compilation compatibility**
+
+Some compilers don't provide independent support for all CPU features. For instance
+**Intel**'s compiler doesn't provide separated flags for ``AVX2`` and ``FMA3``,
+it makes sense since all Intel CPUs that comes with ``AVX2`` also support ``FMA3``,
+but this approach is incompatible with other **x86** CPUs from **AMD** or **VIA**.
+
+For example::
+
+ # Specify AVX2 will force enables FMA3 on Intel compilers
+ python setup.py build --cpu-baseline=avx2
+ # which equivalent to
+ python setup.py build --cpu-baseline="avx2 fma3"
+
+
+The following tables only show the differences imposed by some compilers from the
+general context that been shown in the :ref:`opt-supported-features` tables:
+
+.. note::
+
+ Features names with strikeout represent the unsupported CPU features.
+
+.. raw:: html
+
+ <style>
+ .enabled-feature {color:green; font-weight:bold;}
+ .disabled-feature {color:red; text-decoration: line-through;}
+ </style>
+
+.. role:: enabled
+ :class: enabled-feature
+
+.. role:: disabled
+ :class: disabled-feature
+
+.. include:: generated_tables/compilers-diff.inc
+
+.. _opt-build-report:
+
+Build report
+------------
+
+In most cases, the CPU build options do not produce any fatal errors that lead to hanging the build.
+Most of the errors that may appear in the build log serve as heavy warnings due to the lack of some
+expected CPU features by the compiler.
+
+So we strongly recommend checking the final report log, to be aware of what kind of CPU features
+are enabled and what are not.
+
+You can find the final report of CPU optimizations at the end of the build log,
+and here is how it looks on x86_64/gcc:
+
+.. raw:: html
+
+ <style>#build-report .highlight-bash pre{max-height:450px; overflow-y: scroll;}</style>
+
+.. literalinclude:: log_example.txt
+ :language: bash
+
+As you see, there is a separate report for each of ``build_ext`` and ``build_clib``
+that includes several sections, and each section has several values, representing the following:
+
+**Platform**:
+
+- :enabled:`Architecture`: The architecture name of target CPU. It should be one of
+ ``x86``, ``x64``, ``ppc64``, ``ppc64le``, ``armhf``, ``aarch64``, ``s390x`` or ``unknown``.
+
+- :enabled:`Compiler`: The compiler name. It should be one of
+ gcc, clang, msvc, icc, iccw or unix-like.
+
+**CPU baseline**:
+
+- :enabled:`Requested`: The specific features and options to ``--cpu-baseline`` as-is.
+- :enabled:`Enabled`: The final set of enabled CPU features.
+- :enabled:`Flags`: The compiler flags that were used to all NumPy `C/C++` sources
+ during the compilation except for temporary sources that have been used for generating
+ the binary objects of dispatched features.
+- :enabled:`Extra checks`: list of internal checks that activate certain functionality
+ or intrinsics related to the enabled features, useful for debugging when it comes
+ to developing SIMD kernels.
+
+**CPU dispatch**:
+
+- :enabled:`Requested`: The specific features and options to ``--cpu-dispatch`` as-is.
+- :enabled:`Enabled`: The final set of enabled CPU features.
+- :enabled:`Generated`: At the beginning of the next row of this property,
+ the features for which optimizations have been generated are shown in the
+ form of several sections with similar properties explained as follows:
+
+ - :enabled:`One or multiple dispatched feature`: The implied CPU features.
+ - :enabled:`Flags`: The compiler flags that been used for these features.
+ - :enabled:`Extra checks`: Similar to the baseline but for these dispatched features.
+ - :enabled:`Detect`: Set of CPU features that need be detected in runtime in order to
+ execute the generated optimizations.
+ - The lines that come after the above property and end with a ':' on a separate line,
+ represent the paths of c/c++ sources that define the generated optimizations.
+
+Runtime Trace
+-------------
+To be completed.
diff --git a/doc/source/reference/simd/gen_features.py b/doc/source/reference/simd/gen_features.py
new file mode 100644
index 000000000..9a38ef5c9
--- /dev/null
+++ b/doc/source/reference/simd/gen_features.py
@@ -0,0 +1,196 @@
+"""
+Generate CPU features tables from CCompilerOpt
+"""
+from os import sys, path
+from numpy.distutils.ccompiler_opt import CCompilerOpt
+
+class FakeCCompilerOpt(CCompilerOpt):
+ # disable caching no need for it
+ conf_nocache = True
+
+ def __init__(self, arch, cc, *args, **kwargs):
+ self.fake_info = (arch, cc, '')
+ CCompilerOpt.__init__(self, None, **kwargs)
+
+ def dist_compile(self, sources, flags, **kwargs):
+ return sources
+
+ def dist_info(self):
+ return self.fake_info
+
+ @staticmethod
+ def dist_log(*args, stderr=False):
+ # avoid printing
+ pass
+
+ def feature_test(self, name, force_flags=None, macros=[]):
+ # To speed up
+ return True
+
+class Features:
+ def __init__(self, arch, cc):
+ self.copt = FakeCCompilerOpt(arch, cc, cpu_baseline="max")
+
+ def names(self):
+ return self.copt.cpu_baseline_names()
+
+ def serialize(self, features_names):
+ result = []
+ for f in self.copt.feature_sorted(features_names):
+ gather = self.copt.feature_supported.get(f, {}).get("group", [])
+ implies = self.copt.feature_sorted(self.copt.feature_implies(f))
+ result.append((f, implies, gather))
+ return result
+
+ def table(self, **kwargs):
+ return self.gen_table(self.serialize(self.names()), **kwargs)
+
+ def table_diff(self, vs, **kwargs):
+ fnames = set(self.names())
+ fnames_vs = set(vs.names())
+ common = fnames.intersection(fnames_vs)
+ extra = fnames.difference(fnames_vs)
+ notavl = fnames_vs.difference(fnames)
+ iextra = {}
+ inotavl = {}
+ idiff = set()
+ for f in common:
+ implies = self.copt.feature_implies(f)
+ implies_vs = vs.copt.feature_implies(f)
+ e = implies.difference(implies_vs)
+ i = implies_vs.difference(implies)
+ if not i and not e:
+ continue
+ if e:
+ iextra[f] = e
+ if i:
+ inotavl[f] = e
+ idiff.add(f)
+
+ def fbold(f):
+ if f in extra:
+ return f':enabled:`{f}`'
+ if f in notavl:
+ return f':disabled:`{f}`'
+ return f
+
+ def fbold_implies(f, i):
+ if i in iextra.get(f, {}):
+ return f':enabled:`{i}`'
+ if f in notavl or i in inotavl.get(f, {}):
+ return f':disabled:`{i}`'
+ return i
+
+ diff_all = self.serialize(idiff.union(extra))
+ diff_all += vs.serialize(notavl)
+ content = self.gen_table(
+ diff_all, fstyle=fbold, fstyle_implies=fbold_implies, **kwargs
+ )
+ return content
+
+ def gen_table(self, serialized_features, fstyle=None, fstyle_implies=None,
+ **kwargs):
+
+ if fstyle is None:
+ fstyle = lambda ft: f'``{ft}``'
+ if fstyle_implies is None:
+ fstyle_implies = lambda origin, ft: fstyle(ft)
+
+ rows = []
+ have_gather = False
+ for f, implies, gather in serialized_features:
+ if gather:
+ have_gather = True
+ name = fstyle(f)
+ implies = ' '.join([fstyle_implies(f, i) for i in implies])
+ gather = ' '.join([fstyle_implies(f, i) for i in gather])
+ rows.append((name, implies, gather))
+ if not rows:
+ return ''
+ fields = ["Name", "Implies", "Gathers"]
+ if not have_gather:
+ del fields[2]
+ rows = [(name, implies) for name, implies, _ in rows]
+ return self.gen_rst_table(fields, rows, **kwargs)
+
+ def gen_rst_table(self, field_names, rows, tab_size=4):
+ assert(not rows or len(field_names) == len(rows[0]))
+ rows.append(field_names)
+ fld_len = len(field_names)
+ cls_len = [max(len(c[i]) for c in rows) for i in range(fld_len)]
+ del rows[-1]
+ cformat = ' '.join('{:<%d}' % i for i in cls_len)
+ border = cformat.format(*['='*i for i in cls_len])
+
+ rows = [cformat.format(*row) for row in rows]
+ # header
+ rows = [border, cformat.format(*field_names), border] + rows
+ # footer
+ rows += [border]
+ # add left margin
+ rows = [(' ' * tab_size) + r for r in rows]
+ return '\n'.join(rows)
+
+def wrapper_section(title, content, tab_size=4):
+ tab = ' '*tab_size
+ if content:
+ return (
+ f"{title}\n{'~'*len(title)}"
+ f"\n.. table::\n{tab}:align: left\n\n"
+ f"{content}\n\n"
+ )
+ return ''
+
+def wrapper_tab(title, table, tab_size=4):
+ tab = ' '*tab_size
+ if table:
+ ('\n' + tab).join((
+ '.. tab:: ' + title,
+ tab + '.. table::',
+ tab + 'align: left',
+ table + '\n\n'
+ ))
+ return ''
+
+
+if __name__ == '__main__':
+
+ pretty_names = {
+ "PPC64": "IBM/POWER big-endian",
+ "PPC64LE": "IBM/POWER little-endian",
+ "S390X": "IBM/ZSYSTEM(S390X)",
+ "ARMHF": "ARMv7/A32",
+ "AARCH64": "ARMv8/A64",
+ "ICC": "Intel Compiler",
+ # "ICCW": "Intel Compiler msvc-like",
+ "MSVC": "Microsoft Visual C/C++"
+ }
+ gen_path = path.join(
+ path.dirname(path.realpath(__file__)), "generated_tables"
+ )
+ with open(path.join(gen_path, 'cpu_features.inc'), 'wt') as fd:
+ fd.write(f'.. generated via {__file__}\n\n')
+ for arch in (
+ ("x86", "PPC64", "PPC64LE", "ARMHF", "AARCH64", "S390X")
+ ):
+ title = "On " + pretty_names.get(arch, arch)
+ table = Features(arch, 'gcc').table()
+ fd.write(wrapper_section(title, table))
+
+ with open(path.join(gen_path, 'compilers-diff.inc'), 'wt') as fd:
+ fd.write(f'.. generated via {__file__}\n\n')
+ for arch, cc_names in (
+ ("x86", ("clang", "ICC", "MSVC")),
+ ("PPC64", ("clang",)),
+ ("PPC64LE", ("clang",)),
+ ("ARMHF", ("clang",)),
+ ("AARCH64", ("clang",)),
+ ("S390X", ("clang",))
+ ):
+ arch_pname = pretty_names.get(arch, arch)
+ for cc in cc_names:
+ title = f"On {arch_pname}::{pretty_names.get(cc, cc)}"
+ table = Features(arch, cc).table_diff(Features(arch, "gcc"))
+ fd.write(wrapper_section(title, table))
+
+
diff --git a/doc/source/reference/simd/generated_tables/compilers-diff.inc b/doc/source/reference/simd/generated_tables/compilers-diff.inc
new file mode 100644
index 000000000..4b9009a68
--- /dev/null
+++ b/doc/source/reference/simd/generated_tables/compilers-diff.inc
@@ -0,0 +1,33 @@
+.. generated via /home/seiko/work/repos/numpy/doc/source/reference/simd/./gen_features.py
+
+On x86::Intel Compiler
+~~~~~~~~~~~~~~~~~~~~~~
+.. table::
+ :align: left
+
+ ================ ==========================================================================================================================================
+ Name Implies
+ ================ ==========================================================================================================================================
+ FMA3 SSE SSE2 SSE3 SSSE3 SSE41 POPCNT SSE42 AVX F16C :enabled:`AVX2`
+ AVX2 SSE SSE2 SSE3 SSSE3 SSE41 POPCNT SSE42 AVX F16C :enabled:`FMA3`
+ AVX512F SSE SSE2 SSE3 SSSE3 SSE41 POPCNT SSE42 AVX F16C FMA3 AVX2 :enabled:`AVX512CD`
+ :disabled:`XOP` :disabled:`SSE` :disabled:`SSE2` :disabled:`SSE3` :disabled:`SSSE3` :disabled:`SSE41` :disabled:`POPCNT` :disabled:`SSE42` :disabled:`AVX`
+ :disabled:`FMA4` :disabled:`SSE` :disabled:`SSE2` :disabled:`SSE3` :disabled:`SSSE3` :disabled:`SSE41` :disabled:`POPCNT` :disabled:`SSE42` :disabled:`AVX`
+ ================ ==========================================================================================================================================
+
+On x86::Microsoft Visual C/C++
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+.. table::
+ :align: left
+
+ ====================== ============================================================================================================================================================================================================================================================= =============================================================================
+ Name Implies Gathers
+ ====================== ============================================================================================================================================================================================================================================================= =============================================================================
+ FMA3 SSE SSE2 SSE3 SSSE3 SSE41 POPCNT SSE42 AVX F16C :enabled:`AVX2`
+ AVX2 SSE SSE2 SSE3 SSSE3 SSE41 POPCNT SSE42 AVX F16C :enabled:`FMA3`
+ AVX512F SSE SSE2 SSE3 SSSE3 SSE41 POPCNT SSE42 AVX F16C FMA3 AVX2 :enabled:`AVX512CD` :enabled:`AVX512_SKX`
+ AVX512CD SSE SSE2 SSE3 SSSE3 SSE41 POPCNT SSE42 AVX F16C FMA3 AVX2 AVX512F :enabled:`AVX512_SKX`
+ :disabled:`AVX512_KNL` :disabled:`SSE` :disabled:`SSE2` :disabled:`SSE3` :disabled:`SSSE3` :disabled:`SSE41` :disabled:`POPCNT` :disabled:`SSE42` :disabled:`AVX` :disabled:`F16C` :disabled:`FMA3` :disabled:`AVX2` :disabled:`AVX512F` :disabled:`AVX512CD` :disabled:`AVX512ER` :disabled:`AVX512PF`
+ :disabled:`AVX512_KNM` :disabled:`SSE` :disabled:`SSE2` :disabled:`SSE3` :disabled:`SSSE3` :disabled:`SSE41` :disabled:`POPCNT` :disabled:`SSE42` :disabled:`AVX` :disabled:`F16C` :disabled:`FMA3` :disabled:`AVX2` :disabled:`AVX512F` :disabled:`AVX512CD` :disabled:`AVX512_KNL` :disabled:`AVX5124FMAPS` :disabled:`AVX5124VNNIW` :disabled:`AVX512VPOPCNTDQ`
+ ====================== ============================================================================================================================================================================================================================================================= =============================================================================
+
diff --git a/doc/source/reference/simd/generated_tables/cpu_features.inc b/doc/source/reference/simd/generated_tables/cpu_features.inc
new file mode 100644
index 000000000..17d1b4951
--- /dev/null
+++ b/doc/source/reference/simd/generated_tables/cpu_features.inc
@@ -0,0 +1,106 @@
+.. generated via /home/seiko/work/repos/review/numpy/doc/source/reference/simd/gen_features.py
+
+On x86
+~~~~~~
+.. table::
+ :align: left
+
+ ============== =========================================================================================================================================================================== =====================================================
+ Name Implies Gathers
+ ============== =========================================================================================================================================================================== =====================================================
+ ``SSE`` ``SSE2``
+ ``SSE2`` ``SSE``
+ ``SSE3`` ``SSE`` ``SSE2``
+ ``SSSE3`` ``SSE`` ``SSE2`` ``SSE3``
+ ``SSE41`` ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3``
+ ``POPCNT`` ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3`` ``SSE41``
+ ``SSE42`` ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3`` ``SSE41`` ``POPCNT``
+ ``AVX`` ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3`` ``SSE41`` ``POPCNT`` ``SSE42``
+ ``XOP`` ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3`` ``SSE41`` ``POPCNT`` ``SSE42`` ``AVX``
+ ``FMA4`` ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3`` ``SSE41`` ``POPCNT`` ``SSE42`` ``AVX``
+ ``F16C`` ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3`` ``SSE41`` ``POPCNT`` ``SSE42`` ``AVX``
+ ``FMA3`` ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3`` ``SSE41`` ``POPCNT`` ``SSE42`` ``AVX`` ``F16C``
+ ``AVX2`` ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3`` ``SSE41`` ``POPCNT`` ``SSE42`` ``AVX`` ``F16C``
+ ``AVX512F`` ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3`` ``SSE41`` ``POPCNT`` ``SSE42`` ``AVX`` ``F16C`` ``FMA3`` ``AVX2``
+ ``AVX512CD`` ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3`` ``SSE41`` ``POPCNT`` ``SSE42`` ``AVX`` ``F16C`` ``FMA3`` ``AVX2`` ``AVX512F``
+ ``AVX512_KNL`` ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3`` ``SSE41`` ``POPCNT`` ``SSE42`` ``AVX`` ``F16C`` ``FMA3`` ``AVX2`` ``AVX512F`` ``AVX512CD`` ``AVX512ER`` ``AVX512PF``
+ ``AVX512_KNM`` ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3`` ``SSE41`` ``POPCNT`` ``SSE42`` ``AVX`` ``F16C`` ``FMA3`` ``AVX2`` ``AVX512F`` ``AVX512CD`` ``AVX512_KNL`` ``AVX5124FMAPS`` ``AVX5124VNNIW`` ``AVX512VPOPCNTDQ``
+ ``AVX512_SKX`` ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3`` ``SSE41`` ``POPCNT`` ``SSE42`` ``AVX`` ``F16C`` ``FMA3`` ``AVX2`` ``AVX512F`` ``AVX512CD`` ``AVX512VL`` ``AVX512BW`` ``AVX512DQ``
+ ``AVX512_CLX`` ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3`` ``SSE41`` ``POPCNT`` ``SSE42`` ``AVX`` ``F16C`` ``FMA3`` ``AVX2`` ``AVX512F`` ``AVX512CD`` ``AVX512_SKX`` ``AVX512VNNI``
+ ``AVX512_CNL`` ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3`` ``SSE41`` ``POPCNT`` ``SSE42`` ``AVX`` ``F16C`` ``FMA3`` ``AVX2`` ``AVX512F`` ``AVX512CD`` ``AVX512_SKX`` ``AVX512IFMA`` ``AVX512VBMI``
+ ``AVX512_ICL`` ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3`` ``SSE41`` ``POPCNT`` ``SSE42`` ``AVX`` ``F16C`` ``FMA3`` ``AVX2`` ``AVX512F`` ``AVX512CD`` ``AVX512_SKX`` ``AVX512_CLX`` ``AVX512_CNL`` ``AVX512VBMI2`` ``AVX512BITALG`` ``AVX512VPOPCNTDQ``
+ ============== =========================================================================================================================================================================== =====================================================
+
+On IBM/POWER big-endian
+~~~~~~~~~~~~~~~~~~~~~~~
+.. table::
+ :align: left
+
+ ======== ================
+ Name Implies
+ ======== ================
+ ``VSX``
+ ``VSX2`` ``VSX``
+ ``VSX3`` ``VSX`` ``VSX2``
+ ======== ================
+
+On IBM/POWER little-endian
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+.. table::
+ :align: left
+
+ ======== ================
+ Name Implies
+ ======== ================
+ ``VSX`` ``VSX2``
+ ``VSX2`` ``VSX``
+ ``VSX3`` ``VSX`` ``VSX2``
+ ======== ================
+
+On ARMv7/A32
+~~~~~~~~~~~~
+.. table::
+ :align: left
+
+ ============== ===========================================================
+ Name Implies
+ ============== ===========================================================
+ ``NEON``
+ ``NEON_FP16`` ``NEON``
+ ``NEON_VFPV4`` ``NEON`` ``NEON_FP16``
+ ``ASIMD`` ``NEON`` ``NEON_FP16`` ``NEON_VFPV4``
+ ``ASIMDHP`` ``NEON`` ``NEON_FP16`` ``NEON_VFPV4`` ``ASIMD``
+ ``ASIMDDP`` ``NEON`` ``NEON_FP16`` ``NEON_VFPV4`` ``ASIMD``
+ ``ASIMDFHM`` ``NEON`` ``NEON_FP16`` ``NEON_VFPV4`` ``ASIMD`` ``ASIMDHP``
+ ============== ===========================================================
+
+On ARMv8/A64
+~~~~~~~~~~~~
+.. table::
+ :align: left
+
+ ============== ===========================================================
+ Name Implies
+ ============== ===========================================================
+ ``NEON`` ``NEON_FP16`` ``NEON_VFPV4`` ``ASIMD``
+ ``NEON_FP16`` ``NEON`` ``NEON_VFPV4`` ``ASIMD``
+ ``NEON_VFPV4`` ``NEON`` ``NEON_FP16`` ``ASIMD``
+ ``ASIMD`` ``NEON`` ``NEON_FP16`` ``NEON_VFPV4``
+ ``ASIMDHP`` ``NEON`` ``NEON_FP16`` ``NEON_VFPV4`` ``ASIMD``
+ ``ASIMDDP`` ``NEON`` ``NEON_FP16`` ``NEON_VFPV4`` ``ASIMD``
+ ``ASIMDFHM`` ``NEON`` ``NEON_FP16`` ``NEON_VFPV4`` ``ASIMD`` ``ASIMDHP``
+ ============== ===========================================================
+
+On IBM/ZSYSTEM(S390X)
+~~~~~~~~~~~~~~~~~~~~~
+.. table::
+ :align: left
+
+ ======== ==============
+ Name Implies
+ ======== ==============
+ ``VX``
+ ``VXE`` ``VX``
+ ``VXE2`` ``VX`` ``VXE``
+ ======== ==============
+
diff --git a/doc/source/reference/simd/how-it-works.rst b/doc/source/reference/simd/how-it-works.rst
new file mode 100644
index 000000000..a2882f484
--- /dev/null
+++ b/doc/source/reference/simd/how-it-works.rst
@@ -0,0 +1,349 @@
+**********************************
+How does the CPU dispatcher work?
+**********************************
+
+NumPy dispatcher is based on multi-source compiling, which means taking
+a certain source and compiling it multiple times with different compiler
+flags and also with different **C** definitions that affect the code
+paths. This enables certain instruction-sets for each compiled object
+depending on the required optimizations and ends with linking the
+returned objects together.
+
+.. figure:: ../figures/opt-infra.png
+
+This mechanism should support all compilers and it doesn't require any
+compiler-specific extension, but at the same time it adds a few steps to
+normal compilation that are explained as follows.
+
+1- Configuration
+~~~~~~~~~~~~~~~~
+
+Configuring the required optimization by the user before starting to build the
+source files via the two command arguments as explained above:
+
+- ``--cpu-baseline``: minimal set of required optimizations.
+
+- ``--cpu-dispatch``: dispatched set of additional optimizations.
+
+
+2- Discovering the environment
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+In this part, we check the compiler and platform architecture
+and cache some of the intermediary results to speed up rebuilding.
+
+3- Validating the requested optimizations
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+By testing them against the compiler, and seeing what the compiler can
+support according to the requested optimizations.
+
+4- Generating the main configuration header
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The generated header ``_cpu_dispatch.h`` contains all the definitions and
+headers of instruction-sets for the required optimizations that have been
+validated during the previous step.
+
+It also contains extra C definitions that are used for defining NumPy's
+Python-level module attributes ``__cpu_baseline__`` and ``__cpu_dispatch__``.
+
+**What is in this header?**
+
+The example header was dynamically generated by gcc on an X86 machine.
+The compiler supports ``--cpu-baseline="sse sse2 sse3"`` and
+``--cpu-dispatch="ssse3 sse41"``, and the result is below.
+
+.. code:: c
+
+ // The header should be located at numpy/numpy/core/src/common/_cpu_dispatch.h
+ /**NOTE
+ ** C definitions prefixed with "NPY_HAVE_" represent
+ ** the required optimzations.
+ **
+ ** C definitions prefixed with 'NPY__CPU_TARGET_' are protected and
+ ** shouldn't be used by any NumPy C sources.
+ */
+ /******* baseline features *******/
+ /** SSE **/
+ #define NPY_HAVE_SSE 1
+ #include <xmmintrin.h>
+ /** SSE2 **/
+ #define NPY_HAVE_SSE2 1
+ #include <emmintrin.h>
+ /** SSE3 **/
+ #define NPY_HAVE_SSE3 1
+ #include <pmmintrin.h>
+
+ /******* dispatch-able features *******/
+ #ifdef NPY__CPU_TARGET_SSSE3
+ /** SSSE3 **/
+ #define NPY_HAVE_SSSE3 1
+ #include <tmmintrin.h>
+ #endif
+ #ifdef NPY__CPU_TARGET_SSE41
+ /** SSE41 **/
+ #define NPY_HAVE_SSE41 1
+ #include <smmintrin.h>
+ #endif
+
+**Baseline features** are the minimal set of required optimizations configured
+via ``--cpu-baseline``. They have no preprocessor guards and they're
+always on, which means they can be used in any source.
+
+Does this mean NumPy's infrastructure passes the compiler's flags of
+baseline features to all sources?
+
+Definitely, yes. But the :ref:`dispatch-able sources <dispatchable-sources>` are
+treated differently.
+
+What if the user specifies certain **baseline features** during the
+build but at runtime the machine doesn't support even these
+features? Will the compiled code be called via one of these definitions, or
+maybe the compiler itself auto-generated/vectorized certain piece of code
+based on the provided command line compiler flags?
+
+During the loading of the NumPy module, there's a validation step
+which detects this behavior. It will raise a Python runtime error to inform the
+user. This is to prevent the CPU reaching an illegal instruction error causing
+a segfault.
+
+**Dispatch-able features** are our dispatched set of additional optimizations
+that were configured via ``--cpu-dispatch``. They are not activated by
+default and are always guarded by other C definitions prefixed with
+``NPY__CPU_TARGET_``. C definitions ``NPY__CPU_TARGET_`` are only
+enabled within **dispatch-able sources**.
+
+.. _dispatchable-sources:
+
+5- Dispatch-able sources and configuration statements
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Dispatch-able sources are special **C** files that can be compiled multiple
+times with different compiler flags and also with different **C**
+definitions. These affect code paths to enable certain
+instruction-sets for each compiled object according to "**the
+configuration statements**" that must be declared between a **C**
+comment\ ``(/**/)`` and start with a special mark **@targets** at the
+top of each dispatch-able source. At the same time, dispatch-able
+sources will be treated as normal **C** sources if the optimization was
+disabled by the command argument ``--disable-optimization`` .
+
+**What are configuration statements?**
+
+Configuration statements are sort of keywords combined together to
+determine the required optimization for the dispatch-able source.
+
+Example:
+
+.. code:: c
+
+ /*@targets avx2 avx512f vsx2 vsx3 asimd asimdhp */
+ // C code
+
+The keywords mainly represent the additional optimizations configured
+through ``--cpu-dispatch``, but it can also represent other options such as:
+
+- Target groups: pre-configured configuration statements used for
+ managing the required optimizations from outside the dispatch-able source.
+
+- Policies: collections of options used for changing the default
+ behaviors or forcing the compilers to perform certain things.
+
+- "baseline": a unique keyword represents the minimal optimizations
+ that configured through ``--cpu-baseline``
+
+**Numpy's infrastructure handles dispatch-able sources in four steps**:
+
+- **(A) Recognition**: Just like source templates and F2PY, the
+ dispatch-able sources requires a special extension ``*.dispatch.c``
+ to mark C dispatch-able source files, and for C++
+ ``*.dispatch.cpp`` or ``*.dispatch.cxx``
+ **NOTE**: C++ not supported yet.
+
+- **(B) Parsing and validating**: In this step, the
+ dispatch-able sources that had been filtered by the previous step
+ are parsed and validated by the configuration statements for each one
+ of them one by one in order to determine the required optimizations.
+
+- **(C) Wrapping**: This is the approach taken by NumPy's
+ infrastructure, which has proved to be sufficiently flexible in order
+ to compile a single source multiple times with different **C**
+ definitions and flags that affect the code paths. The process is
+ achieved by creating a temporary **C** source for each required
+ optimization that related to the additional optimization, which
+ contains the declarations of the **C** definitions and includes the
+ involved source via the **C** directive **#include**. For more
+ clarification take a look at the following code for AVX512F :
+
+ .. code:: c
+
+ /*
+ * this definition is used by NumPy utilities as suffixes for the
+ * exported symbols
+ */
+ #define NPY__CPU_TARGET_CURRENT AVX512F
+ /*
+ * The following definitions enable
+ * definitions of the dispatch-able features that are defined within the main
+ * configuration header. These are definitions for the implied features.
+ */
+ #define NPY__CPU_TARGET_SSE
+ #define NPY__CPU_TARGET_SSE2
+ #define NPY__CPU_TARGET_SSE3
+ #define NPY__CPU_TARGET_SSSE3
+ #define NPY__CPU_TARGET_SSE41
+ #define NPY__CPU_TARGET_POPCNT
+ #define NPY__CPU_TARGET_SSE42
+ #define NPY__CPU_TARGET_AVX
+ #define NPY__CPU_TARGET_F16C
+ #define NPY__CPU_TARGET_FMA3
+ #define NPY__CPU_TARGET_AVX2
+ #define NPY__CPU_TARGET_AVX512F
+ // our dispatch-able source
+ #include "/the/absuolate/path/of/hello.dispatch.c"
+
+- **(D) Dispatch-able configuration header**: The infrastructure
+ generates a config header for each dispatch-able source, this header
+ mainly contains two abstract **C** macros used for identifying the
+ generated objects, so they can be used for runtime dispatching
+ certain symbols from the generated objects by any **C** source. It is
+ also used for forward declarations.
+
+ The generated header takes the name of the dispatch-able source after
+ excluding the extension and replace it with ``.h``, for example
+ assume we have a dispatch-able source called ``hello.dispatch.c`` and
+ contains the following:
+
+ .. code:: c
+
+ // hello.dispatch.c
+ /*@targets baseline sse42 avx512f */
+ #include <stdio.h>
+ #include "numpy/utils.h" // NPY_CAT, NPY_TOSTR
+
+ #ifndef NPY__CPU_TARGET_CURRENT
+ // wrapping the dispatch-able source only happens to the additional optimizations
+ // but if the keyword 'baseline' provided within the configuration statements,
+ // the infrastructure will add extra compiling for the dispatch-able source by
+ // passing it as-is to the compiler without any changes.
+ #define CURRENT_TARGET(X) X
+ #define NPY__CPU_TARGET_CURRENT baseline // for printing only
+ #else
+ // since we reach to this point, that's mean we're dealing with
+ // the additional optimizations, so it could be SSE42 or AVX512F
+ #define CURRENT_TARGET(X) NPY_CAT(NPY_CAT(X, _), NPY__CPU_TARGET_CURRENT)
+ #endif
+ // Macro 'CURRENT_TARGET' adding the current target as suffux to the exported symbols,
+ // to avoid linking duplications, NumPy already has a macro called
+ // 'NPY_CPU_DISPATCH_CURFX' similar to it, located at
+ // numpy/numpy/core/src/common/npy_cpu_dispatch.h
+ // NOTE: we tend to not adding suffixes to the baseline exported symbols
+ void CURRENT_TARGET(simd_whoami)(const char *extra_info)
+ {
+ printf("I'm " NPY_TOSTR(NPY__CPU_TARGET_CURRENT) ", %s\n", extra_info);
+ }
+
+ Now assume you attached **hello.dispatch.c** to the source tree, then
+ the infrastructure should generate a temporary config header called
+ **hello.dispatch.h** that can be reached by any source in the source
+ tree, and it should contain the following code :
+
+ .. code:: c
+
+ #ifndef NPY__CPU_DISPATCH_EXPAND_
+ // To expand the macro calls in this header
+ #define NPY__CPU_DISPATCH_EXPAND_(X) X
+ #endif
+ // Undefining the following macros, due to the possibility of including config headers
+ // multiple times within the same source and since each config header represents
+ // different required optimizations according to the specified configuration
+ // statements in the dispatch-able source that derived from it.
+ #undef NPY__CPU_DISPATCH_BASELINE_CALL
+ #undef NPY__CPU_DISPATCH_CALL
+ // nothing strange here, just a normal preprocessor callback
+ // enabled only if 'baseline' specified within the configuration statements
+ #define NPY__CPU_DISPATCH_BASELINE_CALL(CB, ...) \
+ NPY__CPU_DISPATCH_EXPAND_(CB(__VA_ARGS__))
+ // 'NPY__CPU_DISPATCH_CALL' is an abstract macro is used for dispatching
+ // the required optimizations that specified within the configuration statements.
+ //
+ // @param CHK, Expected a macro that can be used to detect CPU features
+ // in runtime, which takes a CPU feature name without string quotes and
+ // returns the testing result in a shape of boolean value.
+ // NumPy already has macro called "NPY_CPU_HAVE", which fits this requirement.
+ //
+ // @param CB, a callback macro that expected to be called multiple times depending
+ // on the required optimizations, the callback should receive the following arguments:
+ // 1- The pending calls of @param CHK filled up with the required CPU features,
+ // that need to be tested first in runtime before executing call belong to
+ // the compiled object.
+ // 2- The required optimization name, same as in 'NPY__CPU_TARGET_CURRENT'
+ // 3- Extra arguments in the macro itself
+ //
+ // By default the callback calls are sorted depending on the highest interest
+ // unless the policy "$keep_sort" was in place within the configuration statements
+ // see "Dive into the CPU dispatcher" for more clarification.
+ #define NPY__CPU_DISPATCH_CALL(CHK, CB, ...) \
+ NPY__CPU_DISPATCH_EXPAND_(CB((CHK(AVX512F)), AVX512F, __VA_ARGS__)) \
+ NPY__CPU_DISPATCH_EXPAND_(CB((CHK(SSE)&&CHK(SSE2)&&CHK(SSE3)&&CHK(SSSE3)&&CHK(SSE41)), SSE41, __VA_ARGS__))
+
+ An example of using the config header in light of the above:
+
+ .. code:: c
+
+ // NOTE: The following macros are only defined for demonstration purposes only.
+ // NumPy already has a collections of macros located at
+ // numpy/numpy/core/src/common/npy_cpu_dispatch.h, that covers all dispatching
+ // and declarations scenarios.
+
+ #include "numpy/npy_cpu_features.h" // NPY_CPU_HAVE
+ #include "numpy/utils.h" // NPY_CAT, NPY_EXPAND
+
+ // An example for setting a macro that calls all the exported symbols at once
+ // after checking if they're supported by the running machine.
+ #define DISPATCH_CALL_ALL(FN, ARGS) \
+ NPY__CPU_DISPATCH_CALL(NPY_CPU_HAVE, DISPATCH_CALL_ALL_CB, FN, ARGS) \
+ NPY__CPU_DISPATCH_BASELINE_CALL(DISPATCH_CALL_BASELINE_ALL_CB, FN, ARGS)
+ // The preprocessor callbacks.
+ // The same suffixes as we define it in the dispatch-able source.
+ #define DISPATCH_CALL_ALL_CB(CHECK, TARGET_NAME, FN, ARGS) \
+ if (CHECK) { NPY_CAT(NPY_CAT(FN, _), TARGET_NAME) ARGS; }
+ #define DISPATCH_CALL_BASELINE_ALL_CB(FN, ARGS) \
+ FN NPY_EXPAND(ARGS);
+
+ // An example for setting a macro that calls the exported symbols of highest
+ // interest optimization, after checking if they're supported by the running machine.
+ #define DISPATCH_CALL_HIGH(FN, ARGS) \
+ if (0) {} \
+ NPY__CPU_DISPATCH_CALL(NPY_CPU_HAVE, DISPATCH_CALL_HIGH_CB, FN, ARGS) \
+ NPY__CPU_DISPATCH_BASELINE_CALL(DISPATCH_CALL_BASELINE_HIGH_CB, FN, ARGS)
+ // The preprocessor callbacks
+ // The same suffixes as we define it in the dispatch-able source.
+ #define DISPATCH_CALL_HIGH_CB(CHECK, TARGET_NAME, FN, ARGS) \
+ else if (CHECK) { NPY_CAT(NPY_CAT(FN, _), TARGET_NAME) ARGS; }
+ #define DISPATCH_CALL_BASELINE_HIGH_CB(FN, ARGS) \
+ else { FN NPY_EXPAND(ARGS); }
+
+ // NumPy has a macro called 'NPY_CPU_DISPATCH_DECLARE' can be used
+ // for forward declrations any kind of prototypes based on
+ // 'NPY__CPU_DISPATCH_CALL' and 'NPY__CPU_DISPATCH_BASELINE_CALL'.
+ // However in this example, we just handle it manually.
+ void simd_whoami(const char *extra_info);
+ void simd_whoami_AVX512F(const char *extra_info);
+ void simd_whoami_SSE41(const char *extra_info);
+
+ void trigger_me(void)
+ {
+ // bring the auto-gernreated config header
+ // which contains config macros 'NPY__CPU_DISPATCH_CALL' and
+ // 'NPY__CPU_DISPATCH_BASELINE_CALL'.
+ // it highely recomaned to include the config header before exectuing
+ // the dispatching macros in case if there's another header in the scope.
+ #include "hello.dispatch.h"
+ DISPATCH_CALL_ALL(simd_whoami, ("all"))
+ DISPATCH_CALL_HIGH(simd_whoami, ("the highest interest"))
+ // An example of including multiple config headers in the same source
+ // #include "hello2.dispatch.h"
+ // DISPATCH_CALL_HIGH(another_function, ("the highest interest"))
+ }
diff --git a/doc/source/reference/simd/index.rst b/doc/source/reference/simd/index.rst
new file mode 100644
index 000000000..230e2dc15
--- /dev/null
+++ b/doc/source/reference/simd/index.rst
@@ -0,0 +1,43 @@
+.. _numpysimd:
+.. currentmodule:: numpysimd
+
+***********************
+CPU/SIMD Optimizations
+***********************
+
+NumPy comes with a flexible working mechanism that allows it to harness the SIMD
+features that CPUs own, in order to provide faster and more stable performance
+on all popular platforms. Currently, NumPy supports the X86, IBM/Power, ARM7 and ARM8
+architectures.
+
+The optimization process in NumPy is carried out in three layers:
+
+- Code is *written* using the universal intrinsics which is a set of types, macros and
+ functions that are mapped to each supported instruction-sets by using guards that
+ will enable use of the them only when the compiler recognizes them.
+ This allow us to generate multiple kernels for the same functionality,
+ in which each generated kernel represents a set of instructions that related one
+ or multiple certain CPU features. The first kernel represents the minimum (baseline)
+ CPU features, and the other kernels represent the additional (dispatched) CPU features.
+
+- At *compile* time, CPU build options are used to define the minimum and
+ additional features to support, based on user choice and compiler support. The
+ appropriate intrinsics are overlaid with the platform / architecture intrinsics,
+ and multiple kernels are compiled.
+
+- At *runtime import*, the CPU is probed for the set of supported CPU
+ features. A mechanism is used to grab the pointer to the most appropriate
+ kernel, and this will be the one called for the function.
+
+.. note::
+
+ NumPy community had a deep discussion before implementing this work,
+ please check `NEP-38`_ for more clarification.
+
+.. toctree::
+
+ build-options
+ how-it-works
+
+.. _`NEP-38`: https://numpy.org/neps/nep-0038-SIMD-optimizations.html
+
diff --git a/doc/source/reference/simd/log_example.txt b/doc/source/reference/simd/log_example.txt
new file mode 100644
index 000000000..b0c732433
--- /dev/null
+++ b/doc/source/reference/simd/log_example.txt
@@ -0,0 +1,79 @@
+########### EXT COMPILER OPTIMIZATION ###########
+Platform :
+ Architecture: x64
+ Compiler : gcc
+
+CPU baseline :
+ Requested : 'min'
+ Enabled : SSE SSE2 SSE3
+ Flags : -msse -msse2 -msse3
+ Extra checks: none
+
+CPU dispatch :
+ Requested : 'max -xop -fma4'
+ Enabled : SSSE3 SSE41 POPCNT SSE42 AVX F16C FMA3 AVX2 AVX512F AVX512CD AVX512_KNL AVX512_KNM AVX512_SKX AVX512_CLX AVX512_CNL AVX512_ICL
+ Generated :
+ :
+ SSE41 : SSE SSE2 SSE3 SSSE3
+ Flags : -msse -msse2 -msse3 -mssse3 -msse4.1
+ Extra checks: none
+ Detect : SSE SSE2 SSE3 SSSE3 SSE41
+ : build/src.linux-x86_64-3.9/numpy/core/src/umath/loops_arithmetic.dispatch.c
+ : numpy/core/src/umath/_umath_tests.dispatch.c
+ :
+ SSE42 : SSE SSE2 SSE3 SSSE3 SSE41 POPCNT
+ Flags : -msse -msse2 -msse3 -mssse3 -msse4.1 -mpopcnt -msse4.2
+ Extra checks: none
+ Detect : SSE SSE2 SSE3 SSSE3 SSE41 POPCNT SSE42
+ : build/src.linux-x86_64-3.9/numpy/core/src/_simd/_simd.dispatch.c
+ :
+ AVX2 : SSE SSE2 SSE3 SSSE3 SSE41 POPCNT SSE42 AVX F16C
+ Flags : -msse -msse2 -msse3 -mssse3 -msse4.1 -mpopcnt -msse4.2 -mavx -mf16c -mavx2
+ Extra checks: none
+ Detect : AVX F16C AVX2
+ : build/src.linux-x86_64-3.9/numpy/core/src/umath/loops_arithm_fp.dispatch.c
+ : build/src.linux-x86_64-3.9/numpy/core/src/umath/loops_arithmetic.dispatch.c
+ : numpy/core/src/umath/_umath_tests.dispatch.c
+ :
+ (FMA3 AVX2) : SSE SSE2 SSE3 SSSE3 SSE41 POPCNT SSE42 AVX F16C
+ Flags : -msse -msse2 -msse3 -mssse3 -msse4.1 -mpopcnt -msse4.2 -mavx -mf16c -mfma -mavx2
+ Extra checks: none
+ Detect : AVX F16C FMA3 AVX2
+ : build/src.linux-x86_64-3.9/numpy/core/src/_simd/_simd.dispatch.c
+ : build/src.linux-x86_64-3.9/numpy/core/src/umath/loops_exponent_log.dispatch.c
+ : build/src.linux-x86_64-3.9/numpy/core/src/umath/loops_trigonometric.dispatch.c
+ :
+ AVX512F : SSE SSE2 SSE3 SSSE3 SSE41 POPCNT SSE42 AVX F16C FMA3 AVX2
+ Flags : -msse -msse2 -msse3 -mssse3 -msse4.1 -mpopcnt -msse4.2 -mavx -mf16c -mfma -mavx2 -mavx512f
+ Extra checks: AVX512F_REDUCE
+ Detect : AVX512F
+ : build/src.linux-x86_64-3.9/numpy/core/src/_simd/_simd.dispatch.c
+ : build/src.linux-x86_64-3.9/numpy/core/src/umath/loops_arithm_fp.dispatch.c
+ : build/src.linux-x86_64-3.9/numpy/core/src/umath/loops_arithmetic.dispatch.c
+ : build/src.linux-x86_64-3.9/numpy/core/src/umath/loops_exponent_log.dispatch.c
+ : build/src.linux-x86_64-3.9/numpy/core/src/umath/loops_trigonometric.dispatch.c
+ :
+ AVX512_SKX : SSE SSE2 SSE3 SSSE3 SSE41 POPCNT SSE42 AVX F16C FMA3 AVX2 AVX512F AVX512CD
+ Flags : -msse -msse2 -msse3 -mssse3 -msse4.1 -mpopcnt -msse4.2 -mavx -mf16c -mfma -mavx2 -mavx512f -mavx512cd -mavx512vl -mavx512bw -mavx512dq
+ Extra checks: AVX512BW_MASK AVX512DQ_MASK
+ Detect : AVX512_SKX
+ : build/src.linux-x86_64-3.9/numpy/core/src/_simd/_simd.dispatch.c
+ : build/src.linux-x86_64-3.9/numpy/core/src/umath/loops_arithmetic.dispatch.c
+ : build/src.linux-x86_64-3.9/numpy/core/src/umath/loops_exponent_log.dispatch.c
+CCompilerOpt.cache_flush[804] : write cache to path -> /home/seiko/work/repos/numpy/build/temp.linux-x86_64-3.9/ccompiler_opt_cache_ext.py
+
+########### CLIB COMPILER OPTIMIZATION ###########
+Platform :
+ Architecture: x64
+ Compiler : gcc
+
+CPU baseline :
+ Requested : 'min'
+ Enabled : SSE SSE2 SSE3
+ Flags : -msse -msse2 -msse3
+ Extra checks: none
+
+CPU dispatch :
+ Requested : 'max -xop -fma4'
+ Enabled : SSSE3 SSE41 POPCNT SSE42 AVX F16C FMA3 AVX2 AVX512F AVX512CD AVX512_KNL AVX512_KNM AVX512_SKX AVX512_CLX AVX512_CNL AVX512_ICL
+ Generated : none
diff --git a/doc/source/reference/simd/simd-optimizations-tables-diff.inc b/doc/source/reference/simd/simd-optimizations-tables-diff.inc
deleted file mode 100644
index 41fa96703..000000000
--- a/doc/source/reference/simd/simd-optimizations-tables-diff.inc
+++ /dev/null
@@ -1,37 +0,0 @@
-.. generated via source/reference/simd/simd-optimizations.py
-
-x86::Intel Compiler - CPU feature names
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-.. table::
- :align: left
-
- =========== ==================================================================================================================
- Name Implies
- =========== ==================================================================================================================
- ``FMA3`` ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3`` ``SSE41`` ``POPCNT`` ``SSE42`` ``AVX`` ``F16C`` **AVX2**
- ``AVX2`` ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3`` ``SSE41`` ``POPCNT`` ``SSE42`` ``AVX`` ``F16C`` **FMA3**
- ``AVX512F`` ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3`` ``SSE41`` ``POPCNT`` ``SSE42`` ``AVX`` ``F16C`` ``FMA3`` ``AVX2`` **AVX512CD**
- =========== ==================================================================================================================
-
-.. note::
- The following features aren't supported by x86::Intel Compiler:
- **XOP FMA4**
-
-x86::Microsoft Visual C/C++ - CPU feature names
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-.. table::
- :align: left
-
- ============ =================================================================================================================================
- Name Implies
- ============ =================================================================================================================================
- ``FMA3`` ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3`` ``SSE41`` ``POPCNT`` ``SSE42`` ``AVX`` ``F16C`` **AVX2**
- ``AVX2`` ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3`` ``SSE41`` ``POPCNT`` ``SSE42`` ``AVX`` ``F16C`` **FMA3**
- ``AVX512F`` ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3`` ``SSE41`` ``POPCNT`` ``SSE42`` ``AVX`` ``F16C`` ``FMA3`` ``AVX2`` **AVX512CD** **AVX512_SKX**
- ``AVX512CD`` ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3`` ``SSE41`` ``POPCNT`` ``SSE42`` ``AVX`` ``F16C`` ``FMA3`` ``AVX2`` ``AVX512F`` **AVX512_SKX**
- ============ =================================================================================================================================
-
-.. note::
- The following features aren't supported by x86::Microsoft Visual C/C++:
- **AVX512_KNL AVX512_KNM**
-
diff --git a/doc/source/reference/simd/simd-optimizations-tables.inc b/doc/source/reference/simd/simd-optimizations-tables.inc
deleted file mode 100644
index f038a91e1..000000000
--- a/doc/source/reference/simd/simd-optimizations-tables.inc
+++ /dev/null
@@ -1,103 +0,0 @@
-.. generated via source/reference/simd/simd-optimizations.py
-
-x86 - CPU feature names
-~~~~~~~~~~~~~~~~~~~~~~~
-.. table::
- :align: left
-
- ============ =================================================================================================================
- Name Implies
- ============ =================================================================================================================
- ``SSE`` ``SSE2``
- ``SSE2`` ``SSE``
- ``SSE3`` ``SSE`` ``SSE2``
- ``SSSE3`` ``SSE`` ``SSE2`` ``SSE3``
- ``SSE41`` ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3``
- ``POPCNT`` ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3`` ``SSE41``
- ``SSE42`` ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3`` ``SSE41`` ``POPCNT``
- ``AVX`` ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3`` ``SSE41`` ``POPCNT`` ``SSE42``
- ``XOP`` ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3`` ``SSE41`` ``POPCNT`` ``SSE42`` ``AVX``
- ``FMA4`` ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3`` ``SSE41`` ``POPCNT`` ``SSE42`` ``AVX``
- ``F16C`` ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3`` ``SSE41`` ``POPCNT`` ``SSE42`` ``AVX``
- ``FMA3`` ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3`` ``SSE41`` ``POPCNT`` ``SSE42`` ``AVX`` ``F16C``
- ``AVX2`` ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3`` ``SSE41`` ``POPCNT`` ``SSE42`` ``AVX`` ``F16C``
- ``AVX512F`` ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3`` ``SSE41`` ``POPCNT`` ``SSE42`` ``AVX`` ``F16C`` ``FMA3`` ``AVX2``
- ``AVX512CD`` ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3`` ``SSE41`` ``POPCNT`` ``SSE42`` ``AVX`` ``F16C`` ``FMA3`` ``AVX2`` ``AVX512F``
- ============ =================================================================================================================
-
-x86 - Group names
-~~~~~~~~~~~~~~~~~
-.. table::
- :align: left
-
- ============== ===================================================== ===========================================================================================================================================================================
- Name Gather Implies
- ============== ===================================================== ===========================================================================================================================================================================
- ``AVX512_KNL`` ``AVX512ER`` ``AVX512PF`` ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3`` ``SSE41`` ``POPCNT`` ``SSE42`` ``AVX`` ``F16C`` ``FMA3`` ``AVX2`` ``AVX512F`` ``AVX512CD``
- ``AVX512_KNM`` ``AVX5124FMAPS`` ``AVX5124VNNIW`` ``AVX512VPOPCNTDQ`` ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3`` ``SSE41`` ``POPCNT`` ``SSE42`` ``AVX`` ``F16C`` ``FMA3`` ``AVX2`` ``AVX512F`` ``AVX512CD`` ``AVX512_KNL``
- ``AVX512_SKX`` ``AVX512VL`` ``AVX512BW`` ``AVX512DQ`` ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3`` ``SSE41`` ``POPCNT`` ``SSE42`` ``AVX`` ``F16C`` ``FMA3`` ``AVX2`` ``AVX512F`` ``AVX512CD``
- ``AVX512_CLX`` ``AVX512VNNI`` ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3`` ``SSE41`` ``POPCNT`` ``SSE42`` ``AVX`` ``F16C`` ``FMA3`` ``AVX2`` ``AVX512F`` ``AVX512CD`` ``AVX512_SKX``
- ``AVX512_CNL`` ``AVX512IFMA`` ``AVX512VBMI`` ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3`` ``SSE41`` ``POPCNT`` ``SSE42`` ``AVX`` ``F16C`` ``FMA3`` ``AVX2`` ``AVX512F`` ``AVX512CD`` ``AVX512_SKX``
- ``AVX512_ICL`` ``AVX512VBMI2`` ``AVX512BITALG`` ``AVX512VPOPCNTDQ`` ``SSE`` ``SSE2`` ``SSE3`` ``SSSE3`` ``SSE41`` ``POPCNT`` ``SSE42`` ``AVX`` ``F16C`` ``FMA3`` ``AVX2`` ``AVX512F`` ``AVX512CD`` ``AVX512_SKX`` ``AVX512_CLX`` ``AVX512_CNL``
- ============== ===================================================== ===========================================================================================================================================================================
-
-IBM/POWER big-endian - CPU feature names
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-.. table::
- :align: left
-
- ======== ================
- Name Implies
- ======== ================
- ``VSX``
- ``VSX2`` ``VSX``
- ``VSX3`` ``VSX`` ``VSX2``
- ======== ================
-
-IBM/POWER little-endian - CPU feature names
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-.. table::
- :align: left
-
- ======== ================
- Name Implies
- ======== ================
- ``VSX`` ``VSX2``
- ``VSX2`` ``VSX``
- ``VSX3`` ``VSX`` ``VSX2``
- ======== ================
-
-ARMv7/A32 - CPU feature names
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-.. table::
- :align: left
-
- ============== ===========================================================
- Name Implies
- ============== ===========================================================
- ``NEON``
- ``NEON_FP16`` ``NEON``
- ``NEON_VFPV4`` ``NEON`` ``NEON_FP16``
- ``ASIMD`` ``NEON`` ``NEON_FP16`` ``NEON_VFPV4``
- ``ASIMDHP`` ``NEON`` ``NEON_FP16`` ``NEON_VFPV4`` ``ASIMD``
- ``ASIMDDP`` ``NEON`` ``NEON_FP16`` ``NEON_VFPV4`` ``ASIMD``
- ``ASIMDFHM`` ``NEON`` ``NEON_FP16`` ``NEON_VFPV4`` ``ASIMD`` ``ASIMDHP``
- ============== ===========================================================
-
-ARMv8/A64 - CPU feature names
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-.. table::
- :align: left
-
- ============== ===========================================================
- Name Implies
- ============== ===========================================================
- ``NEON`` ``NEON_FP16`` ``NEON_VFPV4`` ``ASIMD``
- ``NEON_FP16`` ``NEON`` ``NEON_VFPV4`` ``ASIMD``
- ``NEON_VFPV4`` ``NEON`` ``NEON_FP16`` ``ASIMD``
- ``ASIMD`` ``NEON`` ``NEON_FP16`` ``NEON_VFPV4``
- ``ASIMDHP`` ``NEON`` ``NEON_FP16`` ``NEON_VFPV4`` ``ASIMD``
- ``ASIMDDP`` ``NEON`` ``NEON_FP16`` ``NEON_VFPV4`` ``ASIMD``
- ``ASIMDFHM`` ``NEON`` ``NEON_FP16`` ``NEON_VFPV4`` ``ASIMD`` ``ASIMDHP``
- ============== ===========================================================
-
diff --git a/doc/source/reference/simd/simd-optimizations.py b/doc/source/reference/simd/simd-optimizations.py
deleted file mode 100644
index a78302db5..000000000
--- a/doc/source/reference/simd/simd-optimizations.py
+++ /dev/null
@@ -1,190 +0,0 @@
-"""
-Generate CPU features tables from CCompilerOpt
-"""
-from os import sys, path
-gen_path = path.dirname(path.realpath(__file__))
-#sys.path.append(path.abspath(path.join(gen_path, *([".."]*4), "numpy", "distutils")))
-#from ccompiler_opt import CCompilerOpt
-from numpy.distutils.ccompiler_opt import CCompilerOpt
-
-class FakeCCompilerOpt(CCompilerOpt):
- fake_info = ("arch", "compiler", "extra_args")
- # disable caching no need for it
- conf_nocache = True
- def __init__(self, *args, **kwargs):
- no_cc = None
- CCompilerOpt.__init__(self, no_cc, **kwargs)
- def dist_compile(self, sources, flags, **kwargs):
- return sources
- def dist_info(self):
- return FakeCCompilerOpt.fake_info
- @staticmethod
- def dist_log(*args, stderr=False):
- # avoid printing
- pass
- def feature_test(self, name, force_flags=None):
- # To speed up
- return True
-
- def gen_features_table(self, features, ignore_groups=True,
- field_names=["Name", "Implies"],
- fstyle=None, fstyle_implies=None, **kwargs):
- rows = []
- if fstyle is None:
- fstyle = lambda ft: f'``{ft}``'
- if fstyle_implies is None:
- fstyle_implies = lambda origin, ft: fstyle(ft)
- for f in self.feature_sorted(features):
- is_group = "group" in self.feature_supported.get(f, {})
- if ignore_groups and is_group:
- continue
- implies = self.feature_sorted(self.feature_implies(f))
- implies = ' '.join([fstyle_implies(f, i) for i in implies])
- rows.append([fstyle(f), implies])
- if rows:
- return self.gen_rst_table(field_names, rows, **kwargs)
-
- def gen_gfeatures_table(self, features,
- field_names=["Name", "Gather", "Implies"],
- fstyle=None, fstyle_implies=None, **kwargs):
- rows = []
- if fstyle is None:
- fstyle = lambda ft: f'``{ft}``'
- if fstyle_implies is None:
- fstyle_implies = lambda origin, ft: fstyle(ft)
- for f in self.feature_sorted(features):
- gather = self.feature_supported.get(f, {}).get("group", None)
- if not gather:
- continue
- implies = self.feature_sorted(self.feature_implies(f))
- implies = ' '.join([fstyle_implies(f, i) for i in implies])
- gather = ' '.join([fstyle_implies(f, i) for i in gather])
- rows.append([fstyle(f), gather, implies])
- if rows:
- return self.gen_rst_table(field_names, rows, **kwargs)
-
- def gen_rst_table(self, field_names, rows, tab_size=4):
- assert(not rows or len(field_names) == len(rows[0]))
- rows.append(field_names)
- fld_len = len(field_names)
- cls_len = [max(len(c[i]) for c in rows) for i in range(fld_len)]
- del rows[-1]
- cformat = ' '.join('{:<%d}' % i for i in cls_len)
- border = cformat.format(*['='*i for i in cls_len])
-
- rows = [cformat.format(*row) for row in rows]
- # header
- rows = [border, cformat.format(*field_names), border] + rows
- # footer
- rows += [border]
- # add left margin
- rows = [(' ' * tab_size) + r for r in rows]
- return '\n'.join(rows)
-
-def features_table_sections(name, ftable=None, gtable=None, tab_size=4):
- tab = ' '*tab_size
- content = ''
- if ftable:
- title = f"{name} - CPU feature names"
- content = (
- f"{title}\n{'~'*len(title)}"
- f"\n.. table::\n{tab}:align: left\n\n"
- f"{ftable}\n\n"
- )
- if gtable:
- title = f"{name} - Group names"
- content += (
- f"{title}\n{'~'*len(title)}"
- f"\n.. table::\n{tab}:align: left\n\n"
- f"{gtable}\n\n"
- )
- return content
-
-def features_table(arch, cc="gcc", pretty_name=None, **kwargs):
- FakeCCompilerOpt.fake_info = (arch, cc, '')
- ccopt = FakeCCompilerOpt(cpu_baseline="max")
- features = ccopt.cpu_baseline_names()
- ftable = ccopt.gen_features_table(features, **kwargs)
- gtable = ccopt.gen_gfeatures_table(features, **kwargs)
-
- if not pretty_name:
- pretty_name = arch + '/' + cc
- return features_table_sections(pretty_name, ftable, gtable, **kwargs)
-
-def features_table_diff(arch, cc, cc_vs="gcc", pretty_name=None, **kwargs):
- FakeCCompilerOpt.fake_info = (arch, cc, '')
- ccopt = FakeCCompilerOpt(cpu_baseline="max")
- fnames = ccopt.cpu_baseline_names()
- features = {f:ccopt.feature_implies(f) for f in fnames}
-
- FakeCCompilerOpt.fake_info = (arch, cc_vs, '')
- ccopt_vs = FakeCCompilerOpt(cpu_baseline="max")
- fnames_vs = ccopt_vs.cpu_baseline_names()
- features_vs = {f:ccopt_vs.feature_implies(f) for f in fnames_vs}
-
- common = set(fnames).intersection(fnames_vs)
- extra_avl = set(fnames).difference(fnames_vs)
- not_avl = set(fnames_vs).difference(fnames)
- diff_impl_f = {f:features[f].difference(features_vs[f]) for f in common}
- diff_impl = {k for k, v in diff_impl_f.items() if v}
-
- fbold = lambda ft: f'**{ft}**' if ft in extra_avl else f'``{ft}``'
- fbold_implies = lambda origin, ft: (
- f'**{ft}**' if ft in diff_impl_f.get(origin, {}) else f'``{ft}``'
- )
- diff_all = diff_impl.union(extra_avl)
- ftable = ccopt.gen_features_table(
- diff_all, fstyle=fbold, fstyle_implies=fbold_implies, **kwargs
- )
- gtable = ccopt.gen_gfeatures_table(
- diff_all, fstyle=fbold, fstyle_implies=fbold_implies, **kwargs
- )
- if not pretty_name:
- pretty_name = arch + '/' + cc
- content = features_table_sections(pretty_name, ftable, gtable, **kwargs)
-
- if not_avl:
- not_avl = ccopt_vs.feature_sorted(not_avl)
- not_avl = ' '.join(not_avl)
- content += (
- ".. note::\n"
- f" The following features aren't supported by {pretty_name}:\n"
- f" **{not_avl}**\n\n"
- )
- return content
-
-if __name__ == '__main__':
- pretty_names = {
- "PPC64": "IBM/POWER big-endian",
- "PPC64LE": "IBM/POWER little-endian",
- "ARMHF": "ARMv7/A32",
- "AARCH64": "ARMv8/A64",
- "ICC": "Intel Compiler",
- # "ICCW": "Intel Compiler msvc-like",
- "MSVC": "Microsoft Visual C/C++"
- }
- with open(path.join(gen_path, 'simd-optimizations-tables.inc'), 'wt') as fd:
- fd.write(f'.. generated via {__file__}\n\n')
- for arch in (
- ("x86", "PPC64", "PPC64LE", "ARMHF", "AARCH64")
- ):
- pretty_name = pretty_names.get(arch, arch)
- table = features_table(arch=arch, pretty_name=pretty_name)
- assert(table)
- fd.write(table)
-
- with open(path.join(gen_path, 'simd-optimizations-tables-diff.inc'), 'wt') as fd:
- fd.write(f'.. generated via {__file__}\n\n')
- for arch, cc_names in (
- ("x86", ("clang", "ICC", "MSVC")),
- ("PPC64", ("clang",)),
- ("PPC64LE", ("clang",)),
- ("ARMHF", ("clang",)),
- ("AARCH64", ("clang",))
- ):
- arch_pname = pretty_names.get(arch, arch)
- for cc in cc_names:
- pretty_name = f"{arch_pname}::{pretty_names.get(cc, cc)}"
- table = features_table_diff(arch=arch, cc=cc, pretty_name=pretty_name)
- if table:
- fd.write(table)
diff --git a/doc/source/reference/simd/simd-optimizations.rst b/doc/source/reference/simd/simd-optimizations.rst
index 9de6d1734..a18108266 100644
--- a/doc/source/reference/simd/simd-optimizations.rst
+++ b/doc/source/reference/simd/simd-optimizations.rst
@@ -1,527 +1,12 @@
-******************
-SIMD Optimizations
-******************
+:orphan:
-NumPy provides a set of macros that define `Universal Intrinsics`_ to
-abstract out typical platform-specific intrinsics so SIMD code needs to be
-written only once. There are three layers:
+.. raw:: html
-- Code is *written* using the universal intrinsic macros, with guards that
- will enable use of the macros only when the compiler recognizes them.
- In NumPy, these are used to construct multiple ufunc loops. Current policy is
- to create three loops: One loop is the default and uses no intrinsics. One
- uses the minimum intrinsics required on the architecture. And the third is
- written using the maximum set of intrinsics possible.
-- At *compile* time, a distutils command is used to define the minimum and
- maximum features to support, based on user choice and compiler support. The
- appropriate macros are overlaid with the platform / architecture intrinsics,
- and the three loops are compiled.
-- At *runtime import*, the CPU is probed for the set of supported intrinsic
- features. A mechanism is used to grab the pointer to the most appropriate
- function, and this will be the one called for the function.
+ <html>
+ <head>
+ <meta http-equiv="refresh" content="0; url=index.html"/>
+ </head>
+ </html>
-
-Build options for compilation
-=============================
-
-- ``--cpu-baseline``: minimal set of required optimizations. Default
- value is ``min`` which provides the minimum CPU features that can
- safely run on a wide range of platforms within the processor family.
-
-- ``--cpu-dispatch``: dispatched set of additional optimizations.
- The default value is ``max -xop -fma4`` which enables all CPU
- features, except for AMD legacy features(in case of X86).
-
-The command arguments are available in ``build``, ``build_clib``, and
-``build_ext``.
-if ``build_clib`` or ``build_ext`` are not specified by the user, the arguments of
-``build`` will be used instead, which also holds the default values.
-
-Optimization names can be CPU features or groups of features that gather
-several features or :ref:`special options <special-options>` to perform a series of procedures.
-
-
-The following tables show the current supported optimizations sorted from the lowest to the highest interest.
-
-.. include:: simd-optimizations-tables.inc
-
-----
-
-.. _tables-diff:
-
-While the above tables are based on the GCC Compiler, the following tables showing the differences in the
-other compilers:
-
-.. include:: simd-optimizations-tables-diff.inc
-
-.. _special-options:
-
-Special options
-~~~~~~~~~~~~~~~
-
-- ``NONE``: enable no features
-
-- ``NATIVE``: Enables all CPU features that supported by the current
- machine, this operation is based on the compiler flags (``-march=native, -xHost, /QxHost``)
-
-- ``MIN``: Enables the minimum CPU features that can safely run on a wide range of platforms:
-
- .. table::
- :align: left
-
- ====================================== =======================================
- For Arch Returns
- ====================================== =======================================
- ``x86`` ``SSE`` ``SSE2``
- ``x86`` ``64-bit mode`` ``SSE`` ``SSE2`` ``SSE3``
- ``IBM/POWER`` ``big-endian mode`` ``NONE``
- ``IBM/POWER`` ``little-endian mode`` ``VSX`` ``VSX2``
- ``ARMHF`` ``NONE``
- ``ARM64`` ``AARCH64`` ``NEON`` ``NEON_FP16`` ``NEON_VFPV4``
- ``ASIMD``
- ====================================== =======================================
-
-- ``MAX``: Enables all supported CPU features by the Compiler and platform.
-
-- ``Operators-/+``: remove or add features, useful with options ``MAX``, ``MIN`` and ``NATIVE``.
-
-NOTES
-~~~~~~~~~~~~~
-- CPU features and other options are case-insensitive.
-
-- The order of the requested optimizations doesn't matter.
-
-- Either commas or spaces can be used as a separator, e.g. ``--cpu-dispatch``\ =
- "avx2 avx512f" or ``--cpu-dispatch``\ = "avx2, avx512f" both work, but the
- arguments must be enclosed in quotes.
-
-- The operand ``+`` is only added for nominal reasons, For example:
- ``--cpu-baseline= "min avx2"`` is equivalent to ``--cpu-baseline="min + avx2"``.
- ``--cpu-baseline="min,avx2"`` is equivalent to ``--cpu-baseline`="min,+avx2"``
-
-- If the CPU feature is not supported by the user platform or
- compiler, it will be skipped rather than raising a fatal error.
-
-- Any specified CPU feature to ``--cpu-dispatch`` will be skipped if
- it's part of CPU baseline features
-
-- The ``--cpu-baseline`` argument force-enables implied features,
- e.g. ``--cpu-baseline``\ ="sse42" is equivalent to
- ``--cpu-baseline``\ ="sse sse2 sse3 ssse3 sse41 popcnt sse42"
-
-- The value of ``--cpu-baseline`` will be treated as "native" if
- compiler native flag ``-march=native`` or ``-xHost`` or ``QxHost`` is
- enabled through environment variable ``CFLAGS``
-
-- The validation process for the requested optimizations when it comes to
- ``--cpu-baseline`` isn't strict. For example, if the user requested
- ``AVX2`` but the compiler doesn't support it then we just skip it and return
- the maximum optimization that the compiler can handle depending on the
- implied features of ``AVX2``, let us assume ``AVX``.
-
-- The user should always check the final report through the build log
- to verify the enabled features.
-
-Special cases
-~~~~~~~~~~~~~
-
-**Interrelated CPU features**: Some exceptional conditions force us to link some features together when it come to certain compilers or architectures, resulting in the impossibility of building them separately.
-These conditions can be divided into two parts, as follows:
-
-- **Architectural compatibility**: The need to align certain CPU features that are assured
- to be supported by successive generations of the same architecture, for example:
-
- - On ppc64le `VSX(ISA 2.06)` and `VSX2(ISA 2.07)` both imply one another since the
- first generation that supports little-endian mode is Power-8`(ISA 2.07)`
- - On AArch64 `NEON` `FP16` `VFPV4` `ASIMD` implies each other since they are part of the
- hardware baseline.
-
-- **Compilation compatibility**: Not all **C/C++** compilers provide independent support for all CPU
- features. For example, **Intel**'s compiler doesn't provide separated flags for `AVX2` and `FMA3`,
- it makes sense since all Intel CPUs that comes with `AVX2` also support `FMA3` and vice versa,
- but this approach is incompatible with other **x86** CPUs from **AMD** or **VIA**.
- Therefore, there are differences in the depiction of CPU features between the C/C++ compilers,
- as shown in the :ref:`tables above <tables-diff>`.
-
-
-Behaviors and Errors
-~~~~~~~~~~~~~~~~~~~~
-
-
-
-Usage and Examples
-~~~~~~~~~~~~~~~~~~
-
-Report and Trace
-~~~~~~~~~~~~~~~~
-
-Understanding CPU Dispatching, How the NumPy dispatcher works?
-==============================================================
-
-NumPy dispatcher is based on multi-source compiling, which means taking
-a certain source and compiling it multiple times with different compiler
-flags and also with different **C** definitions that affect the code
-paths to enable certain instruction-sets for each compiled object
-depending on the required optimizations, then combining the returned
-objects together.
-
-.. figure:: ../figures/opt-infra.png
-
-This mechanism should support all compilers and it doesn't require any
-compiler-specific extension, but at the same time it is adds a few steps to
-normal compilation that are explained as follows:
-
-1- Configuration
-~~~~~~~~~~~~~~~~
-
-Configuring the required optimization by the user before starting to build the
-source files via the two command arguments as explained above:
-
-- ``--cpu-baseline``: minimal set of required optimizations.
-
-- ``--cpu-dispatch``: dispatched set of additional optimizations.
-
-
-2- Discovering the environment
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-In this part, we check the compiler and platform architecture
-and cache some of the intermediary results to speed up rebuilding.
-
-3- Validating the requested optimizations
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-By testing them against the compiler, and seeing what the compiler can
-support according to the requested optimizations.
-
-4- Generating the main configuration header
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-The generated header ``_cpu_dispatch.h`` contains all the definitions and
-headers of instruction-sets for the required optimizations that have been
-validated during the previous step.
-
-It also contains extra C definitions that are used for defining NumPy's
-Python-level module attributes ``__cpu_baseline__`` and ``__cpu_dispaٍtch__``.
-
-**What is in this header?**
-
-The example header was dynamically generated by gcc on an X86 machine.
-The compiler supports ``--cpu-baseline="sse sse2 sse3"`` and
-``--cpu-dispatch="ssse3 sse41"``, and the result is below.
-
-.. code:: c
-
- // The header should be located at numpy/numpy/core/src/common/_cpu_dispatch.h
- /**NOTE
- ** C definitions prefixed with "NPY_HAVE_" represent
- ** the required optimzations.
- **
- ** C definitions prefixed with 'NPY__CPU_TARGET_' are protected and
- ** shouldn't be used by any NumPy C sources.
- */
- /******* baseline features *******/
- /** SSE **/
- #define NPY_HAVE_SSE 1
- #include <xmmintrin.h>
- /** SSE2 **/
- #define NPY_HAVE_SSE2 1
- #include <emmintrin.h>
- /** SSE3 **/
- #define NPY_HAVE_SSE3 1
- #include <pmmintrin.h>
-
- /******* dispatch-able features *******/
- #ifdef NPY__CPU_TARGET_SSSE3
- /** SSSE3 **/
- #define NPY_HAVE_SSSE3 1
- #include <tmmintrin.h>
- #endif
- #ifdef NPY__CPU_TARGET_SSE41
- /** SSE41 **/
- #define NPY_HAVE_SSE41 1
- #include <smmintrin.h>
- #endif
-
-**Baseline features** are the minimal set of required optimizations configured
-via ``--cpu-baseline``. They have no preprocessor guards and they're
-always on, which means they can be used in any source.
-
-Does this mean NumPy's infrastructure passes the compiler's flags of
-baseline features to all sources?
-
-Definitely, yes. But the :ref:`dispatch-able sources <dispatchable-sources>` are
-treated differently.
-
-What if the user specifies certain **baseline features** during the
-build but at runtime the machine doesn't support even these
-features? Will the compiled code be called via one of these definitions, or
-maybe the compiler itself auto-generated/vectorized certain piece of code
-based on the provided command line compiler flags?
-
-During the loading of the NumPy module, there's a validation step
-which detects this behavior. It will raise a Python runtime error to inform the
-user. This is to prevent the CPU reaching an illegal instruction error causing
-a segfault.
-
-**Dispatch-able features** are our dispatched set of additional optimizations
-that were configured via ``--cpu-dispatch``. They are not activated by
-default and are always guarded by other C definitions prefixed with
-``NPY__CPU_TARGET_``. C definitions ``NPY__CPU_TARGET_`` are only
-enabled within **dispatch-able sources**.
-
-.. _dispatchable-sources:
-
-5- Dispatch-able sources and configuration statements
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-Dispatch-able sources are special **C** files that can be compiled multiple
-times with different compiler flags and also with different **C**
-definitions. These affect code paths to enable certain
-instruction-sets for each compiled object according to "**the
-configuration statements**" that must be declared between a **C**
-comment\ ``(/**/)`` and start with a special mark **@targets** at the
-top of each dispatch-able source. At the same time, dispatch-able
-sources will be treated as normal **C** sources if the optimization was
-disabled by the command argument ``--disable-optimization`` .
-
-**What are configuration statements?**
-
-Configuration statements are sort of keywords combined together to
-determine the required optimization for the dispatch-able source.
-
-Example:
-
-.. code:: c
-
- /*@targets avx2 avx512f vsx2 vsx3 asimd asimdhp */
- // C code
-
-The keywords mainly represent the additional optimizations configured
-through ``--cpu-dispatch``, but it can also represent other options such as:
-
-- Target groups: pre-configured configuration statements used for
- managing the required optimizations from outside the dispatch-able source.
-
-- Policies: collections of options used for changing the default
- behaviors or forcing the compilers to perform certain things.
-
-- "baseline": a unique keyword represents the minimal optimizations
- that configured through ``--cpu-baseline``
-
-**Numpy's infrastructure handles dispatch-able sources in four steps**:
-
-- **(A) Recognition**: Just like source templates and F2PY, the
- dispatch-able sources requires a special extension ``*.dispatch.c``
- to mark C dispatch-able source files, and for C++
- ``*.dispatch.cpp`` or ``*.dispatch.cxx``
- **NOTE**: C++ not supported yet.
-
-- **(B) Parsing and validating**: In this step, the
- dispatch-able sources that had been filtered by the previous step
- are parsed and validated by the configuration statements for each one
- of them one by one in order to determine the required optimizations.
-
-- **(C) Wrapping**: This is the approach taken by NumPy's
- infrastructure, which has proved to be sufficiently flexible in order
- to compile a single source multiple times with different **C**
- definitions and flags that affect the code paths. The process is
- achieved by creating a temporary **C** source for each required
- optimization that related to the additional optimization, which
- contains the declarations of the **C** definitions and includes the
- involved source via the **C** directive **#include**. For more
- clarification take a look at the following code for AVX512F :
-
- .. code:: c
-
- /*
- * this definition is used by NumPy utilities as suffixes for the
- * exported symbols
- */
- #define NPY__CPU_TARGET_CURRENT AVX512F
- /*
- * The following definitions enable
- * definitions of the dispatch-able features that are defined within the main
- * configuration header. These are definitions for the implied features.
- */
- #define NPY__CPU_TARGET_SSE
- #define NPY__CPU_TARGET_SSE2
- #define NPY__CPU_TARGET_SSE3
- #define NPY__CPU_TARGET_SSSE3
- #define NPY__CPU_TARGET_SSE41
- #define NPY__CPU_TARGET_POPCNT
- #define NPY__CPU_TARGET_SSE42
- #define NPY__CPU_TARGET_AVX
- #define NPY__CPU_TARGET_F16C
- #define NPY__CPU_TARGET_FMA3
- #define NPY__CPU_TARGET_AVX2
- #define NPY__CPU_TARGET_AVX512F
- // our dispatch-able source
- #include "/the/absuolate/path/of/hello.dispatch.c"
-
-- **(D) Dispatch-able configuration header**: The infrastructure
- generates a config header for each dispatch-able source, this header
- mainly contains two abstract **C** macros used for identifying the
- generated objects, so they can be used for runtime dispatching
- certain symbols from the generated objects by any **C** source. It is
- also used for forward declarations.
-
- The generated header takes the name of the dispatch-able source after
- excluding the extension and replace it with '**.h**', for example
- assume we have a dispatch-able source called **hello.dispatch.c** and
- contains the following:
-
- .. code:: c
-
- // hello.dispatch.c
- /*@targets baseline sse42 avx512f */
- #include <stdio.h>
- #include "numpy/utils.h" // NPY_CAT, NPY_TOSTR
-
- #ifndef NPY__CPU_TARGET_CURRENT
- // wrapping the dispatch-able source only happens to the additional optimizations
- // but if the keyword 'baseline' provided within the configuration statements,
- // the infrastructure will add extra compiling for the dispatch-able source by
- // passing it as-is to the compiler without any changes.
- #define CURRENT_TARGET(X) X
- #define NPY__CPU_TARGET_CURRENT baseline // for printing only
- #else
- // since we reach to this point, that's mean we're dealing with
- // the additional optimizations, so it could be SSE42 or AVX512F
- #define CURRENT_TARGET(X) NPY_CAT(NPY_CAT(X, _), NPY__CPU_TARGET_CURRENT)
- #endif
- // Macro 'CURRENT_TARGET' adding the current target as suffux to the exported symbols,
- // to avoid linking duplications, NumPy already has a macro called
- // 'NPY_CPU_DISPATCH_CURFX' similar to it, located at
- // numpy/numpy/core/src/common/npy_cpu_dispatch.h
- // NOTE: we tend to not adding suffixes to the baseline exported symbols
- void CURRENT_TARGET(simd_whoami)(const char *extra_info)
- {
- printf("I'm " NPY_TOSTR(NPY__CPU_TARGET_CURRENT) ", %s\n", extra_info);
- }
-
- Now assume you attached **hello.dispatch.c** to the source tree, then
- the infrastructure should generate a temporary config header called
- **hello.dispatch.h** that can be reached by any source in the source
- tree, and it should contain the following code :
-
- .. code:: c
-
- #ifndef NPY__CPU_DISPATCH_EXPAND_
- // To expand the macro calls in this header
- #define NPY__CPU_DISPATCH_EXPAND_(X) X
- #endif
- // Undefining the following macros, due to the possibility of including config headers
- // multiple times within the same source and since each config header represents
- // different required optimizations according to the specified configuration
- // statements in the dispatch-able source that derived from it.
- #undef NPY__CPU_DISPATCH_BASELINE_CALL
- #undef NPY__CPU_DISPATCH_CALL
- // nothing strange here, just a normal preprocessor callback
- // enabled only if 'baseline' specified within the configuration statements
- #define NPY__CPU_DISPATCH_BASELINE_CALL(CB, ...) \
- NPY__CPU_DISPATCH_EXPAND_(CB(__VA_ARGS__))
- // 'NPY__CPU_DISPATCH_CALL' is an abstract macro is used for dispatching
- // the required optimizations that specified within the configuration statements.
- //
- // @param CHK, Expected a macro that can be used to detect CPU features
- // in runtime, which takes a CPU feature name without string quotes and
- // returns the testing result in a shape of boolean value.
- // NumPy already has macro called "NPY_CPU_HAVE", which fits this requirement.
- //
- // @param CB, a callback macro that expected to be called multiple times depending
- // on the required optimizations, the callback should receive the following arguments:
- // 1- The pending calls of @param CHK filled up with the required CPU features,
- // that need to be tested first in runtime before executing call belong to
- // the compiled object.
- // 2- The required optimization name, same as in 'NPY__CPU_TARGET_CURRENT'
- // 3- Extra arguments in the macro itself
- //
- // By default the callback calls are sorted depending on the highest interest
- // unless the policy "$keep_sort" was in place within the configuration statements
- // see "Dive into the CPU dispatcher" for more clarification.
- #define NPY__CPU_DISPATCH_CALL(CHK, CB, ...) \
- NPY__CPU_DISPATCH_EXPAND_(CB((CHK(AVX512F)), AVX512F, __VA_ARGS__)) \
- NPY__CPU_DISPATCH_EXPAND_(CB((CHK(SSE)&&CHK(SSE2)&&CHK(SSE3)&&CHK(SSSE3)&&CHK(SSE41)), SSE41, __VA_ARGS__))
-
- An example of using the config header in light of the above:
-
- .. code:: c
-
- // NOTE: The following macros are only defined for demonstration purposes only.
- // NumPy already has a collections of macros located at
- // numpy/numpy/core/src/common/npy_cpu_dispatch.h, that covers all dispatching
- // and declarations scenarios.
-
- #include "numpy/npy_cpu_features.h" // NPY_CPU_HAVE
- #include "numpy/utils.h" // NPY_CAT, NPY_EXPAND
-
- // An example for setting a macro that calls all the exported symbols at once
- // after checking if they're supported by the running machine.
- #define DISPATCH_CALL_ALL(FN, ARGS) \
- NPY__CPU_DISPATCH_CALL(NPY_CPU_HAVE, DISPATCH_CALL_ALL_CB, FN, ARGS) \
- NPY__CPU_DISPATCH_BASELINE_CALL(DISPATCH_CALL_BASELINE_ALL_CB, FN, ARGS)
- // The preprocessor callbacks.
- // The same suffixes as we define it in the dispatch-able source.
- #define DISPATCH_CALL_ALL_CB(CHECK, TARGET_NAME, FN, ARGS) \
- if (CHECK) { NPY_CAT(NPY_CAT(FN, _), TARGET_NAME) ARGS; }
- #define DISPATCH_CALL_BASELINE_ALL_CB(FN, ARGS) \
- FN NPY_EXPAND(ARGS);
-
- // An example for setting a macro that calls the exported symbols of highest
- // interest optimization, after checking if they're supported by the running machine.
- #define DISPATCH_CALL_HIGH(FN, ARGS) \
- if (0) {} \
- NPY__CPU_DISPATCH_CALL(NPY_CPU_HAVE, DISPATCH_CALL_HIGH_CB, FN, ARGS) \
- NPY__CPU_DISPATCH_BASELINE_CALL(DISPATCH_CALL_BASELINE_HIGH_CB, FN, ARGS)
- // The preprocessor callbacks
- // The same suffixes as we define it in the dispatch-able source.
- #define DISPATCH_CALL_HIGH_CB(CHECK, TARGET_NAME, FN, ARGS) \
- else if (CHECK) { NPY_CAT(NPY_CAT(FN, _), TARGET_NAME) ARGS; }
- #define DISPATCH_CALL_BASELINE_HIGH_CB(FN, ARGS) \
- else { FN NPY_EXPAND(ARGS); }
-
- // NumPy has a macro called 'NPY_CPU_DISPATCH_DECLARE' can be used
- // for forward declrations any kind of prototypes based on
- // 'NPY__CPU_DISPATCH_CALL' and 'NPY__CPU_DISPATCH_BASELINE_CALL'.
- // However in this example, we just handle it manually.
- void simd_whoami(const char *extra_info);
- void simd_whoami_AVX512F(const char *extra_info);
- void simd_whoami_SSE41(const char *extra_info);
-
- void trigger_me(void)
- {
- // bring the auto-gernreated config header
- // which contains config macros 'NPY__CPU_DISPATCH_CALL' and
- // 'NPY__CPU_DISPATCH_BASELINE_CALL'.
- // it highely recomaned to include the config header before exectuing
- // the dispatching macros in case if there's another header in the scope.
- #include "hello.dispatch.h"
- DISPATCH_CALL_ALL(simd_whoami, ("all"))
- DISPATCH_CALL_HIGH(simd_whoami, ("the highest interest"))
- // An example of including multiple config headers in the same source
- // #include "hello2.dispatch.h"
- // DISPATCH_CALL_HIGH(another_function, ("the highest interest"))
- }
-
-
-Dive into the CPU dispatcher
-============================
-
-The baseline
-~~~~~~~~~~~~
-
-Dispatcher
-~~~~~~~~~~
-
-Groups and Policies
-~~~~~~~~~~~~~~~~~~~
-
-Examples
-~~~~~~~~
-
-Report and Trace
-~~~~~~~~~~~~~~~~
-
-
-.. _`Universal Intrinsics`: https://numpy.org/neps/nep-0038-SIMD-optimizations.html
+The location of this document has been changed , if you are not
+redirected in few seconds, `click here <index.html>`_.