summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSebastian Berg <sebastian@sipsolutions.net>2020-08-21 08:21:12 -0500
committerGitHub <noreply@github.com>2020-08-21 08:21:12 -0500
commit4cba2d91e1546872d29af6b25ad35947f27e03ac (patch)
tree4dfa46e8428889df6f7bde185f851e33fbf9434f
parent0bd548e287b9e2fd0126f64d8be01a812e3c6a48 (diff)
parentf7df11c52b48a7dd22cb4fc968d8af7cce4a2c5f (diff)
downloadnumpy-4cba2d91e1546872d29af6b25ad35947f27e03ac.tar.gz
Merge pull request #17109 from eric-wieser/split-einsum
MAINT: Split einsum into multiple files
-rw-r--r--numpy/core/setup.py3
-rw-r--r--numpy/core/src/multiarray/einsum.c.src1895
-rw-r--r--numpy/core/src/multiarray/einsum_debug.h28
-rw-r--r--numpy/core/src/multiarray/einsum_sumprod.c.src1897
-rw-r--r--numpy/core/src/multiarray/einsum_sumprod.h12
5 files changed, 1942 insertions, 1893 deletions
diff --git a/numpy/core/setup.py b/numpy/core/setup.py
index aede12080..a4a84397d 100644
--- a/numpy/core/setup.py
+++ b/numpy/core/setup.py
@@ -790,6 +790,8 @@ def configuration(parent_package='',top_path=None):
join('src', 'multiarray', 'descriptor.h'),
join('src', 'multiarray', 'dtypemeta.h'),
join('src', 'multiarray', 'dragon4.h'),
+ join('src', 'multiarray', 'einsum_debug.h'),
+ join('src', 'multiarray', 'einsum_sumprod.h'),
join('src', 'multiarray', 'getset.h'),
join('src', 'multiarray', 'hashdescr.h'),
join('src', 'multiarray', 'iterators.h'),
@@ -853,6 +855,7 @@ def configuration(parent_package='',top_path=None):
join('src', 'multiarray', 'dragon4.c'),
join('src', 'multiarray', 'dtype_transfer.c'),
join('src', 'multiarray', 'einsum.c.src'),
+ join('src', 'multiarray', 'einsum_sumprod.c.src'),
join('src', 'multiarray', 'flagsobject.c'),
join('src', 'multiarray', 'getset.c'),
join('src', 'multiarray', 'hashdescr.c'),
diff --git a/numpy/core/src/multiarray/einsum.c.src b/numpy/core/src/multiarray/einsum.c.src
index 2538e05c6..6ad375f67 100644
--- a/numpy/core/src/multiarray/einsum.c.src
+++ b/numpy/core/src/multiarray/einsum.c.src
@@ -16,7 +16,6 @@
#define _MULTIARRAYMODULE
#include <numpy/npy_common.h>
#include <numpy/arrayobject.h>
-#include <numpy/halffloat.h>
#include <npy_pycompat.h>
#include <ctype.h>
@@ -25,1898 +24,8 @@
#include "common.h"
#include "ctors.h"
-#ifdef NPY_HAVE_SSE_INTRINSICS
-#define EINSUM_USE_SSE1 1
-#else
-#define EINSUM_USE_SSE1 0
-#endif
-
-#ifdef NPY_HAVE_SSE2_INTRINSICS
-#define EINSUM_USE_SSE2 1
-#else
-#define EINSUM_USE_SSE2 0
-#endif
-
-#if EINSUM_USE_SSE1
-#include <xmmintrin.h>
-#endif
-
-#if EINSUM_USE_SSE2
-#include <emmintrin.h>
-#endif
-
-#define EINSUM_IS_SSE_ALIGNED(x) ((((npy_intp)x)&0xf) == 0)
-
-/********** PRINTF DEBUG TRACING **************/
-#define NPY_EINSUM_DBG_TRACING 0
-
-#if NPY_EINSUM_DBG_TRACING
-#define NPY_EINSUM_DBG_PRINT(s) printf("%s", s);
-#define NPY_EINSUM_DBG_PRINT1(s, p1) printf(s, p1);
-#define NPY_EINSUM_DBG_PRINT2(s, p1, p2) printf(s, p1, p2);
-#define NPY_EINSUM_DBG_PRINT3(s, p1, p2, p3) printf(s);
-#else
-#define NPY_EINSUM_DBG_PRINT(s)
-#define NPY_EINSUM_DBG_PRINT1(s, p1)
-#define NPY_EINSUM_DBG_PRINT2(s, p1, p2)
-#define NPY_EINSUM_DBG_PRINT3(s, p1, p2, p3)
-#endif
-/**********************************************/
-
-/**begin repeat
- * #name = byte, short, int, long, longlong,
- * ubyte, ushort, uint, ulong, ulonglong,
- * half, float, double, longdouble,
- * cfloat, cdouble, clongdouble#
- * #type = npy_byte, npy_short, npy_int, npy_long, npy_longlong,
- * npy_ubyte, npy_ushort, npy_uint, npy_ulong, npy_ulonglong,
- * npy_half, npy_float, npy_double, npy_longdouble,
- * npy_cfloat, npy_cdouble, npy_clongdouble#
- * #temptype = npy_byte, npy_short, npy_int, npy_long, npy_longlong,
- * npy_ubyte, npy_ushort, npy_uint, npy_ulong, npy_ulonglong,
- * npy_float, npy_float, npy_double, npy_longdouble,
- * npy_float, npy_double, npy_longdouble#
- * #to = ,,,,,
- * ,,,,,
- * npy_float_to_half,,,,
- * ,,#
- * #from = ,,,,,
- * ,,,,,
- * npy_half_to_float,,,,
- * ,,#
- * #complex = 0*5,
- * 0*5,
- * 0*4,
- * 1*3#
- * #float32 = 0*5,
- * 0*5,
- * 0,1,0,0,
- * 0*3#
- * #float64 = 0*5,
- * 0*5,
- * 0,0,1,0,
- * 0*3#
- */
-
-/**begin repeat1
- * #nop = 1, 2, 3, 1000#
- * #noplabel = one, two, three, any#
- */
-static void
-@name@_sum_of_products_@noplabel@(int nop, char **dataptr,
- npy_intp const *strides, npy_intp count)
-{
-#if (@nop@ == 1) || (@nop@ <= 3 && !@complex@)
- char *data0 = dataptr[0];
- npy_intp stride0 = strides[0];
-#endif
-#if (@nop@ == 2 || @nop@ == 3) && !@complex@
- char *data1 = dataptr[1];
- npy_intp stride1 = strides[1];
-#endif
-#if (@nop@ == 3) && !@complex@
- char *data2 = dataptr[2];
- npy_intp stride2 = strides[2];
-#endif
-#if (@nop@ == 1) || (@nop@ <= 3 && !@complex@)
- char *data_out = dataptr[@nop@];
- npy_intp stride_out = strides[@nop@];
-#endif
-
- NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_@noplabel@ (%d)\n", (int)count);
-
- while (count--) {
-#if !@complex@
-# if @nop@ == 1
- *(@type@ *)data_out = @to@(@from@(*(@type@ *)data0) +
- @from@(*(@type@ *)data_out));
- data0 += stride0;
- data_out += stride_out;
-# elif @nop@ == 2
- *(@type@ *)data_out = @to@(@from@(*(@type@ *)data0) *
- @from@(*(@type@ *)data1) +
- @from@(*(@type@ *)data_out));
- data0 += stride0;
- data1 += stride1;
- data_out += stride_out;
-# elif @nop@ == 3
- *(@type@ *)data_out = @to@(@from@(*(@type@ *)data0) *
- @from@(*(@type@ *)data1) *
- @from@(*(@type@ *)data2) +
- @from@(*(@type@ *)data_out));
- data0 += stride0;
- data1 += stride1;
- data2 += stride2;
- data_out += stride_out;
-# else
- @temptype@ temp = @from@(*(@type@ *)dataptr[0]);
- int i;
- for (i = 1; i < nop; ++i) {
- temp *= @from@(*(@type@ *)dataptr[i]);
- }
- *(@type@ *)dataptr[nop] = @to@(temp +
- @from@(*(@type@ *)dataptr[i]));
- for (i = 0; i <= nop; ++i) {
- dataptr[i] += strides[i];
- }
-# endif
-#else /* complex */
-# if @nop@ == 1
- ((@temptype@ *)data_out)[0] = ((@temptype@ *)data0)[0] +
- ((@temptype@ *)data_out)[0];
- ((@temptype@ *)data_out)[1] = ((@temptype@ *)data0)[1] +
- ((@temptype@ *)data_out)[1];
- data0 += stride0;
- data_out += stride_out;
-# else
-# if @nop@ <= 3
-#define _SUMPROD_NOP @nop@
-# else
-#define _SUMPROD_NOP nop
-# endif
- @temptype@ re, im, tmp;
- int i;
- re = ((@temptype@ *)dataptr[0])[0];
- im = ((@temptype@ *)dataptr[0])[1];
- for (i = 1; i < _SUMPROD_NOP; ++i) {
- tmp = re * ((@temptype@ *)dataptr[i])[0] -
- im * ((@temptype@ *)dataptr[i])[1];
- im = re * ((@temptype@ *)dataptr[i])[1] +
- im * ((@temptype@ *)dataptr[i])[0];
- re = tmp;
- }
- ((@temptype@ *)dataptr[_SUMPROD_NOP])[0] = re +
- ((@temptype@ *)dataptr[_SUMPROD_NOP])[0];
- ((@temptype@ *)dataptr[_SUMPROD_NOP])[1] = im +
- ((@temptype@ *)dataptr[_SUMPROD_NOP])[1];
-
- for (i = 0; i <= _SUMPROD_NOP; ++i) {
- dataptr[i] += strides[i];
- }
-#undef _SUMPROD_NOP
-# endif
-#endif
- }
-}
-
-#if @nop@ == 1
-
-static void
-@name@_sum_of_products_contig_one(int nop, char **dataptr,
- npy_intp const *NPY_UNUSED(strides), npy_intp count)
-{
- @type@ *data0 = (@type@ *)dataptr[0];
- @type@ *data_out = (@type@ *)dataptr[1];
-
- NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_contig_one (%d)\n",
- (int)count);
-
-/* This is placed before the main loop to make small counts faster */
-finish_after_unrolled_loop:
- switch (count) {
-/**begin repeat2
- * #i = 6, 5, 4, 3, 2, 1, 0#
- */
- case @i@+1:
-#if !@complex@
- data_out[@i@] = @to@(@from@(data0[@i@]) +
- @from@(data_out[@i@]));
-#else
- ((@temptype@ *)data_out + 2*@i@)[0] =
- ((@temptype@ *)data0 + 2*@i@)[0] +
- ((@temptype@ *)data_out + 2*@i@)[0];
- ((@temptype@ *)data_out + 2*@i@)[1] =
- ((@temptype@ *)data0 + 2*@i@)[1] +
- ((@temptype@ *)data_out + 2*@i@)[1];
-#endif
-/**end repeat2**/
- case 0:
- return;
- }
-
- /* Unroll the loop by 8 */
- while (count >= 8) {
- count -= 8;
-
-/**begin repeat2
- * #i = 0, 1, 2, 3, 4, 5, 6, 7#
- */
-#if !@complex@
- data_out[@i@] = @to@(@from@(data0[@i@]) +
- @from@(data_out[@i@]));
-#else /* complex */
- ((@temptype@ *)data_out + 2*@i@)[0] =
- ((@temptype@ *)data0 + 2*@i@)[0] +
- ((@temptype@ *)data_out + 2*@i@)[0];
- ((@temptype@ *)data_out + 2*@i@)[1] =
- ((@temptype@ *)data0 + 2*@i@)[1] +
- ((@temptype@ *)data_out + 2*@i@)[1];
-#endif
-/**end repeat2**/
- data0 += 8;
- data_out += 8;
- }
-
- /* Finish off the loop */
- goto finish_after_unrolled_loop;
-}
-
-#elif @nop@ == 2 && !@complex@
-
-static void
-@name@_sum_of_products_contig_two(int nop, char **dataptr,
- npy_intp const *NPY_UNUSED(strides), npy_intp count)
-{
- @type@ *data0 = (@type@ *)dataptr[0];
- @type@ *data1 = (@type@ *)dataptr[1];
- @type@ *data_out = (@type@ *)dataptr[2];
-
-#if EINSUM_USE_SSE1 && @float32@
- __m128 a, b;
-#elif EINSUM_USE_SSE2 && @float64@
- __m128d a, b;
-#endif
-
- NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_contig_two (%d)\n",
- (int)count);
-
-/* This is placed before the main loop to make small counts faster */
-finish_after_unrolled_loop:
- switch (count) {
-/**begin repeat2
- * #i = 6, 5, 4, 3, 2, 1, 0#
- */
- case @i@+1:
- data_out[@i@] = @to@(@from@(data0[@i@]) *
- @from@(data1[@i@]) +
- @from@(data_out[@i@]));
-/**end repeat2**/
- case 0:
- return;
- }
-
-#if EINSUM_USE_SSE1 && @float32@
- /* Use aligned instructions if possible */
- if (EINSUM_IS_SSE_ALIGNED(data0) && EINSUM_IS_SSE_ALIGNED(data1) &&
- EINSUM_IS_SSE_ALIGNED(data_out)) {
- /* Unroll the loop by 8 */
- while (count >= 8) {
- count -= 8;
-
-/**begin repeat2
- * #i = 0, 4#
- */
- a = _mm_mul_ps(_mm_load_ps(data0+@i@), _mm_load_ps(data1+@i@));
- b = _mm_add_ps(a, _mm_load_ps(data_out+@i@));
- _mm_store_ps(data_out+@i@, b);
-/**end repeat2**/
- data0 += 8;
- data1 += 8;
- data_out += 8;
- }
-
- /* Finish off the loop */
- goto finish_after_unrolled_loop;
- }
-#elif EINSUM_USE_SSE2 && @float64@
- /* Use aligned instructions if possible */
- if (EINSUM_IS_SSE_ALIGNED(data0) && EINSUM_IS_SSE_ALIGNED(data1) &&
- EINSUM_IS_SSE_ALIGNED(data_out)) {
- /* Unroll the loop by 8 */
- while (count >= 8) {
- count -= 8;
-
-/**begin repeat2
- * #i = 0, 2, 4, 6#
- */
- a = _mm_mul_pd(_mm_load_pd(data0+@i@), _mm_load_pd(data1+@i@));
- b = _mm_add_pd(a, _mm_load_pd(data_out+@i@));
- _mm_store_pd(data_out+@i@, b);
-/**end repeat2**/
- data0 += 8;
- data1 += 8;
- data_out += 8;
- }
-
- /* Finish off the loop */
- goto finish_after_unrolled_loop;
- }
-#endif
-
- /* Unroll the loop by 8 */
- while (count >= 8) {
- count -= 8;
-
-#if EINSUM_USE_SSE1 && @float32@
-/**begin repeat2
- * #i = 0, 4#
- */
- a = _mm_mul_ps(_mm_loadu_ps(data0+@i@), _mm_loadu_ps(data1+@i@));
- b = _mm_add_ps(a, _mm_loadu_ps(data_out+@i@));
- _mm_storeu_ps(data_out+@i@, b);
-/**end repeat2**/
-#elif EINSUM_USE_SSE2 && @float64@
-/**begin repeat2
- * #i = 0, 2, 4, 6#
- */
- a = _mm_mul_pd(_mm_loadu_pd(data0+@i@), _mm_loadu_pd(data1+@i@));
- b = _mm_add_pd(a, _mm_loadu_pd(data_out+@i@));
- _mm_storeu_pd(data_out+@i@, b);
-/**end repeat2**/
-#else
-/**begin repeat2
- * #i = 0, 1, 2, 3, 4, 5, 6, 7#
- */
- data_out[@i@] = @to@(@from@(data0[@i@]) *
- @from@(data1[@i@]) +
- @from@(data_out[@i@]));
-/**end repeat2**/
-#endif
- data0 += 8;
- data1 += 8;
- data_out += 8;
- }
-
- /* Finish off the loop */
- goto finish_after_unrolled_loop;
-}
-
-/* Some extra specializations for the two operand case */
-static void
-@name@_sum_of_products_stride0_contig_outcontig_two(int nop, char **dataptr,
- npy_intp const *NPY_UNUSED(strides), npy_intp count)
-{
- @temptype@ value0 = @from@(*(@type@ *)dataptr[0]);
- @type@ *data1 = (@type@ *)dataptr[1];
- @type@ *data_out = (@type@ *)dataptr[2];
-
-#if EINSUM_USE_SSE1 && @float32@
- __m128 a, b, value0_sse;
-#elif EINSUM_USE_SSE2 && @float64@
- __m128d a, b, value0_sse;
-#endif
-
- NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_stride0_contig_outcontig_two (%d)\n",
- (int)count);
-
-/* This is placed before the main loop to make small counts faster */
-finish_after_unrolled_loop:
- switch (count) {
-/**begin repeat2
- * #i = 6, 5, 4, 3, 2, 1, 0#
- */
- case @i@+1:
- data_out[@i@] = @to@(value0 *
- @from@(data1[@i@]) +
- @from@(data_out[@i@]));
-/**end repeat2**/
- case 0:
- return;
- }
-
-#if EINSUM_USE_SSE1 && @float32@
- value0_sse = _mm_set_ps1(value0);
-
- /* Use aligned instructions if possible */
- if (EINSUM_IS_SSE_ALIGNED(data1) && EINSUM_IS_SSE_ALIGNED(data_out)) {
- /* Unroll the loop by 8 */
- while (count >= 8) {
- count -= 8;
-
-/**begin repeat2
- * #i = 0, 4#
- */
- a = _mm_mul_ps(value0_sse, _mm_load_ps(data1+@i@));
- b = _mm_add_ps(a, _mm_load_ps(data_out+@i@));
- _mm_store_ps(data_out+@i@, b);
-/**end repeat2**/
- data1 += 8;
- data_out += 8;
- }
-
- /* Finish off the loop */
- if (count > 0) {
- goto finish_after_unrolled_loop;
- }
- else {
- return;
- }
- }
-#elif EINSUM_USE_SSE2 && @float64@
- value0_sse = _mm_set1_pd(value0);
-
- /* Use aligned instructions if possible */
- if (EINSUM_IS_SSE_ALIGNED(data1) && EINSUM_IS_SSE_ALIGNED(data_out)) {
- /* Unroll the loop by 8 */
- while (count >= 8) {
- count -= 8;
-
-/**begin repeat2
- * #i = 0, 2, 4, 6#
- */
- a = _mm_mul_pd(value0_sse, _mm_load_pd(data1+@i@));
- b = _mm_add_pd(a, _mm_load_pd(data_out+@i@));
- _mm_store_pd(data_out+@i@, b);
-/**end repeat2**/
- data1 += 8;
- data_out += 8;
- }
-
- /* Finish off the loop */
- if (count > 0) {
- goto finish_after_unrolled_loop;
- }
- else {
- return;
- }
- }
-#endif
-
- /* Unroll the loop by 8 */
- while (count >= 8) {
- count -= 8;
-
-#if EINSUM_USE_SSE1 && @float32@
-/**begin repeat2
- * #i = 0, 4#
- */
- a = _mm_mul_ps(value0_sse, _mm_loadu_ps(data1+@i@));
- b = _mm_add_ps(a, _mm_loadu_ps(data_out+@i@));
- _mm_storeu_ps(data_out+@i@, b);
-/**end repeat2**/
-#elif EINSUM_USE_SSE2 && @float64@
-/**begin repeat2
- * #i = 0, 2, 4, 6#
- */
- a = _mm_mul_pd(value0_sse, _mm_loadu_pd(data1+@i@));
- b = _mm_add_pd(a, _mm_loadu_pd(data_out+@i@));
- _mm_storeu_pd(data_out+@i@, b);
-/**end repeat2**/
-#else
-/**begin repeat2
- * #i = 0, 1, 2, 3, 4, 5, 6, 7#
- */
- data_out[@i@] = @to@(value0 *
- @from@(data1[@i@]) +
- @from@(data_out[@i@]));
-/**end repeat2**/
-#endif
- data1 += 8;
- data_out += 8;
- }
-
- /* Finish off the loop */
- if (count > 0) {
- goto finish_after_unrolled_loop;
- }
-}
-
-static void
-@name@_sum_of_products_contig_stride0_outcontig_two(int nop, char **dataptr,
- npy_intp const *NPY_UNUSED(strides), npy_intp count)
-{
- @type@ *data0 = (@type@ *)dataptr[0];
- @temptype@ value1 = @from@(*(@type@ *)dataptr[1]);
- @type@ *data_out = (@type@ *)dataptr[2];
-
-#if EINSUM_USE_SSE1 && @float32@
- __m128 a, b, value1_sse;
-#elif EINSUM_USE_SSE2 && @float64@
- __m128d a, b, value1_sse;
-#endif
-
- NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_contig_stride0_outcontig_two (%d)\n",
- (int)count);
-
-/* This is placed before the main loop to make small counts faster */
-finish_after_unrolled_loop:
- switch (count) {
-/**begin repeat2
- * #i = 6, 5, 4, 3, 2, 1, 0#
- */
- case @i@+1:
- data_out[@i@] = @to@(@from@(data0[@i@])*
- value1 +
- @from@(data_out[@i@]));
-/**end repeat2**/
- case 0:
- return;
- }
-
-#if EINSUM_USE_SSE1 && @float32@
- value1_sse = _mm_set_ps1(value1);
-
- /* Use aligned instructions if possible */
- if (EINSUM_IS_SSE_ALIGNED(data0) && EINSUM_IS_SSE_ALIGNED(data_out)) {
- /* Unroll the loop by 8 */
- while (count >= 8) {
- count -= 8;
-
-/**begin repeat2
- * #i = 0, 4#
- */
- a = _mm_mul_ps(_mm_load_ps(data0+@i@), value1_sse);
- b = _mm_add_ps(a, _mm_load_ps(data_out+@i@));
- _mm_store_ps(data_out+@i@, b);
-/**end repeat2**/
- data0 += 8;
- data_out += 8;
- }
-
- /* Finish off the loop */
- goto finish_after_unrolled_loop;
- }
-#elif EINSUM_USE_SSE2 && @float64@
- value1_sse = _mm_set1_pd(value1);
-
- /* Use aligned instructions if possible */
- if (EINSUM_IS_SSE_ALIGNED(data0) && EINSUM_IS_SSE_ALIGNED(data_out)) {
- /* Unroll the loop by 8 */
- while (count >= 8) {
- count -= 8;
-
-/**begin repeat2
- * #i = 0, 2, 4, 6#
- */
- a = _mm_mul_pd(_mm_load_pd(data0+@i@), value1_sse);
- b = _mm_add_pd(a, _mm_load_pd(data_out+@i@));
- _mm_store_pd(data_out+@i@, b);
-/**end repeat2**/
- data0 += 8;
- data_out += 8;
- }
-
- /* Finish off the loop */
- goto finish_after_unrolled_loop;
- }
-#endif
-
- /* Unroll the loop by 8 */
- while (count >= 8) {
- count -= 8;
-
-#if EINSUM_USE_SSE1 && @float32@
-/**begin repeat2
- * #i = 0, 4#
- */
- a = _mm_mul_ps(_mm_loadu_ps(data0+@i@), value1_sse);
- b = _mm_add_ps(a, _mm_loadu_ps(data_out+@i@));
- _mm_storeu_ps(data_out+@i@, b);
-/**end repeat2**/
-#elif EINSUM_USE_SSE2 && @float64@
-/**begin repeat2
- * #i = 0, 2, 4, 6#
- */
- a = _mm_mul_pd(_mm_loadu_pd(data0+@i@), value1_sse);
- b = _mm_add_pd(a, _mm_loadu_pd(data_out+@i@));
- _mm_storeu_pd(data_out+@i@, b);
-/**end repeat2**/
-#else
-/**begin repeat2
- * #i = 0, 1, 2, 3, 4, 5, 6, 7#
- */
- data_out[@i@] = @to@(@from@(data0[@i@])*
- value1 +
- @from@(data_out[@i@]));
-/**end repeat2**/
-#endif
- data0 += 8;
- data_out += 8;
- }
-
- /* Finish off the loop */
- goto finish_after_unrolled_loop;
-}
-
-static void
-@name@_sum_of_products_contig_contig_outstride0_two(int nop, char **dataptr,
- npy_intp const *NPY_UNUSED(strides), npy_intp count)
-{
- @type@ *data0 = (@type@ *)dataptr[0];
- @type@ *data1 = (@type@ *)dataptr[1];
- @temptype@ accum = 0;
-
-#if EINSUM_USE_SSE1 && @float32@
- __m128 a, accum_sse = _mm_setzero_ps();
-#elif EINSUM_USE_SSE2 && @float64@
- __m128d a, accum_sse = _mm_setzero_pd();
-#endif
-
- NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_contig_contig_outstride0_two (%d)\n",
- (int)count);
-
-/* This is placed before the main loop to make small counts faster */
-finish_after_unrolled_loop:
- switch (count) {
-/**begin repeat2
- * #i = 6, 5, 4, 3, 2, 1, 0#
- */
- case @i@+1:
- accum += @from@(data0[@i@]) * @from@(data1[@i@]);
-/**end repeat2**/
- case 0:
- *(@type@ *)dataptr[2] = @to@(@from@(*(@type@ *)dataptr[2]) + accum);
- return;
- }
-
-#if EINSUM_USE_SSE1 && @float32@
- /* Use aligned instructions if possible */
- if (EINSUM_IS_SSE_ALIGNED(data0) && EINSUM_IS_SSE_ALIGNED(data1)) {
- /* Unroll the loop by 8 */
- while (count >= 8) {
- count -= 8;
-
- _mm_prefetch(data0 + 512, _MM_HINT_T0);
- _mm_prefetch(data1 + 512, _MM_HINT_T0);
-
-/**begin repeat2
- * #i = 0, 4#
- */
- /*
- * NOTE: This accumulation changes the order, so will likely
- * produce slightly different results.
- */
- a = _mm_mul_ps(_mm_load_ps(data0+@i@), _mm_load_ps(data1+@i@));
- accum_sse = _mm_add_ps(accum_sse, a);
-/**end repeat2**/
- data0 += 8;
- data1 += 8;
- }
-
- /* Add the four SSE values and put in accum */
- a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
- accum_sse = _mm_add_ps(a, accum_sse);
- a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
- accum_sse = _mm_add_ps(a, accum_sse);
- _mm_store_ss(&accum, accum_sse);
-
- /* Finish off the loop */
- goto finish_after_unrolled_loop;
- }
-#elif EINSUM_USE_SSE2 && @float64@
- /* Use aligned instructions if possible */
- if (EINSUM_IS_SSE_ALIGNED(data0) && EINSUM_IS_SSE_ALIGNED(data1)) {
- /* Unroll the loop by 8 */
- while (count >= 8) {
- count -= 8;
-
- _mm_prefetch(data0 + 512, _MM_HINT_T0);
- _mm_prefetch(data1 + 512, _MM_HINT_T0);
-
-/**begin repeat2
- * #i = 0, 2, 4, 6#
- */
- /*
- * NOTE: This accumulation changes the order, so will likely
- * produce slightly different results.
- */
- a = _mm_mul_pd(_mm_load_pd(data0+@i@), _mm_load_pd(data1+@i@));
- accum_sse = _mm_add_pd(accum_sse, a);
-/**end repeat2**/
- data0 += 8;
- data1 += 8;
- }
-
- /* Add the two SSE2 values and put in accum */
- a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1));
- accum_sse = _mm_add_pd(a, accum_sse);
- _mm_store_sd(&accum, accum_sse);
-
- /* Finish off the loop */
- goto finish_after_unrolled_loop;
- }
-#endif
-
- /* Unroll the loop by 8 */
- while (count >= 8) {
- count -= 8;
-
-#if EINSUM_USE_SSE1 && @float32@
- _mm_prefetch(data0 + 512, _MM_HINT_T0);
- _mm_prefetch(data1 + 512, _MM_HINT_T0);
-
-/**begin repeat2
- * #i = 0, 4#
- */
- /*
- * NOTE: This accumulation changes the order, so will likely
- * produce slightly different results.
- */
- a = _mm_mul_ps(_mm_loadu_ps(data0+@i@), _mm_loadu_ps(data1+@i@));
- accum_sse = _mm_add_ps(accum_sse, a);
-/**end repeat2**/
-#elif EINSUM_USE_SSE2 && @float64@
- _mm_prefetch(data0 + 512, _MM_HINT_T0);
- _mm_prefetch(data1 + 512, _MM_HINT_T0);
-
-/**begin repeat2
- * #i = 0, 2, 4, 6#
- */
- /*
- * NOTE: This accumulation changes the order, so will likely
- * produce slightly different results.
- */
- a = _mm_mul_pd(_mm_loadu_pd(data0+@i@), _mm_loadu_pd(data1+@i@));
- accum_sse = _mm_add_pd(accum_sse, a);
-/**end repeat2**/
-#else
-/**begin repeat2
- * #i = 0, 1, 2, 3, 4, 5, 6, 7#
- */
- accum += @from@(data0[@i@]) * @from@(data1[@i@]);
-/**end repeat2**/
-#endif
- data0 += 8;
- data1 += 8;
- }
-
-#if EINSUM_USE_SSE1 && @float32@
- /* Add the four SSE values and put in accum */
- a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
- accum_sse = _mm_add_ps(a, accum_sse);
- a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
- accum_sse = _mm_add_ps(a, accum_sse);
- _mm_store_ss(&accum, accum_sse);
-#elif EINSUM_USE_SSE2 && @float64@
- /* Add the two SSE2 values and put in accum */
- a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1));
- accum_sse = _mm_add_pd(a, accum_sse);
- _mm_store_sd(&accum, accum_sse);
-#endif
-
- /* Finish off the loop */
- goto finish_after_unrolled_loop;
-}
-
-static void
-@name@_sum_of_products_stride0_contig_outstride0_two(int nop, char **dataptr,
- npy_intp const *NPY_UNUSED(strides), npy_intp count)
-{
- @temptype@ value0 = @from@(*(@type@ *)dataptr[0]);
- @type@ *data1 = (@type@ *)dataptr[1];
- @temptype@ accum = 0;
-
-#if EINSUM_USE_SSE1 && @float32@
- __m128 a, accum_sse = _mm_setzero_ps();
-#elif EINSUM_USE_SSE2 && @float64@
- __m128d a, accum_sse = _mm_setzero_pd();
-#endif
-
- NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_stride0_contig_outstride0_two (%d)\n",
- (int)count);
-
-/* This is placed before the main loop to make small counts faster */
-finish_after_unrolled_loop:
- switch (count) {
-/**begin repeat2
- * #i = 6, 5, 4, 3, 2, 1, 0#
- */
- case @i@+1:
- accum += @from@(data1[@i@]);
-/**end repeat2**/
- case 0:
- *(@type@ *)dataptr[2] = @to@(@from@(*(@type@ *)dataptr[2]) + value0 * accum);
- return;
- }
-
-#if EINSUM_USE_SSE1 && @float32@
- /* Use aligned instructions if possible */
- if (EINSUM_IS_SSE_ALIGNED(data1)) {
- /* Unroll the loop by 8 */
- while (count >= 8) {
- count -= 8;
-
-/**begin repeat2
- * #i = 0, 4#
- */
- /*
- * NOTE: This accumulation changes the order, so will likely
- * produce slightly different results.
- */
- accum_sse = _mm_add_ps(accum_sse, _mm_load_ps(data1+@i@));
-/**end repeat2**/
- data1 += 8;
- }
- /* Add the four SSE values and put in accum */
- a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
- accum_sse = _mm_add_ps(a, accum_sse);
- a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
- accum_sse = _mm_add_ps(a, accum_sse);
- _mm_store_ss(&accum, accum_sse);
-
- /* Finish off the loop */
- goto finish_after_unrolled_loop;
- }
-#elif EINSUM_USE_SSE2 && @float64@
- /* Use aligned instructions if possible */
- if (EINSUM_IS_SSE_ALIGNED(data1)) {
- /* Unroll the loop by 8 */
- while (count >= 8) {
- count -= 8;
-
-/**begin repeat2
- * #i = 0, 2, 4, 6#
- */
- /*
- * NOTE: This accumulation changes the order, so will likely
- * produce slightly different results.
- */
- accum_sse = _mm_add_pd(accum_sse, _mm_load_pd(data1+@i@));
-/**end repeat2**/
- data1 += 8;
- }
- /* Add the two SSE2 values and put in accum */
- a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1));
- accum_sse = _mm_add_pd(a, accum_sse);
- _mm_store_sd(&accum, accum_sse);
-
- /* Finish off the loop */
- goto finish_after_unrolled_loop;
- }
-#endif
-
- /* Unroll the loop by 8 */
- while (count >= 8) {
- count -= 8;
-
-#if EINSUM_USE_SSE1 && @float32@
-/**begin repeat2
- * #i = 0, 4#
- */
- /*
- * NOTE: This accumulation changes the order, so will likely
- * produce slightly different results.
- */
- accum_sse = _mm_add_ps(accum_sse, _mm_loadu_ps(data1+@i@));
-/**end repeat2**/
-#elif EINSUM_USE_SSE2 && @float64@
-/**begin repeat2
- * #i = 0, 2, 4, 6#
- */
- /*
- * NOTE: This accumulation changes the order, so will likely
- * produce slightly different results.
- */
- accum_sse = _mm_add_pd(accum_sse, _mm_loadu_pd(data1+@i@));
-/**end repeat2**/
-#else
-/**begin repeat2
- * #i = 0, 1, 2, 3, 4, 5, 6, 7#
- */
- accum += @from@(data1[@i@]);
-/**end repeat2**/
-#endif
- data1 += 8;
- }
-
-#if EINSUM_USE_SSE1 && @float32@
- /* Add the four SSE values and put in accum */
- a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
- accum_sse = _mm_add_ps(a, accum_sse);
- a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
- accum_sse = _mm_add_ps(a, accum_sse);
- _mm_store_ss(&accum, accum_sse);
-#elif EINSUM_USE_SSE2 && @float64@
- /* Add the two SSE2 values and put in accum */
- a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1));
- accum_sse = _mm_add_pd(a, accum_sse);
- _mm_store_sd(&accum, accum_sse);
-#endif
-
- /* Finish off the loop */
- goto finish_after_unrolled_loop;
-}
-
-static void
-@name@_sum_of_products_contig_stride0_outstride0_two(int nop, char **dataptr,
- npy_intp const *NPY_UNUSED(strides), npy_intp count)
-{
- @type@ *data0 = (@type@ *)dataptr[0];
- @temptype@ value1 = @from@(*(@type@ *)dataptr[1]);
- @temptype@ accum = 0;
-
-#if EINSUM_USE_SSE1 && @float32@
- __m128 a, accum_sse = _mm_setzero_ps();
-#elif EINSUM_USE_SSE2 && @float64@
- __m128d a, accum_sse = _mm_setzero_pd();
-#endif
-
- NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_contig_stride0_outstride0_two (%d)\n",
- (int)count);
-
-/* This is placed before the main loop to make small counts faster */
-finish_after_unrolled_loop:
- switch (count) {
-/**begin repeat2
- * #i = 6, 5, 4, 3, 2, 1, 0#
- */
- case @i@+1:
- accum += @from@(data0[@i@]);
-/**end repeat2**/
- case 0:
- *(@type@ *)dataptr[2] = @to@(@from@(*(@type@ *)dataptr[2]) + accum * value1);
- return;
- }
-
-#if EINSUM_USE_SSE1 && @float32@
- /* Use aligned instructions if possible */
- if (EINSUM_IS_SSE_ALIGNED(data0)) {
- /* Unroll the loop by 8 */
- while (count >= 8) {
- count -= 8;
-
-/**begin repeat2
- * #i = 0, 4#
- */
- /*
- * NOTE: This accumulation changes the order, so will likely
- * produce slightly different results.
- */
- accum_sse = _mm_add_ps(accum_sse, _mm_load_ps(data0+@i@));
-/**end repeat2**/
- data0 += 8;
- }
- /* Add the four SSE values and put in accum */
- a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
- accum_sse = _mm_add_ps(a, accum_sse);
- a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
- accum_sse = _mm_add_ps(a, accum_sse);
- _mm_store_ss(&accum, accum_sse);
- /* Finish off the loop */
- goto finish_after_unrolled_loop;
- }
-#elif EINSUM_USE_SSE2 && @float64@
- /* Use aligned instructions if possible */
- if (EINSUM_IS_SSE_ALIGNED(data0)) {
- /* Unroll the loop by 8 */
- while (count >= 8) {
- count -= 8;
-
-/**begin repeat2
- * #i = 0, 2, 4, 6#
- */
- /*
- * NOTE: This accumulation changes the order, so will likely
- * produce slightly different results.
- */
- accum_sse = _mm_add_pd(accum_sse, _mm_load_pd(data0+@i@));
-/**end repeat2**/
- data0 += 8;
- }
- /* Add the two SSE2 values and put in accum */
- a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1));
- accum_sse = _mm_add_pd(a, accum_sse);
- _mm_store_sd(&accum, accum_sse);
- /* Finish off the loop */
- goto finish_after_unrolled_loop;
- }
-#endif
-
- /* Unroll the loop by 8 */
- while (count >= 8) {
- count -= 8;
-
-#if EINSUM_USE_SSE1 && @float32@
-/**begin repeat2
- * #i = 0, 4#
- */
- /*
- * NOTE: This accumulation changes the order, so will likely
- * produce slightly different results.
- */
- accum_sse = _mm_add_ps(accum_sse, _mm_loadu_ps(data0+@i@));
-/**end repeat2**/
-#elif EINSUM_USE_SSE2 && @float64@
-/**begin repeat2
- * #i = 0, 2, 4, 6#
- */
- /*
- * NOTE: This accumulation changes the order, so will likely
- * produce slightly different results.
- */
- accum_sse = _mm_add_pd(accum_sse, _mm_loadu_pd(data0+@i@));
-/**end repeat2**/
-#else
-/**begin repeat2
- * #i = 0, 1, 2, 3, 4, 5, 6, 7#
- */
- accum += @from@(data0[@i@]);
-/**end repeat2**/
-#endif
- data0 += 8;
- }
-
-#if EINSUM_USE_SSE1 && @float32@
- /* Add the four SSE values and put in accum */
- a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
- accum_sse = _mm_add_ps(a, accum_sse);
- a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
- accum_sse = _mm_add_ps(a, accum_sse);
- _mm_store_ss(&accum, accum_sse);
-#elif EINSUM_USE_SSE2 && @float64@
- /* Add the two SSE2 values and put in accum */
- a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1));
- accum_sse = _mm_add_pd(a, accum_sse);
- _mm_store_sd(&accum, accum_sse);
-#endif
-
- /* Finish off the loop */
- goto finish_after_unrolled_loop;
-}
-
-#elif @nop@ == 3 && !@complex@
-
-static void
-@name@_sum_of_products_contig_three(int nop, char **dataptr,
- npy_intp const *NPY_UNUSED(strides), npy_intp count)
-{
- @type@ *data0 = (@type@ *)dataptr[0];
- @type@ *data1 = (@type@ *)dataptr[1];
- @type@ *data2 = (@type@ *)dataptr[2];
- @type@ *data_out = (@type@ *)dataptr[3];
-
- /* Unroll the loop by 8 */
- while (count >= 8) {
- count -= 8;
-
-/**begin repeat2
- * #i = 0, 1, 2, 3, 4, 5, 6, 7#
- */
- data_out[@i@] = @to@(@from@(data0[@i@]) *
- @from@(data1[@i@]) *
- @from@(data2[@i@]) +
- @from@(data_out[@i@]));
-/**end repeat2**/
- data0 += 8;
- data1 += 8;
- data2 += 8;
- data_out += 8;
- }
-
- /* Finish off the loop */
-
-/**begin repeat2
- * #i = 0, 1, 2, 3, 4, 5, 6, 7#
- */
- if (count-- == 0) {
- return;
- }
- data_out[@i@] = @to@(@from@(data0[@i@]) *
- @from@(data1[@i@]) *
- @from@(data2[@i@]) +
- @from@(data_out[@i@]));
-/**end repeat2**/
-}
-
-#else /* @nop@ > 3 || @complex */
-
-static void
-@name@_sum_of_products_contig_@noplabel@(int nop, char **dataptr,
- npy_intp const *NPY_UNUSED(strides), npy_intp count)
-{
- NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_contig_@noplabel@ (%d)\n",
- (int)count);
-
- while (count--) {
-#if !@complex@
- @temptype@ temp = @from@(*(@type@ *)dataptr[0]);
- int i;
- for (i = 1; i < nop; ++i) {
- temp *= @from@(*(@type@ *)dataptr[i]);
- }
- *(@type@ *)dataptr[nop] = @to@(temp +
- @from@(*(@type@ *)dataptr[i]));
- for (i = 0; i <= nop; ++i) {
- dataptr[i] += sizeof(@type@);
- }
-#else /* complex */
-# if @nop@ <= 3
-# define _SUMPROD_NOP @nop@
-# else
-# define _SUMPROD_NOP nop
-# endif
- @temptype@ re, im, tmp;
- int i;
- re = ((@temptype@ *)dataptr[0])[0];
- im = ((@temptype@ *)dataptr[0])[1];
- for (i = 1; i < _SUMPROD_NOP; ++i) {
- tmp = re * ((@temptype@ *)dataptr[i])[0] -
- im * ((@temptype@ *)dataptr[i])[1];
- im = re * ((@temptype@ *)dataptr[i])[1] +
- im * ((@temptype@ *)dataptr[i])[0];
- re = tmp;
- }
- ((@temptype@ *)dataptr[_SUMPROD_NOP])[0] = re +
- ((@temptype@ *)dataptr[_SUMPROD_NOP])[0];
- ((@temptype@ *)dataptr[_SUMPROD_NOP])[1] = im +
- ((@temptype@ *)dataptr[_SUMPROD_NOP])[1];
-
- for (i = 0; i <= _SUMPROD_NOP; ++i) {
- dataptr[i] += sizeof(@type@);
- }
-# undef _SUMPROD_NOP
-#endif
- }
-}
-
-#endif /* functions for various @nop@ */
-
-#if @nop@ == 1
-
-static void
-@name@_sum_of_products_contig_outstride0_one(int nop, char **dataptr,
- npy_intp const *strides, npy_intp count)
-{
-#if @complex@
- @temptype@ accum_re = 0, accum_im = 0;
- @temptype@ *data0 = (@temptype@ *)dataptr[0];
-#else
- @temptype@ accum = 0;
- @type@ *data0 = (@type@ *)dataptr[0];
-#endif
-
-#if EINSUM_USE_SSE1 && @float32@
- __m128 a, accum_sse = _mm_setzero_ps();
-#elif EINSUM_USE_SSE2 && @float64@
- __m128d a, accum_sse = _mm_setzero_pd();
-#endif
-
-
- NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_contig_outstride0_one (%d)\n",
- (int)count);
-
-/* This is placed before the main loop to make small counts faster */
-finish_after_unrolled_loop:
- switch (count) {
-/**begin repeat2
- * #i = 6, 5, 4, 3, 2, 1, 0#
- */
- case @i@+1:
-#if !@complex@
- accum += @from@(data0[@i@]);
-#else /* complex */
- accum_re += data0[2*@i@+0];
- accum_im += data0[2*@i@+1];
-#endif
-/**end repeat2**/
- case 0:
-#if @complex@
- ((@temptype@ *)dataptr[1])[0] += accum_re;
- ((@temptype@ *)dataptr[1])[1] += accum_im;
-#else
- *((@type@ *)dataptr[1]) = @to@(accum +
- @from@(*((@type@ *)dataptr[1])));
-#endif
- return;
- }
-
-#if EINSUM_USE_SSE1 && @float32@
- /* Use aligned instructions if possible */
- if (EINSUM_IS_SSE_ALIGNED(data0)) {
- /* Unroll the loop by 8 */
- while (count >= 8) {
- count -= 8;
-
- _mm_prefetch(data0 + 512, _MM_HINT_T0);
-
-/**begin repeat2
- * #i = 0, 4#
- */
- /*
- * NOTE: This accumulation changes the order, so will likely
- * produce slightly different results.
- */
- accum_sse = _mm_add_ps(accum_sse, _mm_load_ps(data0+@i@));
-/**end repeat2**/
- data0 += 8;
- }
-
- /* Add the four SSE values and put in accum */
- a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
- accum_sse = _mm_add_ps(a, accum_sse);
- a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
- accum_sse = _mm_add_ps(a, accum_sse);
- _mm_store_ss(&accum, accum_sse);
-
- /* Finish off the loop */
- goto finish_after_unrolled_loop;
- }
-#elif EINSUM_USE_SSE2 && @float64@
- /* Use aligned instructions if possible */
- if (EINSUM_IS_SSE_ALIGNED(data0)) {
- /* Unroll the loop by 8 */
- while (count >= 8) {
- count -= 8;
-
- _mm_prefetch(data0 + 512, _MM_HINT_T0);
-
-/**begin repeat2
- * #i = 0, 2, 4, 6#
- */
- /*
- * NOTE: This accumulation changes the order, so will likely
- * produce slightly different results.
- */
- accum_sse = _mm_add_pd(accum_sse, _mm_load_pd(data0+@i@));
-/**end repeat2**/
- data0 += 8;
- }
-
- /* Add the two SSE2 values and put in accum */
- a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1));
- accum_sse = _mm_add_pd(a, accum_sse);
- _mm_store_sd(&accum, accum_sse);
-
- /* Finish off the loop */
- goto finish_after_unrolled_loop;
- }
-#endif
-
- /* Unroll the loop by 8 */
- while (count >= 8) {
- count -= 8;
-
-#if EINSUM_USE_SSE1 && @float32@
- _mm_prefetch(data0 + 512, _MM_HINT_T0);
-
-/**begin repeat2
- * #i = 0, 4#
- */
- /*
- * NOTE: This accumulation changes the order, so will likely
- * produce slightly different results.
- */
- accum_sse = _mm_add_ps(accum_sse, _mm_loadu_ps(data0+@i@));
-/**end repeat2**/
-#elif EINSUM_USE_SSE2 && @float64@
- _mm_prefetch(data0 + 512, _MM_HINT_T0);
-
-/**begin repeat2
- * #i = 0, 2, 4, 6#
- */
- /*
- * NOTE: This accumulation changes the order, so will likely
- * produce slightly different results.
- */
- accum_sse = _mm_add_pd(accum_sse, _mm_loadu_pd(data0+@i@));
-/**end repeat2**/
-#else
-/**begin repeat2
- * #i = 0, 1, 2, 3, 4, 5, 6, 7#
- */
-# if !@complex@
- accum += @from@(data0[@i@]);
-# else /* complex */
- accum_re += data0[2*@i@+0];
- accum_im += data0[2*@i@+1];
-# endif
-/**end repeat2**/
-#endif
-
-#if !@complex@
- data0 += 8;
-#else
- data0 += 8*2;
-#endif
- }
-
-#if EINSUM_USE_SSE1 && @float32@
- /* Add the four SSE values and put in accum */
- a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
- accum_sse = _mm_add_ps(a, accum_sse);
- a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
- accum_sse = _mm_add_ps(a, accum_sse);
- _mm_store_ss(&accum, accum_sse);
-#elif EINSUM_USE_SSE2 && @float64@
- /* Add the two SSE2 values and put in accum */
- a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1));
- accum_sse = _mm_add_pd(a, accum_sse);
- _mm_store_sd(&accum, accum_sse);
-#endif
-
- /* Finish off the loop */
- goto finish_after_unrolled_loop;
-}
-
-#endif /* @nop@ == 1 */
-
-static void
-@name@_sum_of_products_outstride0_@noplabel@(int nop, char **dataptr,
- npy_intp const *strides, npy_intp count)
-{
-#if @complex@
- @temptype@ accum_re = 0, accum_im = 0;
-#else
- @temptype@ accum = 0;
-#endif
-
-#if (@nop@ == 1) || (@nop@ <= 3 && !@complex@)
- char *data0 = dataptr[0];
- npy_intp stride0 = strides[0];
-#endif
-#if (@nop@ == 2 || @nop@ == 3) && !@complex@
- char *data1 = dataptr[1];
- npy_intp stride1 = strides[1];
-#endif
-#if (@nop@ == 3) && !@complex@
- char *data2 = dataptr[2];
- npy_intp stride2 = strides[2];
-#endif
-
- NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_outstride0_@noplabel@ (%d)\n",
- (int)count);
-
- while (count--) {
-#if !@complex@
-# if @nop@ == 1
- accum += @from@(*(@type@ *)data0);
- data0 += stride0;
-# elif @nop@ == 2
- accum += @from@(*(@type@ *)data0) *
- @from@(*(@type@ *)data1);
- data0 += stride0;
- data1 += stride1;
-# elif @nop@ == 3
- accum += @from@(*(@type@ *)data0) *
- @from@(*(@type@ *)data1) *
- @from@(*(@type@ *)data2);
- data0 += stride0;
- data1 += stride1;
- data2 += stride2;
-# else
- @temptype@ temp = @from@(*(@type@ *)dataptr[0]);
- int i;
- for (i = 1; i < nop; ++i) {
- temp *= @from@(*(@type@ *)dataptr[i]);
- }
- accum += temp;
- for (i = 0; i < nop; ++i) {
- dataptr[i] += strides[i];
- }
-# endif
-#else /* complex */
-# if @nop@ == 1
- accum_re += ((@temptype@ *)data0)[0];
- accum_im += ((@temptype@ *)data0)[1];
- data0 += stride0;
-# else
-# if @nop@ <= 3
-#define _SUMPROD_NOP @nop@
-# else
-#define _SUMPROD_NOP nop
-# endif
- @temptype@ re, im, tmp;
- int i;
- re = ((@temptype@ *)dataptr[0])[0];
- im = ((@temptype@ *)dataptr[0])[1];
- for (i = 1; i < _SUMPROD_NOP; ++i) {
- tmp = re * ((@temptype@ *)dataptr[i])[0] -
- im * ((@temptype@ *)dataptr[i])[1];
- im = re * ((@temptype@ *)dataptr[i])[1] +
- im * ((@temptype@ *)dataptr[i])[0];
- re = tmp;
- }
- accum_re += re;
- accum_im += im;
- for (i = 0; i < _SUMPROD_NOP; ++i) {
- dataptr[i] += strides[i];
- }
-#undef _SUMPROD_NOP
-# endif
-#endif
- }
-
-#if @complex@
-# if @nop@ <= 3
- ((@temptype@ *)dataptr[@nop@])[0] += accum_re;
- ((@temptype@ *)dataptr[@nop@])[1] += accum_im;
-# else
- ((@temptype@ *)dataptr[nop])[0] += accum_re;
- ((@temptype@ *)dataptr[nop])[1] += accum_im;
-# endif
-#else
-# if @nop@ <= 3
- *((@type@ *)dataptr[@nop@]) = @to@(accum +
- @from@(*((@type@ *)dataptr[@nop@])));
-# else
- *((@type@ *)dataptr[nop]) = @to@(accum +
- @from@(*((@type@ *)dataptr[nop])));
-# endif
-#endif
-
-}
-
-/**end repeat1**/
-
-/**end repeat**/
-
-
-/* Do OR of ANDs for the boolean type */
-
-/**begin repeat
- * #nop = 1, 2, 3, 1000#
- * #noplabel = one, two, three, any#
- */
-
-static void
-bool_sum_of_products_@noplabel@(int nop, char **dataptr,
- npy_intp const *strides, npy_intp count)
-{
-#if (@nop@ <= 3)
- char *data0 = dataptr[0];
- npy_intp stride0 = strides[0];
-#endif
-#if (@nop@ == 2 || @nop@ == 3)
- char *data1 = dataptr[1];
- npy_intp stride1 = strides[1];
-#endif
-#if (@nop@ == 3)
- char *data2 = dataptr[2];
- npy_intp stride2 = strides[2];
-#endif
-#if (@nop@ <= 3)
- char *data_out = dataptr[@nop@];
- npy_intp stride_out = strides[@nop@];
-#endif
-
- while (count--) {
-#if @nop@ == 1
- *(npy_bool *)data_out = *(npy_bool *)data0 ||
- *(npy_bool *)data_out;
- data0 += stride0;
- data_out += stride_out;
-#elif @nop@ == 2
- *(npy_bool *)data_out = (*(npy_bool *)data0 &&
- *(npy_bool *)data1) ||
- *(npy_bool *)data_out;
- data0 += stride0;
- data1 += stride1;
- data_out += stride_out;
-#elif @nop@ == 3
- *(npy_bool *)data_out = (*(npy_bool *)data0 &&
- *(npy_bool *)data1 &&
- *(npy_bool *)data2) ||
- *(npy_bool *)data_out;
- data0 += stride0;
- data1 += stride1;
- data2 += stride2;
- data_out += stride_out;
-#else
- npy_bool temp = *(npy_bool *)dataptr[0];
- int i;
- for (i = 1; i < nop; ++i) {
- temp = temp && *(npy_bool *)dataptr[i];
- }
- *(npy_bool *)dataptr[nop] = temp || *(npy_bool *)dataptr[i];
- for (i = 0; i <= nop; ++i) {
- dataptr[i] += strides[i];
- }
-#endif
- }
-}
-
-static void
-bool_sum_of_products_contig_@noplabel@(int nop, char **dataptr,
- npy_intp const *strides, npy_intp count)
-{
-#if (@nop@ <= 3)
- char *data0 = dataptr[0];
-#endif
-#if (@nop@ == 2 || @nop@ == 3)
- char *data1 = dataptr[1];
-#endif
-#if (@nop@ == 3)
- char *data2 = dataptr[2];
-#endif
-#if (@nop@ <= 3)
- char *data_out = dataptr[@nop@];
-#endif
-
-#if (@nop@ <= 3)
-/* This is placed before the main loop to make small counts faster */
-finish_after_unrolled_loop:
- switch (count) {
-/**begin repeat1
- * #i = 6, 5, 4, 3, 2, 1, 0#
- */
- case @i@+1:
-# if @nop@ == 1
- ((npy_bool *)data_out)[@i@] = ((npy_bool *)data0)[@i@] ||
- ((npy_bool *)data_out)[@i@];
-# elif @nop@ == 2
- ((npy_bool *)data_out)[@i@] =
- (((npy_bool *)data0)[@i@] &&
- ((npy_bool *)data1)[@i@]) ||
- ((npy_bool *)data_out)[@i@];
-# elif @nop@ == 3
- ((npy_bool *)data_out)[@i@] =
- (((npy_bool *)data0)[@i@] &&
- ((npy_bool *)data1)[@i@] &&
- ((npy_bool *)data2)[@i@]) ||
- ((npy_bool *)data_out)[@i@];
-# endif
-/**end repeat1**/
- case 0:
- return;
- }
-#endif
-
-/* Unroll the loop by 8 for fixed-size nop */
-#if (@nop@ <= 3)
- while (count >= 8) {
- count -= 8;
-#else
- while (count--) {
-#endif
-
-# if @nop@ == 1
-/**begin repeat1
- * #i = 0, 1, 2, 3, 4, 5, 6, 7#
- */
- *((npy_bool *)data_out + @i@) = (*((npy_bool *)data0 + @i@)) ||
- (*((npy_bool *)data_out + @i@));
-/**end repeat1**/
- data0 += 8*sizeof(npy_bool);
- data_out += 8*sizeof(npy_bool);
-# elif @nop@ == 2
-/**begin repeat1
- * #i = 0, 1, 2, 3, 4, 5, 6, 7#
- */
- *((npy_bool *)data_out + @i@) =
- ((*((npy_bool *)data0 + @i@)) &&
- (*((npy_bool *)data1 + @i@))) ||
- (*((npy_bool *)data_out + @i@));
-/**end repeat1**/
- data0 += 8*sizeof(npy_bool);
- data1 += 8*sizeof(npy_bool);
- data_out += 8*sizeof(npy_bool);
-# elif @nop@ == 3
-/**begin repeat1
- * #i = 0, 1, 2, 3, 4, 5, 6, 7#
- */
- *((npy_bool *)data_out + @i@) =
- ((*((npy_bool *)data0 + @i@)) &&
- (*((npy_bool *)data1 + @i@)) &&
- (*((npy_bool *)data2 + @i@))) ||
- (*((npy_bool *)data_out + @i@));
-/**end repeat1**/
- data0 += 8*sizeof(npy_bool);
- data1 += 8*sizeof(npy_bool);
- data2 += 8*sizeof(npy_bool);
- data_out += 8*sizeof(npy_bool);
-# else
- npy_bool temp = *(npy_bool *)dataptr[0];
- int i;
- for (i = 1; i < nop; ++i) {
- temp = temp && *(npy_bool *)dataptr[i];
- }
- *(npy_bool *)dataptr[nop] = temp || *(npy_bool *)dataptr[i];
- for (i = 0; i <= nop; ++i) {
- dataptr[i] += sizeof(npy_bool);
- }
-# endif
- }
-
- /* If the loop was unrolled, we need to finish it off */
-#if (@nop@ <= 3)
- goto finish_after_unrolled_loop;
-#endif
-}
-
-static void
-bool_sum_of_products_outstride0_@noplabel@(int nop, char **dataptr,
- npy_intp const *strides, npy_intp count)
-{
- npy_bool accum = 0;
-
-#if (@nop@ <= 3)
- char *data0 = dataptr[0];
- npy_intp stride0 = strides[0];
-#endif
-#if (@nop@ == 2 || @nop@ == 3)
- char *data1 = dataptr[1];
- npy_intp stride1 = strides[1];
-#endif
-#if (@nop@ == 3)
- char *data2 = dataptr[2];
- npy_intp stride2 = strides[2];
-#endif
-
- while (count--) {
-#if @nop@ == 1
- accum = *(npy_bool *)data0 || accum;
- data0 += stride0;
-#elif @nop@ == 2
- accum = (*(npy_bool *)data0 && *(npy_bool *)data1) || accum;
- data0 += stride0;
- data1 += stride1;
-#elif @nop@ == 3
- accum = (*(npy_bool *)data0 &&
- *(npy_bool *)data1 &&
- *(npy_bool *)data2) || accum;
- data0 += stride0;
- data1 += stride1;
- data2 += stride2;
-#else
- npy_bool temp = *(npy_bool *)dataptr[0];
- int i;
- for (i = 1; i < nop; ++i) {
- temp = temp && *(npy_bool *)dataptr[i];
- }
- accum = temp || accum;
- for (i = 0; i <= nop; ++i) {
- dataptr[i] += strides[i];
- }
-#endif
- }
-
-# if @nop@ <= 3
- *((npy_bool *)dataptr[@nop@]) = accum || *((npy_bool *)dataptr[@nop@]);
-# else
- *((npy_bool *)dataptr[nop]) = accum || *((npy_bool *)dataptr[nop]);
-# endif
-}
-
-/**end repeat**/
-
-typedef void (*sum_of_products_fn)(int, char **, npy_intp const*, npy_intp);
-
-/* These tables need to match up with the type enum */
-static sum_of_products_fn
-_contig_outstride0_unary_specialization_table[NPY_NTYPES] = {
-/**begin repeat
- * #name = bool,
- * byte, ubyte,
- * short, ushort,
- * int, uint,
- * long, ulong,
- * longlong, ulonglong,
- * float, double, longdouble,
- * cfloat, cdouble, clongdouble,
- * object, string, unicode, void,
- * datetime, timedelta, half#
- * #use = 0,
- * 1, 1,
- * 1, 1,
- * 1, 1,
- * 1, 1,
- * 1, 1,
- * 1, 1, 1,
- * 1, 1, 1,
- * 0, 0, 0, 0,
- * 0, 0, 1#
- */
-#if @use@
- &@name@_sum_of_products_contig_outstride0_one,
-#else
- NULL,
-#endif
-/**end repeat**/
-}; /* End of _contig_outstride0_unary_specialization_table */
-
-static sum_of_products_fn _binary_specialization_table[NPY_NTYPES][5] = {
-/**begin repeat
- * #name = bool,
- * byte, ubyte,
- * short, ushort,
- * int, uint,
- * long, ulong,
- * longlong, ulonglong,
- * float, double, longdouble,
- * cfloat, cdouble, clongdouble,
- * object, string, unicode, void,
- * datetime, timedelta, half#
- * #use = 0,
- * 1, 1,
- * 1, 1,
- * 1, 1,
- * 1, 1,
- * 1, 1,
- * 1, 1, 1,
- * 0, 0, 0,
- * 0, 0, 0, 0,
- * 0, 0, 1#
- */
-#if @use@
-{
- &@name@_sum_of_products_stride0_contig_outstride0_two,
- &@name@_sum_of_products_stride0_contig_outcontig_two,
- &@name@_sum_of_products_contig_stride0_outstride0_two,
- &@name@_sum_of_products_contig_stride0_outcontig_two,
- &@name@_sum_of_products_contig_contig_outstride0_two,
-},
-#else
- {NULL, NULL, NULL, NULL, NULL},
-#endif
-/**end repeat**/
-}; /* End of _binary_specialization_table */
-
-static sum_of_products_fn _outstride0_specialized_table[NPY_NTYPES][4] = {
-/**begin repeat
- * #name = bool,
- * byte, ubyte,
- * short, ushort,
- * int, uint,
- * long, ulong,
- * longlong, ulonglong,
- * float, double, longdouble,
- * cfloat, cdouble, clongdouble,
- * object, string, unicode, void,
- * datetime, timedelta, half#
- * #use = 1,
- * 1, 1,
- * 1, 1,
- * 1, 1,
- * 1, 1,
- * 1, 1,
- * 1, 1, 1,
- * 1, 1, 1,
- * 0, 0, 0, 0,
- * 0, 0, 1#
- */
-#if @use@
-{
- &@name@_sum_of_products_outstride0_any,
- &@name@_sum_of_products_outstride0_one,
- &@name@_sum_of_products_outstride0_two,
- &@name@_sum_of_products_outstride0_three
-},
-#else
- {NULL, NULL, NULL, NULL},
-#endif
-/**end repeat**/
-}; /* End of _outstride0_specialized_table */
-
-static sum_of_products_fn _allcontig_specialized_table[NPY_NTYPES][4] = {
-/**begin repeat
- * #name = bool,
- * byte, ubyte,
- * short, ushort,
- * int, uint,
- * long, ulong,
- * longlong, ulonglong,
- * float, double, longdouble,
- * cfloat, cdouble, clongdouble,
- * object, string, unicode, void,
- * datetime, timedelta, half#
- * #use = 1,
- * 1, 1,
- * 1, 1,
- * 1, 1,
- * 1, 1,
- * 1, 1,
- * 1, 1, 1,
- * 1, 1, 1,
- * 0, 0, 0, 0,
- * 0, 0, 1#
- */
-#if @use@
-{
- &@name@_sum_of_products_contig_any,
- &@name@_sum_of_products_contig_one,
- &@name@_sum_of_products_contig_two,
- &@name@_sum_of_products_contig_three
-},
-#else
- {NULL, NULL, NULL, NULL},
-#endif
-/**end repeat**/
-}; /* End of _allcontig_specialized_table */
-
-static sum_of_products_fn _unspecialized_table[NPY_NTYPES][4] = {
-/**begin repeat
- * #name = bool,
- * byte, ubyte,
- * short, ushort,
- * int, uint,
- * long, ulong,
- * longlong, ulonglong,
- * float, double, longdouble,
- * cfloat, cdouble, clongdouble,
- * object, string, unicode, void,
- * datetime, timedelta, half#
- * #use = 1,
- * 1, 1,
- * 1, 1,
- * 1, 1,
- * 1, 1,
- * 1, 1,
- * 1, 1, 1,
- * 1, 1, 1,
- * 0, 0, 0, 0,
- * 0, 0, 1#
- */
-#if @use@
-{
- &@name@_sum_of_products_any,
- &@name@_sum_of_products_one,
- &@name@_sum_of_products_two,
- &@name@_sum_of_products_three
-},
-#else
- {NULL, NULL, NULL, NULL},
-#endif
-/**end repeat**/
-}; /* End of _unnspecialized_table */
-
-static sum_of_products_fn
-get_sum_of_products_function(int nop, int type_num,
- npy_intp itemsize, npy_intp const *fixed_strides)
-{
- int iop;
-
- if (type_num >= NPY_NTYPES) {
- return NULL;
- }
-
- /* contiguous reduction */
- if (nop == 1 && fixed_strides[0] == itemsize && fixed_strides[1] == 0) {
- sum_of_products_fn ret =
- _contig_outstride0_unary_specialization_table[type_num];
- if (ret != NULL) {
- return ret;
- }
- }
-
- /* nop of 2 has more specializations */
- if (nop == 2) {
- /* Encode the zero/contiguous strides */
- int code;
- code = (fixed_strides[0] == 0) ? 0 :
- (fixed_strides[0] == itemsize) ? 2*2*1 : 8;
- code += (fixed_strides[1] == 0) ? 0 :
- (fixed_strides[1] == itemsize) ? 2*1 : 8;
- code += (fixed_strides[2] == 0) ? 0 :
- (fixed_strides[2] == itemsize) ? 1 : 8;
- if (code >= 2 && code < 7) {
- sum_of_products_fn ret =
- _binary_specialization_table[type_num][code-2];
- if (ret != NULL) {
- return ret;
- }
- }
- }
-
- /* Inner loop with an output stride of 0 */
- if (fixed_strides[nop] == 0) {
- return _outstride0_specialized_table[type_num][nop <= 3 ? nop : 0];
- }
-
- /* Check for all contiguous */
- for (iop = 0; iop < nop + 1; ++iop) {
- if (fixed_strides[iop] != itemsize) {
- break;
- }
- }
-
- /* Contiguous loop */
- if (iop == nop + 1) {
- return _allcontig_specialized_table[type_num][nop <= 3 ? nop : 0];
- }
-
- /* None of the above specializations caught it, general loops */
- return _unspecialized_table[type_num][nop <= 3 ? nop : 0];
-}
+#include "einsum_sumprod.h"
+#include "einsum_debug.h"
/*
diff --git a/numpy/core/src/multiarray/einsum_debug.h b/numpy/core/src/multiarray/einsum_debug.h
new file mode 100644
index 000000000..9aa81fcbd
--- /dev/null
+++ b/numpy/core/src/multiarray/einsum_debug.h
@@ -0,0 +1,28 @@
+/*
+ * This file provides debug macros used by the other einsum files.
+ *
+ * Copyright (c) 2011 by Mark Wiebe (mwwiebe@gmail.com)
+ * The University of British Columbia
+ *
+ * See LICENSE.txt for the license.
+ */
+#ifndef _NPY_MULTIARRAY_EINSUM_DEBUG_H
+#define _NPY_MULTIARRAY_EINSUM_DEBUG_H
+
+/********** PRINTF DEBUG TRACING **************/
+#define NPY_EINSUM_DBG_TRACING 0
+
+#if NPY_EINSUM_DBG_TRACING
+#include <cstdio>
+#define NPY_EINSUM_DBG_PRINT(s) printf("%s", s);
+#define NPY_EINSUM_DBG_PRINT1(s, p1) printf(s, p1);
+#define NPY_EINSUM_DBG_PRINT2(s, p1, p2) printf(s, p1, p2);
+#define NPY_EINSUM_DBG_PRINT3(s, p1, p2, p3) printf(s);
+#else
+#define NPY_EINSUM_DBG_PRINT(s)
+#define NPY_EINSUM_DBG_PRINT1(s, p1)
+#define NPY_EINSUM_DBG_PRINT2(s, p1, p2)
+#define NPY_EINSUM_DBG_PRINT3(s, p1, p2, p3)
+#endif
+
+#endif
diff --git a/numpy/core/src/multiarray/einsum_sumprod.c.src b/numpy/core/src/multiarray/einsum_sumprod.c.src
new file mode 100644
index 000000000..c58e74287
--- /dev/null
+++ b/numpy/core/src/multiarray/einsum_sumprod.c.src
@@ -0,0 +1,1897 @@
+/*
+ * This file provides optimized sum of product implementations used internally
+ * by einsum.
+ *
+ * Copyright (c) 2011 by Mark Wiebe (mwwiebe@gmail.com)
+ * The University of British Columbia
+ *
+ * See LICENSE.txt for the license.
+ */
+
+#define NPY_NO_DEPRECATED_API NPY_API_VERSION
+#define _MULTIARRAYMODULE
+
+#include <numpy/npy_common.h>
+#include <numpy/ndarraytypes.h> /* for NPY_NTYPES */
+#include <numpy/halffloat.h>
+
+#include "einsum_sumprod.h"
+#include "einsum_debug.h"
+
+
+#ifdef NPY_HAVE_SSE_INTRINSICS
+#define EINSUM_USE_SSE1 1
+#else
+#define EINSUM_USE_SSE1 0
+#endif
+
+#ifdef NPY_HAVE_SSE2_INTRINSICS
+#define EINSUM_USE_SSE2 1
+#else
+#define EINSUM_USE_SSE2 0
+#endif
+
+#if EINSUM_USE_SSE1
+#include <xmmintrin.h>
+#endif
+
+#if EINSUM_USE_SSE2
+#include <emmintrin.h>
+#endif
+
+#define EINSUM_IS_SSE_ALIGNED(x) ((((npy_intp)x)&0xf) == 0)
+
+/**********************************************/
+
+/**begin repeat
+ * #name = byte, short, int, long, longlong,
+ * ubyte, ushort, uint, ulong, ulonglong,
+ * half, float, double, longdouble,
+ * cfloat, cdouble, clongdouble#
+ * #type = npy_byte, npy_short, npy_int, npy_long, npy_longlong,
+ * npy_ubyte, npy_ushort, npy_uint, npy_ulong, npy_ulonglong,
+ * npy_half, npy_float, npy_double, npy_longdouble,
+ * npy_cfloat, npy_cdouble, npy_clongdouble#
+ * #temptype = npy_byte, npy_short, npy_int, npy_long, npy_longlong,
+ * npy_ubyte, npy_ushort, npy_uint, npy_ulong, npy_ulonglong,
+ * npy_float, npy_float, npy_double, npy_longdouble,
+ * npy_float, npy_double, npy_longdouble#
+ * #to = ,,,,,
+ * ,,,,,
+ * npy_float_to_half,,,,
+ * ,,#
+ * #from = ,,,,,
+ * ,,,,,
+ * npy_half_to_float,,,,
+ * ,,#
+ * #complex = 0*5,
+ * 0*5,
+ * 0*4,
+ * 1*3#
+ * #float32 = 0*5,
+ * 0*5,
+ * 0,1,0,0,
+ * 0*3#
+ * #float64 = 0*5,
+ * 0*5,
+ * 0,0,1,0,
+ * 0*3#
+ */
+
+/**begin repeat1
+ * #nop = 1, 2, 3, 1000#
+ * #noplabel = one, two, three, any#
+ */
+static void
+@name@_sum_of_products_@noplabel@(int nop, char **dataptr,
+ npy_intp const *strides, npy_intp count)
+{
+#if (@nop@ == 1) || (@nop@ <= 3 && !@complex@)
+ char *data0 = dataptr[0];
+ npy_intp stride0 = strides[0];
+#endif
+#if (@nop@ == 2 || @nop@ == 3) && !@complex@
+ char *data1 = dataptr[1];
+ npy_intp stride1 = strides[1];
+#endif
+#if (@nop@ == 3) && !@complex@
+ char *data2 = dataptr[2];
+ npy_intp stride2 = strides[2];
+#endif
+#if (@nop@ == 1) || (@nop@ <= 3 && !@complex@)
+ char *data_out = dataptr[@nop@];
+ npy_intp stride_out = strides[@nop@];
+#endif
+
+ NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_@noplabel@ (%d)\n", (int)count);
+
+ while (count--) {
+#if !@complex@
+# if @nop@ == 1
+ *(@type@ *)data_out = @to@(@from@(*(@type@ *)data0) +
+ @from@(*(@type@ *)data_out));
+ data0 += stride0;
+ data_out += stride_out;
+# elif @nop@ == 2
+ *(@type@ *)data_out = @to@(@from@(*(@type@ *)data0) *
+ @from@(*(@type@ *)data1) +
+ @from@(*(@type@ *)data_out));
+ data0 += stride0;
+ data1 += stride1;
+ data_out += stride_out;
+# elif @nop@ == 3
+ *(@type@ *)data_out = @to@(@from@(*(@type@ *)data0) *
+ @from@(*(@type@ *)data1) *
+ @from@(*(@type@ *)data2) +
+ @from@(*(@type@ *)data_out));
+ data0 += stride0;
+ data1 += stride1;
+ data2 += stride2;
+ data_out += stride_out;
+# else
+ @temptype@ temp = @from@(*(@type@ *)dataptr[0]);
+ int i;
+ for (i = 1; i < nop; ++i) {
+ temp *= @from@(*(@type@ *)dataptr[i]);
+ }
+ *(@type@ *)dataptr[nop] = @to@(temp +
+ @from@(*(@type@ *)dataptr[i]));
+ for (i = 0; i <= nop; ++i) {
+ dataptr[i] += strides[i];
+ }
+# endif
+#else /* complex */
+# if @nop@ == 1
+ ((@temptype@ *)data_out)[0] = ((@temptype@ *)data0)[0] +
+ ((@temptype@ *)data_out)[0];
+ ((@temptype@ *)data_out)[1] = ((@temptype@ *)data0)[1] +
+ ((@temptype@ *)data_out)[1];
+ data0 += stride0;
+ data_out += stride_out;
+# else
+# if @nop@ <= 3
+#define _SUMPROD_NOP @nop@
+# else
+#define _SUMPROD_NOP nop
+# endif
+ @temptype@ re, im, tmp;
+ int i;
+ re = ((@temptype@ *)dataptr[0])[0];
+ im = ((@temptype@ *)dataptr[0])[1];
+ for (i = 1; i < _SUMPROD_NOP; ++i) {
+ tmp = re * ((@temptype@ *)dataptr[i])[0] -
+ im * ((@temptype@ *)dataptr[i])[1];
+ im = re * ((@temptype@ *)dataptr[i])[1] +
+ im * ((@temptype@ *)dataptr[i])[0];
+ re = tmp;
+ }
+ ((@temptype@ *)dataptr[_SUMPROD_NOP])[0] = re +
+ ((@temptype@ *)dataptr[_SUMPROD_NOP])[0];
+ ((@temptype@ *)dataptr[_SUMPROD_NOP])[1] = im +
+ ((@temptype@ *)dataptr[_SUMPROD_NOP])[1];
+
+ for (i = 0; i <= _SUMPROD_NOP; ++i) {
+ dataptr[i] += strides[i];
+ }
+#undef _SUMPROD_NOP
+# endif
+#endif
+ }
+}
+
+#if @nop@ == 1
+
+static void
+@name@_sum_of_products_contig_one(int nop, char **dataptr,
+ npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+ @type@ *data0 = (@type@ *)dataptr[0];
+ @type@ *data_out = (@type@ *)dataptr[1];
+
+ NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_contig_one (%d)\n",
+ (int)count);
+
+/* This is placed before the main loop to make small counts faster */
+finish_after_unrolled_loop:
+ switch (count) {
+/**begin repeat2
+ * #i = 6, 5, 4, 3, 2, 1, 0#
+ */
+ case @i@+1:
+#if !@complex@
+ data_out[@i@] = @to@(@from@(data0[@i@]) +
+ @from@(data_out[@i@]));
+#else
+ ((@temptype@ *)data_out + 2*@i@)[0] =
+ ((@temptype@ *)data0 + 2*@i@)[0] +
+ ((@temptype@ *)data_out + 2*@i@)[0];
+ ((@temptype@ *)data_out + 2*@i@)[1] =
+ ((@temptype@ *)data0 + 2*@i@)[1] +
+ ((@temptype@ *)data_out + 2*@i@)[1];
+#endif
+/**end repeat2**/
+ case 0:
+ return;
+ }
+
+ /* Unroll the loop by 8 */
+ while (count >= 8) {
+ count -= 8;
+
+/**begin repeat2
+ * #i = 0, 1, 2, 3, 4, 5, 6, 7#
+ */
+#if !@complex@
+ data_out[@i@] = @to@(@from@(data0[@i@]) +
+ @from@(data_out[@i@]));
+#else /* complex */
+ ((@temptype@ *)data_out + 2*@i@)[0] =
+ ((@temptype@ *)data0 + 2*@i@)[0] +
+ ((@temptype@ *)data_out + 2*@i@)[0];
+ ((@temptype@ *)data_out + 2*@i@)[1] =
+ ((@temptype@ *)data0 + 2*@i@)[1] +
+ ((@temptype@ *)data_out + 2*@i@)[1];
+#endif
+/**end repeat2**/
+ data0 += 8;
+ data_out += 8;
+ }
+
+ /* Finish off the loop */
+ goto finish_after_unrolled_loop;
+}
+
+#elif @nop@ == 2 && !@complex@
+
+static void
+@name@_sum_of_products_contig_two(int nop, char **dataptr,
+ npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+ @type@ *data0 = (@type@ *)dataptr[0];
+ @type@ *data1 = (@type@ *)dataptr[1];
+ @type@ *data_out = (@type@ *)dataptr[2];
+
+#if EINSUM_USE_SSE1 && @float32@
+ __m128 a, b;
+#elif EINSUM_USE_SSE2 && @float64@
+ __m128d a, b;
+#endif
+
+ NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_contig_two (%d)\n",
+ (int)count);
+
+/* This is placed before the main loop to make small counts faster */
+finish_after_unrolled_loop:
+ switch (count) {
+/**begin repeat2
+ * #i = 6, 5, 4, 3, 2, 1, 0#
+ */
+ case @i@+1:
+ data_out[@i@] = @to@(@from@(data0[@i@]) *
+ @from@(data1[@i@]) +
+ @from@(data_out[@i@]));
+/**end repeat2**/
+ case 0:
+ return;
+ }
+
+#if EINSUM_USE_SSE1 && @float32@
+ /* Use aligned instructions if possible */
+ if (EINSUM_IS_SSE_ALIGNED(data0) && EINSUM_IS_SSE_ALIGNED(data1) &&
+ EINSUM_IS_SSE_ALIGNED(data_out)) {
+ /* Unroll the loop by 8 */
+ while (count >= 8) {
+ count -= 8;
+
+/**begin repeat2
+ * #i = 0, 4#
+ */
+ a = _mm_mul_ps(_mm_load_ps(data0+@i@), _mm_load_ps(data1+@i@));
+ b = _mm_add_ps(a, _mm_load_ps(data_out+@i@));
+ _mm_store_ps(data_out+@i@, b);
+/**end repeat2**/
+ data0 += 8;
+ data1 += 8;
+ data_out += 8;
+ }
+
+ /* Finish off the loop */
+ goto finish_after_unrolled_loop;
+ }
+#elif EINSUM_USE_SSE2 && @float64@
+ /* Use aligned instructions if possible */
+ if (EINSUM_IS_SSE_ALIGNED(data0) && EINSUM_IS_SSE_ALIGNED(data1) &&
+ EINSUM_IS_SSE_ALIGNED(data_out)) {
+ /* Unroll the loop by 8 */
+ while (count >= 8) {
+ count -= 8;
+
+/**begin repeat2
+ * #i = 0, 2, 4, 6#
+ */
+ a = _mm_mul_pd(_mm_load_pd(data0+@i@), _mm_load_pd(data1+@i@));
+ b = _mm_add_pd(a, _mm_load_pd(data_out+@i@));
+ _mm_store_pd(data_out+@i@, b);
+/**end repeat2**/
+ data0 += 8;
+ data1 += 8;
+ data_out += 8;
+ }
+
+ /* Finish off the loop */
+ goto finish_after_unrolled_loop;
+ }
+#endif
+
+ /* Unroll the loop by 8 */
+ while (count >= 8) {
+ count -= 8;
+
+#if EINSUM_USE_SSE1 && @float32@
+/**begin repeat2
+ * #i = 0, 4#
+ */
+ a = _mm_mul_ps(_mm_loadu_ps(data0+@i@), _mm_loadu_ps(data1+@i@));
+ b = _mm_add_ps(a, _mm_loadu_ps(data_out+@i@));
+ _mm_storeu_ps(data_out+@i@, b);
+/**end repeat2**/
+#elif EINSUM_USE_SSE2 && @float64@
+/**begin repeat2
+ * #i = 0, 2, 4, 6#
+ */
+ a = _mm_mul_pd(_mm_loadu_pd(data0+@i@), _mm_loadu_pd(data1+@i@));
+ b = _mm_add_pd(a, _mm_loadu_pd(data_out+@i@));
+ _mm_storeu_pd(data_out+@i@, b);
+/**end repeat2**/
+#else
+/**begin repeat2
+ * #i = 0, 1, 2, 3, 4, 5, 6, 7#
+ */
+ data_out[@i@] = @to@(@from@(data0[@i@]) *
+ @from@(data1[@i@]) +
+ @from@(data_out[@i@]));
+/**end repeat2**/
+#endif
+ data0 += 8;
+ data1 += 8;
+ data_out += 8;
+ }
+
+ /* Finish off the loop */
+ goto finish_after_unrolled_loop;
+}
+
+/* Some extra specializations for the two operand case */
+static void
+@name@_sum_of_products_stride0_contig_outcontig_two(int nop, char **dataptr,
+ npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+ @temptype@ value0 = @from@(*(@type@ *)dataptr[0]);
+ @type@ *data1 = (@type@ *)dataptr[1];
+ @type@ *data_out = (@type@ *)dataptr[2];
+
+#if EINSUM_USE_SSE1 && @float32@
+ __m128 a, b, value0_sse;
+#elif EINSUM_USE_SSE2 && @float64@
+ __m128d a, b, value0_sse;
+#endif
+
+ NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_stride0_contig_outcontig_two (%d)\n",
+ (int)count);
+
+/* This is placed before the main loop to make small counts faster */
+finish_after_unrolled_loop:
+ switch (count) {
+/**begin repeat2
+ * #i = 6, 5, 4, 3, 2, 1, 0#
+ */
+ case @i@+1:
+ data_out[@i@] = @to@(value0 *
+ @from@(data1[@i@]) +
+ @from@(data_out[@i@]));
+/**end repeat2**/
+ case 0:
+ return;
+ }
+
+#if EINSUM_USE_SSE1 && @float32@
+ value0_sse = _mm_set_ps1(value0);
+
+ /* Use aligned instructions if possible */
+ if (EINSUM_IS_SSE_ALIGNED(data1) && EINSUM_IS_SSE_ALIGNED(data_out)) {
+ /* Unroll the loop by 8 */
+ while (count >= 8) {
+ count -= 8;
+
+/**begin repeat2
+ * #i = 0, 4#
+ */
+ a = _mm_mul_ps(value0_sse, _mm_load_ps(data1+@i@));
+ b = _mm_add_ps(a, _mm_load_ps(data_out+@i@));
+ _mm_store_ps(data_out+@i@, b);
+/**end repeat2**/
+ data1 += 8;
+ data_out += 8;
+ }
+
+ /* Finish off the loop */
+ if (count > 0) {
+ goto finish_after_unrolled_loop;
+ }
+ else {
+ return;
+ }
+ }
+#elif EINSUM_USE_SSE2 && @float64@
+ value0_sse = _mm_set1_pd(value0);
+
+ /* Use aligned instructions if possible */
+ if (EINSUM_IS_SSE_ALIGNED(data1) && EINSUM_IS_SSE_ALIGNED(data_out)) {
+ /* Unroll the loop by 8 */
+ while (count >= 8) {
+ count -= 8;
+
+/**begin repeat2
+ * #i = 0, 2, 4, 6#
+ */
+ a = _mm_mul_pd(value0_sse, _mm_load_pd(data1+@i@));
+ b = _mm_add_pd(a, _mm_load_pd(data_out+@i@));
+ _mm_store_pd(data_out+@i@, b);
+/**end repeat2**/
+ data1 += 8;
+ data_out += 8;
+ }
+
+ /* Finish off the loop */
+ if (count > 0) {
+ goto finish_after_unrolled_loop;
+ }
+ else {
+ return;
+ }
+ }
+#endif
+
+ /* Unroll the loop by 8 */
+ while (count >= 8) {
+ count -= 8;
+
+#if EINSUM_USE_SSE1 && @float32@
+/**begin repeat2
+ * #i = 0, 4#
+ */
+ a = _mm_mul_ps(value0_sse, _mm_loadu_ps(data1+@i@));
+ b = _mm_add_ps(a, _mm_loadu_ps(data_out+@i@));
+ _mm_storeu_ps(data_out+@i@, b);
+/**end repeat2**/
+#elif EINSUM_USE_SSE2 && @float64@
+/**begin repeat2
+ * #i = 0, 2, 4, 6#
+ */
+ a = _mm_mul_pd(value0_sse, _mm_loadu_pd(data1+@i@));
+ b = _mm_add_pd(a, _mm_loadu_pd(data_out+@i@));
+ _mm_storeu_pd(data_out+@i@, b);
+/**end repeat2**/
+#else
+/**begin repeat2
+ * #i = 0, 1, 2, 3, 4, 5, 6, 7#
+ */
+ data_out[@i@] = @to@(value0 *
+ @from@(data1[@i@]) +
+ @from@(data_out[@i@]));
+/**end repeat2**/
+#endif
+ data1 += 8;
+ data_out += 8;
+ }
+
+ /* Finish off the loop */
+ if (count > 0) {
+ goto finish_after_unrolled_loop;
+ }
+}
+
+static void
+@name@_sum_of_products_contig_stride0_outcontig_two(int nop, char **dataptr,
+ npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+ @type@ *data0 = (@type@ *)dataptr[0];
+ @temptype@ value1 = @from@(*(@type@ *)dataptr[1]);
+ @type@ *data_out = (@type@ *)dataptr[2];
+
+#if EINSUM_USE_SSE1 && @float32@
+ __m128 a, b, value1_sse;
+#elif EINSUM_USE_SSE2 && @float64@
+ __m128d a, b, value1_sse;
+#endif
+
+ NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_contig_stride0_outcontig_two (%d)\n",
+ (int)count);
+
+/* This is placed before the main loop to make small counts faster */
+finish_after_unrolled_loop:
+ switch (count) {
+/**begin repeat2
+ * #i = 6, 5, 4, 3, 2, 1, 0#
+ */
+ case @i@+1:
+ data_out[@i@] = @to@(@from@(data0[@i@])*
+ value1 +
+ @from@(data_out[@i@]));
+/**end repeat2**/
+ case 0:
+ return;
+ }
+
+#if EINSUM_USE_SSE1 && @float32@
+ value1_sse = _mm_set_ps1(value1);
+
+ /* Use aligned instructions if possible */
+ if (EINSUM_IS_SSE_ALIGNED(data0) && EINSUM_IS_SSE_ALIGNED(data_out)) {
+ /* Unroll the loop by 8 */
+ while (count >= 8) {
+ count -= 8;
+
+/**begin repeat2
+ * #i = 0, 4#
+ */
+ a = _mm_mul_ps(_mm_load_ps(data0+@i@), value1_sse);
+ b = _mm_add_ps(a, _mm_load_ps(data_out+@i@));
+ _mm_store_ps(data_out+@i@, b);
+/**end repeat2**/
+ data0 += 8;
+ data_out += 8;
+ }
+
+ /* Finish off the loop */
+ goto finish_after_unrolled_loop;
+ }
+#elif EINSUM_USE_SSE2 && @float64@
+ value1_sse = _mm_set1_pd(value1);
+
+ /* Use aligned instructions if possible */
+ if (EINSUM_IS_SSE_ALIGNED(data0) && EINSUM_IS_SSE_ALIGNED(data_out)) {
+ /* Unroll the loop by 8 */
+ while (count >= 8) {
+ count -= 8;
+
+/**begin repeat2
+ * #i = 0, 2, 4, 6#
+ */
+ a = _mm_mul_pd(_mm_load_pd(data0+@i@), value1_sse);
+ b = _mm_add_pd(a, _mm_load_pd(data_out+@i@));
+ _mm_store_pd(data_out+@i@, b);
+/**end repeat2**/
+ data0 += 8;
+ data_out += 8;
+ }
+
+ /* Finish off the loop */
+ goto finish_after_unrolled_loop;
+ }
+#endif
+
+ /* Unroll the loop by 8 */
+ while (count >= 8) {
+ count -= 8;
+
+#if EINSUM_USE_SSE1 && @float32@
+/**begin repeat2
+ * #i = 0, 4#
+ */
+ a = _mm_mul_ps(_mm_loadu_ps(data0+@i@), value1_sse);
+ b = _mm_add_ps(a, _mm_loadu_ps(data_out+@i@));
+ _mm_storeu_ps(data_out+@i@, b);
+/**end repeat2**/
+#elif EINSUM_USE_SSE2 && @float64@
+/**begin repeat2
+ * #i = 0, 2, 4, 6#
+ */
+ a = _mm_mul_pd(_mm_loadu_pd(data0+@i@), value1_sse);
+ b = _mm_add_pd(a, _mm_loadu_pd(data_out+@i@));
+ _mm_storeu_pd(data_out+@i@, b);
+/**end repeat2**/
+#else
+/**begin repeat2
+ * #i = 0, 1, 2, 3, 4, 5, 6, 7#
+ */
+ data_out[@i@] = @to@(@from@(data0[@i@])*
+ value1 +
+ @from@(data_out[@i@]));
+/**end repeat2**/
+#endif
+ data0 += 8;
+ data_out += 8;
+ }
+
+ /* Finish off the loop */
+ goto finish_after_unrolled_loop;
+}
+
+static void
+@name@_sum_of_products_contig_contig_outstride0_two(int nop, char **dataptr,
+ npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+ @type@ *data0 = (@type@ *)dataptr[0];
+ @type@ *data1 = (@type@ *)dataptr[1];
+ @temptype@ accum = 0;
+
+#if EINSUM_USE_SSE1 && @float32@
+ __m128 a, accum_sse = _mm_setzero_ps();
+#elif EINSUM_USE_SSE2 && @float64@
+ __m128d a, accum_sse = _mm_setzero_pd();
+#endif
+
+ NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_contig_contig_outstride0_two (%d)\n",
+ (int)count);
+
+/* This is placed before the main loop to make small counts faster */
+finish_after_unrolled_loop:
+ switch (count) {
+/**begin repeat2
+ * #i = 6, 5, 4, 3, 2, 1, 0#
+ */
+ case @i@+1:
+ accum += @from@(data0[@i@]) * @from@(data1[@i@]);
+/**end repeat2**/
+ case 0:
+ *(@type@ *)dataptr[2] = @to@(@from@(*(@type@ *)dataptr[2]) + accum);
+ return;
+ }
+
+#if EINSUM_USE_SSE1 && @float32@
+ /* Use aligned instructions if possible */
+ if (EINSUM_IS_SSE_ALIGNED(data0) && EINSUM_IS_SSE_ALIGNED(data1)) {
+ /* Unroll the loop by 8 */
+ while (count >= 8) {
+ count -= 8;
+
+ _mm_prefetch(data0 + 512, _MM_HINT_T0);
+ _mm_prefetch(data1 + 512, _MM_HINT_T0);
+
+/**begin repeat2
+ * #i = 0, 4#
+ */
+ /*
+ * NOTE: This accumulation changes the order, so will likely
+ * produce slightly different results.
+ */
+ a = _mm_mul_ps(_mm_load_ps(data0+@i@), _mm_load_ps(data1+@i@));
+ accum_sse = _mm_add_ps(accum_sse, a);
+/**end repeat2**/
+ data0 += 8;
+ data1 += 8;
+ }
+
+ /* Add the four SSE values and put in accum */
+ a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
+ accum_sse = _mm_add_ps(a, accum_sse);
+ a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
+ accum_sse = _mm_add_ps(a, accum_sse);
+ _mm_store_ss(&accum, accum_sse);
+
+ /* Finish off the loop */
+ goto finish_after_unrolled_loop;
+ }
+#elif EINSUM_USE_SSE2 && @float64@
+ /* Use aligned instructions if possible */
+ if (EINSUM_IS_SSE_ALIGNED(data0) && EINSUM_IS_SSE_ALIGNED(data1)) {
+ /* Unroll the loop by 8 */
+ while (count >= 8) {
+ count -= 8;
+
+ _mm_prefetch(data0 + 512, _MM_HINT_T0);
+ _mm_prefetch(data1 + 512, _MM_HINT_T0);
+
+/**begin repeat2
+ * #i = 0, 2, 4, 6#
+ */
+ /*
+ * NOTE: This accumulation changes the order, so will likely
+ * produce slightly different results.
+ */
+ a = _mm_mul_pd(_mm_load_pd(data0+@i@), _mm_load_pd(data1+@i@));
+ accum_sse = _mm_add_pd(accum_sse, a);
+/**end repeat2**/
+ data0 += 8;
+ data1 += 8;
+ }
+
+ /* Add the two SSE2 values and put in accum */
+ a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1));
+ accum_sse = _mm_add_pd(a, accum_sse);
+ _mm_store_sd(&accum, accum_sse);
+
+ /* Finish off the loop */
+ goto finish_after_unrolled_loop;
+ }
+#endif
+
+ /* Unroll the loop by 8 */
+ while (count >= 8) {
+ count -= 8;
+
+#if EINSUM_USE_SSE1 && @float32@
+ _mm_prefetch(data0 + 512, _MM_HINT_T0);
+ _mm_prefetch(data1 + 512, _MM_HINT_T0);
+
+/**begin repeat2
+ * #i = 0, 4#
+ */
+ /*
+ * NOTE: This accumulation changes the order, so will likely
+ * produce slightly different results.
+ */
+ a = _mm_mul_ps(_mm_loadu_ps(data0+@i@), _mm_loadu_ps(data1+@i@));
+ accum_sse = _mm_add_ps(accum_sse, a);
+/**end repeat2**/
+#elif EINSUM_USE_SSE2 && @float64@
+ _mm_prefetch(data0 + 512, _MM_HINT_T0);
+ _mm_prefetch(data1 + 512, _MM_HINT_T0);
+
+/**begin repeat2
+ * #i = 0, 2, 4, 6#
+ */
+ /*
+ * NOTE: This accumulation changes the order, so will likely
+ * produce slightly different results.
+ */
+ a = _mm_mul_pd(_mm_loadu_pd(data0+@i@), _mm_loadu_pd(data1+@i@));
+ accum_sse = _mm_add_pd(accum_sse, a);
+/**end repeat2**/
+#else
+/**begin repeat2
+ * #i = 0, 1, 2, 3, 4, 5, 6, 7#
+ */
+ accum += @from@(data0[@i@]) * @from@(data1[@i@]);
+/**end repeat2**/
+#endif
+ data0 += 8;
+ data1 += 8;
+ }
+
+#if EINSUM_USE_SSE1 && @float32@
+ /* Add the four SSE values and put in accum */
+ a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
+ accum_sse = _mm_add_ps(a, accum_sse);
+ a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
+ accum_sse = _mm_add_ps(a, accum_sse);
+ _mm_store_ss(&accum, accum_sse);
+#elif EINSUM_USE_SSE2 && @float64@
+ /* Add the two SSE2 values and put in accum */
+ a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1));
+ accum_sse = _mm_add_pd(a, accum_sse);
+ _mm_store_sd(&accum, accum_sse);
+#endif
+
+ /* Finish off the loop */
+ goto finish_after_unrolled_loop;
+}
+
+static void
+@name@_sum_of_products_stride0_contig_outstride0_two(int nop, char **dataptr,
+ npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+ @temptype@ value0 = @from@(*(@type@ *)dataptr[0]);
+ @type@ *data1 = (@type@ *)dataptr[1];
+ @temptype@ accum = 0;
+
+#if EINSUM_USE_SSE1 && @float32@
+ __m128 a, accum_sse = _mm_setzero_ps();
+#elif EINSUM_USE_SSE2 && @float64@
+ __m128d a, accum_sse = _mm_setzero_pd();
+#endif
+
+ NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_stride0_contig_outstride0_two (%d)\n",
+ (int)count);
+
+/* This is placed before the main loop to make small counts faster */
+finish_after_unrolled_loop:
+ switch (count) {
+/**begin repeat2
+ * #i = 6, 5, 4, 3, 2, 1, 0#
+ */
+ case @i@+1:
+ accum += @from@(data1[@i@]);
+/**end repeat2**/
+ case 0:
+ *(@type@ *)dataptr[2] = @to@(@from@(*(@type@ *)dataptr[2]) + value0 * accum);
+ return;
+ }
+
+#if EINSUM_USE_SSE1 && @float32@
+ /* Use aligned instructions if possible */
+ if (EINSUM_IS_SSE_ALIGNED(data1)) {
+ /* Unroll the loop by 8 */
+ while (count >= 8) {
+ count -= 8;
+
+/**begin repeat2
+ * #i = 0, 4#
+ */
+ /*
+ * NOTE: This accumulation changes the order, so will likely
+ * produce slightly different results.
+ */
+ accum_sse = _mm_add_ps(accum_sse, _mm_load_ps(data1+@i@));
+/**end repeat2**/
+ data1 += 8;
+ }
+ /* Add the four SSE values and put in accum */
+ a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
+ accum_sse = _mm_add_ps(a, accum_sse);
+ a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
+ accum_sse = _mm_add_ps(a, accum_sse);
+ _mm_store_ss(&accum, accum_sse);
+
+ /* Finish off the loop */
+ goto finish_after_unrolled_loop;
+ }
+#elif EINSUM_USE_SSE2 && @float64@
+ /* Use aligned instructions if possible */
+ if (EINSUM_IS_SSE_ALIGNED(data1)) {
+ /* Unroll the loop by 8 */
+ while (count >= 8) {
+ count -= 8;
+
+/**begin repeat2
+ * #i = 0, 2, 4, 6#
+ */
+ /*
+ * NOTE: This accumulation changes the order, so will likely
+ * produce slightly different results.
+ */
+ accum_sse = _mm_add_pd(accum_sse, _mm_load_pd(data1+@i@));
+/**end repeat2**/
+ data1 += 8;
+ }
+ /* Add the two SSE2 values and put in accum */
+ a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1));
+ accum_sse = _mm_add_pd(a, accum_sse);
+ _mm_store_sd(&accum, accum_sse);
+
+ /* Finish off the loop */
+ goto finish_after_unrolled_loop;
+ }
+#endif
+
+ /* Unroll the loop by 8 */
+ while (count >= 8) {
+ count -= 8;
+
+#if EINSUM_USE_SSE1 && @float32@
+/**begin repeat2
+ * #i = 0, 4#
+ */
+ /*
+ * NOTE: This accumulation changes the order, so will likely
+ * produce slightly different results.
+ */
+ accum_sse = _mm_add_ps(accum_sse, _mm_loadu_ps(data1+@i@));
+/**end repeat2**/
+#elif EINSUM_USE_SSE2 && @float64@
+/**begin repeat2
+ * #i = 0, 2, 4, 6#
+ */
+ /*
+ * NOTE: This accumulation changes the order, so will likely
+ * produce slightly different results.
+ */
+ accum_sse = _mm_add_pd(accum_sse, _mm_loadu_pd(data1+@i@));
+/**end repeat2**/
+#else
+/**begin repeat2
+ * #i = 0, 1, 2, 3, 4, 5, 6, 7#
+ */
+ accum += @from@(data1[@i@]);
+/**end repeat2**/
+#endif
+ data1 += 8;
+ }
+
+#if EINSUM_USE_SSE1 && @float32@
+ /* Add the four SSE values and put in accum */
+ a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
+ accum_sse = _mm_add_ps(a, accum_sse);
+ a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
+ accum_sse = _mm_add_ps(a, accum_sse);
+ _mm_store_ss(&accum, accum_sse);
+#elif EINSUM_USE_SSE2 && @float64@
+ /* Add the two SSE2 values and put in accum */
+ a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1));
+ accum_sse = _mm_add_pd(a, accum_sse);
+ _mm_store_sd(&accum, accum_sse);
+#endif
+
+ /* Finish off the loop */
+ goto finish_after_unrolled_loop;
+}
+
+static void
+@name@_sum_of_products_contig_stride0_outstride0_two(int nop, char **dataptr,
+ npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+ @type@ *data0 = (@type@ *)dataptr[0];
+ @temptype@ value1 = @from@(*(@type@ *)dataptr[1]);
+ @temptype@ accum = 0;
+
+#if EINSUM_USE_SSE1 && @float32@
+ __m128 a, accum_sse = _mm_setzero_ps();
+#elif EINSUM_USE_SSE2 && @float64@
+ __m128d a, accum_sse = _mm_setzero_pd();
+#endif
+
+ NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_contig_stride0_outstride0_two (%d)\n",
+ (int)count);
+
+/* This is placed before the main loop to make small counts faster */
+finish_after_unrolled_loop:
+ switch (count) {
+/**begin repeat2
+ * #i = 6, 5, 4, 3, 2, 1, 0#
+ */
+ case @i@+1:
+ accum += @from@(data0[@i@]);
+/**end repeat2**/
+ case 0:
+ *(@type@ *)dataptr[2] = @to@(@from@(*(@type@ *)dataptr[2]) + accum * value1);
+ return;
+ }
+
+#if EINSUM_USE_SSE1 && @float32@
+ /* Use aligned instructions if possible */
+ if (EINSUM_IS_SSE_ALIGNED(data0)) {
+ /* Unroll the loop by 8 */
+ while (count >= 8) {
+ count -= 8;
+
+/**begin repeat2
+ * #i = 0, 4#
+ */
+ /*
+ * NOTE: This accumulation changes the order, so will likely
+ * produce slightly different results.
+ */
+ accum_sse = _mm_add_ps(accum_sse, _mm_load_ps(data0+@i@));
+/**end repeat2**/
+ data0 += 8;
+ }
+ /* Add the four SSE values and put in accum */
+ a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
+ accum_sse = _mm_add_ps(a, accum_sse);
+ a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
+ accum_sse = _mm_add_ps(a, accum_sse);
+ _mm_store_ss(&accum, accum_sse);
+ /* Finish off the loop */
+ goto finish_after_unrolled_loop;
+ }
+#elif EINSUM_USE_SSE2 && @float64@
+ /* Use aligned instructions if possible */
+ if (EINSUM_IS_SSE_ALIGNED(data0)) {
+ /* Unroll the loop by 8 */
+ while (count >= 8) {
+ count -= 8;
+
+/**begin repeat2
+ * #i = 0, 2, 4, 6#
+ */
+ /*
+ * NOTE: This accumulation changes the order, so will likely
+ * produce slightly different results.
+ */
+ accum_sse = _mm_add_pd(accum_sse, _mm_load_pd(data0+@i@));
+/**end repeat2**/
+ data0 += 8;
+ }
+ /* Add the two SSE2 values and put in accum */
+ a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1));
+ accum_sse = _mm_add_pd(a, accum_sse);
+ _mm_store_sd(&accum, accum_sse);
+ /* Finish off the loop */
+ goto finish_after_unrolled_loop;
+ }
+#endif
+
+ /* Unroll the loop by 8 */
+ while (count >= 8) {
+ count -= 8;
+
+#if EINSUM_USE_SSE1 && @float32@
+/**begin repeat2
+ * #i = 0, 4#
+ */
+ /*
+ * NOTE: This accumulation changes the order, so will likely
+ * produce slightly different results.
+ */
+ accum_sse = _mm_add_ps(accum_sse, _mm_loadu_ps(data0+@i@));
+/**end repeat2**/
+#elif EINSUM_USE_SSE2 && @float64@
+/**begin repeat2
+ * #i = 0, 2, 4, 6#
+ */
+ /*
+ * NOTE: This accumulation changes the order, so will likely
+ * produce slightly different results.
+ */
+ accum_sse = _mm_add_pd(accum_sse, _mm_loadu_pd(data0+@i@));
+/**end repeat2**/
+#else
+/**begin repeat2
+ * #i = 0, 1, 2, 3, 4, 5, 6, 7#
+ */
+ accum += @from@(data0[@i@]);
+/**end repeat2**/
+#endif
+ data0 += 8;
+ }
+
+#if EINSUM_USE_SSE1 && @float32@
+ /* Add the four SSE values and put in accum */
+ a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
+ accum_sse = _mm_add_ps(a, accum_sse);
+ a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
+ accum_sse = _mm_add_ps(a, accum_sse);
+ _mm_store_ss(&accum, accum_sse);
+#elif EINSUM_USE_SSE2 && @float64@
+ /* Add the two SSE2 values and put in accum */
+ a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1));
+ accum_sse = _mm_add_pd(a, accum_sse);
+ _mm_store_sd(&accum, accum_sse);
+#endif
+
+ /* Finish off the loop */
+ goto finish_after_unrolled_loop;
+}
+
+#elif @nop@ == 3 && !@complex@
+
+static void
+@name@_sum_of_products_contig_three(int nop, char **dataptr,
+ npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+ @type@ *data0 = (@type@ *)dataptr[0];
+ @type@ *data1 = (@type@ *)dataptr[1];
+ @type@ *data2 = (@type@ *)dataptr[2];
+ @type@ *data_out = (@type@ *)dataptr[3];
+
+ /* Unroll the loop by 8 */
+ while (count >= 8) {
+ count -= 8;
+
+/**begin repeat2
+ * #i = 0, 1, 2, 3, 4, 5, 6, 7#
+ */
+ data_out[@i@] = @to@(@from@(data0[@i@]) *
+ @from@(data1[@i@]) *
+ @from@(data2[@i@]) +
+ @from@(data_out[@i@]));
+/**end repeat2**/
+ data0 += 8;
+ data1 += 8;
+ data2 += 8;
+ data_out += 8;
+ }
+
+ /* Finish off the loop */
+
+/**begin repeat2
+ * #i = 0, 1, 2, 3, 4, 5, 6, 7#
+ */
+ if (count-- == 0) {
+ return;
+ }
+ data_out[@i@] = @to@(@from@(data0[@i@]) *
+ @from@(data1[@i@]) *
+ @from@(data2[@i@]) +
+ @from@(data_out[@i@]));
+/**end repeat2**/
+}
+
+#else /* @nop@ > 3 || @complex */
+
+static void
+@name@_sum_of_products_contig_@noplabel@(int nop, char **dataptr,
+ npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+ NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_contig_@noplabel@ (%d)\n",
+ (int)count);
+
+ while (count--) {
+#if !@complex@
+ @temptype@ temp = @from@(*(@type@ *)dataptr[0]);
+ int i;
+ for (i = 1; i < nop; ++i) {
+ temp *= @from@(*(@type@ *)dataptr[i]);
+ }
+ *(@type@ *)dataptr[nop] = @to@(temp +
+ @from@(*(@type@ *)dataptr[i]));
+ for (i = 0; i <= nop; ++i) {
+ dataptr[i] += sizeof(@type@);
+ }
+#else /* complex */
+# if @nop@ <= 3
+# define _SUMPROD_NOP @nop@
+# else
+# define _SUMPROD_NOP nop
+# endif
+ @temptype@ re, im, tmp;
+ int i;
+ re = ((@temptype@ *)dataptr[0])[0];
+ im = ((@temptype@ *)dataptr[0])[1];
+ for (i = 1; i < _SUMPROD_NOP; ++i) {
+ tmp = re * ((@temptype@ *)dataptr[i])[0] -
+ im * ((@temptype@ *)dataptr[i])[1];
+ im = re * ((@temptype@ *)dataptr[i])[1] +
+ im * ((@temptype@ *)dataptr[i])[0];
+ re = tmp;
+ }
+ ((@temptype@ *)dataptr[_SUMPROD_NOP])[0] = re +
+ ((@temptype@ *)dataptr[_SUMPROD_NOP])[0];
+ ((@temptype@ *)dataptr[_SUMPROD_NOP])[1] = im +
+ ((@temptype@ *)dataptr[_SUMPROD_NOP])[1];
+
+ for (i = 0; i <= _SUMPROD_NOP; ++i) {
+ dataptr[i] += sizeof(@type@);
+ }
+# undef _SUMPROD_NOP
+#endif
+ }
+}
+
+#endif /* functions for various @nop@ */
+
+#if @nop@ == 1
+
+static void
+@name@_sum_of_products_contig_outstride0_one(int nop, char **dataptr,
+ npy_intp const *strides, npy_intp count)
+{
+#if @complex@
+ @temptype@ accum_re = 0, accum_im = 0;
+ @temptype@ *data0 = (@temptype@ *)dataptr[0];
+#else
+ @temptype@ accum = 0;
+ @type@ *data0 = (@type@ *)dataptr[0];
+#endif
+
+#if EINSUM_USE_SSE1 && @float32@
+ __m128 a, accum_sse = _mm_setzero_ps();
+#elif EINSUM_USE_SSE2 && @float64@
+ __m128d a, accum_sse = _mm_setzero_pd();
+#endif
+
+
+ NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_contig_outstride0_one (%d)\n",
+ (int)count);
+
+/* This is placed before the main loop to make small counts faster */
+finish_after_unrolled_loop:
+ switch (count) {
+/**begin repeat2
+ * #i = 6, 5, 4, 3, 2, 1, 0#
+ */
+ case @i@+1:
+#if !@complex@
+ accum += @from@(data0[@i@]);
+#else /* complex */
+ accum_re += data0[2*@i@+0];
+ accum_im += data0[2*@i@+1];
+#endif
+/**end repeat2**/
+ case 0:
+#if @complex@
+ ((@temptype@ *)dataptr[1])[0] += accum_re;
+ ((@temptype@ *)dataptr[1])[1] += accum_im;
+#else
+ *((@type@ *)dataptr[1]) = @to@(accum +
+ @from@(*((@type@ *)dataptr[1])));
+#endif
+ return;
+ }
+
+#if EINSUM_USE_SSE1 && @float32@
+ /* Use aligned instructions if possible */
+ if (EINSUM_IS_SSE_ALIGNED(data0)) {
+ /* Unroll the loop by 8 */
+ while (count >= 8) {
+ count -= 8;
+
+ _mm_prefetch(data0 + 512, _MM_HINT_T0);
+
+/**begin repeat2
+ * #i = 0, 4#
+ */
+ /*
+ * NOTE: This accumulation changes the order, so will likely
+ * produce slightly different results.
+ */
+ accum_sse = _mm_add_ps(accum_sse, _mm_load_ps(data0+@i@));
+/**end repeat2**/
+ data0 += 8;
+ }
+
+ /* Add the four SSE values and put in accum */
+ a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
+ accum_sse = _mm_add_ps(a, accum_sse);
+ a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
+ accum_sse = _mm_add_ps(a, accum_sse);
+ _mm_store_ss(&accum, accum_sse);
+
+ /* Finish off the loop */
+ goto finish_after_unrolled_loop;
+ }
+#elif EINSUM_USE_SSE2 && @float64@
+ /* Use aligned instructions if possible */
+ if (EINSUM_IS_SSE_ALIGNED(data0)) {
+ /* Unroll the loop by 8 */
+ while (count >= 8) {
+ count -= 8;
+
+ _mm_prefetch(data0 + 512, _MM_HINT_T0);
+
+/**begin repeat2
+ * #i = 0, 2, 4, 6#
+ */
+ /*
+ * NOTE: This accumulation changes the order, so will likely
+ * produce slightly different results.
+ */
+ accum_sse = _mm_add_pd(accum_sse, _mm_load_pd(data0+@i@));
+/**end repeat2**/
+ data0 += 8;
+ }
+
+ /* Add the two SSE2 values and put in accum */
+ a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1));
+ accum_sse = _mm_add_pd(a, accum_sse);
+ _mm_store_sd(&accum, accum_sse);
+
+ /* Finish off the loop */
+ goto finish_after_unrolled_loop;
+ }
+#endif
+
+ /* Unroll the loop by 8 */
+ while (count >= 8) {
+ count -= 8;
+
+#if EINSUM_USE_SSE1 && @float32@
+ _mm_prefetch(data0 + 512, _MM_HINT_T0);
+
+/**begin repeat2
+ * #i = 0, 4#
+ */
+ /*
+ * NOTE: This accumulation changes the order, so will likely
+ * produce slightly different results.
+ */
+ accum_sse = _mm_add_ps(accum_sse, _mm_loadu_ps(data0+@i@));
+/**end repeat2**/
+#elif EINSUM_USE_SSE2 && @float64@
+ _mm_prefetch(data0 + 512, _MM_HINT_T0);
+
+/**begin repeat2
+ * #i = 0, 2, 4, 6#
+ */
+ /*
+ * NOTE: This accumulation changes the order, so will likely
+ * produce slightly different results.
+ */
+ accum_sse = _mm_add_pd(accum_sse, _mm_loadu_pd(data0+@i@));
+/**end repeat2**/
+#else
+/**begin repeat2
+ * #i = 0, 1, 2, 3, 4, 5, 6, 7#
+ */
+# if !@complex@
+ accum += @from@(data0[@i@]);
+# else /* complex */
+ accum_re += data0[2*@i@+0];
+ accum_im += data0[2*@i@+1];
+# endif
+/**end repeat2**/
+#endif
+
+#if !@complex@
+ data0 += 8;
+#else
+ data0 += 8*2;
+#endif
+ }
+
+#if EINSUM_USE_SSE1 && @float32@
+ /* Add the four SSE values and put in accum */
+ a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(2,3,0,1));
+ accum_sse = _mm_add_ps(a, accum_sse);
+ a = _mm_shuffle_ps(accum_sse, accum_sse, _MM_SHUFFLE(1,0,3,2));
+ accum_sse = _mm_add_ps(a, accum_sse);
+ _mm_store_ss(&accum, accum_sse);
+#elif EINSUM_USE_SSE2 && @float64@
+ /* Add the two SSE2 values and put in accum */
+ a = _mm_shuffle_pd(accum_sse, accum_sse, _MM_SHUFFLE2(0,1));
+ accum_sse = _mm_add_pd(a, accum_sse);
+ _mm_store_sd(&accum, accum_sse);
+#endif
+
+ /* Finish off the loop */
+ goto finish_after_unrolled_loop;
+}
+
+#endif /* @nop@ == 1 */
+
+static void
+@name@_sum_of_products_outstride0_@noplabel@(int nop, char **dataptr,
+ npy_intp const *strides, npy_intp count)
+{
+#if @complex@
+ @temptype@ accum_re = 0, accum_im = 0;
+#else
+ @temptype@ accum = 0;
+#endif
+
+#if (@nop@ == 1) || (@nop@ <= 3 && !@complex@)
+ char *data0 = dataptr[0];
+ npy_intp stride0 = strides[0];
+#endif
+#if (@nop@ == 2 || @nop@ == 3) && !@complex@
+ char *data1 = dataptr[1];
+ npy_intp stride1 = strides[1];
+#endif
+#if (@nop@ == 3) && !@complex@
+ char *data2 = dataptr[2];
+ npy_intp stride2 = strides[2];
+#endif
+
+ NPY_EINSUM_DBG_PRINT1("@name@_sum_of_products_outstride0_@noplabel@ (%d)\n",
+ (int)count);
+
+ while (count--) {
+#if !@complex@
+# if @nop@ == 1
+ accum += @from@(*(@type@ *)data0);
+ data0 += stride0;
+# elif @nop@ == 2
+ accum += @from@(*(@type@ *)data0) *
+ @from@(*(@type@ *)data1);
+ data0 += stride0;
+ data1 += stride1;
+# elif @nop@ == 3
+ accum += @from@(*(@type@ *)data0) *
+ @from@(*(@type@ *)data1) *
+ @from@(*(@type@ *)data2);
+ data0 += stride0;
+ data1 += stride1;
+ data2 += stride2;
+# else
+ @temptype@ temp = @from@(*(@type@ *)dataptr[0]);
+ int i;
+ for (i = 1; i < nop; ++i) {
+ temp *= @from@(*(@type@ *)dataptr[i]);
+ }
+ accum += temp;
+ for (i = 0; i < nop; ++i) {
+ dataptr[i] += strides[i];
+ }
+# endif
+#else /* complex */
+# if @nop@ == 1
+ accum_re += ((@temptype@ *)data0)[0];
+ accum_im += ((@temptype@ *)data0)[1];
+ data0 += stride0;
+# else
+# if @nop@ <= 3
+#define _SUMPROD_NOP @nop@
+# else
+#define _SUMPROD_NOP nop
+# endif
+ @temptype@ re, im, tmp;
+ int i;
+ re = ((@temptype@ *)dataptr[0])[0];
+ im = ((@temptype@ *)dataptr[0])[1];
+ for (i = 1; i < _SUMPROD_NOP; ++i) {
+ tmp = re * ((@temptype@ *)dataptr[i])[0] -
+ im * ((@temptype@ *)dataptr[i])[1];
+ im = re * ((@temptype@ *)dataptr[i])[1] +
+ im * ((@temptype@ *)dataptr[i])[0];
+ re = tmp;
+ }
+ accum_re += re;
+ accum_im += im;
+ for (i = 0; i < _SUMPROD_NOP; ++i) {
+ dataptr[i] += strides[i];
+ }
+#undef _SUMPROD_NOP
+# endif
+#endif
+ }
+
+#if @complex@
+# if @nop@ <= 3
+ ((@temptype@ *)dataptr[@nop@])[0] += accum_re;
+ ((@temptype@ *)dataptr[@nop@])[1] += accum_im;
+# else
+ ((@temptype@ *)dataptr[nop])[0] += accum_re;
+ ((@temptype@ *)dataptr[nop])[1] += accum_im;
+# endif
+#else
+# if @nop@ <= 3
+ *((@type@ *)dataptr[@nop@]) = @to@(accum +
+ @from@(*((@type@ *)dataptr[@nop@])));
+# else
+ *((@type@ *)dataptr[nop]) = @to@(accum +
+ @from@(*((@type@ *)dataptr[nop])));
+# endif
+#endif
+
+}
+
+/**end repeat1**/
+
+/**end repeat**/
+
+
+/* Do OR of ANDs for the boolean type */
+
+/**begin repeat
+ * #nop = 1, 2, 3, 1000#
+ * #noplabel = one, two, three, any#
+ */
+
+static void
+bool_sum_of_products_@noplabel@(int nop, char **dataptr,
+ npy_intp const *strides, npy_intp count)
+{
+#if (@nop@ <= 3)
+ char *data0 = dataptr[0];
+ npy_intp stride0 = strides[0];
+#endif
+#if (@nop@ == 2 || @nop@ == 3)
+ char *data1 = dataptr[1];
+ npy_intp stride1 = strides[1];
+#endif
+#if (@nop@ == 3)
+ char *data2 = dataptr[2];
+ npy_intp stride2 = strides[2];
+#endif
+#if (@nop@ <= 3)
+ char *data_out = dataptr[@nop@];
+ npy_intp stride_out = strides[@nop@];
+#endif
+
+ while (count--) {
+#if @nop@ == 1
+ *(npy_bool *)data_out = *(npy_bool *)data0 ||
+ *(npy_bool *)data_out;
+ data0 += stride0;
+ data_out += stride_out;
+#elif @nop@ == 2
+ *(npy_bool *)data_out = (*(npy_bool *)data0 &&
+ *(npy_bool *)data1) ||
+ *(npy_bool *)data_out;
+ data0 += stride0;
+ data1 += stride1;
+ data_out += stride_out;
+#elif @nop@ == 3
+ *(npy_bool *)data_out = (*(npy_bool *)data0 &&
+ *(npy_bool *)data1 &&
+ *(npy_bool *)data2) ||
+ *(npy_bool *)data_out;
+ data0 += stride0;
+ data1 += stride1;
+ data2 += stride2;
+ data_out += stride_out;
+#else
+ npy_bool temp = *(npy_bool *)dataptr[0];
+ int i;
+ for (i = 1; i < nop; ++i) {
+ temp = temp && *(npy_bool *)dataptr[i];
+ }
+ *(npy_bool *)dataptr[nop] = temp || *(npy_bool *)dataptr[i];
+ for (i = 0; i <= nop; ++i) {
+ dataptr[i] += strides[i];
+ }
+#endif
+ }
+}
+
+static void
+bool_sum_of_products_contig_@noplabel@(int nop, char **dataptr,
+ npy_intp const *strides, npy_intp count)
+{
+#if (@nop@ <= 3)
+ char *data0 = dataptr[0];
+#endif
+#if (@nop@ == 2 || @nop@ == 3)
+ char *data1 = dataptr[1];
+#endif
+#if (@nop@ == 3)
+ char *data2 = dataptr[2];
+#endif
+#if (@nop@ <= 3)
+ char *data_out = dataptr[@nop@];
+#endif
+
+#if (@nop@ <= 3)
+/* This is placed before the main loop to make small counts faster */
+finish_after_unrolled_loop:
+ switch (count) {
+/**begin repeat1
+ * #i = 6, 5, 4, 3, 2, 1, 0#
+ */
+ case @i@+1:
+# if @nop@ == 1
+ ((npy_bool *)data_out)[@i@] = ((npy_bool *)data0)[@i@] ||
+ ((npy_bool *)data_out)[@i@];
+# elif @nop@ == 2
+ ((npy_bool *)data_out)[@i@] =
+ (((npy_bool *)data0)[@i@] &&
+ ((npy_bool *)data1)[@i@]) ||
+ ((npy_bool *)data_out)[@i@];
+# elif @nop@ == 3
+ ((npy_bool *)data_out)[@i@] =
+ (((npy_bool *)data0)[@i@] &&
+ ((npy_bool *)data1)[@i@] &&
+ ((npy_bool *)data2)[@i@]) ||
+ ((npy_bool *)data_out)[@i@];
+# endif
+/**end repeat1**/
+ case 0:
+ return;
+ }
+#endif
+
+/* Unroll the loop by 8 for fixed-size nop */
+#if (@nop@ <= 3)
+ while (count >= 8) {
+ count -= 8;
+#else
+ while (count--) {
+#endif
+
+# if @nop@ == 1
+/**begin repeat1
+ * #i = 0, 1, 2, 3, 4, 5, 6, 7#
+ */
+ *((npy_bool *)data_out + @i@) = (*((npy_bool *)data0 + @i@)) ||
+ (*((npy_bool *)data_out + @i@));
+/**end repeat1**/
+ data0 += 8*sizeof(npy_bool);
+ data_out += 8*sizeof(npy_bool);
+# elif @nop@ == 2
+/**begin repeat1
+ * #i = 0, 1, 2, 3, 4, 5, 6, 7#
+ */
+ *((npy_bool *)data_out + @i@) =
+ ((*((npy_bool *)data0 + @i@)) &&
+ (*((npy_bool *)data1 + @i@))) ||
+ (*((npy_bool *)data_out + @i@));
+/**end repeat1**/
+ data0 += 8*sizeof(npy_bool);
+ data1 += 8*sizeof(npy_bool);
+ data_out += 8*sizeof(npy_bool);
+# elif @nop@ == 3
+/**begin repeat1
+ * #i = 0, 1, 2, 3, 4, 5, 6, 7#
+ */
+ *((npy_bool *)data_out + @i@) =
+ ((*((npy_bool *)data0 + @i@)) &&
+ (*((npy_bool *)data1 + @i@)) &&
+ (*((npy_bool *)data2 + @i@))) ||
+ (*((npy_bool *)data_out + @i@));
+/**end repeat1**/
+ data0 += 8*sizeof(npy_bool);
+ data1 += 8*sizeof(npy_bool);
+ data2 += 8*sizeof(npy_bool);
+ data_out += 8*sizeof(npy_bool);
+# else
+ npy_bool temp = *(npy_bool *)dataptr[0];
+ int i;
+ for (i = 1; i < nop; ++i) {
+ temp = temp && *(npy_bool *)dataptr[i];
+ }
+ *(npy_bool *)dataptr[nop] = temp || *(npy_bool *)dataptr[i];
+ for (i = 0; i <= nop; ++i) {
+ dataptr[i] += sizeof(npy_bool);
+ }
+# endif
+ }
+
+ /* If the loop was unrolled, we need to finish it off */
+#if (@nop@ <= 3)
+ goto finish_after_unrolled_loop;
+#endif
+}
+
+static void
+bool_sum_of_products_outstride0_@noplabel@(int nop, char **dataptr,
+ npy_intp const *strides, npy_intp count)
+{
+ npy_bool accum = 0;
+
+#if (@nop@ <= 3)
+ char *data0 = dataptr[0];
+ npy_intp stride0 = strides[0];
+#endif
+#if (@nop@ == 2 || @nop@ == 3)
+ char *data1 = dataptr[1];
+ npy_intp stride1 = strides[1];
+#endif
+#if (@nop@ == 3)
+ char *data2 = dataptr[2];
+ npy_intp stride2 = strides[2];
+#endif
+
+ while (count--) {
+#if @nop@ == 1
+ accum = *(npy_bool *)data0 || accum;
+ data0 += stride0;
+#elif @nop@ == 2
+ accum = (*(npy_bool *)data0 && *(npy_bool *)data1) || accum;
+ data0 += stride0;
+ data1 += stride1;
+#elif @nop@ == 3
+ accum = (*(npy_bool *)data0 &&
+ *(npy_bool *)data1 &&
+ *(npy_bool *)data2) || accum;
+ data0 += stride0;
+ data1 += stride1;
+ data2 += stride2;
+#else
+ npy_bool temp = *(npy_bool *)dataptr[0];
+ int i;
+ for (i = 1; i < nop; ++i) {
+ temp = temp && *(npy_bool *)dataptr[i];
+ }
+ accum = temp || accum;
+ for (i = 0; i <= nop; ++i) {
+ dataptr[i] += strides[i];
+ }
+#endif
+ }
+
+# if @nop@ <= 3
+ *((npy_bool *)dataptr[@nop@]) = accum || *((npy_bool *)dataptr[@nop@]);
+# else
+ *((npy_bool *)dataptr[nop]) = accum || *((npy_bool *)dataptr[nop]);
+# endif
+}
+
+/**end repeat**/
+
+/* These tables need to match up with the type enum */
+static sum_of_products_fn
+_contig_outstride0_unary_specialization_table[NPY_NTYPES] = {
+/**begin repeat
+ * #name = bool,
+ * byte, ubyte,
+ * short, ushort,
+ * int, uint,
+ * long, ulong,
+ * longlong, ulonglong,
+ * float, double, longdouble,
+ * cfloat, cdouble, clongdouble,
+ * object, string, unicode, void,
+ * datetime, timedelta, half#
+ * #use = 0,
+ * 1, 1,
+ * 1, 1,
+ * 1, 1,
+ * 1, 1,
+ * 1, 1,
+ * 1, 1, 1,
+ * 1, 1, 1,
+ * 0, 0, 0, 0,
+ * 0, 0, 1#
+ */
+#if @use@
+ &@name@_sum_of_products_contig_outstride0_one,
+#else
+ NULL,
+#endif
+/**end repeat**/
+}; /* End of _contig_outstride0_unary_specialization_table */
+
+static sum_of_products_fn _binary_specialization_table[NPY_NTYPES][5] = {
+/**begin repeat
+ * #name = bool,
+ * byte, ubyte,
+ * short, ushort,
+ * int, uint,
+ * long, ulong,
+ * longlong, ulonglong,
+ * float, double, longdouble,
+ * cfloat, cdouble, clongdouble,
+ * object, string, unicode, void,
+ * datetime, timedelta, half#
+ * #use = 0,
+ * 1, 1,
+ * 1, 1,
+ * 1, 1,
+ * 1, 1,
+ * 1, 1,
+ * 1, 1, 1,
+ * 0, 0, 0,
+ * 0, 0, 0, 0,
+ * 0, 0, 1#
+ */
+#if @use@
+{
+ &@name@_sum_of_products_stride0_contig_outstride0_two,
+ &@name@_sum_of_products_stride0_contig_outcontig_two,
+ &@name@_sum_of_products_contig_stride0_outstride0_two,
+ &@name@_sum_of_products_contig_stride0_outcontig_two,
+ &@name@_sum_of_products_contig_contig_outstride0_two,
+},
+#else
+ {NULL, NULL, NULL, NULL, NULL},
+#endif
+/**end repeat**/
+}; /* End of _binary_specialization_table */
+
+static sum_of_products_fn _outstride0_specialized_table[NPY_NTYPES][4] = {
+/**begin repeat
+ * #name = bool,
+ * byte, ubyte,
+ * short, ushort,
+ * int, uint,
+ * long, ulong,
+ * longlong, ulonglong,
+ * float, double, longdouble,
+ * cfloat, cdouble, clongdouble,
+ * object, string, unicode, void,
+ * datetime, timedelta, half#
+ * #use = 1,
+ * 1, 1,
+ * 1, 1,
+ * 1, 1,
+ * 1, 1,
+ * 1, 1,
+ * 1, 1, 1,
+ * 1, 1, 1,
+ * 0, 0, 0, 0,
+ * 0, 0, 1#
+ */
+#if @use@
+{
+ &@name@_sum_of_products_outstride0_any,
+ &@name@_sum_of_products_outstride0_one,
+ &@name@_sum_of_products_outstride0_two,
+ &@name@_sum_of_products_outstride0_three
+},
+#else
+ {NULL, NULL, NULL, NULL},
+#endif
+/**end repeat**/
+}; /* End of _outstride0_specialized_table */
+
+static sum_of_products_fn _allcontig_specialized_table[NPY_NTYPES][4] = {
+/**begin repeat
+ * #name = bool,
+ * byte, ubyte,
+ * short, ushort,
+ * int, uint,
+ * long, ulong,
+ * longlong, ulonglong,
+ * float, double, longdouble,
+ * cfloat, cdouble, clongdouble,
+ * object, string, unicode, void,
+ * datetime, timedelta, half#
+ * #use = 1,
+ * 1, 1,
+ * 1, 1,
+ * 1, 1,
+ * 1, 1,
+ * 1, 1,
+ * 1, 1, 1,
+ * 1, 1, 1,
+ * 0, 0, 0, 0,
+ * 0, 0, 1#
+ */
+#if @use@
+{
+ &@name@_sum_of_products_contig_any,
+ &@name@_sum_of_products_contig_one,
+ &@name@_sum_of_products_contig_two,
+ &@name@_sum_of_products_contig_three
+},
+#else
+ {NULL, NULL, NULL, NULL},
+#endif
+/**end repeat**/
+}; /* End of _allcontig_specialized_table */
+
+static sum_of_products_fn _unspecialized_table[NPY_NTYPES][4] = {
+/**begin repeat
+ * #name = bool,
+ * byte, ubyte,
+ * short, ushort,
+ * int, uint,
+ * long, ulong,
+ * longlong, ulonglong,
+ * float, double, longdouble,
+ * cfloat, cdouble, clongdouble,
+ * object, string, unicode, void,
+ * datetime, timedelta, half#
+ * #use = 1,
+ * 1, 1,
+ * 1, 1,
+ * 1, 1,
+ * 1, 1,
+ * 1, 1,
+ * 1, 1, 1,
+ * 1, 1, 1,
+ * 0, 0, 0, 0,
+ * 0, 0, 1#
+ */
+#if @use@
+{
+ &@name@_sum_of_products_any,
+ &@name@_sum_of_products_one,
+ &@name@_sum_of_products_two,
+ &@name@_sum_of_products_three
+},
+#else
+ {NULL, NULL, NULL, NULL},
+#endif
+/**end repeat**/
+}; /* End of _unnspecialized_table */
+
+NPY_VISIBILITY_HIDDEN sum_of_products_fn
+get_sum_of_products_function(int nop, int type_num,
+ npy_intp itemsize, npy_intp const *fixed_strides)
+{
+ int iop;
+
+ if (type_num >= NPY_NTYPES) {
+ return NULL;
+ }
+
+ /* contiguous reduction */
+ if (nop == 1 && fixed_strides[0] == itemsize && fixed_strides[1] == 0) {
+ sum_of_products_fn ret =
+ _contig_outstride0_unary_specialization_table[type_num];
+ if (ret != NULL) {
+ return ret;
+ }
+ }
+
+ /* nop of 2 has more specializations */
+ if (nop == 2) {
+ /* Encode the zero/contiguous strides */
+ int code;
+ code = (fixed_strides[0] == 0) ? 0 :
+ (fixed_strides[0] == itemsize) ? 2*2*1 : 8;
+ code += (fixed_strides[1] == 0) ? 0 :
+ (fixed_strides[1] == itemsize) ? 2*1 : 8;
+ code += (fixed_strides[2] == 0) ? 0 :
+ (fixed_strides[2] == itemsize) ? 1 : 8;
+ if (code >= 2 && code < 7) {
+ sum_of_products_fn ret =
+ _binary_specialization_table[type_num][code-2];
+ if (ret != NULL) {
+ return ret;
+ }
+ }
+ }
+
+ /* Inner loop with an output stride of 0 */
+ if (fixed_strides[nop] == 0) {
+ return _outstride0_specialized_table[type_num][nop <= 3 ? nop : 0];
+ }
+
+ /* Check for all contiguous */
+ for (iop = 0; iop < nop + 1; ++iop) {
+ if (fixed_strides[iop] != itemsize) {
+ break;
+ }
+ }
+
+ /* Contiguous loop */
+ if (iop == nop + 1) {
+ return _allcontig_specialized_table[type_num][nop <= 3 ? nop : 0];
+ }
+
+ /* None of the above specializations caught it, general loops */
+ return _unspecialized_table[type_num][nop <= 3 ? nop : 0];
+}
diff --git a/numpy/core/src/multiarray/einsum_sumprod.h b/numpy/core/src/multiarray/einsum_sumprod.h
new file mode 100644
index 000000000..c6cf18ec6
--- /dev/null
+++ b/numpy/core/src/multiarray/einsum_sumprod.h
@@ -0,0 +1,12 @@
+#ifndef _NPY_MULTIARRAY_EINSUM_SUMPROD_H
+#define _NPY_MULTIARRAY_EINSUM_SUMPROD_H
+
+#include <numpy/npy_common.h>
+
+typedef void (*sum_of_products_fn)(int, char **, npy_intp const*, npy_intp);
+
+NPY_VISIBILITY_HIDDEN sum_of_products_fn
+get_sum_of_products_function(int nop, int type_num,
+ npy_intp itemsize, npy_intp const *fixed_strides);
+
+#endif