2 files changed, 31 insertions, 12 deletions
diff --git a/benchmarks/benchmarks/bench_linalg.py b/benchmarks/benchmarks/bench_linalg.py
index 3d26b800c..a65d510be 100644
--- a/benchmarks/benchmarks/bench_linalg.py
+++ b/benchmarks/benchmarks/bench_linalg.py
@@ -18,12 +18,12 @@ class Eindot(Benchmark):
         self.a3 = np.arange(480000.).reshape(60, 80, 100)
         self.b3 = np.arange(192000.).reshape(80, 60, 40)
 
-    def time_einsum_ij_jk_a_b(self):
-        np.einsum('ij,jk', self.a, self.b)
-
     def time_dot_a_b(self):
         np.dot(self.a, self.b)
 
+    def time_dot_d_dot_b_c(self):
+        np.dot(self.d, np.dot(self.b, self.c))
+
     def time_dot_trans_a_at(self):
         np.dot(self.a, self.at)
 
@@ -36,20 +36,38 @@ class Eindot(Benchmark):
     def time_dot_trans_atc_a(self):
         np.dot(self.atc, self.a)
 
+    def time_einsum_i_ij_j(self):
+        np.einsum('i,ij,j', self.d, self.b, self.c)
+
+    def time_einsum_ij_jk_a_b(self):
+        np.einsum('ij,jk', self.a, self.b)
+
+    def time_einsum_ijk_jil_kl(self):
+        np.einsum('ijk,jil->kl', self.a3, self.b3)
+
     def time_inner_trans_a_a(self):
         np.inner(self.a, self.a)
 
     def time_inner_trans_a_ac(self):
         np.inner(self.a, self.ac)
 
-    def time_einsum_i_ij_j(self):
-        np.einsum('i,ij,j', self.d, self.b, self.c)
+    def time_matmul_a_b(self):
+        np.matmul(self.a, self.b)
 
-    def time_dot_d_dot_b_c(self):
-        np.dot(self.d, np.dot(self.b, self.c))
+    def time_matmul_d_matmul_b_c(self):
+        np.matmul(self.d, np.matmul(self.b, self.c))
 
-    def time_einsum_ijk_jil_kl(self):
-        np.einsum('ijk,jil->kl', self.a3, self.b3)
+    def time_matmul_trans_a_at(self):
+        np.matmul(self.a, self.at)
+
+    def time_matmul_trans_a_atc(self):
+        np.matmul(self.a, self.atc)
+
+    def time_matmul_trans_at_a(self):
+        np.matmul(self.at, self.a)
+
+    def time_matmul_trans_atc_a(self):
+        np.matmul(self.atc, self.a)
 
     def time_tensordot_a_b_axes_1_0_0_1(self):
         np.tensordot(self.a3, self.b3, axes=([1, 0], [0, 1]))
diff --git a/doc/release/1.11.0-notes.rst b/doc/release/1.11.0-notes.rst
index 16af02440..c4ff89230 100644
--- a/doc/release/1.11.0-notes.rst
+++ b/doc/release/1.11.0-notes.rst
@@ -149,11 +149,12 @@ useless computations when printing a masked array.
 The function now uses the fallocate system call to reserve sufficient
 diskspace on filesystems that support it.
 
-``np.dot`` optimized for operations of the form ``A.T @ A`` and ``A @ A.T``
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Optimizations for operations of the form ``A.T @ A`` and ``A @ A.T``
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 Previously, ``gemm`` BLAS operations were used for all matrix products. Now,
 if the matrix product is between a matrix and its transpose, it will use
-``syrk`` BLAS operations for a performance boost.
+``syrk`` BLAS operations for a performance boost. This optimization has been
+extended to ``@``, ``numpy.dot``, ``numpy.inner``, and ``numpy.matmul``.
 
 **Note:** Requires the transposed and non-transposed matrices to share data.