summaryrefslogtreecommitdiff
path: root/libgfortran/generated/matmul_c4.c
diff options
context:
space:
mode:
Diffstat (limited to 'libgfortran/generated/matmul_c4.c')
-rw-r--r--libgfortran/generated/matmul_c4.c71
1 files changed, 63 insertions, 8 deletions
diff --git a/libgfortran/generated/matmul_c4.c b/libgfortran/generated/matmul_c4.c
index 5e59f1dafdc..a01de37bc74 100644
--- a/libgfortran/generated/matmul_c4.c
+++ b/libgfortran/generated/matmul_c4.c
@@ -36,16 +36,29 @@ Boston, MA 02110-1301, USA. */
#if defined (HAVE_GFC_COMPLEX_4)
-/* This is a C version of the following fortran pseudo-code. The key
- point is the loop order -- we access all arrays column-first, which
- improves the performance enough to boost galgel spec score by 50%.
+/* The order of loops is different in the case of plain matrix
+ multiplication C=MATMUL(A,B), and in the frequent special case where
+ the argument A is the temporary result of a TRANSPOSE intrinsic:
+ C=MATMUL(TRANSPOSE(A),B). Transposed temporaries are detected by
+ looking at their strides.
+
+ The equivalent Fortran pseudo-code is:
DIMENSION A(M,COUNT), B(COUNT,N), C(M,N)
- C = 0
- DO J=1,N
- DO K=1,COUNT
+ IF (.NOT.IS_TRANSPOSED(A)) THEN
+ C = 0
+ DO J=1,N
+ DO K=1,COUNT
+ DO I=1,M
+ C(I,J) = C(I,J)+A(I,K)*B(K,J)
+ ELSE
+ DO J=1,N
DO I=1,M
- C(I,J) = C(I,J)+A(I,K)*B(K,J)
+ S = 0
+ DO K=1,COUNT
+ S = S+A(I,K)+B(K,J)
+ C(I,J) = S
+ ENDIF
*/
extern void matmul_c4 (gfc_array_c4 * const restrict retarray,
@@ -204,7 +217,28 @@ matmul_c4 (gfc_array_c4 * const restrict retarray,
}
}
}
- else
+ else if (rxstride == 1 && aystride == 1 && bxstride == 1)
+ {
+ const GFC_COMPLEX_4 *restrict abase_x;
+ const GFC_COMPLEX_4 *restrict bbase_y;
+ GFC_COMPLEX_4 *restrict dest_y;
+ GFC_COMPLEX_4 s;
+
+ for (y = 0; y < ycount; y++)
+ {
+ bbase_y = &bbase[y*bystride];
+ dest_y = &dest[y*rystride];
+ for (x = 0; x < xcount; x++)
+ {
+ abase_x = &abase[x*axstride];
+ s = (GFC_COMPLEX_4) 0;
+ for (n = 0; n < count; n++)
+ s += abase_x[n] * bbase_y[n];
+ dest_y[x] = s;
+ }
+ }
+ }
+ else if (axstride < aystride)
{
for (y = 0; y < ycount; y++)
for (x = 0; x < xcount; x++)
@@ -216,6 +250,27 @@ matmul_c4 (gfc_array_c4 * const restrict retarray,
/* dest[x,y] += a[x,n] * b[n,y] */
dest[x*rxstride + y*rystride] += abase[x*axstride + n*aystride] * bbase[n*bxstride + y*bystride];
}
+ else
+ {
+ const GFC_COMPLEX_4 *restrict abase_x;
+ const GFC_COMPLEX_4 *restrict bbase_y;
+ GFC_COMPLEX_4 *restrict dest_y;
+ GFC_COMPLEX_4 s;
+
+ for (y = 0; y < ycount; y++)
+ {
+ bbase_y = &bbase[y*bystride];
+ dest_y = &dest[y*rystride];
+ for (x = 0; x < xcount; x++)
+ {
+ abase_x = &abase[x*axstride];
+ s = (GFC_COMPLEX_4) 0;
+ for (n = 0; n < count; n++)
+ s += abase_x[n*aystride] * bbase_y[n*bxstride];
+ dest_y[x*rxstride] = s;
+ }
+ }
+ }
}
#endif