diff options
author | victorlei <victorlei@138bc75d-0d04-0410-961f-82ee72b054a4> | 2004-11-18 08:45:11 +0000 |
---|---|---|
committer | victorlei <victorlei@138bc75d-0d04-0410-961f-82ee72b054a4> | 2004-11-18 08:45:11 +0000 |
commit | 4e867f903aef0e97125606c0a6f5b0575ef5a45f (patch) | |
tree | 1bf92b09fa37314da9efc3b1c11b7617fadc2eba /libgfortran/generated/matmul_c4.c | |
parent | d1982d82eb67a4ab81876b6e83479ed9158aeef4 (diff) | |
download | gcc-4e867f903aef0e97125606c0a6f5b0575ef5a45f.tar.gz |
Modified Files:
ChangeLog generated/matmul_c4.c generated/matmul_c8.c
generated/matmul_i4.c generated/matmul_i8.c
generated/matmul_r4.c generated/matmul_r8.c m4/matmul.m4
2004-11-18 Victor Leikehman <lei@il.ibm.com>
* m4/matmul.m4: Loops reordered to improve cache behavior.
* generated/matmul_??.c: Regenerated.
git-svn-id: svn+ssh://gcc.gnu.org/svn/gcc/trunk@90853 138bc75d-0d04-0410-961f-82ee72b054a4
Diffstat (limited to 'libgfortran/generated/matmul_c4.c')
-rw-r--r-- | libgfortran/generated/matmul_c4.c | 154 |
1 files changed, 94 insertions, 60 deletions
diff --git a/libgfortran/generated/matmul_c4.c b/libgfortran/generated/matmul_c4.c index 7967e970646..fd265d8b5b1 100644 --- a/libgfortran/generated/matmul_c4.c +++ b/libgfortran/generated/matmul_c4.c @@ -21,37 +21,46 @@ Boston, MA 02111-1307, USA. */ #include "config.h" #include <stdlib.h> +#include <string.h> #include <assert.h> #include "libgfortran.h" -/* Dimensions: retarray(x,y) a(x, count) b(count,y). - Either a or b can be rank 1. In this case x or y is 1. */ +/* This is a C version of the following fortran pseudo-code. The key + point is the loop order -- we access all arrays column-first, which + improves the performance enough to boost galgel spec score by 50%. + + DIMENSION A(M,COUNT), B(COUNT,N), C(M,N) + C = 0 + DO J=1,N + DO K=1,COUNT + DO I=1,M + C(I,J) = C(I,J)+A(I,K)*B(K,J) +*/ + void __matmul_c4 (gfc_array_c4 * retarray, gfc_array_c4 * a, gfc_array_c4 * b) { GFC_COMPLEX_4 *abase; GFC_COMPLEX_4 *bbase; GFC_COMPLEX_4 *dest; - GFC_COMPLEX_4 res; - index_type rxstride; - index_type rystride; - index_type xcount; - index_type ycount; - index_type xstride; - index_type ystride; - index_type x; - index_type y; - - GFC_COMPLEX_4 *pa; - GFC_COMPLEX_4 *pb; - index_type astride; - index_type bstride; - index_type count; - index_type n; + + index_type rxstride, rystride, axstride, aystride, bxstride, bystride; + index_type x, y, n, count, xcount, ycount; assert (GFC_DESCRIPTOR_RANK (a) == 2 || GFC_DESCRIPTOR_RANK (b) == 2); +/* C[xcount,ycount] = A[xcount, count] * B[count,ycount] + + Either A or B (but not both) can be rank 1: + + o One-dimensional argument A is implicitly treated as a row matrix + dimensioned [1,count], so xcount=1. + + o One-dimensional argument B is implicitly treated as a column matrix + dimensioned [count, 1], so ycount=1. + */ + if (retarray->data == NULL) { if (GFC_DESCRIPTOR_RANK (a) == 1) @@ -95,8 +104,10 @@ __matmul_c4 (gfc_array_c4 * retarray, gfc_array_c4 * a, gfc_array_c4 * b) if (GFC_DESCRIPTOR_RANK (retarray) == 1) { - rxstride = retarray->dim[0].stride; - rystride = rxstride; + /* One-dimensional result may be addressed in the code below + either as a row or a column matrix. We want both cases to + work. */ + rxstride = rystride = retarray->dim[0].stride; } else { @@ -104,65 +115,88 @@ __matmul_c4 (gfc_array_c4 * retarray, gfc_array_c4 * a, gfc_array_c4 * b) rystride = retarray->dim[1].stride; } - /* If we have rank 1 parameters, zero the absent stride, and set the size to - one. */ + if (GFC_DESCRIPTOR_RANK (a) == 1) { - astride = a->dim[0].stride; - count = a->dim[0].ubound + 1 - a->dim[0].lbound; - xstride = 0; - rxstride = 0; + /* Treat it as a a row matrix A[1,count]. */ + axstride = a->dim[0].stride; + aystride = 1; + xcount = 1; + count = a->dim[0].ubound + 1 - a->dim[0].lbound; } else { - astride = a->dim[1].stride; + axstride = a->dim[0].stride; + aystride = a->dim[1].stride; + count = a->dim[1].ubound + 1 - a->dim[1].lbound; - xstride = a->dim[0].stride; xcount = a->dim[0].ubound + 1 - a->dim[0].lbound; } + + assert(count == b->dim[0].ubound + 1 - b->dim[0].lbound); + if (GFC_DESCRIPTOR_RANK (b) == 1) { - bstride = b->dim[0].stride; - assert(count == b->dim[0].ubound + 1 - b->dim[0].lbound); - ystride = 0; - rystride = 0; + /* Treat it as a column matrix B[count,1] */ + bxstride = b->dim[0].stride; + + /* bystride should never be used for 1-dimensional b. + in case it is we want it to cause a segfault, rather than + an incorrect result. */ + bystride = 0xDEADBEEF; ycount = 1; } else { - bstride = b->dim[0].stride; - assert(count == b->dim[0].ubound + 1 - b->dim[0].lbound); - ystride = b->dim[1].stride; + bxstride = b->dim[0].stride; + bystride = b->dim[1].stride; ycount = b->dim[1].ubound + 1 - b->dim[1].lbound; } - for (y = 0; y < ycount; y++) + assert (a->base == 0); + assert (b->base == 0); + assert (retarray->base == 0); + + abase = a->data; + bbase = b->data; + dest = retarray->data; + + if (rxstride == 1 && axstride == 1 && bxstride == 1) { - for (x = 0; x < xcount; x++) - { - /* Do the summation for this element. For real and integer types - this is the same as DOT_PRODUCT. For complex types we use do - a*b, not conjg(a)*b. */ - pa = abase; - pb = bbase; - res = 0; - - for (n = 0; n < count; n++) - { - res += *pa * *pb; - pa += astride; - pb += bstride; - } - - *dest = res; - - dest += rxstride; - abase += xstride; - } - abase -= xstride * xcount; - bbase += ystride; - dest += rystride - (rxstride * xcount); + GFC_COMPLEX_4 *bbase_y; + GFC_COMPLEX_4 *dest_y; + GFC_COMPLEX_4 *abase_n; + GFC_COMPLEX_4 bbase_yn; + + memset (dest, 0, (sizeof (GFC_COMPLEX_4) * size0(retarray))); + + for (y = 0; y < ycount; y++) + { + bbase_y = bbase + y*bystride; + dest_y = dest + y*rystride; + for (n = 0; n < count; n++) + { + abase_n = abase + n*aystride; + bbase_yn = bbase_y[n]; + for (x = 0; x < xcount; x++) + { + dest_y[x] += abase_n[x] * bbase_yn; + } + } + } + } + else + { + for (y = 0; y < ycount; y++) + for (x = 0; x < xcount; x++) + dest[x*rxstride + y*rystride] = (GFC_COMPLEX_4)0; + + for (y = 0; y < ycount; y++) + for (n = 0; n < count; n++) + for (x = 0; x < xcount; x++) + /* dest[x,y] += a[x,n] * b[n,y] */ + dest[x*rxstride + y*rystride] += abase[x*axstride + n*aystride] * bbase[n*bxstride + y*bystride]; } } |