summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authortkoenig <tkoenig@138bc75d-0d04-0410-961f-82ee72b054a4>2015-05-06 20:23:48 +0000
committertkoenig <tkoenig@138bc75d-0d04-0410-961f-82ee72b054a4>2015-05-06 20:23:48 +0000
commit08f351fd0b0f62e35a2f12a9e5b9613c9694a269 (patch)
treebcbdcbcffefce1c189dafd0e4b29a080eefe1575
parent8120405bda9bf222228b004e09b23e74eda57ae1 (diff)
downloadgcc-08f351fd0b0f62e35a2f12a9e5b9613c9694a269.tar.gz
2015-05-06 Thomas Koenig <tkoenig@gcc.gnu.org>
PR fortran/37131 * gfortran.h (gfc_isym_id): Add GFC_ISYM_FE_RUNTIME_ERROR. (gfc_intrinsic_sym): Add vararg. * intrinsic.h (gfc_check_fe_runtime_error): Add prototype. (gfc_resolve_re_runtime_error): Likewise. Add prototype for gfc_is_reallocatable_lhs. * trans-array.h (gfc_is_reallocatable_lhs): Remove prototype. * check.c (gfc_check_fe_runtime_error): New function. * intrinsic.c (add_sym_1p): New function. (make_vararg): New function. (add_subroutines): Add fe_runtime_error. (gfc_intrinsic_sub_interface): Skip sorting for variable number of arguments. * iresolve.c (gfc_resolve_fe_runtime_error): New function. * lang.opt (inline-matmul-limit): New option. (gfc_post_options): If no inline matmul limit has been set and BLAS is called externally, use the BLAS limit. * frontend-passes.c: Include intrinsic.h. (var_num): New global counter for naming temporary variablbles. (matrix_case): Enum for differentiating the different matmul cases. (realloc_string_callback): Add "trim" to the variable name. (create_var): Add optional argument vname as part of the name. Use var_num. Set dimension of result correctly. Split off block creation into (insert_block): New function. (cfe_expr_0): Use "fcn" as part of temporary variable name. (optimize_namesapce): Also set gfc_current_ns. Call inline_matmul_assign. (combine_array_constructor): Use "constr" as part of temporary name. (get_array_inq_function): New function. (build_logical_expr): New function. (get_operand): new function. (inline_limit_check): New function. (runtime_error_ne): New function. (matmul_lhs_realloc): New function. (is_functino_or_op): New function. (has_function_or_op): New function. (freeze_expr): New function. (freeze_references): New function. (convert_to_index_kind): New function. (create_do_loop): New function. (get_size_m1): New function. (scalarized_expr): New function. (inline_matmul_assign): New function. * simplify.c (simplify_bound): Simplify the case of the lower bound of an assumed-shape argument. 2015-05-06 Thomas Koenig <tkoenig@gcc.gnu.org> PR fortran/37131 * gfortran.dg/dependency_26.f90: Add option to suppress inlining matmul. * gfortran.dg/function_optimize_1.f90: Likewise. * gfortran.dg/function_optimize_2.f90: Likewise. * gfortran.dg/function_optimize_5.f90: Likewise. * gfortran.dg/function_optimize_7.f90: Likewise. * gfortran.dg/inline_matmul_1.f90: New test. * gfortran.dg/inline_matmul_2.f90: New test. * gfortran.dg/inline_matmul_3.f90: New test. * gfortran.dg/inline_matmul_4.f90: New test. * gfortran.dg/inline_matmul_5.f90: New test. * gfortran.dg/inline_matmul_6.f90: New test. git-svn-id: svn+ssh://gcc.gnu.org/svn/gcc/trunk@222864 138bc75d-0d04-0410-961f-82ee72b054a4
-rw-r--r--gcc/fortran/check.c30
-rw-r--r--gcc/fortran/frontend-passes.c1135
-rw-r--r--gcc/fortran/gfortran.h8
-rw-r--r--gcc/fortran/intrinsic.c47
-rw-r--r--gcc/fortran/intrinsic.h2
-rw-r--r--gcc/fortran/invoke.texi28
-rw-r--r--gcc/fortran/iresolve.c13
-rw-r--r--gcc/fortran/lang.opt4
-rw-r--r--gcc/fortran/options.c5
-rw-r--r--gcc/fortran/trans-array.h2
-rw-r--r--gcc/testsuite/gfortran.dg/dependency_26.f902
-rw-r--r--gcc/testsuite/gfortran.dg/function_optimize_1.f902
-rw-r--r--gcc/testsuite/gfortran.dg/function_optimize_2.f902
-rw-r--r--gcc/testsuite/gfortran.dg/function_optimize_5.f902
-rw-r--r--gcc/testsuite/gfortran.dg/function_optimize_7.f902
-rw-r--r--gcc/testsuite/gfortran.dg/inline_matmul_1.f90152
-rw-r--r--gcc/testsuite/gfortran.dg/inline_matmul_2.f9065
-rw-r--r--gcc/testsuite/gfortran.dg/inline_matmul_3.f9065
-rw-r--r--gcc/testsuite/gfortran.dg/inline_matmul_4.f9068
-rw-r--r--gcc/testsuite/gfortran.dg/inline_matmul_5.f9013
-rw-r--r--gcc/testsuite/gfortran.dg/inline_matmul_6.f9049
21 files changed, 1652 insertions, 44 deletions
diff --git a/gcc/fortran/check.c b/gcc/fortran/check.c
index cdb5ff1cba6..3e0ce5c8624 100644
--- a/gcc/fortran/check.c
+++ b/gcc/fortran/check.c
@@ -5527,6 +5527,36 @@ gfc_check_random_seed (gfc_expr *size, gfc_expr *put, gfc_expr *get)
return true;
}
+bool
+gfc_check_fe_runtime_error (gfc_actual_arglist *a)
+{
+ gfc_expr *e;
+ int len, i;
+ int num_percent, nargs;
+
+ e = a->expr;
+ if (e->expr_type != EXPR_CONSTANT)
+ return true;
+
+ len = e->value.character.length;
+ if (e->value.character.string[len-1] != '\0')
+ gfc_internal_error ("fe_runtime_error string must be null terminated");
+
+ num_percent = 0;
+ for (i=0; i<len-1; i++)
+ if (e->value.character.string[i] == '%')
+ num_percent ++;
+
+ nargs = 0;
+ for (; a; a = a->next)
+ nargs ++;
+
+ if (nargs -1 != num_percent)
+ gfc_internal_error ("fe_runtime_error: Wrong number of arguments (%d instead of %d)",
+ nargs, num_percent++);
+
+ return true;
+}
bool
gfc_check_second_sub (gfc_expr *time)
diff --git a/gcc/fortran/frontend-passes.c b/gcc/fortran/frontend-passes.c
index 446ef196e2c..62d10634a8e 100644
--- a/gcc/fortran/frontend-passes.c
+++ b/gcc/fortran/frontend-passes.c
@@ -27,6 +27,7 @@ along with GCC; see the file COPYING3. If not see
#include "dependency.h"
#include "constructor.h"
#include "opts.h"
+#include "intrinsic.h"
/* Forward declarations. */
@@ -43,7 +44,11 @@ static void doloop_warn (gfc_namespace *);
static void optimize_reduction (gfc_namespace *);
static int callback_reduction (gfc_expr **, int *, void *);
static void realloc_strings (gfc_namespace *);
-static gfc_expr *create_var (gfc_expr *);
+static gfc_expr *create_var (gfc_expr *, const char *vname=NULL);
+static int inline_matmul_assign (gfc_code **, int *, void *);
+static gfc_code * create_do_loop (gfc_expr *, gfc_expr *, gfc_expr *,
+ locus *, gfc_namespace *,
+ char *vname=NULL);
/* How deep we are inside an argument list. */
@@ -93,6 +98,19 @@ struct my_struct *evec;
static bool in_assoc_list;
+/* Counter for temporary variables. */
+
+static int var_num = 1;
+
+/* What sort of matrix we are dealing with when inlining MATMUL. */
+
+enum matrix_case { none=0, A2B2, A2B1, A1B2 };
+
+/* Keep track of the number of expressions we have inserted so far
+ using create_var. */
+
+int n_vars;
+
/* Entry point - run all passes for a namespace. */
void
@@ -157,7 +175,7 @@ realloc_string_callback (gfc_code **c, int *walk_subtrees,
return 0;
current_code = c;
- n = create_var (expr2);
+ n = create_var (expr2, "trim");
co->expr2 = n;
return 0;
}
@@ -524,28 +542,13 @@ constant_string_length (gfc_expr *e)
}
-/* Returns a new expression (a variable) to be used in place of the old one,
- with an assignment statement before the current statement to set
- the value of the variable. Creates a new BLOCK for the statement if
- that hasn't already been done and puts the statement, plus the
- newly created variables, in that block. Special cases: If the
- expression is constant or a temporary which has already
- been created, just copy it. */
+/* Insert a block at the current position unless it has already
+ been inserted; in this case use the one already there. */
-static gfc_expr*
-create_var (gfc_expr * e)
+static gfc_namespace*
+insert_block ()
{
- char name[GFC_MAX_SYMBOL_LEN +1];
- static int num = 1;
- gfc_symtree *symtree;
- gfc_symbol *symbol;
- gfc_expr *result;
- gfc_code *n;
gfc_namespace *ns;
- int i;
-
- if (e->expr_type == EXPR_CONSTANT || is_fe_temp (e))
- return gfc_copy_expr (e);
/* If the block hasn't already been created, do so. */
if (inserted_block == NULL)
@@ -578,7 +581,37 @@ create_var (gfc_expr * e)
else
ns = inserted_block->ext.block.ns;
- sprintf(name, "__var_%d",num++);
+ return ns;
+}
+
+/* Returns a new expression (a variable) to be used in place of the old one,
+ with an optional assignment statement before the current statement to set
+ the value of the variable. Creates a new BLOCK for the statement if that
+ hasn't already been done and puts the statement, plus the newly created
+ variables, in that block. Special cases: If the expression is constant or
+ a temporary which has already been created, just copy it. */
+
+static gfc_expr*
+create_var (gfc_expr * e, const char *vname)
+{
+ char name[GFC_MAX_SYMBOL_LEN +1];
+ gfc_symtree *symtree;
+ gfc_symbol *symbol;
+ gfc_expr *result;
+ gfc_code *n;
+ gfc_namespace *ns;
+ int i;
+
+ if (e->expr_type == EXPR_CONSTANT || is_fe_temp (e))
+ return gfc_copy_expr (e);
+
+ ns = insert_block ();
+
+ if (vname)
+ snprintf (name, GFC_MAX_SYMBOL_LEN, "__var_%d_%s", var_num++, vname);
+ else
+ snprintf (name, GFC_MAX_SYMBOL_LEN, "__var_%d", var_num++);
+
if (gfc_get_sym_tree (name, ns, &symtree, false) != 0)
gcc_unreachable ();
@@ -651,6 +684,7 @@ create_var (gfc_expr * e)
result->ref->type = REF_ARRAY;
result->ref->u.ar.type = AR_FULL;
result->ref->u.ar.where = e->where;
+ result->ref->u.ar.dimen = e->rank;
result->ref->u.ar.as = symbol->ts.type == BT_CLASS
? CLASS_DATA (symbol)->as : symbol->as;
if (warn_array_temporaries)
@@ -666,6 +700,7 @@ create_var (gfc_expr * e)
n->expr1 = gfc_copy_expr (result);
n->expr2 = e;
*changed_statement = n;
+ n_vars ++;
return result;
}
@@ -724,7 +759,7 @@ cfe_expr_0 (gfc_expr **e, int *walk_subtrees,
if (gfc_dep_compare_functions (*ei, *ej, true) == 0)
{
if (newvar == NULL)
- newvar = create_var (*ei);
+ newvar = create_var (*ei, "fcn");
if (warn_function_elimination)
do_warn_function_elimination (*ej);
@@ -931,13 +966,15 @@ convert_elseif (gfc_code **c, int *walk_subtrees ATTRIBUTE_UNUSED,
/* Don't walk subtrees. */
return 0;
}
+
/* Optimize a namespace, including all contained namespaces. */
static void
optimize_namespace (gfc_namespace *ns)
{
-
+ gfc_namespace *saved_ns = gfc_current_ns;
current_ns = ns;
+ gfc_current_ns = ns;
forall_level = 0;
iterator_level = 0;
in_assoc_list = false;
@@ -947,6 +984,9 @@ optimize_namespace (gfc_namespace *ns)
gfc_code_walker (&ns->code, convert_elseif, dummy_expr_callback, NULL);
gfc_code_walker (&ns->code, cfe_code, cfe_expr_0, NULL);
gfc_code_walker (&ns->code, optimize_code, optimize_expr, NULL);
+ if (flag_inline_matmul_limit != 0)
+ gfc_code_walker (&ns->code, inline_matmul_assign, dummy_expr_callback,
+ NULL);
/* BLOCKs are handled in the expression walker below. */
for (ns = ns->contained; ns; ns = ns->sibling)
@@ -954,6 +994,7 @@ optimize_namespace (gfc_namespace *ns)
if (ns->code == NULL || ns->code->op != EXEC_BLOCK)
optimize_namespace (ns);
}
+ gfc_current_ns = saved_ns;
}
/* Handle dependencies for allocatable strings which potentially redefine
@@ -968,10 +1009,7 @@ realloc_strings (gfc_namespace *ns)
for (ns = ns->contained; ns; ns = ns->sibling)
{
if (ns->code == NULL || ns->code->op != EXEC_BLOCK)
- {
- // current_ns = ns;
- realloc_strings (ns);
- }
+ realloc_strings (ns);
}
}
@@ -1222,7 +1260,7 @@ combine_array_constructor (gfc_expr *e)
if (op2->ts.type == BT_CHARACTER)
return false;
- scalar = create_var (gfc_copy_expr (op2));
+ scalar = create_var (gfc_copy_expr (op2), "constr");
oldbase = op1->value.constructor;
newbase = NULL;
@@ -1939,6 +1977,1045 @@ doloop_warn (gfc_namespace *ns)
gfc_code_walker (&ns->code, doloop_code, do_function, NULL);
}
+/* This selction deals with inlining calls to MATMUL. */
+
+/* Auxiliary function to build and simplify an array inquiry function.
+ dim is zero-based. */
+
+static gfc_expr *
+get_array_inq_function (gfc_isym_id id, gfc_expr *e, int dim)
+{
+ gfc_expr *fcn;
+ gfc_expr *dim_arg, *kind;
+ const char *name;
+ gfc_expr *ec;
+
+ switch (id)
+ {
+ case GFC_ISYM_LBOUND:
+ name = "_gfortran_lbound";
+ break;
+
+ case GFC_ISYM_UBOUND:
+ name = "_gfortran_ubound";
+ break;
+
+ case GFC_ISYM_SIZE:
+ name = "_gfortran_size";
+ break;
+
+ default:
+ gcc_unreachable ();
+ }
+
+ dim_arg = gfc_get_int_expr (gfc_default_integer_kind, &e->where, dim);
+ kind = gfc_get_int_expr (gfc_default_integer_kind, &e->where,
+ gfc_index_integer_kind);
+
+ ec = gfc_copy_expr (e);
+ fcn = gfc_build_intrinsic_call (current_ns, id, name, e->where, 3,
+ ec, dim_arg, kind);
+ gfc_simplify_expr (fcn, 0);
+ return fcn;
+}
+
+/* Builds a logical expression. */
+
+static gfc_expr*
+build_logical_expr (gfc_intrinsic_op op, gfc_expr *e1, gfc_expr *e2)
+{
+ gfc_typespec ts;
+ gfc_expr *res;
+
+ ts.type = BT_LOGICAL;
+ ts.kind = gfc_default_logical_kind;
+ res = gfc_get_expr ();
+ res->where = e1->where;
+ res->expr_type = EXPR_OP;
+ res->value.op.op = op;
+ res->value.op.op1 = e1;
+ res->value.op.op2 = e2;
+ res->ts = ts;
+
+ return res;
+}
+
+
+/* Return an operation of one two gfc_expr (one if e2 is NULL). This assumes
+ compatible typespecs. */
+
+static gfc_expr *
+get_operand (gfc_intrinsic_op op, gfc_expr *e1, gfc_expr *e2)
+{
+ gfc_expr *res;
+
+ res = gfc_get_expr ();
+ res->ts = e1->ts;
+ res->where = e1->where;
+ res->expr_type = EXPR_OP;
+ res->value.op.op = op;
+ res->value.op.op1 = e1;
+ res->value.op.op2 = e2;
+ gfc_simplify_expr (res, 0);
+ return res;
+}
+
+/* Generate the IF statement for a runtime check if we want to do inlining or
+ not - putting in the code for both branches and putting it into the syntax
+ tree is the caller's responsibility. For fixed array sizes, this should be
+ removed by DCE. Only called for rank-two matrices A and B. */
+
+static gfc_code *
+inline_limit_check (gfc_expr *a, gfc_expr *b, enum matrix_case m_case)
+{
+ gfc_expr *inline_limit;
+ gfc_code *if_1, *if_2, *else_2;
+ gfc_expr *b2, *a2, *a1, *m1, *m2;
+ gfc_typespec ts;
+ gfc_expr *cond;
+
+ gcc_assert (m_case == A2B2);
+
+ /* Calculation is done in real to avoid integer overflow. */
+
+ inline_limit = gfc_get_constant_expr (BT_REAL, gfc_default_real_kind,
+ &a->where);
+ mpfr_set_si (inline_limit->value.real, flag_inline_matmul_limit,
+ GFC_RND_MODE);
+ mpfr_pow_ui (inline_limit->value.real, inline_limit->value.real, 3,
+ GFC_RND_MODE);
+
+ a1 = get_array_inq_function (GFC_ISYM_SIZE, a, 1);
+ a2 = get_array_inq_function (GFC_ISYM_SIZE, a, 2);
+ b2 = get_array_inq_function (GFC_ISYM_SIZE, b, 2);
+
+ gfc_clear_ts (&ts);
+ ts.type = BT_REAL;
+ ts.kind = gfc_default_real_kind;
+ gfc_convert_type_warn (a1, &ts, 2, 0);
+ gfc_convert_type_warn (a2, &ts, 2, 0);
+ gfc_convert_type_warn (b2, &ts, 2, 0);
+
+ m1 = get_operand (INTRINSIC_TIMES, a1, a2);
+ m2 = get_operand (INTRINSIC_TIMES, m1, b2);
+
+ cond = build_logical_expr (INTRINSIC_LE, m2, inline_limit);
+ gfc_simplify_expr (cond, 0);
+
+ else_2 = XCNEW (gfc_code);
+ else_2->op = EXEC_IF;
+ else_2->loc = a->where;
+
+ if_2 = XCNEW (gfc_code);
+ if_2->op = EXEC_IF;
+ if_2->expr1 = cond;
+ if_2->loc = a->where;
+ if_2->block = else_2;
+
+ if_1 = XCNEW (gfc_code);
+ if_1->op = EXEC_IF;
+ if_1->block = if_2;
+ if_1->loc = a->where;
+
+ return if_1;
+}
+
+
+/* Insert code to issue a runtime error if the expressions are not equal. */
+
+static gfc_code *
+runtime_error_ne (gfc_expr *e1, gfc_expr *e2, const char *msg)
+{
+ gfc_expr *cond;
+ gfc_code *if_1, *if_2;
+ gfc_code *c;
+ gfc_actual_arglist *a1, *a2, *a3;
+
+ gcc_assert (e1->where.lb);
+ /* Build the call to runtime_error. */
+ c = XCNEW (gfc_code);
+ c->op = EXEC_CALL;
+ c->loc = e1->where;
+
+ /* Get a null-terminated message string. */
+
+ a1 = gfc_get_actual_arglist ();
+ a1->expr = gfc_get_character_expr (gfc_default_character_kind, &e1->where,
+ msg, strlen(msg)+1);
+ c->ext.actual = a1;
+
+ /* Pass the value of the first expression. */
+ a2 = gfc_get_actual_arglist ();
+ a2->expr = gfc_copy_expr (e1);
+ a1->next = a2;
+
+ /* Pass the value of the second expression. */
+ a3 = gfc_get_actual_arglist ();
+ a3->expr = gfc_copy_expr (e2);
+ a2->next = a3;
+
+ gfc_check_fe_runtime_error (c->ext.actual);
+ gfc_resolve_fe_runtime_error (c);
+
+ if_2 = XCNEW (gfc_code);
+ if_2->op = EXEC_IF;
+ if_2->loc = e1->where;
+ if_2->next = c;
+
+ if_1 = XCNEW (gfc_code);
+ if_1->op = EXEC_IF;
+ if_1->block = if_2;
+ if_1->loc = e1->where;
+
+ cond = build_logical_expr (INTRINSIC_NE, e1, e2);
+ gfc_simplify_expr (cond, 0);
+ if_2->expr1 = cond;
+
+ return if_1;
+}
+
+/* Handle matrix reallocation. Caller is responsible to insert into
+ the code tree.
+
+ For the two-dimensional case, build
+
+ if (allocated(c)) then
+ if (size(c,1) /= size(a,1) .or. size(c,2) /= size(b,2)) then
+ deallocate(c)
+ allocate (c(size(a,1), size(b,2)))
+ end if
+ else
+ allocate (c(size(a,1),size(b,2)))
+ end if
+
+ and for the other cases correspondingly.
+*/
+
+static gfc_code *
+matmul_lhs_realloc (gfc_expr *c, gfc_expr *a, gfc_expr *b,
+ enum matrix_case m_case)
+{
+
+ gfc_expr *allocated, *alloc_expr;
+ gfc_code *if_alloc_1, *if_alloc_2, *if_size_1, *if_size_2;
+ gfc_code *else_alloc;
+ gfc_code *deallocate, *allocate1, *allocate_else;
+ gfc_array_ref *ar;
+ gfc_expr *cond, *ne1, *ne2;
+
+ if (warn_realloc_lhs)
+ gfc_warning (OPT_Wrealloc_lhs,
+ "Code for reallocating the allocatable array at %L will "
+ "be added", &c->where);
+
+ alloc_expr = gfc_copy_expr (c);
+
+ ar = gfc_find_array_ref (alloc_expr);
+ gcc_assert (ar && ar->type == AR_FULL);
+
+ /* c comes in as a full ref. Change it into a copy and make it into an
+ element ref so it has the right form for for ALLOCATE. In the same
+ switch statement, also generate the size comparison for the secod IF
+ statement. */
+
+ ar->type = AR_ELEMENT;
+
+ switch (m_case)
+ {
+ case A2B2:
+ ar->start[0] = get_array_inq_function (GFC_ISYM_SIZE, a, 1);
+ ar->start[1] = get_array_inq_function (GFC_ISYM_SIZE, b, 2);
+ ne1 = build_logical_expr (INTRINSIC_NE,
+ get_array_inq_function (GFC_ISYM_SIZE, c, 1),
+ get_array_inq_function (GFC_ISYM_SIZE, a, 1));
+ ne2 = build_logical_expr (INTRINSIC_NE,
+ get_array_inq_function (GFC_ISYM_SIZE, c, 2),
+ get_array_inq_function (GFC_ISYM_SIZE, b, 2));
+ cond = build_logical_expr (INTRINSIC_OR, ne1, ne2);
+ break;
+
+ case A2B1:
+ ar->start[0] = get_array_inq_function (GFC_ISYM_SIZE, a, 1);
+ cond = build_logical_expr (INTRINSIC_NE,
+ get_array_inq_function (GFC_ISYM_SIZE, c, 1),
+ get_array_inq_function (GFC_ISYM_SIZE, a, 2));
+ break;
+
+ case A1B2:
+ ar->start[0] = get_array_inq_function (GFC_ISYM_SIZE, b, 1);
+ cond = build_logical_expr (INTRINSIC_NE,
+ get_array_inq_function (GFC_ISYM_SIZE, c, 1),
+ get_array_inq_function (GFC_ISYM_SIZE, b, 2));
+ break;
+
+ default:
+ gcc_unreachable();
+
+ }
+
+ gfc_simplify_expr (cond, 0);
+
+ /* We need two identical allocate statements in two
+ branches of the IF statement. */
+
+ allocate1 = XCNEW (gfc_code);
+ allocate1->op = EXEC_ALLOCATE;
+ allocate1->ext.alloc.list = gfc_get_alloc ();
+ allocate1->loc = c->where;
+ allocate1->ext.alloc.list->expr = gfc_copy_expr (alloc_expr);
+
+ allocate_else = XCNEW (gfc_code);
+ allocate_else->op = EXEC_ALLOCATE;
+ allocate_else->ext.alloc.list = gfc_get_alloc ();
+ allocate_else->loc = c->where;
+ allocate_else->ext.alloc.list->expr = alloc_expr;
+
+ allocated = gfc_build_intrinsic_call (current_ns, GFC_ISYM_ALLOCATED,
+ "_gfortran_allocated", c->where,
+ 1, gfc_copy_expr (c));
+
+ deallocate = XCNEW (gfc_code);
+ deallocate->op = EXEC_DEALLOCATE;
+ deallocate->ext.alloc.list = gfc_get_alloc ();
+ deallocate->ext.alloc.list->expr = gfc_copy_expr (c);
+ deallocate->next = allocate1;
+ deallocate->loc = c->where;
+
+ if_size_2 = XCNEW (gfc_code);
+ if_size_2->op = EXEC_IF;
+ if_size_2->expr1 = cond;
+ if_size_2->loc = c->where;
+ if_size_2->next = deallocate;
+
+ if_size_1 = XCNEW (gfc_code);
+ if_size_1->op = EXEC_IF;
+ if_size_1->block = if_size_2;
+ if_size_1->loc = c->where;
+
+ else_alloc = XCNEW (gfc_code);
+ else_alloc->op = EXEC_IF;
+ else_alloc->loc = c->where;
+ else_alloc->next = allocate_else;
+
+ if_alloc_2 = XCNEW (gfc_code);
+ if_alloc_2->op = EXEC_IF;
+ if_alloc_2->expr1 = allocated;
+ if_alloc_2->loc = c->where;
+ if_alloc_2->next = if_size_1;
+ if_alloc_2->block = else_alloc;
+
+ if_alloc_1 = XCNEW (gfc_code);
+ if_alloc_1->op = EXEC_IF;
+ if_alloc_1->block = if_alloc_2;
+ if_alloc_1->loc = c->where;
+
+ return if_alloc_1;
+}
+
+/* Callback function for has_function_or_op. */
+
+static int
+is_function_or_op (gfc_expr **e, int *walk_subtrees ATTRIBUTE_UNUSED,
+ void *data ATTRIBUTE_UNUSED)
+{
+ if ((*e) == 0)
+ return 0;
+ else
+ return (*e)->expr_type == EXPR_FUNCTION
+ || (*e)->expr_type == EXPR_OP;
+}
+
+/* Returns true if the expression contains a function. */
+
+static bool
+has_function_or_op (gfc_expr **e)
+{
+ if (e == NULL)
+ return false;
+ else
+ return gfc_expr_walker (e, is_function_or_op, NULL);
+}
+
+/* Freeze (assign to a temporary variable) a single expression. */
+
+static void
+freeze_expr (gfc_expr **ep)
+{
+ gfc_expr *ne;
+ if (has_function_or_op (ep))
+ {
+ ne = create_var (*ep, "freeze");
+ *ep = ne;
+ }
+}
+
+/* Go through an expression's references and assign them to temporary
+ variables if they contain functions. This is usually done prior to
+ front-end scalarization to avoid multiple invocations of functions. */
+
+static void
+freeze_references (gfc_expr *e)
+{
+ gfc_ref *r;
+ gfc_array_ref *ar;
+ int i;
+
+ for (r=e->ref; r; r=r->next)
+ {
+ if (r->type == REF_SUBSTRING)
+ {
+ if (r->u.ss.start != NULL)
+ freeze_expr (&r->u.ss.start);
+
+ if (r->u.ss.end != NULL)
+ freeze_expr (&r->u.ss.end);
+ }
+ else if (r->type == REF_ARRAY)
+ {
+ ar = &r->u.ar;
+ switch (ar->type)
+ {
+ case AR_FULL:
+ break;
+
+ case AR_SECTION:
+ for (i=0; i<ar->dimen; i++)
+ {
+ if (ar->dimen_type[i] == DIMEN_RANGE)
+ {
+ freeze_expr (&ar->start[i]);
+ freeze_expr (&ar->end[i]);
+ freeze_expr (&ar->stride[i]);
+ }
+ else if (ar->dimen_type[i] == DIMEN_ELEMENT)
+ {
+ freeze_expr (&ar->start[i]);
+ }
+ }
+ break;
+
+ case AR_ELEMENT:
+ for (i=0; i<ar->dimen; i++)
+ freeze_expr (&ar->start[i]);
+ break;
+
+ default:
+ break;
+ }
+ }
+ }
+}
+
+/* Convert to gfc_index_integer_kind if needed, just do a copy otherwise. */
+
+static gfc_expr *
+convert_to_index_kind (gfc_expr *e)
+{
+ gfc_expr *res;
+
+ gcc_assert (e != NULL);
+
+ res = gfc_copy_expr (e);
+
+ gcc_assert (e->ts.type == BT_INTEGER);
+
+ if (res->ts.kind != gfc_index_integer_kind)
+ {
+ gfc_typespec ts;
+ gfc_clear_ts (&ts);
+ ts.type = BT_INTEGER;
+ ts.kind = gfc_index_integer_kind;
+
+ gfc_convert_type_warn (e, &ts, 2, 0);
+ }
+
+ return res;
+}
+
+/* Function to create a DO loop including creation of the
+ iteration variable. gfc_expr are copied.*/
+
+static gfc_code *
+create_do_loop (gfc_expr *start, gfc_expr *end, gfc_expr *step, locus *where,
+ gfc_namespace *ns, char *vname)
+{
+
+ char name[GFC_MAX_SYMBOL_LEN +1];
+ gfc_symtree *symtree;
+ gfc_symbol *symbol;
+ gfc_expr *i;
+ gfc_code *n, *n2;
+
+ /* Create an expression for the iteration variable. */
+ if (vname)
+ sprintf (name, "__var_%d_do_%s", var_num++, vname);
+ else
+ sprintf (name, "__var_%d_do", var_num++);
+
+
+ if (gfc_get_sym_tree (name, ns, &symtree, false) != 0)
+ gcc_unreachable ();
+
+ /* Create the loop variable. */
+
+ symbol = symtree->n.sym;
+ symbol->ts.type = BT_INTEGER;
+ symbol->ts.kind = gfc_index_integer_kind;
+ symbol->attr.flavor = FL_VARIABLE;
+ symbol->attr.referenced = 1;
+ symbol->attr.dimension = 0;
+ symbol->attr.fe_temp = 1;
+ gfc_commit_symbol (symbol);
+
+ i = gfc_get_expr ();
+ i->expr_type = EXPR_VARIABLE;
+ i->ts = symbol->ts;
+ i->rank = 0;
+ i->where = *where;
+ i->symtree = symtree;
+
+ /* ... and the nested DO statements. */
+ n = XCNEW (gfc_code);
+ n->op = EXEC_DO;
+ n->loc = *where;
+ n->ext.iterator = gfc_get_iterator ();
+ n->ext.iterator->var = i;
+ n->ext.iterator->start = convert_to_index_kind (start);
+ n->ext.iterator->end = convert_to_index_kind (end);
+ if (step)
+ n->ext.iterator->step = convert_to_index_kind (step);
+ else
+ n->ext.iterator->step = gfc_get_int_expr (gfc_index_integer_kind,
+ where, 1);
+
+ n2 = XCNEW (gfc_code);
+ n2->op = EXEC_DO;
+ n2->loc = *where;
+ n2->next = NULL;
+ n->block = n2;
+ return n;
+}
+
+/* Get the upper bound of the DO loops for matmul along a dimension. This
+ is one-based. */
+
+static gfc_expr*
+get_size_m1 (gfc_expr *e, int dimen)
+{
+ mpz_t size;
+ gfc_expr *res;
+
+ if (gfc_array_dimen_size (e, dimen - 1, &size))
+ {
+ res = gfc_get_constant_expr (BT_INTEGER,
+ gfc_index_integer_kind, &e->where);
+ mpz_sub_ui (res->value.integer, size, 1);
+ mpz_clear (size);
+ }
+ else
+ {
+ res = get_operand (INTRINSIC_MINUS,
+ get_array_inq_function (GFC_ISYM_SIZE, e, dimen),
+ gfc_get_int_expr (gfc_index_integer_kind,
+ &e->where, 1));
+ gfc_simplify_expr (res, 0);
+ }
+
+ return res;
+}
+
+/* Function to return a scalarized expression. It is assumed that indices are
+ zero based to make generation of DO loops easier. A zero as index will
+ access the first element along a dimension. Single element references will
+ be skipped. A NULL as an expression will be replaced by a full reference.
+ This assumes that the index loops have gfc_index_integer_kind, and that all
+ references have been frozen. */
+
+static gfc_expr*
+scalarized_expr (gfc_expr *e_in, gfc_expr **index, int count_index)
+{
+ gfc_array_ref *ar;
+ int i;
+ int rank;
+ gfc_expr *e;
+ int i_index;
+ bool was_fullref;
+
+ e = gfc_copy_expr(e_in);
+
+ rank = e->rank;
+
+ ar = gfc_find_array_ref (e);
+
+ /* We scalarize count_index variables, reducing the rank by count_index. */
+
+ e->rank = rank - count_index;
+
+ was_fullref = ar->type == AR_FULL;
+
+ if (e->rank == 0)
+ ar->type = AR_ELEMENT;
+ else
+ ar->type = AR_SECTION;
+
+ /* Loop over the indices. For each index, create the expression
+ index * stride + lbound(e, dim). */
+
+ i_index = 0;
+ for (i=0; i < ar->dimen; i++)
+ {
+ if (was_fullref || ar->dimen_type[i] == DIMEN_RANGE)
+ {
+ if (index[i_index] != NULL)
+ {
+ gfc_expr *lbound, *nindex;
+ gfc_expr *loopvar;
+
+ loopvar = gfc_copy_expr (index[i_index]);
+
+ if (ar->stride[i])
+ {
+ gfc_expr *tmp;
+
+ tmp = gfc_copy_expr(ar->stride[i]);
+ if (tmp->ts.kind != gfc_index_integer_kind)
+ {
+ gfc_typespec ts;
+ gfc_clear_ts (&ts);
+ ts.type = BT_INTEGER;
+ ts.kind = gfc_index_integer_kind;
+ gfc_convert_type (tmp, &ts, 2);
+ }
+ nindex = get_operand (INTRINSIC_TIMES, loopvar, tmp);
+ }
+ else
+ nindex = loopvar;
+
+ /* Calculate the lower bound of the expression. */
+ if (ar->start[i])
+ {
+ lbound = gfc_copy_expr (ar->start[i]);
+ if (lbound->ts.kind != gfc_index_integer_kind)
+ {
+ gfc_typespec ts;
+ gfc_clear_ts (&ts);
+ ts.type = BT_INTEGER;
+ ts.kind = gfc_index_integer_kind;
+ gfc_convert_type (lbound, &ts, 2);
+
+ }
+ }
+ else
+ {
+ if (!was_fullref)
+ {
+ /* Look at full individual sections, like a(:). The first index
+ is the lbound of a full ref. */
+
+ gfc_array_ref *ar;
+
+ ar = gfc_find_array_ref (e_in);
+ ar->type = AR_FULL;
+ }
+ lbound = get_array_inq_function (GFC_ISYM_LBOUND, e_in,
+ i_index + 1);
+ }
+
+ ar->dimen_type[i] = DIMEN_ELEMENT;
+
+ gfc_free_expr (ar->start[i]);
+ ar->start[i] = get_operand (INTRINSIC_PLUS, nindex, lbound);
+
+ gfc_free_expr (ar->end[i]);
+ ar->end[i] = NULL;
+ gfc_free_expr (ar->stride[i]);
+ ar->stride[i] = NULL;
+ gfc_simplify_expr (ar->start[i], 0);
+ }
+ else if (was_fullref)
+ {
+ gfc_internal_error ("Scalarization using DIMEN_RANGE unimplemented");
+ }
+ i_index ++;
+ }
+ }
+ return e;
+}
+
+
+/* Inline assignments of the form c = matmul(a,b).
+ Handle only the cases currently where b and c are rank-two arrays.
+
+ This basically translates the code to
+
+ BLOCK
+ integer i,j,k
+ c = 0
+ do j=0, size(b,2)-1
+ do k=0, size(a, 2)-1
+ do i=0, size(a, 1)-1
+ c(i * stride(c,1) + lbound(c,1), j * stride(c,2) + lbound(c,2)) =
+ c(i * stride(c,1) + lbound(c,1), j * stride(c,2) + lbound(c,2)) +
+ a(i * stride(a,1) + lbound(a,1), k * stride(a,2) + lbound(a,2)) *
+ b(k * stride(b,1) + lbound(b,1), j * stride(b,2) + lbound(b,2))
+ end do
+ end do
+ end do
+ END BLOCK
+
+*/
+
+static int
+inline_matmul_assign (gfc_code **c, int *walk_subtrees,
+ void *data ATTRIBUTE_UNUSED)
+{
+ gfc_code *co = *c;
+ gfc_expr *expr1, *expr2;
+ gfc_expr *matrix_a, *matrix_b;
+ gfc_actual_arglist *a, *b;
+ gfc_code *do_1, *do_2, *do_3, *assign_zero, *assign_matmul;
+ gfc_expr *zero_e;
+ gfc_expr *u1, *u2, *u3;
+ gfc_expr *list[2];
+ gfc_expr *ascalar, *bscalar, *cscalar;
+ gfc_expr *mult;
+ gfc_expr *var_1, *var_2, *var_3;
+ gfc_expr *zero;
+ gfc_namespace *ns;
+ gfc_intrinsic_op op_times, op_plus;
+ enum matrix_case m_case;
+ int i;
+ gfc_code *if_limit = NULL;
+ gfc_code **next_code_point;
+
+ if (co->op != EXEC_ASSIGN)
+ return 0;
+
+ expr1 = co->expr1;
+ expr2 = co->expr2;
+ if (expr2->expr_type != EXPR_FUNCTION
+ || expr2->value.function.isym == NULL
+ || expr2->value.function.isym->id != GFC_ISYM_MATMUL)
+ return 0;
+
+ current_code = c;
+ inserted_block = NULL;
+ changed_statement = NULL;
+
+ a = expr2->value.function.actual;
+ matrix_a = a->expr;
+ b = a->next;
+ matrix_b = b->expr;
+
+ /* Currently only handling direct variables. Transpose etc. will come
+ later. */
+
+ if (matrix_a->expr_type != EXPR_VARIABLE
+ || matrix_b->expr_type != EXPR_VARIABLE)
+ return 0;
+
+ if (matrix_a->rank == 2)
+ m_case = matrix_b->rank == 1 ? A2B1 : A2B2;
+ else
+ m_case = A1B2;
+
+ /* We do not handle data dependencies yet. */
+ if (gfc_check_dependency (expr1, matrix_a, true)
+ || gfc_check_dependency (expr1, matrix_b, true))
+ return 0;
+
+ ns = insert_block ();
+
+ /* Assign the type of the zero expression for initializing the resulting
+ array, and the expression (+ and * for real, integer and complex;
+ .and. and .or for logical. */
+
+ switch(expr1->ts.type)
+ {
+ case BT_INTEGER:
+ zero_e = gfc_get_int_expr (expr1->ts.kind, &expr1->where, 0);
+ op_times = INTRINSIC_TIMES;
+ op_plus = INTRINSIC_PLUS;
+ break;
+
+ case BT_LOGICAL:
+ op_times = INTRINSIC_AND;
+ op_plus = INTRINSIC_OR;
+ zero_e = gfc_get_logical_expr (expr1->ts.kind, &expr1->where,
+ 0);
+ break;
+ case BT_REAL:
+ zero_e = gfc_get_constant_expr (BT_REAL, expr1->ts.kind,
+ &expr1->where);
+ mpfr_set_si (zero_e->value.real, 0, GFC_RND_MODE);
+ op_times = INTRINSIC_TIMES;
+ op_plus = INTRINSIC_PLUS;
+ break;
+
+ case BT_COMPLEX:
+ zero_e = gfc_get_constant_expr (BT_COMPLEX, expr1->ts.kind,
+ &expr1->where);
+ mpc_set_si_si (zero_e->value.complex, 0, 0, GFC_RND_MODE);
+ op_times = INTRINSIC_TIMES;
+ op_plus = INTRINSIC_PLUS;
+
+ break;
+
+ default:
+ gcc_unreachable();
+ }
+
+ current_code = &ns->code;
+
+ /* Freeze the references, keeping track of how many temporary variables were
+ created. */
+ n_vars = 0;
+ freeze_references (matrix_a);
+ freeze_references (matrix_b);
+ freeze_references (expr1);
+
+ if (n_vars == 0)
+ next_code_point = current_code;
+ else
+ {
+ next_code_point = &ns->code;
+ for (i=0; i<n_vars; i++)
+ next_code_point = &(*next_code_point)->next;
+ }
+
+ /* Take care of the inline flag. If the limit check evaluates to a
+ constant, dead code elimination will eliminate the unneeded branch. */
+
+ if (m_case == A2B2 && flag_inline_matmul_limit > 0)
+ {
+ if_limit = inline_limit_check (matrix_a, matrix_b, m_case);
+
+ /* Insert the original statement into the else branch. */
+ if_limit->block->block->next = co;
+ co->next = NULL;
+
+ /* ... and the new ones go into the original one. */
+ *next_code_point = if_limit;
+ next_code_point = &if_limit->block->next;
+ }
+
+ assign_zero = XCNEW (gfc_code);
+ assign_zero->op = EXEC_ASSIGN;
+ assign_zero->loc = co->loc;
+ assign_zero->expr1 = gfc_copy_expr (expr1);
+ assign_zero->expr2 = zero_e;
+
+ /* Handle the reallocation, if needed. */
+ if (flag_realloc_lhs && gfc_is_reallocatable_lhs (expr1))
+ {
+ gfc_code *lhs_alloc;
+
+ /* Only need to check a single dimension for the A2B2 case for
+ bounds checking, the rest will be allocated. */
+
+ if (gfc_option.rtcheck & GFC_RTCHECK_BOUNDS && m_case == A2B2)
+ {
+ gfc_code *test;
+ gfc_expr *a2, *b1;
+
+ a2 = get_array_inq_function (GFC_ISYM_SIZE, matrix_a, 2);
+ b1 = get_array_inq_function (GFC_ISYM_SIZE, matrix_b, 1);
+ test = runtime_error_ne (b1, a2, "Dimension of array B incorrect "
+ "in MATMUL intrinsic: Is %ld, should be %ld");
+ *next_code_point = test;
+ next_code_point = &test->next;
+ }
+
+
+ lhs_alloc = matmul_lhs_realloc (expr1, matrix_a, matrix_b, m_case);
+
+ *next_code_point = lhs_alloc;
+ next_code_point = &lhs_alloc->next;
+
+ }
+ else if (gfc_option.rtcheck & GFC_RTCHECK_BOUNDS)
+ {
+ gfc_code *test;
+ gfc_expr *a2, *b1, *c1, *c2, *a1, *b2;
+
+ if (m_case == A2B2 || m_case == A2B1)
+ {
+ a2 = get_array_inq_function (GFC_ISYM_SIZE, matrix_a, 2);
+ b1 = get_array_inq_function (GFC_ISYM_SIZE, matrix_b, 1);
+ test = runtime_error_ne (b1, a2, "Dimension of array B incorrect "
+ "in MATMUL intrinsic: Is %ld, should be %ld");
+ *next_code_point = test;
+ next_code_point = &test->next;
+
+ c1 = get_array_inq_function (GFC_ISYM_SIZE, expr1, 1);
+ a1 = get_array_inq_function (GFC_ISYM_SIZE, matrix_a, 1);
+
+ if (m_case == A2B2)
+ test = runtime_error_ne (c1, a1, "Incorrect extent in return array in "
+ "MATMUL intrinsic for dimension 1: "
+ "is %ld, should be %ld");
+ else if (m_case == A2B1)
+ test = runtime_error_ne (c1, a1, "Incorrect extent in return array in "
+ "MATMUL intrinsic: "
+ "is %ld, should be %ld");
+
+
+ *next_code_point = test;
+ next_code_point = &test->next;
+ }
+ else if (m_case == A1B2)
+ {
+ a1 = get_array_inq_function (GFC_ISYM_SIZE, matrix_a, 1);
+ b1 = get_array_inq_function (GFC_ISYM_SIZE, matrix_b, 1);
+ test = runtime_error_ne (b1, a1, "Dimension of array B incorrect "
+ "in MATMUL intrinsic: Is %ld, should be %ld");
+ *next_code_point = test;
+ next_code_point = &test->next;
+
+ c1 = get_array_inq_function (GFC_ISYM_SIZE, expr1, 1);
+ b2 = get_array_inq_function (GFC_ISYM_SIZE, matrix_b, 2);
+
+ test = runtime_error_ne (c1, b2, "Incorrect extent in return array in "
+ "MATMUL intrinsic: "
+ "is %ld, should be %ld");
+
+ *next_code_point = test;
+ next_code_point = &test->next;
+ }
+
+ if (m_case == A2B2)
+ {
+ c2 = get_array_inq_function (GFC_ISYM_SIZE, expr1, 2);
+ b2 = get_array_inq_function (GFC_ISYM_SIZE, matrix_b, 2);
+ test = runtime_error_ne (c2, b2, "Incorrect extent in return array in "
+ "MATMUL intrinsic for dimension 2: is %ld, should be %ld");
+
+ *next_code_point = test;
+ next_code_point = &test->next;
+ }
+ }
+
+ *next_code_point = assign_zero;
+
+ zero = gfc_get_int_expr (gfc_index_integer_kind, &co->loc, 0);
+
+ assign_matmul = XCNEW (gfc_code);
+ assign_matmul->op = EXEC_ASSIGN;
+ assign_matmul->loc = co->loc;
+
+ /* Get the bounds for the loops, create them and create the scalarized
+ expressions. */
+
+ switch (m_case)
+ {
+ case A2B2:
+ inline_limit_check (matrix_a, matrix_b, m_case);
+
+ u1 = get_size_m1 (matrix_b, 2);
+ u2 = get_size_m1 (matrix_a, 2);
+ u3 = get_size_m1 (matrix_a, 1);
+
+ do_1 = create_do_loop (gfc_copy_expr (zero), u1, NULL, &co->loc, ns);
+ do_2 = create_do_loop (gfc_copy_expr (zero), u2, NULL, &co->loc, ns);
+ do_3 = create_do_loop (gfc_copy_expr (zero), u3, NULL, &co->loc, ns);
+
+ do_1->block->next = do_2;
+ do_2->block->next = do_3;
+ do_3->block->next = assign_matmul;
+
+ var_1 = do_1->ext.iterator->var;
+ var_2 = do_2->ext.iterator->var;
+ var_3 = do_3->ext.iterator->var;
+
+ list[0] = var_3;
+ list[1] = var_1;
+ cscalar = scalarized_expr (gfc_copy_expr (co->expr1), list, 2);
+
+ list[0] = var_3;
+ list[1] = var_2;
+ ascalar = scalarized_expr (gfc_copy_expr (matrix_a), list, 2);
+
+ list[0] = var_2;
+ list[1] = var_1;
+ bscalar = scalarized_expr (gfc_copy_expr (matrix_b), list, 2);
+
+ break;
+
+ case A2B1:
+ u1 = get_size_m1 (matrix_b, 1);
+ u2 = get_size_m1 (matrix_a, 1);
+
+ do_1 = create_do_loop (gfc_copy_expr (zero), u1, NULL, &co->loc, ns);
+ do_2 = create_do_loop (gfc_copy_expr (zero), u2, NULL, &co->loc, ns);
+
+ do_1->block->next = do_2;
+ do_2->block->next = assign_matmul;
+
+ var_1 = do_1->ext.iterator->var;
+ var_2 = do_2->ext.iterator->var;
+
+ list[0] = var_2;
+ cscalar = scalarized_expr (gfc_copy_expr (co->expr1), list, 1);
+
+ list[0] = var_2;
+ list[1] = var_1;
+ ascalar = scalarized_expr (gfc_copy_expr (matrix_a), list, 2);
+
+ list[0] = var_1;
+ bscalar = scalarized_expr (gfc_copy_expr (matrix_b), list, 1);
+
+ break;
+
+ case A1B2:
+ u1 = get_size_m1 (matrix_b, 2);
+ u2 = get_size_m1 (matrix_a, 1);
+
+ do_1 = create_do_loop (gfc_copy_expr (zero), u1, NULL, &co->loc, ns);
+ do_2 = create_do_loop (gfc_copy_expr (zero), u2, NULL, &co->loc, ns);
+
+ do_1->block->next = do_2;
+ do_2->block->next = assign_matmul;
+
+ var_1 = do_1->ext.iterator->var;
+ var_2 = do_2->ext.iterator->var;
+
+ list[0] = var_1;
+ cscalar = scalarized_expr (gfc_copy_expr (co->expr1), list, 1);
+
+ list[0] = var_2;
+ ascalar = scalarized_expr (gfc_copy_expr (matrix_a), list, 1);
+
+ list[0] = var_2;
+ list[1] = var_1;
+ bscalar = scalarized_expr (gfc_copy_expr (matrix_b), list, 2);
+
+ break;
+
+ default:
+ gcc_unreachable();
+ }
+
+ /* First loop comes after the zero assignment. */
+ assign_zero->next = do_1;
+
+ /* Build the assignment expression in the loop. */
+ assign_matmul->expr1 = gfc_copy_expr (cscalar);
+
+ mult = get_operand (op_times, ascalar, bscalar);
+ assign_matmul->expr2 = get_operand (op_plus, cscalar, mult);
+
+ /* If we don't want to keep the original statement around in
+ the else branch, we can free it. */
+
+ if (if_limit == NULL)
+ gfc_free_statements(co);
+ else
+ co->next = NULL;
+
+ gfc_free_expr (zero);
+ *walk_subtrees = 0;
+ return 0;
+}
#define WALK_SUBEXPR(NODE) \
do \
diff --git a/gcc/fortran/gfortran.h b/gcc/fortran/gfortran.h
index 832a6ce3ebe..6c54da87182 100644
--- a/gcc/fortran/gfortran.h
+++ b/gcc/fortran/gfortran.h
@@ -419,6 +419,7 @@ enum gfc_isym_id
GFC_ISYM_EXPONENT,
GFC_ISYM_EXTENDS_TYPE_OF,
GFC_ISYM_FDATE,
+ GFC_ISYM_FE_RUNTIME_ERROR,
GFC_ISYM_FGET,
GFC_ISYM_FGETC,
GFC_ISYM_FLOOR,
@@ -1001,7 +1002,6 @@ typedef struct
bool cp_was_assumed; /* AS_ASSUMED_SIZE cp arrays are converted to
AS_EXPLICIT, but we want to remember that we
did this. */
-
}
gfc_array_spec;
@@ -1907,7 +1907,7 @@ typedef struct gfc_intrinsic_sym
gfc_typespec ts;
unsigned elemental:1, inquiry:1, transformational:1, pure:1,
generic:1, specific:1, actual_ok:1, noreturn:1, conversion:1,
- from_module:1;
+ from_module:1, vararg:1;
int standard;
@@ -3231,4 +3231,8 @@ int gfc_code_walker (gfc_code **, walk_code_fn_t, walk_expr_fn_t, void *);
void gfc_convert_mpz_to_signed (mpz_t, int);
+/* trans-array.c */
+
+bool gfc_is_reallocatable_lhs (gfc_expr *);
+
#endif /* GCC_GFORTRAN_H */
diff --git a/gcc/fortran/intrinsic.c b/gcc/fortran/intrinsic.c
index a958f8ec9d1..1c0c388c512 100644
--- a/gcc/fortran/intrinsic.c
+++ b/gcc/fortran/intrinsic.c
@@ -520,6 +520,29 @@ add_sym_1s (const char *name, gfc_isym_id id, enum klass cl, bt type, int kind,
(void *) 0);
}
+/* Add a symbol to the subroutine ilst where the subroutine takes one
+ printf-style character argument and a variable number of arguments
+ to follow. */
+
+static void
+add_sym_1p (const char *name, gfc_isym_id id, enum klass cl, bt type, int kind,
+ int standard, bool (*check) (gfc_actual_arglist *),
+ gfc_expr *(*simplify) (gfc_expr*), void (*resolve) (gfc_code *),
+ const char *a1, bt type1, int kind1, int optional1, sym_intent intent1)
+{
+ gfc_check_f cf;
+ gfc_simplify_f sf;
+ gfc_resolve_f rf;
+
+ cf.f1m = check;
+ sf.f1 = simplify;
+ rf.s1 = resolve;
+
+ add_sym (name, id, cl, ACTUAL_NO, type, kind, standard, cf, sf, rf,
+ a1, type1, kind1, optional1, intent1,
+ (void *) 0);
+}
+
/* Add a symbol from the MAX/MIN family of intrinsic functions to the
function. MAX et al take 2 or more arguments. */
@@ -1159,6 +1182,17 @@ make_from_module (void)
next_sym[-1].from_module = 1;
}
+
+/* Mark the current subroutine as having a variable number of
+ arguments. */
+
+static void
+make_vararg (void)
+{
+ if (sizing == SZ_NOTHING)
+ next_sym[-1].vararg = 1;
+}
+
/* Set the attr.value of the current procedure. */
static void
@@ -3292,6 +3326,17 @@ add_subroutines (void)
"fptr", BT_UNKNOWN, 0, REQUIRED, INTENT_OUT);
make_from_module();
+ /* Internal subroutine for emitting a runtime error. */
+
+ add_sym_1p ("fe_runtime_error", GFC_ISYM_FE_RUNTIME_ERROR, CLASS_IMPURE,
+ BT_UNKNOWN, 0, GFC_STD_GNU,
+ gfc_check_fe_runtime_error, NULL, gfc_resolve_fe_runtime_error,
+ "msg", BT_CHARACTER, dc, REQUIRED, INTENT_IN);
+
+ make_noreturn ();
+ make_vararg ();
+ make_from_module ();
+
/* Coarray collectives. */
add_sym_4s ("co_broadcast", GFC_ISYM_CO_BROADCAST, CLASS_IMPURE,
BT_UNKNOWN, 0, GFC_STD_F2008_TS,
@@ -4501,7 +4546,7 @@ gfc_intrinsic_sub_interface (gfc_code *c, int error_flag)
init_arglist (isym);
- if (!sort_actual (name, &c->ext.actual, isym->formal, &c->loc))
+ if (!isym->vararg && !sort_actual (name, &c->ext.actual, isym->formal, &c->loc))
goto fail;
if (!do_ts29113_check (isym, c->ext.actual))
diff --git a/gcc/fortran/intrinsic.h b/gcc/fortran/intrinsic.h
index be7f214dddd..a9f16f52743 100644
--- a/gcc/fortran/intrinsic.h
+++ b/gcc/fortran/intrinsic.h
@@ -190,6 +190,7 @@ bool gfc_check_system_clock (gfc_expr *, gfc_expr *, gfc_expr *);
bool gfc_check_date_and_time (gfc_expr *, gfc_expr *, gfc_expr *, gfc_expr *);
bool gfc_check_exit (gfc_expr *);
bool gfc_check_fdate_sub (gfc_expr *);
+bool gfc_check_fe_runtime_error (gfc_actual_arglist *);
bool gfc_check_flush (gfc_expr *);
bool gfc_check_free (gfc_expr *);
bool gfc_check_fstat_sub (gfc_expr *, gfc_expr *, gfc_expr *);
@@ -602,6 +603,7 @@ void gfc_resolve_ctime_sub (gfc_code *);
void gfc_resolve_execute_command_line (gfc_code *);
void gfc_resolve_exit (gfc_code *);
void gfc_resolve_fdate_sub (gfc_code *);
+void gfc_resolve_fe_runtime_error (gfc_code *);
void gfc_resolve_flush (gfc_code *);
void gfc_resolve_free (gfc_code *);
void gfc_resolve_fseek_sub (gfc_code *);
diff --git a/gcc/fortran/invoke.texi b/gcc/fortran/invoke.texi
index af123dbbe5a..fc9f1b0fa1d 100644
--- a/gcc/fortran/invoke.texi
+++ b/gcc/fortran/invoke.texi
@@ -178,6 +178,7 @@ and warnings}.
-finit-character=@var{n} -finit-integer=@var{n} -finit-local-zero @gol
-finit-logical=@var{<true|false>}
-finit-real=@var{<zero|inf|-inf|nan|snan>} @gol
+-finline-matmul-limit=@var{n} @gol
-fmax-array-constructor=@var{n} -fmax-stack-var-size=@var{n}
-fno-align-commons @gol
-fno-automatic -fno-protect-parens -fno-underscoring @gol
@@ -1537,6 +1538,22 @@ geometric mean of the dimensions of the argument and result matrices.
The default value for @var{n} is 30.
+@item -finline-matmul-limit=@var{n}
+@opindex @code{finline-matmul-limit}
+When front-end optimiztion is active, some calls to the @code{MATMUL}
+intrinsic function will be inlined. This may result in code size
+increase if the size of the matrix cannot be determined at compile
+time, as code for both cases is generated. Setting
+@code{-finline-matmul-limit=0} will disable inlining in all cases.
+Setting this option with a value of @var{n} will produce inline code
+for matrices with size up to @var{n}. If the matrices involved are not
+square, the size comparison is performed using the geometric mean of
+the dimensions of the argument and result matrices.
+
+The default value for @var{n} is the value specified for
+@code{-fblas-matmul-limit} if this option is specified, or unlimitited
+otherwise.
+
@item -frecursive
@opindex @code{frecursive}
Allow indirect recursion by forcing all local arrays to be allocated
@@ -1632,11 +1649,12 @@ if @option{-ffrontend-optimize} is in effect.
@cindex Front-end optimization
This option performs front-end optimization, based on manipulating
parts the Fortran parse tree. Enabled by default by any @option{-O}
-option. Optimizations enabled by this option include elimination of
-identical function calls within expressions, removing unnecessary
-calls to @code{TRIM} in comparisons and assignments and replacing
-@code{TRIM(a)} with @code{a(1:LEN_TRIM(a))}.
-It can be deselected by specifying @option{-fno-frontend-optimize}.
+option. Optimizations enabled by this option include inlining calls
+to @code{MATMUL}, elimination of identical function calls within
+expressions, removing unnecessary calls to @code{TRIM} in comparisons
+and assignments and replacing @code{TRIM(a)} with
+@code{a(1:LEN_TRIM(a))}. It can be deselected by specifying
+@option{-fno-frontend-optimize}.
@end table
@xref{Code Gen Options,,Options for Code Generation Conventions,
diff --git a/gcc/fortran/iresolve.c b/gcc/fortran/iresolve.c
index 6fa0994cf23..32ef0719ead 100644
--- a/gcc/fortran/iresolve.c
+++ b/gcc/fortran/iresolve.c
@@ -2197,6 +2197,19 @@ gfc_resolve_rrspacing (gfc_expr *f, gfc_expr *x)
f->value.function.name = gfc_get_string ("__rrspacing_%d", x->ts.kind);
}
+void
+gfc_resolve_fe_runtime_error (gfc_code *c)
+{
+ const char *name;
+ gfc_actual_arglist *a;
+
+ name = gfc_get_string (PREFIX ("runtime_error"));
+
+ for (a = c->ext.actual->next; a; a = a->next)
+ a->name = "%VAL";
+
+ c->resolved_sym = gfc_get_intrinsic_sub_symbol (name);
+}
void
gfc_resolve_scale (gfc_expr *f, gfc_expr *x, gfc_expr *i ATTRIBUTE_UNUSED)
diff --git a/gcc/fortran/lang.opt b/gcc/fortran/lang.opt
index d86376a917c..a81c4c52b94 100644
--- a/gcc/fortran/lang.opt
+++ b/gcc/fortran/lang.opt
@@ -542,6 +542,10 @@ Enum(gfc_init_local_real) String(inf) Value(GFC_INIT_REAL_INF)
EnumValue
Enum(gfc_init_local_real) String(-inf) Value(GFC_INIT_REAL_NEG_INF)
+finline-matmul-limit=
+Fortran RejectNegative Joined UInteger Var(flag_inline_matmul_limit) Init(-1)
+-finline-matmul-limit=<n> Specify the size of the largest matrix for which matmul will be inlined
+
fmax-array-constructor=
Fortran RejectNegative Joined UInteger Var(flag_max_array_constructor) Init(65535)
-fmax-array-constructor=<n> Maximum number of objects in an array constructor
diff --git a/gcc/fortran/options.c b/gcc/fortran/options.c
index 5a35803beef..d47cf2acc1c 100644
--- a/gcc/fortran/options.c
+++ b/gcc/fortran/options.c
@@ -378,6 +378,11 @@ gfc_post_options (const char **pfilename)
if (!flag_automatic)
flag_max_stack_var_size = 0;
+ /* If we call BLAS directly, only inline up to the BLAS limit. */
+
+ if (flag_external_blas && flag_inline_matmul_limit < 0)
+ flag_inline_matmul_limit = flag_blas_matmul_limit;
+
/* Optimization implies front end optimization, unless the user
specified it directly. */
diff --git a/gcc/fortran/trans-array.h b/gcc/fortran/trans-array.h
index 76bad2a199a..2310b659a49 100644
--- a/gcc/fortran/trans-array.h
+++ b/gcc/fortran/trans-array.h
@@ -64,8 +64,6 @@ tree gfc_copy_only_alloc_comp (gfc_symbol *, tree, tree, int);
tree gfc_alloc_allocatable_for_assignment (gfc_loopinfo*, gfc_expr*, gfc_expr*);
-bool gfc_is_reallocatable_lhs (gfc_expr *);
-
/* Add initialization for deferred arrays. */
void gfc_trans_deferred_array (gfc_symbol *, gfc_wrapped_block *);
/* Generate an initializer for a static pointer or allocatable array. */
diff --git a/gcc/testsuite/gfortran.dg/dependency_26.f90 b/gcc/testsuite/gfortran.dg/dependency_26.f90
index d37307c5cc4..0be18a35a8d 100644
--- a/gcc/testsuite/gfortran.dg/dependency_26.f90
+++ b/gcc/testsuite/gfortran.dg/dependency_26.f90
@@ -1,5 +1,5 @@
! { dg-do compile }
-! { dg-options "-fdump-tree-original" }
+! { dg-options "-finline-matmul-limit=0 -fdump-tree-original" }
!
! Test the fix for PR36932 and PR36933, in which unnecessary
! temporaries were being generated. The module m2 tests the
diff --git a/gcc/testsuite/gfortran.dg/function_optimize_1.f90 b/gcc/testsuite/gfortran.dg/function_optimize_1.f90
index b023121831b..88527a3334e 100644
--- a/gcc/testsuite/gfortran.dg/function_optimize_1.f90
+++ b/gcc/testsuite/gfortran.dg/function_optimize_1.f90
@@ -1,5 +1,5 @@
! { dg-do compile }
-! { dg-options "-O -fdump-tree-original -Warray-temporaries" }
+! { dg-options "-O -fdump-tree-original -finline-matmul-limit=0 -Warray-temporaries" }
program main
implicit none
real, dimension(2,2) :: a, b, c, d
diff --git a/gcc/testsuite/gfortran.dg/function_optimize_2.f90 b/gcc/testsuite/gfortran.dg/function_optimize_2.f90
index 8105661b8f5..153270b5312 100644
--- a/gcc/testsuite/gfortran.dg/function_optimize_2.f90
+++ b/gcc/testsuite/gfortran.dg/function_optimize_2.f90
@@ -1,5 +1,5 @@
! { dg-do compile }
-! { dg-options "-O -faggressive-function-elimination -fdump-tree-original" }
+! { dg-options "-O -finline-matmul-limit=0 -faggressive-function-elimination -fdump-tree-original" }
program main
implicit none
real, dimension(2,2) :: a, b, c, d
diff --git a/gcc/testsuite/gfortran.dg/function_optimize_5.f90 b/gcc/testsuite/gfortran.dg/function_optimize_5.f90
index 427b12684dc..b563e932b1f 100644
--- a/gcc/testsuite/gfortran.dg/function_optimize_5.f90
+++ b/gcc/testsuite/gfortran.dg/function_optimize_5.f90
@@ -1,5 +1,5 @@
! { dg-do compile }
-! { dg-options "-ffrontend-optimize -Wfunction-elimination" }
+! { dg-options "-ffrontend-optimize -finline-matmul-limit=0 -Wfunction-elimination" }
! Check the -ffrontend-optimize (in the absence of -O) and
! -Wfunction-elimination options.
program main
diff --git a/gcc/testsuite/gfortran.dg/function_optimize_7.f90 b/gcc/testsuite/gfortran.dg/function_optimize_7.f90
index e0c404b6a2a..115a5a66bbb 100644
--- a/gcc/testsuite/gfortran.dg/function_optimize_7.f90
+++ b/gcc/testsuite/gfortran.dg/function_optimize_7.f90
@@ -1,5 +1,5 @@
! { dg-do compile }
-! { dg-options "-O -fdump-tree-original -Warray-temporaries" }
+! { dg-options "-O -fdump-tree-original -Warray-temporaries -finline-matmul-limit=0" }
subroutine xx(n, m, a, b, c, d, x, z, i, s_in, s_out)
implicit none
integer, intent(in) :: n, m
diff --git a/gcc/testsuite/gfortran.dg/inline_matmul_1.f90 b/gcc/testsuite/gfortran.dg/inline_matmul_1.f90
new file mode 100644
index 00000000000..266aad073d4
--- /dev/null
+++ b/gcc/testsuite/gfortran.dg/inline_matmul_1.f90
@@ -0,0 +1,152 @@
+! { dg-do run }
+! { dg-options "-ffrontend-optimize -fdump-tree-original -Wrealloc-lhs" }
+! PR 37131 - check basic functionality of inlined matmul, making
+! sure that the library is not called, with and without reallocation.
+
+program main
+ implicit none
+ integer, parameter :: offset = -2
+ real, dimension(3,2) :: a
+ real, dimension(2,4) :: b
+ real, dimension(3,4) :: c
+ real, dimension(3,4) :: cres
+ real, dimension(:,:), allocatable :: c_alloc
+ integer, parameter :: a1_lower_p = 1 + offset, a1_upper_p = size(a,1) + offset
+ integer, parameter :: a2_lower_p = 1 + offset, a2_upper_p = size(a,2) + offset
+ integer, parameter :: b1_lower_p = 1 + offset, b1_upper_p = size(b,1) + offset
+ integer, parameter :: b2_lower_p = 1 + offset, b2_upper_p = size(b,2) + offset
+ integer, parameter :: c1_lower_p = 1 + offset, c1_upper_p = size(c,1) + offset
+ integer, parameter :: c2_lower_p = 1 + offset, c2_upper_p = size(c,2) + offset
+ real, dimension(a1_lower_p:a1_upper_p, a2_lower_p:a2_upper_p) :: ap
+ real, dimension(b1_lower_p:b1_upper_p, b2_lower_p:b2_upper_p) :: bp
+ real, dimension(c1_lower_p:c1_upper_p, c2_lower_p:c2_upper_p) :: cp
+ real, dimension(4,8,4) :: f, fresult
+ integer :: eight = 8, two = 2
+
+ type foo
+ real :: a
+ integer :: i
+ end type foo
+
+ type(foo), dimension(3,2) :: afoo
+ type(foo), dimension(2,4) :: bfoo
+ type(foo), dimension(3,4) :: cfoo
+
+ data a / 2., -3., 5., -7., 11., -13./
+ data b /17., -23., 29., -31., 37., -39., 41., -47./
+ data cres /195., -304., 384., 275., -428., 548., 347., -540., 692., 411., -640., 816./
+ data fresult / &
+ 0., 0., 195., 0., 0., 17., 0., 0., 0., -23.,-304., 0., 0., 0., 0., 0., &
+ 0., 0., 384., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., &
+ 2., 0., 275., 0., -3., 29., 0., 0., 5., -31.,-428., 0., 0., 0., 0., 0., &
+ 0., 0., 548., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., &
+ -7., 0., 347., 0., 11., 37., 0., 0., -13., -39.,-540., 0., 0., 0., 0., 0., &
+ 0., 0., 692., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., &
+ 0., 0., 411., 0., 0., 41., 0., 0., 0., -47.,-640., 0., 0., 0., 0., 0., &
+ 0., 0., 816., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0./
+
+ integer :: a1 = size(a,1), a2 = size(a,2)
+ integer :: b1 = size(b,1), b2 = size(b,2)
+ integer :: c1 = size(c,1), c2 = size(c,2)
+
+ integer :: a1_lower, a1_upper, a2_lower, a2_upper
+ integer :: b1_lower, b1_upper, b2_lower, b2_upper
+ integer :: c1_lower, c1_upper, c2_lower, c2_upper
+
+ a1_lower = 1 + offset ; a1_upper = a1 + offset
+ a2_lower = 1 + offset ; a2_upper = a2 + offset
+ b1_lower = 1 + offset ; b1_upper = b1 + offset
+ b2_lower = 1 + offset ; b2_upper = b2 + offset
+ c1_lower = 1 + offset ; c1_upper = c1 + offset
+ c2_lower = 1 + offset ; c2_upper = c2 + offset
+
+ c = matmul(a,b)
+ if (sum(abs(c-cres))>1e-4) call abort
+
+ c_alloc = matmul(a,b) ! { dg-warning "Code for reallocating the allocatable array" }
+ if (sum(abs(c_alloc-cres))>1e-4) call abort
+ if (any([size(c_alloc,1), size(c_alloc,2)] /= [3,4])) call abort
+ deallocate(c_alloc)
+
+ allocate(c_alloc(4,4))
+ c_alloc = matmul(a,b) ! { dg-warning "Code for reallocating the allocatable array" }
+ if (sum(abs(c_alloc-cres))>1e-4) call abort
+ if (any([size(c_alloc,1), size(c_alloc,2)] /= [3,4])) call abort
+ deallocate(c_alloc)
+
+ allocate(c_alloc(3,3))
+ c_alloc = matmul(a,b) ! { dg-warning "Code for reallocating the allocatable array" }
+ if (sum(abs(c_alloc-cres))>1e-4) call abort
+ if (any([size(c_alloc,1), size(c_alloc,2)] /= [3,4])) call abort
+
+ c_alloc = 42.
+ c_alloc(:,:) = matmul(a,b)
+ if (sum(abs(c_alloc-cres))>1e-4) call abort
+ if (any([size(c_alloc,1), size(c_alloc,2)] /= [3,4])) call abort
+
+ deallocate(c_alloc)
+
+ ap = a
+ bp = b
+ cp = matmul(ap, bp)
+ if (sum(abs(cp-cres)) > 1e-4) call abort
+
+ f = 0
+ f(1,1:3,2:3) = a
+ f(2,2:3,:) = b
+ c = matmul(f(1,1:3,2:3), f(2,2:3,:))
+ if (sum(abs(c-cres))>1e-4) call abort
+
+ f(3,1:eight:2,:) = matmul(a, b)
+ if (sum(abs(f(3,1:eight:2,:)-cres))>1e-4) call abort
+
+ afoo%a = a
+ bfoo%a = b
+ cfoo%a = matmul(afoo%a, bfoo%a)
+
+ if (sum(abs(cfoo%a-cres)) > 1e-4) call abort
+
+ block
+ real :: aa(a1, a2), bb(b1, b2), cc(c1, c2)
+ real :: am(a1_lower:a1_upper, a2_lower:a2_upper)
+ real :: bm(b1_lower:b1_upper, b2_lower:b2_upper)
+ real :: cm(c1_lower:c1_upper, c2_lower:c2_upper)
+
+ aa = a
+ bb = b
+ am = a
+ bm = b
+
+ cc = matmul(aa,bb)
+ if (sum(cc-cres)>1e-4) call abort
+ c_alloc = matmul(aa,bb) ! { dg-warning "Code for reallocating the allocatable array" }
+ if (sum(abs(c_alloc-cres))>1e-4) call abort
+ if (any([size(c_alloc,1), size(c_alloc,2)] /= [3,4])) call abort
+ c_alloc = 42.
+ deallocate(c_alloc)
+
+ allocate(c_alloc(4,4))
+ c_alloc = matmul(aa,bb) ! { dg-warning "Code for reallocating the allocatable array" }
+ if (sum(abs(c_alloc-cres))>1e-4) call abort
+ if (any([size(c_alloc,1), size(c_alloc,2)] /= [3,4])) call abort
+ deallocate(c_alloc)
+
+ allocate(c_alloc(3,3))
+ c_alloc = matmul(aa,bb) ! { dg-warning "Code for reallocating the allocatable array" }
+ if (sum(abs(c_alloc-cres))>1e-4) call abort
+ if (any([size(c_alloc,1), size(c_alloc,2)] /= [3,4])) call abort
+ deallocate(c_alloc)
+
+ cm = matmul(am, bm)
+ if (sum(abs(cm-cres)) > 1e-4) call abort
+
+ cm = 42.
+
+ cm(:,:) = matmul(a,bm)
+ if (sum(abs(cm-cres)) > 1e-4) call abort
+
+ end block
+
+end program main
+
+! { dg-final { scan-tree-dump-times "_gfortran_matmul" 0 "original" } }
diff --git a/gcc/testsuite/gfortran.dg/inline_matmul_2.f90 b/gcc/testsuite/gfortran.dg/inline_matmul_2.f90
new file mode 100644
index 00000000000..6f56f65e0b0
--- /dev/null
+++ b/gcc/testsuite/gfortran.dg/inline_matmul_2.f90
@@ -0,0 +1,65 @@
+! { dg-do compile }
+! { dg-options "-ffrontend-optimize -finline-matmul-limit=0 -fdump-tree-original" }
+! PR 37131 - no inlining with -finline-matmul-limit=0
+program main
+ real, dimension(3,2) :: a
+ real, dimension(2,4) :: b
+ real, dimension(3,4) :: c
+ real, dimension(3,4) :: cres
+ real, dimension(:,:), allocatable :: calloc
+ integer :: a1 = size(a,1), a2 = size(a,2)
+ integer :: b1 = size(b,1), b2 = size(b,2)
+ integer :: c1 = size(c,1), c2 = size(c,2)
+
+ data a / 2., -3., 5., -7., 11., -13./
+ data b /17., -23., 29., -31., 37., -39., 41., -47./
+ data cres /195., -304., 384., 275., -428., 548., 347., -540., 692., 411., -640., 816./
+ c = matmul(a,b)
+ if (sum(c-cres)>1e-4) call abort
+
+ calloc = matmul(a,b)
+ if (sum(calloc-cres)>1e-4) call abort
+ if (any([size(calloc,1), size(calloc,2)] /= [3,4])) call abort
+ deallocate(calloc)
+
+ allocate(calloc(4,4))
+ calloc = matmul(a,b)
+ if (sum(calloc-cres)>1e-4) call abort
+ if (any([size(calloc,1), size(calloc,2)] /= [3,4])) call abort
+ deallocate(calloc)
+
+ allocate(calloc(3,3))
+ calloc = matmul(a,b)
+ if (sum(calloc-cres)>1e-4) call abort
+ if (any([size(calloc,1), size(calloc,2)] /= [3,4])) call abort
+ deallocate(calloc)
+
+ block
+ real :: aa(a1, a2), bb(b1, b2), cc(c1, c2)
+ aa = a
+ bb = b
+
+ cc = matmul(aa,bb)
+ if (sum(cc-cres)>1e-4) call abort
+ calloc = matmul(aa,bb)
+ if (sum(calloc-cres)>1e-4) call abort
+ if (any([size(calloc,1), size(calloc,2)] /= [3,4])) call abort
+ calloc = 42.
+ deallocate(calloc)
+
+ allocate(calloc(4,4))
+ calloc = matmul(aa,bb)
+ if (sum(calloc-cres)>1e-4) call abort
+ if (any([size(calloc,1), size(calloc,2)] /= [3,4])) call abort
+ deallocate(calloc)
+
+ allocate(calloc(3,3))
+ calloc = matmul(aa,bb)
+ if (sum(calloc-cres)>1e-4) call abort
+ if (any([size(calloc,1), size(calloc,2)] /= [3,4])) call abort
+ deallocate(calloc)
+ end block
+
+end program main
+! { dg-final { scan-tree-dump-times "_gfortran_matmul" 8 "original" } }
+! { dg-final { cleanup-tree-dump "original" } }
diff --git a/gcc/testsuite/gfortran.dg/inline_matmul_3.f90 b/gcc/testsuite/gfortran.dg/inline_matmul_3.f90
new file mode 100644
index 00000000000..4f00c1a2921
--- /dev/null
+++ b/gcc/testsuite/gfortran.dg/inline_matmul_3.f90
@@ -0,0 +1,65 @@
+! { dg-do run }
+! { dg-options "-O3 -finline-matmul-limit=2 -fdump-tree-optimized" }
+! PR 37131 - all calls to matmul should be kept
+program main
+ real, dimension(3,2) :: a
+ real, dimension(2,4) :: b
+ real, dimension(3,4) :: c
+ real, dimension(3,4) :: cres
+ real, dimension(:,:), allocatable :: calloc
+ integer :: a1 = size(a,1), a2 = size(a,2)
+ integer :: b1 = size(b,1), b2 = size(b,2)
+ integer :: c1 = size(c,1), c2 = size(c,2)
+
+ data a / 2., -3., 5., -7., 11., -13./
+ data b /17., -23., 29., -31., 37., -39., 41., -47./
+ data cres /195., -304., 384., 275., -428., 548., 347., -540., 692., 411., -640., 816./
+ c = matmul(a,b)
+ if (sum(c-cres)>1e-4) call abort
+
+ calloc = matmul(a,b)
+ if (sum(calloc-cres)>1e-4) call abort
+ if (any([size(calloc,1), size(calloc,2)] /= [3,4])) call abort
+ deallocate(calloc)
+
+ allocate(calloc(4,4))
+ calloc = matmul(a,b)
+ if (sum(calloc-cres)>1e-4) call abort
+ if (any([size(calloc,1), size(calloc,2)] /= [3,4])) call abort
+ deallocate(calloc)
+
+ allocate(calloc(3,3))
+ calloc = matmul(a,b)
+ if (sum(calloc-cres)>1e-4) call abort
+ if (any([size(calloc,1), size(calloc,2)] /= [3,4])) call abort
+ deallocate(calloc)
+
+ block
+ real :: aa(a1, a2), bb(b1, b2), cc(c1, c2)
+ aa = a
+ bb = b
+
+ cc = matmul(aa,bb)
+ if (sum(cc-cres)>1e-4) call abort
+ calloc = matmul(aa,bb)
+ if (sum(calloc-cres)>1e-4) call abort
+ if (any([size(calloc,1), size(calloc,2)] /= [3,4])) call abort
+ calloc = 42.
+ deallocate(calloc)
+
+ allocate(calloc(4,4))
+ calloc = matmul(aa,bb)
+ if (sum(calloc-cres)>1e-4) call abort
+ if (any([size(calloc,1), size(calloc,2)] /= [3,4])) call abort
+ deallocate(calloc)
+
+ allocate(calloc(3,3))
+ calloc = matmul(aa,bb)
+ if (sum(calloc-cres)>1e-4) call abort
+ if (any([size(calloc,1), size(calloc,2)] /= [3,4])) call abort
+ deallocate(calloc)
+ end block
+
+end program main
+! { dg-final { scan-tree-dump-times "_gfortran_matmul" 8 "optimized" } }
+! { dg-final { cleanup-tree-dump "optimized" } }
diff --git a/gcc/testsuite/gfortran.dg/inline_matmul_4.f90 b/gcc/testsuite/gfortran.dg/inline_matmul_4.f90
new file mode 100644
index 00000000000..b2253b14daa
--- /dev/null
+++ b/gcc/testsuite/gfortran.dg/inline_matmul_4.f90
@@ -0,0 +1,68 @@
+! { dg-do run }
+! { dg-options "-O3 -finline-matmul-limit=10 -fdump-tree-optimized -fdump-tree-original" }
+! PR 37131 - all calls to matmul should be optimized away with -O3
+! and the high limit.
+program main
+ real, dimension(3,2) :: a
+ real, dimension(2,4) :: b
+ real, dimension(3,4) :: c
+ real, dimension(3,4) :: cres
+ real, dimension(:,:), allocatable :: calloc
+ integer :: a1 = size(a,1), a2 = size(a,2)
+ integer :: b1 = size(b,1), b2 = size(b,2)
+ integer :: c1 = size(c,1), c2 = size(c,2)
+
+ data a / 2., -3., 5., -7., 11., -13./
+ data b /17., -23., 29., -31., 37., -39., 41., -47./
+ data cres /195., -304., 384., 275., -428., 548., 347., -540., 692., 411., -640., 816./
+ c = matmul(a,b)
+ if (sum(c-cres)>1e-4) call abort
+
+ calloc = matmul(a,b)
+ if (sum(calloc-cres)>1e-4) call abort
+ if (any([size(calloc,1), size(calloc,2)] /= [3,4])) call abort
+ deallocate(calloc)
+
+ allocate(calloc(4,4))
+ calloc = matmul(a,b)
+ if (sum(calloc-cres)>1e-4) call abort
+ if (any([size(calloc,1), size(calloc,2)] /= [3,4])) call abort
+ deallocate(calloc)
+
+ allocate(calloc(3,3))
+ calloc = matmul(a,b)
+ if (sum(calloc-cres)>1e-4) call abort
+ if (any([size(calloc,1), size(calloc,2)] /= [3,4])) call abort
+ deallocate(calloc)
+
+ block
+ real :: aa(a1, a2), bb(b1, b2), cc(c1, c2)
+ aa = a
+ bb = b
+
+ cc = matmul(aa,bb)
+ if (sum(cc-cres)>1e-4) call abort
+ calloc = matmul(aa,bb)
+ if (sum(calloc-cres)>1e-4) call abort
+ if (any([size(calloc,1), size(calloc,2)] /= [3,4])) call abort
+ calloc = 42.
+ deallocate(calloc)
+
+ allocate(calloc(4,4))
+ calloc = matmul(aa,bb)
+ if (sum(calloc-cres)>1e-4) call abort
+ if (any([size(calloc,1), size(calloc,2)] /= [3,4])) call abort
+ deallocate(calloc)
+
+ allocate(calloc(3,3))
+ calloc = matmul(aa,bb)
+ if (sum(calloc-cres)>1e-4) call abort
+ if (any([size(calloc,1), size(calloc,2)] /= [3,4])) call abort
+ deallocate(calloc)
+ end block
+
+end program main
+! { dg-final { scan-tree-dump-times "_gfortran_matmul" 4 "original" } }
+! { dg-final { scan-tree-dump-times "_gfortran_matmul" 0 "optimized" } }
+! { dg-final { cleanup-tree-dump "original" } }
+! { dg-final { cleanup-tree-dump "optimized" } }
diff --git a/gcc/testsuite/gfortran.dg/inline_matmul_5.f90 b/gcc/testsuite/gfortran.dg/inline_matmul_5.f90
new file mode 100644
index 00000000000..bed1cb6a0af
--- /dev/null
+++ b/gcc/testsuite/gfortran.dg/inline_matmul_5.f90
@@ -0,0 +1,13 @@
+! { dg-do run }
+! { dg-options "-ffrontend-optimize" }
+program main
+
+ real, dimension(2,2) :: a,b,c
+
+ data a /2., 4., 8., 16. /
+ data b /3., 9., 27., 81./
+
+ c = matmul(a,b)
+ a = matmul(a,b)
+ if (any(a /= c)) call abort
+end program main
diff --git a/gcc/testsuite/gfortran.dg/inline_matmul_6.f90 b/gcc/testsuite/gfortran.dg/inline_matmul_6.f90
new file mode 100644
index 00000000000..54b0ecc230e
--- /dev/null
+++ b/gcc/testsuite/gfortran.dg/inline_matmul_6.f90
@@ -0,0 +1,49 @@
+! { dg-do run }
+! { dg-options "-ffrontend-optimize -fdump-tree-original" }
+! PR 37131 - check rank1/rank2 and rank2/rank1 cases for inline matmul.
+
+module foo
+ implicit none
+contains
+ subroutine a1b2(a,b,c)
+ real, dimension(:), intent(in) :: a
+ real, dimension(:,:), intent(in) :: b
+ real, dimension(:), intent(out) :: c
+
+ c = matmul(a,b)
+ end subroutine a1b2
+
+ subroutine a2b1(a,b,c)
+ real, dimension(:,:), intent(in) :: a
+ real, dimension(:), intent(in) :: b
+ real, dimension(:), intent(out) :: c
+
+ c = matmul(a,b)
+ end subroutine a2b1
+end module foo
+
+program main
+ use foo
+ implicit none
+ real, dimension(3) :: a1
+ real, dimension(3,2) :: b1
+ real, dimension(2) :: c1
+
+ real, dimension(3,2) :: a2
+ real, dimension(2) :: b2
+ real, dimension(3) :: c2
+
+ data a1 /17., -23., 29./
+ data b1 / 2., -3., 5., -7., 11., -13./
+
+ data b2/-2.,5./
+
+ a2 = -b1
+ call a1b2(a1,b1,c1)
+ if (any(abs(c1 - (/248., -749./)) > 1e-3)) call abort
+ call a2b1(a2,b2,c2)
+ if (any(abs(c2 - (/39., -61., 75./)) > 1e-3)) call abort
+end program main
+
+! { dg-final { scan-tree-dump-times "_gfortran_matmul" 0 "original" } }
+! { dg-final { cleanup-tree-dump "original" } }