Commit d88412fc by Thomas Koenig

re PR fortran/82471 (Reorder loop for unfavorable index ordering in DO CONCURRENT and FORALL)

2017-11-05  Thomas Koenig  <tkoenig@gcc.gnu.org>

	PR fortran/82471
	* lang.opt (ffrontend-loop-interchange): New option.
	(Wfrontend-loop-interchange): New option.
	* options.c (gfc_post_options): Handle ffrontend-loop-interchange.
	* frontend-passes.c (gfc_run_passes): Run
	optimize_namespace if flag_frontend_optimize or
	flag_frontend_loop_interchange are set.
	(optimize_namespace): Run functions according to flags set;
	also call index_interchange.
	(ind_type): New function.
	(has_var): New function.
	(index_cost): New function.
	(loop_comp): New function.

2017-11-05  Thomas Koenig  <tkoenig@gcc.gnu.org>

	PR fortran/82471
	* gfortran.dg/loop_interchange_1.f90: New test.

From-SVN: r254430
parent 5233d455
2017-11-05 Thomas Koenig <tkoenig@gcc.gnu.org>
PR fortran/82471
* lang.opt (ffrontend-loop-interchange): New option.
(Wfrontend-loop-interchange): New option.
* options.c (gfc_post_options): Handle ffrontend-loop-interchange.
* frontend-passes.c (gfc_run_passes): Run
optimize_namespace if flag_frontend_optimize or
flag_frontend_loop_interchange are set.
(optimize_namespace): Run functions according to flags set;
also call index_interchange.
(ind_type): New function.
(has_var): New function.
(index_cost): New function.
(loop_comp): New function.
2017-11-05 Paul Thomas <pault@gcc.gnu.org> 2017-11-05 Paul Thomas <pault@gcc.gnu.org>
PR fortran/78641 PR fortran/78641
......
...@@ -55,6 +55,7 @@ static gfc_expr* check_conjg_transpose_variable (gfc_expr *, bool *, ...@@ -55,6 +55,7 @@ static gfc_expr* check_conjg_transpose_variable (gfc_expr *, bool *,
bool *); bool *);
static bool has_dimen_vector_ref (gfc_expr *); static bool has_dimen_vector_ref (gfc_expr *);
static int matmul_temp_args (gfc_code **, int *,void *data); static int matmul_temp_args (gfc_code **, int *,void *data);
static int index_interchange (gfc_code **, int*, void *);
#ifdef CHECKING_P #ifdef CHECKING_P
static void check_locus (gfc_namespace *); static void check_locus (gfc_namespace *);
...@@ -155,9 +156,11 @@ gfc_run_passes (gfc_namespace *ns) ...@@ -155,9 +156,11 @@ gfc_run_passes (gfc_namespace *ns)
check_locus (ns); check_locus (ns);
#endif #endif
if (flag_frontend_optimize || flag_frontend_loop_interchange)
optimize_namespace (ns);
if (flag_frontend_optimize) if (flag_frontend_optimize)
{ {
optimize_namespace (ns);
optimize_reduction (ns); optimize_reduction (ns);
if (flag_dump_fortran_optimized) if (flag_dump_fortran_optimized)
gfc_dump_parse_tree (ns, stdout); gfc_dump_parse_tree (ns, stdout);
...@@ -1350,7 +1353,9 @@ simplify_io_impl_do (gfc_code **code, int *walk_subtrees, ...@@ -1350,7 +1353,9 @@ simplify_io_impl_do (gfc_code **code, int *walk_subtrees,
return 0; return 0;
} }
/* Optimize a namespace, including all contained namespaces. */ /* Optimize a namespace, including all contained namespaces.
flag_frontend_optimize and flag_fronend_loop_interchange are
handled separately. */
static void static void
optimize_namespace (gfc_namespace *ns) optimize_namespace (gfc_namespace *ns)
...@@ -1363,28 +1368,35 @@ optimize_namespace (gfc_namespace *ns) ...@@ -1363,28 +1368,35 @@ optimize_namespace (gfc_namespace *ns)
in_assoc_list = false; in_assoc_list = false;
in_omp_workshare = false; in_omp_workshare = false;
gfc_code_walker (&ns->code, simplify_io_impl_do, dummy_expr_callback, NULL); if (flag_frontend_optimize)
gfc_code_walker (&ns->code, convert_do_while, dummy_expr_callback, NULL);
gfc_code_walker (&ns->code, convert_elseif, dummy_expr_callback, NULL);
gfc_code_walker (&ns->code, cfe_code, cfe_expr_0, NULL);
gfc_code_walker (&ns->code, optimize_code, optimize_expr, NULL);
if (flag_inline_matmul_limit != 0)
{ {
bool found; gfc_code_walker (&ns->code, simplify_io_impl_do, dummy_expr_callback, NULL);
do gfc_code_walker (&ns->code, convert_do_while, dummy_expr_callback, NULL);
gfc_code_walker (&ns->code, convert_elseif, dummy_expr_callback, NULL);
gfc_code_walker (&ns->code, cfe_code, cfe_expr_0, NULL);
gfc_code_walker (&ns->code, optimize_code, optimize_expr, NULL);
if (flag_inline_matmul_limit != 0)
{ {
found = false; bool found;
gfc_code_walker (&ns->code, matmul_to_var_code, matmul_to_var_expr, do
(void *) &found); {
} found = false;
while (found); gfc_code_walker (&ns->code, matmul_to_var_code, matmul_to_var_expr,
(void *) &found);
}
while (found);
gfc_code_walker (&ns->code, matmul_temp_args, dummy_expr_callback, gfc_code_walker (&ns->code, matmul_temp_args, dummy_expr_callback,
NULL); NULL);
gfc_code_walker (&ns->code, inline_matmul_assign, dummy_expr_callback, gfc_code_walker (&ns->code, inline_matmul_assign, dummy_expr_callback,
NULL); NULL);
}
} }
if (flag_frontend_loop_interchange)
gfc_code_walker (&ns->code, index_interchange, dummy_expr_callback,
NULL);
/* BLOCKs are handled in the expression walker below. */ /* BLOCKs are handled in the expression walker below. */
for (ns = ns->contained; ns; ns = ns->sibling) for (ns = ns->contained; ns; ns = ns->sibling)
{ {
...@@ -4225,6 +4237,170 @@ inline_matmul_assign (gfc_code **c, int *walk_subtrees, ...@@ -4225,6 +4237,170 @@ inline_matmul_assign (gfc_code **c, int *walk_subtrees,
return 0; return 0;
} }
/* Code for index interchange for loops which are grouped together in DO
CONCURRENT or FORALL statements. This is currently only applied if the
iterations are grouped together in a single statement.
For this transformation, it is assumed that memory access in strides is
expensive, and that loops which access later indices (which access memory
in bigger strides) should be moved to the first loops.
For this, a loop over all the statements is executed, counting the times
that the loop iteration values are accessed in each index. The loop
indices are then sorted to minimize access to later indices from inner
loops. */
/* Type for holding index information. */
typedef struct {
gfc_symbol *sym;
gfc_forall_iterator *fa;
int num;
int n[GFC_MAX_DIMENSIONS];
} ind_type;
/* Callback function to determine if an expression is the
corresponding variable. */
static int
has_var (gfc_expr **e, int *walk_subtrees ATTRIBUTE_UNUSED, void *data)
{
gfc_expr *expr = *e;
gfc_symbol *sym;
if (expr->expr_type != EXPR_VARIABLE)
return 0;
sym = (gfc_symbol *) data;
return sym == expr->symtree->n.sym;
}
/* Callback function to calculate the cost of a certain index. */
static int
index_cost (gfc_expr **e, int *walk_subtrees ATTRIBUTE_UNUSED,
void *data)
{
ind_type *ind;
gfc_expr *expr;
gfc_array_ref *ar;
gfc_ref *ref;
int i,j;
expr = *e;
if (expr->expr_type != EXPR_VARIABLE)
return 0;
ar = NULL;
for (ref = expr->ref; ref; ref = ref->next)
{
if (ref->type == REF_ARRAY)
{
ar = &ref->u.ar;
break;
}
}
if (ar == NULL || ar->type != AR_ELEMENT)
return 0;
ind = (ind_type *) data;
for (i = 0; i < ar->dimen; i++)
{
for (j=0; ind[j].sym != NULL; j++)
{
if (gfc_expr_walker (&ar->start[i], has_var, (void *) (ind[j].sym)))
ind[j].n[i]++;
}
}
return 0;
}
/* Callback function for qsort, to sort the loop indices. */
static int
loop_comp (const void *e1, const void *e2)
{
const ind_type *i1 = (const ind_type *) e1;
const ind_type *i2 = (const ind_type *) e2;
int i;
for (i=GFC_MAX_DIMENSIONS-1; i >= 0; i--)
{
if (i1->n[i] != i2->n[i])
return i1->n[i] - i2->n[i];
}
/* All other things being equal, let's not change the ordering. */
return i2->num - i1->num;
}
/* Main function to do the index interchange. */
static int
index_interchange (gfc_code **c, int *walk_subtrees ATTRIBUTE_UNUSED,
void *data ATTRIBUTE_UNUSED)
{
gfc_code *co;
co = *c;
int n_iter;
gfc_forall_iterator *fa;
ind_type *ind;
int i, j;
if (co->op != EXEC_FORALL && co->op != EXEC_DO_CONCURRENT)
return 0;
n_iter = 0;
for (fa = co->ext.forall_iterator; fa; fa = fa->next)
n_iter ++;
/* Nothing to reorder. */
if (n_iter < 2)
return 0;
ind = XALLOCAVEC (ind_type, n_iter + 1);
i = 0;
for (fa = co->ext.forall_iterator; fa; fa = fa->next)
{
ind[i].sym = fa->var->symtree->n.sym;
ind[i].fa = fa;
for (j=0; j<GFC_MAX_DIMENSIONS; j++)
ind[i].n[j] = 0;
ind[i].num = i;
i++;
}
ind[n_iter].sym = NULL;
ind[n_iter].fa = NULL;
gfc_code_walker (c, gfc_dummy_code_callback, index_cost, (void *) ind);
qsort ((void *) ind, n_iter, sizeof (ind_type), loop_comp);
/* Do the actual index interchange. */
co->ext.forall_iterator = fa = ind[0].fa;
for (i=1; i<n_iter; i++)
{
fa->next = ind[i].fa;
fa = fa->next;
}
fa->next = NULL;
if (flag_warn_frontend_loop_interchange)
{
for (i=1; i<n_iter; i++)
{
if (ind[i-1].num > ind[i].num)
{
gfc_warning (OPT_Wfrontend_loop_interchange,
"Interchanging loops at %L", &co->loc);
break;
}
}
}
return 0;
}
#define WALK_SUBEXPR(NODE) \ #define WALK_SUBEXPR(NODE) \
do \ do \
{ \ { \
......
...@@ -149,8 +149,9 @@ and warnings}. ...@@ -149,8 +149,9 @@ and warnings}.
-Wdo-subscript -Wfunction-elimination -Wimplicit-interface @gol -Wdo-subscript -Wfunction-elimination -Wimplicit-interface @gol
-Wimplicit-procedure -Wintrinsic-shadow -Wuse-without-only -Wintrinsics-std @gol -Wimplicit-procedure -Wintrinsic-shadow -Wuse-without-only -Wintrinsics-std @gol
-Wline-truncation -Wno-align-commons -Wno-tabs -Wreal-q-constant @gol -Wline-truncation -Wno-align-commons -Wno-tabs -Wreal-q-constant @gol
-Wsurprising -Wunderflow -Wunused-parameter -Wrealloc-lhs -Wrealloc-lhs-all @gol -Wsurprising -Wunderflow -Wunused-parameter -Wrealloc-lhs @gol
-Wtarget-lifetime -fmax-errors=@var{n} -fsyntax-only -pedantic -pedantic-errors -Wrealloc-lhs-all -Wfrontend-loop-interchange -Wtarget-lifetime @gol
-fmax-errors=@var{n} -fsyntax-only -pedantic -pedantic-errors @gol
} }
@item Debugging Options @item Debugging Options
...@@ -183,6 +184,7 @@ and warnings}. ...@@ -183,6 +184,7 @@ and warnings}.
-fbounds-check -fcheck-array-temporaries @gol -fbounds-check -fcheck-array-temporaries @gol
-fcheck=@var{<all|array-temps|bounds|do|mem|pointer|recursion>} @gol -fcheck=@var{<all|array-temps|bounds|do|mem|pointer|recursion>} @gol
-fcoarray=@var{<none|single|lib>} -fexternal-blas -ff2c -fcoarray=@var{<none|single|lib>} -fexternal-blas -ff2c
-ffrontend-loop-interchange @gol
-ffrontend-optimize @gol -ffrontend-optimize @gol
-finit-character=@var{n} -finit-integer=@var{n} -finit-local-zero @gol -finit-character=@var{n} -finit-integer=@var{n} -finit-local-zero @gol
-finit-derived @gol -finit-derived @gol
...@@ -910,6 +912,13 @@ Enables some warning options for usages of language features which ...@@ -910,6 +912,13 @@ Enables some warning options for usages of language features which
may be problematic. This currently includes @option{-Wcompare-reals}, may be problematic. This currently includes @option{-Wcompare-reals},
@option{-Wunused-parameter} and @option{-Wdo-subscript}. @option{-Wunused-parameter} and @option{-Wdo-subscript}.
@item -Wfrontend-loop-interchange
@opindex @code{Wfrontend-loop-interchange}
@cindex warnings, loop interchange
@cindex loop interchange, warning
Enable warning for loop interchanges performed by the
@option{-ffrontend-loop-interchange} option.
@item -Wimplicit-interface @item -Wimplicit-interface
@opindex @code{Wimplicit-interface} @opindex @code{Wimplicit-interface}
@cindex warnings, implicit interface @cindex warnings, implicit interface
...@@ -1782,6 +1791,14 @@ expressions, removing unnecessary calls to @code{TRIM} in comparisons ...@@ -1782,6 +1791,14 @@ expressions, removing unnecessary calls to @code{TRIM} in comparisons
and assignments and replacing @code{TRIM(a)} with and assignments and replacing @code{TRIM(a)} with
@code{a(1:LEN_TRIM(a))}. It can be deselected by specifying @code{a(1:LEN_TRIM(a))}. It can be deselected by specifying
@option{-fno-frontend-optimize}. @option{-fno-frontend-optimize}.
@item -ffrontend-loop-interchange
@opindex @code{frontend-loop-interchange}
@cindex loop interchange, Fortran
Attempt to interchange loops in the Fortran front end where
profitable. Enabled by default by any @option{-O} option.
At the moment, this option only affects @code{FORALL} and
@code{DO CONCURRENT} statements with several forall triplets.
@end table @end table
@xref{Code Gen Options,,Options for Code Generation Conventions, @xref{Code Gen Options,,Options for Code Generation Conventions,
......
...@@ -245,6 +245,10 @@ Wextra ...@@ -245,6 +245,10 @@ Wextra
Fortran Warning Fortran Warning
; Documented in common ; Documented in common
Wfrontend-loop-interchange
Fortran Var(flag_warn_frontend_loop_interchange)
Warn if loops have been interchanged.
Wfunction-elimination Wfunction-elimination
Fortran Warning Var(warn_function_elimination) Fortran Warning Var(warn_function_elimination)
Warn about function call elimination. Warn about function call elimination.
...@@ -548,6 +552,10 @@ ffree-line-length- ...@@ -548,6 +552,10 @@ ffree-line-length-
Fortran RejectNegative Joined UInteger Var(flag_free_line_length) Init(132) Fortran RejectNegative Joined UInteger Var(flag_free_line_length) Init(132)
-ffree-line-length-<n> Use n as character line width in free mode. -ffree-line-length-<n> Use n as character line width in free mode.
ffrontend-loop-interchange
Fortran Var(flag_frontend_loop_interchange) Init(-1)
Try to interchange loops if profitable.
ffrontend-optimize ffrontend-optimize
Fortran Var(flag_frontend_optimize) Init(-1) Fortran Var(flag_frontend_optimize) Init(-1)
Enable front end optimization. Enable front end optimization.
......
...@@ -417,6 +417,11 @@ gfc_post_options (const char **pfilename) ...@@ -417,6 +417,11 @@ gfc_post_options (const char **pfilename)
if (flag_frontend_optimize == -1) if (flag_frontend_optimize == -1)
flag_frontend_optimize = optimize; flag_frontend_optimize = optimize;
/* Same for front end loop interchange. */
if (flag_frontend_loop_interchange == -1)
flag_frontend_loop_interchange = optimize;
if (flag_max_array_constructor < 65535) if (flag_max_array_constructor < 65535)
flag_max_array_constructor = 65535; flag_max_array_constructor = 65535;
......
2017-11-05 Thomas Koenig <tkoenig@gcc.gnu.org>
PR fortran/82471
* gfortran.dg/loop_interchange_1.f90: New test.
2017-11-05 Paul Thomas <pault@gcc.gnu.org> 2017-11-05 Paul Thomas <pault@gcc.gnu.org>
PR fortran/78641 PR fortran/78641
......
! { dg-do compile }
! { dg-additional-options "-O -Wfrontend-loop-interchange" }
PROGRAM TEST_DO_SPEED
IMPLICIT NONE
REAL, ALLOCATABLE :: A(:,:,:), B(:,:,:), C(:,:,:)
REAL :: TIC
INTEGER :: T0, T1, T2
INTEGER :: I, J, K
INTEGER, PARAMETER :: L = 512, M = 512, N = 512
ALLOCATE( A(L,M,N), B(L,M,N), C(L,M,N) )
CALL RANDOM_NUMBER(A)
CALL RANDOM_NUMBER(B)
CALL SYSTEM_CLOCK( T0, TIC)
DO CONCURRENT( K=1:N, J=1:M, I=1:L) ! { dg-warning "Interchanging loops" }
C(I,J,K) = A(I,J,K) +B(I,J,K)
END DO
END
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment