Commit c42cfb5c by Cesar Philippidis Committed by Cesar Philippidis

re PR lto/70289 ([openacc] ICE in input_varpool_node)

	gcc/
	PR lto/70289
	PR ipa/70348
	PR tree-optimization/70373
	PR middle-end/70533
	PR middle-end/70534
	PR middle-end/70535
	* gimplify.c (gimplify_adjust_omp_clauses): Add or adjust data
	clauses for acc parallel reductions as necessary.  Error on those
	that are private.
	* omp-low.c (scan_sharing_clauses): Don't install variables which
	are used in acc parallel reductions.
	(lower_rec_input_clauses): Remove dead code.
	(lower_oacc_reductions): Add support for reference reductions.
	(lower_reduction_clauses): Remove dead code.
	(lower_omp_target): Don't remap variables appearing in acc parallel
	reductions.
	* tree.h (OMP_CLAUSE_MAP_IN_REDUCTION): New macro.

	gcc/testsuite/
	* c-c++-common/goacc/reduction-5.c: New test.
	* c-c++-common/goacc/reduction-promotions.c: New test.
	* gfortran.dg/goacc/reduction-3.f95: New test.
	* gfortran.dg/goacc/reduction-promotions.f90: New test.

	libgomp/
	* testsuite/libgomp.oacc-c-c++-common/loop-reduction-gang-np-1.c: New
	test.
	* testsuite/libgomp.oacc-c-c++-common/loop-reduction-gw-np-1.c: New
	test.
	* testsuite/libgomp.oacc-c-c++-common/loop-reduction-gwv-np-1.c: New
	test.
	* testsuite/libgomp.oacc-c-c++-common/loop-reduction-gwv-np-2.c: New
	test.
	* testsuite/libgomp.oacc-c-c++-common/loop-reduction-gwv-np-3.c: New
	test.
	* testsuite/libgomp.oacc-c-c++-common/loop-reduction-gwv-np-4.c: New
	test.
	* testsuite/libgomp.oacc-c-c++-common/loop-reduction-vector-p-1.c: New
	test.
	* testsuite/libgomp.oacc-c-c++-common/loop-reduction-vector-p-2.c: New
	test.
	* testsuite/libgomp.oacc-c-c++-common/loop-reduction-worker-p-1.c: New
	test.
	* testsuite/libgomp.oacc-c-c++-common/loop-reduction-wv-p-1.c: New test.
	* testsuite/libgomp.oacc-c-c++-common/loop-reduction-wv-p-2.c: New test.
	* testsuite/libgomp.oacc-c-c++-common/loop-reduction-wv-p-3.c: New test.
	* testsuite/libgomp.oacc-c-c++-common/par-loop-comb-reduction-1.c: New
	test.
	* testsuite/libgomp.oacc-c-c++-common/par-loop-comb-reduction-2.c: New
	test.
	* testsuite/libgomp.oacc-c-c++-common/par-loop-comb-reduction-3.c: New
	test.
	* testsuite/libgomp.oacc-c-c++-common/par-loop-comb-reduction-4.c: New
	test.
	* testsuite/libgomp.oacc-c-c++-common/par-reduction-1.c: Add test
	coverage.
	* testsuite/libgomp.oacc-c-c++-common/par-reduction-2.c: Likewise.
	* testsuite/libgomp.oacc-c-c++-common/parallel-dims.c: Likewise.
	* testsuite/libgomp.oacc-c-c++-common/parallel-reduction.c: New test.
	* testsuite/libgomp.oacc-c-c++-common/pr70289.c: New test.
	* testsuite/libgomp.oacc-c-c++-common/pr70373.c: New test.
	* testsuite/libgomp.oacc-c-c++-common/reduction-1.c: Add test
	coverage.
	* testsuite/libgomp.oacc-c-c++-common/reduction-2.c: Likewise.
	* testsuite/libgomp.oacc-c-c++-common/reduction-3.c: Likewise.
	* testsuite/libgomp.oacc-c-c++-common/reduction-4.c: Likewise.
	* testsuite/libgomp.oacc-c-c++-common/reduction-5.c: Likewise.
	* testsuite/libgomp.oacc-c-c++-common/reduction-6.c: New test.
	* testsuite/libgomp.oacc-c-c++-common/reduction.h: New test.
	* testsuite/libgomp.oacc-fortran/parallel-reduction.f90: New test.
	* testsuite/libgomp.oacc-fortran/pr70289.f90: New test.
	* testsuite/libgomp.oacc-fortran/reduction-1.f90: Add test coverage.
	* testsuite/libgomp.oacc-fortran/reduction-2.f90: Likewise.
	* testsuite/libgomp.oacc-fortran/reduction-3.f90: Likewise.
	* testsuite/libgomp.oacc-fortran/reduction-4.f90: Likewise.
	* testsuite/libgomp.oacc-fortran/reduction-5.f90: Likewise.
	* testsuite/libgomp.oacc-fortran/reduction-6.f90: Likewise.
	* testsuite/libgomp.oacc-fortran/reduction-7.f90: New test.

From-SVN: r234840
parent 51a4b0f1
2016-04-08 Cesar Philippidis <cesar@codesourcery.com>
PR lto/70289
PR ipa/70348
PR tree-optimization/70373
PR middle-end/70533
PR middle-end/70534
PR middle-end/70535
* gimplify.c (gimplify_adjust_omp_clauses): Add or adjust data
clauses for acc parallel reductions as necessary. Error on those
that are private.
* omp-low.c (scan_sharing_clauses): Don't install variables which
are used in acc parallel reductions.
(lower_rec_input_clauses): Remove dead code.
(lower_oacc_reductions): Add support for reference reductions.
(lower_reduction_clauses): Remove dead code.
(lower_omp_target): Don't remap variables appearing in acc parallel
reductions.
* tree.h (OMP_CLAUSE_MAP_IN_REDUCTION): New macro.
2016-04-08 Jakub Jelinek <jakub@redhat.com>
PR middle-end/70593
......
......@@ -7987,6 +7987,34 @@ gimplify_adjust_omp_clauses (gimple_seq *pre_p, gimple_seq body, tree *list_p,
break;
}
decl = OMP_CLAUSE_DECL (c);
/* Data clasues associated with acc parallel reductions must be
compatible with present_or_copy. Warn and adjust the clause
if that is not the case. */
if (ctx->region_type == ORT_ACC_PARALLEL)
{
tree t = DECL_P (decl) ? decl : TREE_OPERAND (decl, 0);
n = NULL;
if (DECL_P (t))
n = splay_tree_lookup (ctx->variables, (splay_tree_key) t);
if (n && (n->value & GOVD_REDUCTION))
{
enum gomp_map_kind kind = OMP_CLAUSE_MAP_KIND (c);
OMP_CLAUSE_MAP_IN_REDUCTION (c) = 1;
if ((kind & GOMP_MAP_TOFROM) != GOMP_MAP_TOFROM
&& kind != GOMP_MAP_FORCE_PRESENT
&& kind != GOMP_MAP_POINTER)
{
warning_at (OMP_CLAUSE_LOCATION (c), 0,
"incompatible data clause with reduction "
"on %qE; promoting to present_or_copy",
DECL_NAME (t));
OMP_CLAUSE_SET_MAP_KIND (c, GOMP_MAP_TOFROM);
}
}
}
if (!DECL_P (decl))
{
if ((ctx->region_type & ORT_TARGET) != 0
......@@ -8118,6 +8146,33 @@ gimplify_adjust_omp_clauses (gimple_seq *pre_p, gimple_seq body, tree *list_p,
case OMP_CLAUSE_REDUCTION:
decl = OMP_CLAUSE_DECL (c);
/* OpenACC reductions need a present_or_copy data clause.
Add one if necessary. Error is the reduction is private. */
if (ctx->region_type == ORT_ACC_PARALLEL)
{
n = splay_tree_lookup (ctx->variables, (splay_tree_key) decl);
if (n->value & (GOVD_PRIVATE | GOVD_FIRSTPRIVATE))
error_at (OMP_CLAUSE_LOCATION (c), "invalid private "
"reduction on %qE", DECL_NAME (decl));
else if ((n->value & GOVD_MAP) == 0)
{
tree next = OMP_CLAUSE_CHAIN (c);
tree nc = build_omp_clause (UNKNOWN_LOCATION, OMP_CLAUSE_MAP);
OMP_CLAUSE_SET_MAP_KIND (nc, GOMP_MAP_TOFROM);
OMP_CLAUSE_DECL (nc) = decl;
OMP_CLAUSE_CHAIN (c) = nc;
lang_hooks.decls.omp_finish_clause (nc, pre_p);
while (1)
{
OMP_CLAUSE_MAP_IN_REDUCTION (nc) = 1;
if (OMP_CLAUSE_CHAIN (nc) == NULL)
break;
nc = OMP_CLAUSE_CHAIN (nc);
}
OMP_CLAUSE_CHAIN (nc) = next;
n->value |= GOVD_MAP;
}
}
if (DECL_P (decl)
&& omp_shared_to_firstprivate_optimizable_decl_p (decl))
omp_mark_stores (gimplify_omp_ctxp->outer_context, decl);
......
......@@ -2122,7 +2122,8 @@ scan_sharing_clauses (tree clauses, omp_context *ctx,
else
install_var_field (decl, true, 3, ctx,
base_pointers_restrict);
if (is_gimple_omp_offloaded (ctx->stmt))
if (is_gimple_omp_offloaded (ctx->stmt)
&& !OMP_CLAUSE_MAP_IN_REDUCTION (c))
install_var_local (decl, ctx);
}
}
......@@ -4839,7 +4840,7 @@ lower_rec_input_clauses (tree clauses, gimple_seq *ilist, gimple_seq *dlist,
gimplify_assign (ptr, x, ilist);
}
}
else if (is_reference (var) && !is_oacc_parallel (ctx))
else if (is_reference (var))
{
/* For references that are being privatized for Fortran,
allocate new backing storage for the new pointer
......@@ -5575,7 +5576,8 @@ lower_oacc_reductions (location_t loc, tree clauses, tree level, bool inner,
tree orig = OMP_CLAUSE_DECL (c);
tree var = maybe_lookup_decl (orig, ctx);
tree ref_to_res = NULL_TREE;
tree incoming, outgoing;
tree incoming, outgoing, v1, v2, v3;
bool is_private = false;
enum tree_code rcode = OMP_CLAUSE_REDUCTION_CODE (c);
if (rcode == MINUS_EXPR)
......@@ -5588,7 +5590,6 @@ lower_oacc_reductions (location_t loc, tree clauses, tree level, bool inner,
if (!var)
var = orig;
gcc_assert (!is_reference (var));
incoming = outgoing = var;
......@@ -5624,22 +5625,38 @@ lower_oacc_reductions (location_t loc, tree clauses, tree level, bool inner,
for (; cls; cls = OMP_CLAUSE_CHAIN (cls))
if (OMP_CLAUSE_CODE (cls) == OMP_CLAUSE_REDUCTION
&& orig == OMP_CLAUSE_DECL (cls))
goto has_outer_reduction;
{
incoming = outgoing = lookup_decl (orig, probe);
goto has_outer_reduction;
}
else if ((OMP_CLAUSE_CODE (cls) == OMP_CLAUSE_FIRSTPRIVATE
|| OMP_CLAUSE_CODE (cls) == OMP_CLAUSE_PRIVATE)
&& orig == OMP_CLAUSE_DECL (cls))
{
is_private = true;
goto do_lookup;
}
}
do_lookup:
/* This is the outermost construct with this reduction,
see if there's a mapping for it. */
if (gimple_code (outer->stmt) == GIMPLE_OMP_TARGET
&& maybe_lookup_field (orig, outer))
&& maybe_lookup_field (orig, outer) && !is_private)
{
ref_to_res = build_receiver_ref (orig, false, outer);
if (is_reference (orig))
ref_to_res = build_simple_mem_ref (ref_to_res);
tree type = TREE_TYPE (var);
if (POINTER_TYPE_P (type))
type = TREE_TYPE (type);
outgoing = var;
incoming = omp_reduction_init_op (loc, rcode, TREE_TYPE (var));
incoming = omp_reduction_init_op (loc, rcode, type);
}
else if (ctx->outer)
incoming = outgoing = lookup_decl (orig, ctx->outer);
else
incoming = outgoing = orig;
......@@ -5649,6 +5666,37 @@ lower_oacc_reductions (location_t loc, tree clauses, tree level, bool inner,
if (!ref_to_res)
ref_to_res = integer_zero_node;
if (is_reference (orig))
{
tree type = TREE_TYPE (var);
const char *id = IDENTIFIER_POINTER (DECL_NAME (var));
if (!inner)
{
tree x = create_tmp_var (TREE_TYPE (type), id);
gimplify_assign (var, build_fold_addr_expr (x), fork_seq);
}
v1 = create_tmp_var (type, id);
v2 = create_tmp_var (type, id);
v3 = create_tmp_var (type, id);
gimplify_assign (v1, var, fork_seq);
gimplify_assign (v2, var, fork_seq);
gimplify_assign (v3, var, fork_seq);
var = build_simple_mem_ref (var);
v1 = build_simple_mem_ref (v1);
v2 = build_simple_mem_ref (v2);
v3 = build_simple_mem_ref (v3);
outgoing = build_simple_mem_ref (outgoing);
if (TREE_CODE (incoming) != INTEGER_CST)
incoming = build_simple_mem_ref (incoming);
}
else
v1 = v2 = v3 = var;
/* Determine position in reduction buffer, which may be used
by target. */
enum machine_mode mode = TYPE_MODE (TREE_TYPE (var));
......@@ -5678,20 +5726,20 @@ lower_oacc_reductions (location_t loc, tree clauses, tree level, bool inner,
= build_call_expr_internal_loc (loc, IFN_GOACC_REDUCTION,
TREE_TYPE (var), 6, init_code,
unshare_expr (ref_to_res),
var, level, op, off);
v1, level, op, off);
tree fini_call
= build_call_expr_internal_loc (loc, IFN_GOACC_REDUCTION,
TREE_TYPE (var), 6, fini_code,
unshare_expr (ref_to_res),
var, level, op, off);
v2, level, op, off);
tree teardown_call
= build_call_expr_internal_loc (loc, IFN_GOACC_REDUCTION,
TREE_TYPE (var), 6, teardown_code,
ref_to_res, var, level, op, off);
ref_to_res, v3, level, op, off);
gimplify_assign (var, setup_call, &before_fork);
gimplify_assign (var, init_call, &after_fork);
gimplify_assign (var, fini_call, &before_join);
gimplify_assign (v1, setup_call, &before_fork);
gimplify_assign (v2, init_call, &after_fork);
gimplify_assign (v3, fini_call, &before_join);
gimplify_assign (outgoing, teardown_call, &after_join);
}
......@@ -5933,9 +5981,6 @@ lower_reduction_clauses (tree clauses, gimple_seq *stmt_seqp, omp_context *ctx)
}
}
if (is_gimple_omp_oacc (ctx->stmt))
return;
stmt = gimple_build_call (builtin_decl_explicit (BUILT_IN_GOMP_ATOMIC_START),
0);
gimple_seq_add_stmt (stmt_seqp, stmt);
......@@ -15829,7 +15874,10 @@ lower_omp_target (gimple_stmt_iterator *gsi_p, omp_context *ctx)
if (!maybe_lookup_field (var, ctx))
continue;
if (offloaded)
/* Don't remap oacc parallel reduction variables, because the
intermediate result must be local to each gang. */
if (offloaded && !(OMP_CLAUSE_CODE (c) == OMP_CLAUSE_MAP
&& OMP_CLAUSE_MAP_IN_REDUCTION (c)))
{
x = build_receiver_ref (var, true, ctx);
tree new_var = lookup_decl (var, ctx);
......
2016-04-08 Cesar Philippidis <cesar@codesourcery.com>
PR lto/70289
PR ipa/70348
PR tree-optimization/70373
PR middle-end/70533
PR middle-end/70534
PR middle-end/70535
* c-c++-common/goacc/reduction-5.c: New test.
* c-c++-common/goacc/reduction-promotions.c: New test.
* gfortran.dg/goacc/reduction-3.f95: New test.
* gfortran.dg/goacc/reduction-promotions.f90: New test.
2016-04-08 Patrick Palka <ppalka@gcc.gnu.org>
PR c++/70590
......
/* Integer reductions. */
#define n 1000
int
main(void)
{
int v1;
#pragma acc parallel reduction(+:v1) private(v1) /* { dg-error "appears more than once in data clauses" } */
;
#pragma acc parallel reduction(+:v1) firstprivate(v1) /* { dg-error "appears more than once in data clauses" } */
;
return 0;
}
/* Integer reductions. */
#define n 1000
int
main(void)
{
int v1, v2;
#pragma acc parallel reduction(+:v1,v2)
;
#pragma acc parallel reduction(+:v1,v2) copy(v1,v2)
;
#pragma acc parallel reduction(+:v1,v2) pcopy(v1,v2)
;
#pragma acc parallel reduction(+:v1,v2) present(v1,v2)
;
#pragma acc parallel reduction(+:v1,v2) copyin(v1,v2) /* { dg-warning "incompatible data clause with reduction" } */
;
#pragma acc parallel reduction(+:v1,v2) pcopyin(v1,v2) /* { dg-warning "incompatible data clause with reduction" } */
;
#pragma acc parallel reduction(+:v1,v2) copyout(v1,v2) /* { dg-warning "incompatible data clause with reduction" } */
;
#pragma acc parallel reduction(+:v1,v2) pcopyout(v1,v2) /* { dg-warning "incompatible data clause with reduction" } */
;
#pragma acc parallel reduction(+:v1,v2) create(v1,v2) /* { dg-warning "incompatible data clause with reduction" } */
;
#pragma acc parallel reduction(+:v1,v2) pcreate(v1,v2) /* { dg-warning "incompatible data clause with reduction" } */
;
return 0;
}
! { dg-do compile }
subroutine foo (ia1)
integer :: i1, i2
!$acc parallel reduction (+:i1) private(i1) ! { dg-error "invalid private reduction on .i1." }
!$acc end parallel
!$acc parallel reduction (+:i2) firstprivate(i2) ! { dg-error "invalid private reduction on .i2." }
!$acc end parallel
end subroutine foo
! Ensure that each parallel reduction variable as a copy or pcopy
! data clause.
! { dg-additional-options "-fdump-tree-gimple" }
program test
implicit none
integer :: v1, v2
!$acc parallel reduction(+:v1,v2)
!$acc end parallel
!$acc parallel reduction(+:v1,v2) copy(v1,v2)
!$acc end parallel
!$acc parallel reduction(+:v1,v2) pcopy(v1,v2)
!$acc end parallel
!$acc parallel reduction(+:v1,v2) present(v1,v2)
!$acc end parallel
!$acc parallel reduction(+:v1,v2) copyin(v1,v2) ! { dg-warning "incompatible data clause" }
!$acc end parallel
!$acc parallel reduction(+:v1,v2) pcopyin(v1,v2) ! { dg-warning "incompatible data clause" }
!$acc end parallel
!$acc parallel reduction(+:v1,v2) copyout(v1,v2) ! { dg-warning "incompatible data clause" }
!$acc end parallel
!$acc parallel reduction(+:v1,v2) pcopyout(v1,v2) ! { dg-warning "incompatible data clause" }
!$acc end parallel
!$acc parallel reduction(+:v1,v2) create(v1,v2) ! { dg-warning "incompatible data clause" }
!$acc end parallel
!$acc parallel reduction(+:v1,v2) pcreate(v1,v2) ! { dg-warning "incompatible data clause" }
!$acc end parallel
end program test
! { dg-final { scan-tree-dump-times "map.tofrom:v1" 8 "gimple" } }
! { dg-final { scan-tree-dump-times "map.tofrom:v2" 8 "gimple" } }
! { dg-final { scan-tree-dump-times "map.force_tofrom:v1" 1 "gimple" } }
! { dg-final { scan-tree-dump-times "map.force_tofrom:v2" 1 "gimple" } }
! { dg-final { scan-tree-dump-times "map.force_present:v1" 1 "gimple" } }
! { dg-final { scan-tree-dump-times "map.force_present:v2" 1 "gimple" } }
......@@ -1536,6 +1536,9 @@ extern void protected_set_expr_location (tree, location_t);
treatment if OMP_CLAUSE_SIZE is zero. */
#define OMP_CLAUSE_MAP_MAYBE_ZERO_LENGTH_ARRAY_SECTION(NODE) \
TREE_PROTECTED (OMP_CLAUSE_SUBCODE_CHECK (NODE, OMP_CLAUSE_MAP))
/* Nonzero if this map clause is for an ACC parallel reduction variable. */
#define OMP_CLAUSE_MAP_IN_REDUCTION(NODE) \
TREE_PRIVATE (OMP_CLAUSE_SUBCODE_CHECK (NODE, OMP_CLAUSE_MAP))
#define OMP_CLAUSE_PROC_BIND_KIND(NODE) \
(OMP_CLAUSE_SUBCODE_CHECK (NODE, OMP_CLAUSE_PROC_BIND)->omp_clause.subcode.proc_bind_kind)
......
2016-04-08 Cesar Philippidis <cesar@codesourcery.com>
PR lto/70289
PR ipa/70348
PR tree-optimization/70373
PR middle-end/70533
PR middle-end/70534
PR middle-end/70535
* testsuite/libgomp.oacc-c-c++-common/loop-reduction-gang-np-1.c: New
test.
* testsuite/libgomp.oacc-c-c++-common/loop-reduction-gw-np-1.c: New
test.
* testsuite/libgomp.oacc-c-c++-common/loop-reduction-gwv-np-1.c: New
test.
* testsuite/libgomp.oacc-c-c++-common/loop-reduction-gwv-np-2.c: New
test.
* testsuite/libgomp.oacc-c-c++-common/loop-reduction-gwv-np-3.c: New
test.
* testsuite/libgomp.oacc-c-c++-common/loop-reduction-gwv-np-4.c: New
test.
* testsuite/libgomp.oacc-c-c++-common/loop-reduction-vector-p-1.c: New
test.
* testsuite/libgomp.oacc-c-c++-common/loop-reduction-vector-p-2.c: New
test.
* testsuite/libgomp.oacc-c-c++-common/loop-reduction-worker-p-1.c: New
test.
* testsuite/libgomp.oacc-c-c++-common/loop-reduction-wv-p-1.c: New test.
* testsuite/libgomp.oacc-c-c++-common/loop-reduction-wv-p-2.c: New test.
* testsuite/libgomp.oacc-c-c++-common/loop-reduction-wv-p-3.c: New test.
* testsuite/libgomp.oacc-c-c++-common/par-loop-comb-reduction-1.c: New
test.
* testsuite/libgomp.oacc-c-c++-common/par-loop-comb-reduction-2.c: New
test.
* testsuite/libgomp.oacc-c-c++-common/par-loop-comb-reduction-3.c: New
test.
* testsuite/libgomp.oacc-c-c++-common/par-loop-comb-reduction-4.c: New
test.
* testsuite/libgomp.oacc-c-c++-common/par-reduction-1.c: Add test
coverage.
* testsuite/libgomp.oacc-c-c++-common/par-reduction-2.c: Likewise.
* testsuite/libgomp.oacc-c-c++-common/parallel-dims.c: Likewise.
* testsuite/libgomp.oacc-c-c++-common/parallel-reduction.c: New test.
* testsuite/libgomp.oacc-c-c++-common/pr70289.c: New test.
* testsuite/libgomp.oacc-c-c++-common/pr70373.c: New test.
* testsuite/libgomp.oacc-c-c++-common/reduction-1.c: Add test
coverage.
* testsuite/libgomp.oacc-c-c++-common/reduction-2.c: Likewise.
* testsuite/libgomp.oacc-c-c++-common/reduction-3.c: Likewise.
* testsuite/libgomp.oacc-c-c++-common/reduction-4.c: Likewise.
* testsuite/libgomp.oacc-c-c++-common/reduction-5.c: Likewise.
* testsuite/libgomp.oacc-c-c++-common/reduction-6.c: New test.
* testsuite/libgomp.oacc-c-c++-common/reduction.h: New test.
* testsuite/libgomp.oacc-fortran/parallel-reduction.f90: New test.
* testsuite/libgomp.oacc-fortran/pr70289.f90: New test.
* testsuite/libgomp.oacc-fortran/reduction-1.f90: Add test coverage.
* testsuite/libgomp.oacc-fortran/reduction-2.f90: Likewise.
* testsuite/libgomp.oacc-fortran/reduction-3.f90: Likewise.
* testsuite/libgomp.oacc-fortran/reduction-4.f90: Likewise.
* testsuite/libgomp.oacc-fortran/reduction-5.f90: Likewise.
* testsuite/libgomp.oacc-fortran/reduction-6.f90: Likewise.
* testsuite/libgomp.oacc-fortran/reduction-7.f90: New test.
2016-03-30 Thomas Schwinge <thomas@codesourcery.com>
James Norris <jnorris@codesourcery.com>
Nathan Sidwell <nathan@codesourcery.com>
......
/* { dg-additional-options "-w" } */
#include <assert.h>
/* Test of reduction on loop directive (gangs, non-private reduction
variable). */
int
main (int argc, char *argv[])
{
int i, arr[1024], res = 0, hres = 0;
for (i = 0; i < 1024; i++)
arr[i] = i;
#pragma acc parallel num_gangs(32) num_workers(32) vector_length(32) \
copy(res)
{
#pragma acc loop gang reduction(+:res)
for (i = 0; i < 1024; i++)
res += arr[i];
}
for (i = 0; i < 1024; i++)
hres += arr[i];
assert (res == hres);
res = hres = 1;
#pragma acc parallel num_gangs(32) num_workers(32) vector_length(32) \
copy(res)
{
#pragma acc loop gang reduction(*:res)
for (i = 0; i < 12; i++)
res *= arr[i];
}
for (i = 0; i < 12; i++)
hres *= arr[i];
assert (res == hres);
return 0;
}
/* { dg-additional-options "-w" } */
#include <assert.h>
/* Test of reduction on loop directive (gangs and vectors, non-private
reduction variable). */
int
main (int argc, char *argv[])
{
int i, arr[1024], res = 0, hres = 0;
for (i = 0; i < 1024; i++)
arr[i] = i;
#pragma acc parallel num_gangs(32) num_workers(32) vector_length(32) \
copy(res)
{
#pragma acc loop gang vector reduction(+:res)
for (i = 0; i < 1024; i++)
res += arr[i];
}
for (i = 0; i < 1024; i++)
hres += arr[i];
assert (res == hres);
return 0;
}
/* { dg-additional-options "-w" } */
#include <assert.h>
/* Test of reduction on loop directive (gangs and workers, non-private
reduction variable). */
int
main (int argc, char *argv[])
{
int i, arr[1024], res = 0, hres = 0;
for (i = 0; i < 1024; i++)
arr[i] = i;
#pragma acc parallel num_gangs(32) num_workers(32) vector_length(32) \
copy(res)
{
#pragma acc loop gang worker reduction(+:res)
for (i = 0; i < 1024; i++)
res += arr[i];
}
for (i = 0; i < 1024; i++)
hres += arr[i];
assert (res == hres);
return 0;
}
#include <assert.h>
/* Test of reduction on loop directive (gangs, workers and vectors, non-private
reduction variable). */
int
main (int argc, char *argv[])
{
int i, arr[1024], res = 0, hres = 0;
for (i = 0; i < 1024; i++)
arr[i] = i;
#pragma acc parallel num_gangs(32) num_workers(32) vector_length(32) \
copy(res)
{
#pragma acc loop gang worker vector reduction(+:res)
for (i = 0; i < 1024; i++)
res += arr[i];
}
for (i = 0; i < 1024; i++)
hres += arr[i];
assert (res == hres);
return 0;
}
#include <assert.h>
/* Test of reduction on loop directive (gangs, workers and vectors, non-private
reduction variable: separate gang and worker/vector loops). */
int
main (int argc, char *argv[])
{
int i, j, arr[32768], res = 0, hres = 0;
for (i = 0; i < 32768; i++)
arr[i] = i;
#pragma acc parallel num_gangs(32) num_workers(32) vector_length(32) \
copy(res)
{
#pragma acc loop gang reduction(+:res)
for (j = 0; j < 32; j++)
{
#pragma acc loop worker vector reduction(+:res)
for (i = 0; i < 1024; i++)
res += arr[j * 1024 + i];
}
/* "res" is non-private, and is not available until after the parallel
region. */
}
for (i = 0; i < 32768; i++)
hres += arr[i];
assert (res == hres);
return 0;
}
#include <assert.h>
/* Test of reduction on loop directive (gangs, workers and vectors, non-private
reduction variable: separate gang and worker/vector loops). */
int
main (int argc, char *argv[])
{
int i, j;
double arr[32768], res = 0, hres = 0;
for (i = 0; i < 32768; i++)
arr[i] = i;
#pragma acc parallel num_gangs(32) num_workers(32) vector_length(32) \
copyin(arr) copy(res)
{
#pragma acc loop gang reduction(+:res)
for (j = 0; j < 32; j++)
{
#pragma acc loop worker vector reduction(+:res)
for (i = 0; i < 1024; i++)
res += arr[j * 1024 + i];
}
}
for (i = 0; i < 32768; i++)
hres += arr[i];
assert (res == hres);
return 0;
}
#include <assert.h>
/* Test of reduction on loop directive (gangs, workers and vectors, multiple
non-private reduction variables, float type). */
int
main (int argc, char *argv[])
{
int i, j;
float arr[32768];
float res = 0, mres = 0, hres = 0, hmres = 0;
for (i = 0; i < 32768; i++)
arr[i] = i;
#pragma acc parallel num_gangs(32) num_workers(32) vector_length(32) \
copy(res, mres)
{
#pragma acc loop gang reduction(+:res) reduction(max:mres)
for (j = 0; j < 32; j++)
{
#pragma acc loop worker vector reduction(+:res) reduction(max:mres)
for (i = 0; i < 1024; i++)
{
res += arr[j * 1024 + i];
if (arr[j * 1024 + i] > mres)
mres = arr[j * 1024 + i];
}
#pragma acc loop worker vector reduction(+:res) reduction(max:mres)
for (i = 0; i < 1024; i++)
{
res += arr[j * 1024 + (1023 - i)];
if (arr[j * 1024 + (1023 - i)] > mres)
mres = arr[j * 1024 + (1023 - i)];
}
}
}
for (j = 0; j < 32; j++)
for (i = 0; i < 1024; i++)
{
hres += arr[j * 1024 + i];
hres += arr[j * 1024 + (1023 - i)];
if (arr[j * 1024 + i] > hmres)
hmres = arr[j * 1024 + i];
if (arr[j * 1024 + (1023 - i)] > hmres)
hmres = arr[j * 1024 + (1023 - i)];
}
assert (res == hres);
assert (mres == hmres);
return 0;
}
/* { dg-additional-options "-w" } */
#include <assert.h>
/* Test of reduction on loop directive (vectors, private reduction
variable). */
int
main (int argc, char *argv[])
{
int i, j, arr[1024], out[32], res = 0, hres = 0;
for (i = 0; i < 1024; i++)
arr[i] = i;
#pragma acc parallel num_gangs(32) num_workers(32) vector_length(32) \
private(res) copyout(out)
{
#pragma acc loop gang
for (j = 0; j < 32; j++)
{
res = 0;
#pragma acc loop vector reduction(+:res)
for (i = 0; i < 32; i++)
res += arr[j * 32 + i];
out[j] = res;
}
}
for (j = 0; j < 32; j++)
{
hres = 0;
for (i = 0; i < 32; i++)
hres += arr[j * 32 + i];
assert (out[j] == hres);
}
return 0;
}
#include <assert.h>
/* Test of reduction on loop directive (vector reduction in
gang-partitioned/worker-partitioned mode, private reduction variable). */
int
main (int argc, char *argv[])
{
int i, j, k;
double ina[1024], inb[1024], out[1024], acc;
for (j = 0; j < 32; j++)
for (i = 0; i < 32; i++)
{
ina[j * 32 + i] = (i == j) ? 2.0 : 0.0;
inb[j * 32 + i] = (double) (i + j);
}
#pragma acc parallel num_gangs(32) num_workers(32) vector_length(32) \
private(acc) copyin(ina, inb) copyout(out)
{
#pragma acc loop gang worker
for (k = 0; k < 32; k++)
for (j = 0; j < 32; j++)
{
acc = 0;
#pragma acc loop vector reduction(+:acc)
for (i = 0; i < 32; i++)
acc += ina[k * 32 + i] * inb[i * 32 + j];
out[k * 32 + j] = acc;
}
}
for (j = 0; j < 32; j++)
for (i = 0; i < 32; i++)
assert (out[j * 32 + i] == (i + j) * 2);
return 0;
}
/* { dg-additional-options "-w" } */
#include <assert.h>
/* Test of reduction on loop directive (workers, private reduction
variable). */
int
main (int argc, char *argv[])
{
int i, j, arr[1024], out[32], res = 0, hres = 0;
for (i = 0; i < 1024; i++)
arr[i] = i;
#pragma acc parallel num_gangs(32) num_workers(32) vector_length(32) \
private(res) copyout(out)
{
#pragma acc loop gang
for (j = 0; j < 32; j++)
{
res = 0;
#pragma acc loop worker reduction(+:res)
for (i = 0; i < 32; i++)
res += arr[j * 32 + i];
out[j] = res;
}
}
for (j = 0; j < 32; j++)
{
hres = 0;
for (i = 0; i < 32; i++)
hres += arr[j * 32 + i];
assert (out[j] == hres);
}
return 0;
}
#include <assert.h>
/* Test of reduction on loop directive (workers and vectors, private reduction
variable). */
int
main (int argc, char *argv[])
{
int i, j, arr[1024], out[32], res = 0, hres = 0;
for (i = 0; i < 1024; i++)
arr[i] = i;
#pragma acc parallel num_gangs(32) num_workers(32) vector_length(32) \
private(res) copyout(out)
{
#pragma acc loop gang
for (j = 0; j < 32; j++)
{
res = 0;
#pragma acc loop worker vector reduction(+:res)
for (i = 0; i < 32; i++)
res += arr[j * 32 + i];
out[j] = res;
}
}
for (j = 0; j < 32; j++)
{
hres = 0;
for (i = 0; i < 32; i++)
hres += arr[j * 32 + i];
assert (out[j] == hres);
}
return 0;
}
#include <assert.h>
/* Test of reduction on loop directive (workers and vectors, private reduction
variable). */
int
main (int argc, char *argv[])
{
int i, j, arr[32768], out[32], res = 0, hres = 0;
for (i = 0; i < 32768; i++)
arr[i] = i;
#pragma acc parallel num_gangs(32) num_workers(32) vector_length(32) \
private(res) copyout(out)
{
#pragma acc loop gang
for (j = 0; j < 32; j++)
{
res = j;
#pragma acc loop worker reduction(+:res)
for (i = 0; i < 1024; i++)
res += arr[j * 1024 + i];
#pragma acc loop vector reduction(+:res)
for (i = 1023; i >= 0; i--)
res += arr[j * 1024 + i];
out[j] = res;
}
}
for (j = 0; j < 32; j++)
{
hres = j;
for (i = 0; i < 1024; i++)
hres += arr[j * 1024 + i] * 2;
assert (out[j] == hres);
}
return 0;
}
#include <assert.h>
/* Test of reduction on loop directive (workers and vectors, private reduction
variable: gang-redundant mode). */
int
main (int argc, char *argv[])
{
int i, arr[1024], out[32], res = 0, hres = 0;
for (i = 0; i < 1024; i++)
arr[i] = i ^ 33;
#pragma acc parallel num_gangs(32) num_workers(32) vector_length(32) \
private(res) copyin(arr) copyout(out)
{
/* Private variables aren't initialized by default in openacc. */
res = 0;
/* "res" should be available at the end of the following loop (and should
have the same value redundantly in each gang). */
#pragma acc loop worker vector reduction(+:res)
for (i = 0; i < 1024; i++)
res += arr[i];
#pragma acc loop gang (static: 1)
for (i = 0; i < 32; i++)
out[i] = res;
}
for (i = 0; i < 1024; i++)
hres += arr[i];
for (i = 0; i < 32; i++)
assert (out[i] == hres);
return 0;
}
#include <assert.h>
/* Test of reduction on both parallel and loop directives (worker and
vector-partitioned loops individually in gang-partitioned mode, int
type). */
int
main (int argc, char *argv[])
{
int i, j, arr[32768], res = 0, hres = 0;
for (i = 0; i < 32768; i++)
arr[i] = i;
#pragma acc parallel num_gangs(32) num_workers(32) vector_length(32) \
reduction(+:res) copy(res)
{
#pragma acc loop gang
for (j = 0; j < 32; j++)
{
#pragma acc loop worker reduction(+:res)
for (i = 0; i < 1024; i++)
res += arr[j * 1024 + i];
#pragma acc loop vector reduction(+:res)
for (i = 1023; i >= 0; i--)
res += arr[j * 1024 + i];
}
}
for (j = 0; j < 32; j++)
for (i = 0; i < 1024; i++)
hres += arr[j * 1024 + i] * 2;
assert (res == hres);
return 0;
}
#include <assert.h>
/* Test of reduction on both parallel and loop directives (workers and vectors
in gang-partitioned mode, int type with XOR). */
int
main (int argc, char *argv[])
{
int i, j, arr[32768], res = 0, hres = 0;
for (i = 0; i < 32768; i++)
arr[i] = i;
#pragma acc parallel num_gangs(32) num_workers(32) vector_length(32) \
reduction(^:res)
{
#pragma acc loop gang
for (j = 0; j < 32; j++)
{
#pragma acc loop worker vector reduction(^:res)
for (i = 0; i < 1024; i++)
res ^= arr[j * 1024 + i];
#pragma acc loop worker vector reduction(^:res)
for (i = 0; i < 1024; i++)
res ^= arr[j * 1024 + (1023 - i)];
}
}
for (j = 0; j < 32; j++)
for (i = 0; i < 1024; i++)
{
hres ^= arr[j * 1024 + i];
hres ^= arr[j * 1024 + (1023 - i)];
}
assert (res == hres);
return 0;
}
#include <assert.h>
/* Test of reduction on both parallel and loop directives (workers and vectors
together in gang-partitioned mode, float type). */
int
main (int argc, char *argv[])
{
int i, j;
float arr[32768];
float res = 0, hres = 0;
for (i = 0; i < 32768; i++)
arr[i] = i;
#pragma acc parallel num_gangs(32) num_workers(32) vector_length(32) \
reduction(+:res) copy(res)
{
#pragma acc loop gang
for (j = 0; j < 32; j++)
{
#pragma acc loop worker vector reduction(+:res)
for (i = 0; i < 1024; i++)
res += arr[j * 1024 + i];
#pragma acc loop worker vector reduction(+:res)
for (i = 0; i < 1024; i++)
res += arr[j * 1024 + (1023 - i)];
}
}
for (j = 0; j < 32; j++)
for (i = 0; i < 1024; i++)
{
hres += arr[j * 1024 + i];
hres += arr[j * 1024 + (1023 - i)];
}
assert (res == hres);
return 0;
}
#include <assert.h>
/* Test of reduction on both parallel and loop directives (workers and vectors
together in gang-partitioned mode, float type, multiple reductions). */
int
main (int argc, char *argv[])
{
int i, j;
float arr[32768];
float res = 0, mres = 0, hres = 0, hmres = 0;
for (i = 0; i < 32768; i++)
arr[i] = i;
#pragma acc parallel num_gangs(32) num_workers(32) vector_length(32) \
reduction(+:res) reduction(max:mres) copy(res, mres)
{
#pragma acc loop gang
for (j = 0; j < 32; j++)
{
#pragma acc loop worker vector reduction(+:res) reduction(max:mres)
for (i = 0; i < 1024; i++)
{
res += arr[j * 1024 + i];
if (arr[j * 1024 + i] > mres)
mres = arr[j * 1024 + i];
}
#pragma acc loop worker vector reduction(+:res) reduction(max:mres)
for (i = 0; i < 1024; i++)
{
res += arr[j * 1024 + (1023 - i)];
if (arr[j * 1024 + (1023 - i)] > mres)
mres = arr[j * 1024 + (1023 - i)];
}
}
}
for (j = 0; j < 32; j++)
for (i = 0; i < 1024; i++)
{
hres += arr[j * 1024 + i];
hres += arr[j * 1024 + (1023 - i)];
if (arr[j * 1024 + i] > hmres)
hmres = arr[j * 1024 + i];
if (arr[j * 1024 + (1023 - i)] > hmres)
hmres = arr[j * 1024 + (1023 - i)];
}
assert (res == hres);
assert (mres == hmres);
return 0;
}
/* { dg-additional-options "-w" } */
#include <assert.h>
/* Test of reduction on parallel directive. */
#define ACTUAL_GANGS 256
int
main (int argc, char *argv[])
{
int res, res2 = 0;
int res, res1 = 0, res2 = 0;
#if defined(ACC_DEVICE_TYPE_host)
# define GANGS 1
#else
# define GANGS 256
#endif
#pragma acc parallel num_gangs(GANGS) copy(res2)
#pragma acc parallel num_gangs(GANGS) num_workers(32) vector_length(32) \
reduction(+:res1) copy(res2, res1)
{
res1 += 5;
#pragma acc atomic
res2 += 5;
}
res = GANGS * 5;
assert (res == res1);
assert (res == res2);
#undef GANGS
res = res2 = 1;
res = res1 = res2 = 1;
#if defined(ACC_DEVICE_TYPE_host)
# define GANGS 1
#else
# define GANGS 8
#endif
#pragma acc parallel num_gangs(GANGS) copy(res2)
#pragma acc parallel num_gangs(GANGS) num_workers(32) vector_length(32) \
reduction(*:res1) copy(res1, res2)
{
res1 *= 5;
#pragma acc atomic
res2 *= 5;
}
for (int i = 0; i < GANGS; ++i)
res *= 5;
assert (res == res1);
assert (res == res2);
#undef GANGS
......
/* { dg-additional-options "-w" } */
#include <assert.h>
#include <openacc.h>
/* Test of reduction on parallel directive (with async). */
int
main (int argc, char *argv[])
{
int res, res2 = 0;
int res, res1 = 0, res2 = 0;
#if defined(ACC_DEVICE_TYPE_host)
# define GANGS 1
#else
# define GANGS 256
#endif
#pragma acc parallel num_gangs(GANGS) copy(res2) async(1)
#pragma acc parallel num_gangs(GANGS) num_workers(32) vector_length(32) \
reduction(+:res1) copy(res1, res2) async(1)
{
res1 += 5;
#pragma acc atomic
res2 += 5;
}
......@@ -20,18 +27,22 @@ main (int argc, char *argv[])
acc_wait (1);
assert (res == res1);
assert (res == res2);
#undef GANGS
res = res2 = 1;
res = res1 = res2 = 1;
#if defined(ACC_DEVICE_TYPE_host)
# define GANGS 1
#else
# define GANGS 8
#endif
#pragma acc parallel num_gangs(GANGS) copy(res2) async(1)
#pragma acc parallel num_gangs(GANGS) num_workers(32) vector_length(32) \
reduction(*:res1) copy(res1, res2) async(1)
{
res1 *= 5;
#pragma acc atomic
res2 *= 5;
}
......@@ -40,6 +51,7 @@ main (int argc, char *argv[])
acc_wait (1);
assert (res == res1);
assert (res == res2);
return 0;
......
......@@ -5,12 +5,20 @@
int main ()
{
int dummy[10];
#pragma acc parallel num_workers (2<<20) /* { dg-error "using num_workers" } */
{
#pragma acc loop worker
for (int i = 0; i < 10; i++)
dummy[i] = i;
}
#pragma acc parallel vector_length (2<<20) /* { dg-error "using vector_length" } */
{
#pragma acc loop vector
for (int i = 0; i < 10; i++)
dummy[i] = i;
}
return 0;
......
/* { dg-do run } */
/* { dg-additional-options "-w" } */
#include <stdlib.h>
#include <openacc.h>
#define N 10
int
main ()
{
int s1 = 0, s2 = 0;
int i;
int dummy = 0;
#pragma acc data copy (dummy)
{
#pragma acc parallel num_gangs (N) reduction (+:s1) copy(s1)
{
s1++;
}
}
if (acc_get_device_type () != acc_device_nvidia)
{
if (s1 != 1)
abort ();
}
else
{
if (s1 != N)
abort ();
}
s1 = 0;
s2 = 0;
#pragma acc parallel num_gangs (10) reduction (+:s1, s2) copy(s1, s2)
{
s1++;
s2 += N;
}
if (acc_get_device_type () != acc_device_nvidia)
{
if (s1 != 1)
abort ();
if (s2 != N)
abort ();
}
else
{
if (s1 != N)
abort ();
if (s2 != N*N)
abort ();
}
s1 = 0;
#pragma acc parallel num_gangs (10) reduction (+:s1) copy(s1)
{
#pragma acc loop gang reduction (+:s1)
for (i = 0; i < 10; i++)
s1++;
}
if (s1 != N)
abort ();
return 0;
}
int
main ()
{
int i;
static int temp;
#pragma acc parallel reduction(+:temp)
{
temp++;
}
return 0;
}
#define N 32
int
foo (unsigned int sum)
{
#pragma acc parallel reduction (+:sum)
{
sum;
}
return sum;
}
int
main (void)
{
unsigned int sum = 0;
foo (sum);
return 0;
}
/* { dg-do run } */
/* Ignore vector_length warnings for offloaded (nvptx) targets. */
/* { dg-additional-options "-foffload=-w" } */
/* Integer reductions. */
#include <stdlib.h>
#include <stdbool.h>
#define vl 32
#define DO_PRAGMA(x) _Pragma (#x)
#define check_reduction_op(type, op, init, b) \
{ \
type res, vres; \
res = (init); \
DO_PRAGMA (acc parallel vector_length (vl) copy(res)) \
DO_PRAGMA (acc loop reduction (op:res))\
for (i = 0; i < n; i++) \
res = res op (b); \
\
vres = (init); \
for (i = 0; i < n; i++) \
vres = vres op (b); \
\
if (res != vres) \
abort (); \
}
#include "reduction.h"
const int ng = 8;
const int nw = 4;
const int vl = 32;
static void
test_reductions_int (void)
test_reductions (void)
{
const int n = 1000;
const int n = 100;
int i;
int array[n];
for (i = 0; i < n; i++)
array[i] = i;
check_reduction_op (int, +, 0, array[i]);
check_reduction_op (int, *, 1, array[i]);
check_reduction_op (int, &, -1, array[i]);
check_reduction_op (int, |, 0, array[i]);
check_reduction_op (int, ^, 0, array[i]);
array[i] = i+1;
/* Gang reductions. */
check_reduction_op (int, +, 0, array[i], num_gangs (ng), gang);
check_reduction_op (int, *, 1, array[i], num_gangs (ng), gang);
check_reduction_op (int, &, -1, array[i], num_gangs (ng), gang);
check_reduction_op (int, |, 0, array[i], num_gangs (ng), gang);
check_reduction_op (int, ^, 0, array[i], num_gangs (ng), gang);
/* Worker reductions. */
check_reduction_op (int, +, 0, array[i], num_workers (nw), worker);
check_reduction_op (int, *, 1, array[i], num_workers (nw), worker);
check_reduction_op (int, &, -1, array[i], num_workers (nw), worker);
check_reduction_op (int, |, 0, array[i], num_workers (nw), worker);
check_reduction_op (int, ^, 0, array[i], num_workers (nw), worker);
/* Vector reductions. */
check_reduction_op (int, +, 0, array[i], vector_length (vl), vector);
check_reduction_op (int, *, 1, array[i], vector_length (vl), vector);
check_reduction_op (int, &, -1, array[i], vector_length (vl), vector);
check_reduction_op (int, |, 0, array[i], vector_length (vl), vector);
check_reduction_op (int, ^, 0, array[i], vector_length (vl), vector);
/* Combined reductions. */
check_reduction_op (int, +, 0, array[i], num_gangs (ng) num_workers (nw)
vector_length (vl), gang worker vector);
check_reduction_op (int, *, 1, array[i], num_gangs (ng) num_workers (nw)
vector_length (vl), gang worker vector);
check_reduction_op (int, &, -1, array[i], num_gangs (ng) num_workers (nw)
vector_length (vl), gang worker vector);
check_reduction_op (int, |, 0, array[i], num_gangs (ng) num_workers (nw)
vector_length (vl), gang worker vector);
check_reduction_op (int, ^, 0, array[i], num_gangs (ng) num_workers (nw)
vector_length (vl), gang worker vector);
}
static void
......@@ -55,29 +68,31 @@ test_reductions_bool (void)
array[i] = i;
cmp_val = 5;
check_reduction_op (bool, &&, true, (cmp_val > array[i]));
check_reduction_op (bool, ||, false, (cmp_val > array[i]));
}
#define check_reduction_macro(type, op, init, b) \
{ \
type res, vres; \
res = (init); \
DO_PRAGMA (acc parallel vector_length (vl) copy(res))\
DO_PRAGMA (acc loop reduction (op:res))\
for (i = 0; i < n; i++) \
res = op (res, (b)); \
\
vres = (init); \
for (i = 0; i < n; i++) \
vres = op (vres, (b)); \
\
if (res != vres) \
abort (); \
}
#define max(a, b) (((a) > (b)) ? (a) : (b))
#define min(a, b) (((a) < (b)) ? (a) : (b))
/* Gang reductions. */
check_reduction_op (int, &&, 1, (cmp_val > array[i]), num_gangs (ng),
gang);
check_reduction_op (int, ||, 0, (cmp_val > array[i]), num_gangs (ng),
gang);
/* Worker reductions. */
check_reduction_op (int, &&, 1, (cmp_val > array[i]), num_workers (nw),
worker);
check_reduction_op (int, ||, 0, (cmp_val > array[i]), num_workers (nw),
worker);
/* Vector reductions. */
check_reduction_op (int, &&, 1, (cmp_val > array[i]), vector_length (vl),
vector);
check_reduction_op (int, ||, 0, (cmp_val > array[i]), vector_length (vl),
vector);
/* Combined reductions. */
check_reduction_op (int, &&, 1, (cmp_val > array[i]), num_gangs (ng)
num_workers (nw) vector_length (vl), gang worker vector);
check_reduction_op (int, ||, 0, (cmp_val > array[i]), num_gangs (ng)
num_workers (nw) vector_length (vl), gang worker vector);
}
static void
test_reductions_minmax (void)
......@@ -89,14 +104,32 @@ test_reductions_minmax (void)
for (i = 0; i < n; i++)
array[i] = i;
check_reduction_macro (int, min, n + 1, array[i]);
check_reduction_macro (int, max, -1, array[i]);
/* Gang reductions. */
check_reduction_macro (int, min, n + 1, array[i], num_gangs (ng), gang);
check_reduction_macro (int, max, -1, array[i], num_gangs (ng), gang);
/* Worker reductions. */
check_reduction_macro (int, min, n + 1, array[i], num_workers (nw), worker);
check_reduction_macro (int, max, -1, array[i], num_workers (nw), worker);
/* Vector reductions. */
check_reduction_macro (int, min, n + 1, array[i], vector_length (vl),
vector);
check_reduction_macro (int, max, -1, array[i], vector_length (vl), vector);
/* Combined reductions. */
check_reduction_macro (int, min, n + 1, array[i], num_gangs (ng)
num_workers (nw) vector_length (vl), gang worker
vector);
check_reduction_macro (int, max, -1, array[i], num_gangs (ng)
num_workers (nw) vector_length (vl), gang worker
vector);
}
int
main (void)
{
test_reductions_int ();
test_reductions ();
test_reductions_bool ();
test_reductions_minmax ();
return 0;
......
/* { dg-do run } */
/* Ignore vector_length warnings for offloaded (nvptx) targets. */
/* { dg-additional-options "-foffload=-w" } */
/* float reductions. */
#include <stdlib.h>
#include <stdbool.h>
#include <math.h>
#include "reduction.h"
#define vl 32
const int ng = 8;
const int nw = 4;
const int vl = 32;
int
main(void)
static void
test_reductions (void)
{
const int n = 1000;
const int n = 100;
int i;
float vresult, result, array[n];
bool lvresult, lresult;
float array[n];
for (i = 0; i < n; i++)
array[i] = i;
result = 0;
vresult = 0;
array[i] = i+1;
/* '+' reductions. */
#pragma acc parallel vector_length (vl) copy(result)
#pragma acc loop reduction (+:result)
for (i = 0; i < n; i++)
result += array[i];
/* Gang reductions. */
check_reduction_op (float, +, 0, array[i], num_gangs (ng), gang);
check_reduction_op (float, *, 1, array[i], num_gangs (ng), gang);
/* Verify the reduction. */
for (i = 0; i < n; i++)
vresult += array[i];
if (result != vresult)
abort ();
result = 0;
vresult = 0;
/* '*' reductions. */
#pragma acc parallel vector_length (vl) copy(result)
#pragma acc loop reduction (*:result)
for (i = 0; i < n; i++)
result *= array[i];
/* Worker reductions. */
check_reduction_op (float, +, 0, array[i], num_workers (nw), worker);
check_reduction_op (float, *, 1, array[i], num_workers (nw), worker);
/* Verify the reduction. */
for (i = 0; i < n; i++)
vresult *= array[i];
if (fabs(result - vresult) > .0001)
abort ();
result = 0;
vresult = 0;
/* 'max' reductions. */
#pragma acc parallel vector_length (vl) copy(result)
#pragma acc loop reduction (max:result)
for (i = 0; i < n; i++)
result = result > array[i] ? result : array[i];
/* Verify the reduction. */
for (i = 0; i < n; i++)
vresult = vresult > array[i] ? vresult : array[i];
if (result != vresult)
abort ();
result = 0;
vresult = 0;
/* 'min' reductions. */
#pragma acc parallel vector_length (vl) copy(result)
#pragma acc loop reduction (min:result)
for (i = 0; i < n; i++)
result = result < array[i] ? result : array[i];
/* Vector reductions. */
check_reduction_op (float, +, 0, array[i], vector_length (vl), vector);
check_reduction_op (float, *, 1, array[i], vector_length (vl), vector);
/* Verify the reduction. */
for (i = 0; i < n; i++)
vresult = vresult < array[i] ? vresult : array[i];
if (result != vresult)
abort ();
result = 5;
vresult = 5;
lresult = false;
lvresult = false;
/* '&&' reductions. */
#pragma acc parallel vector_length (vl) copy(lresult)
#pragma acc loop reduction (&&:lresult)
for (i = 0; i < n; i++)
lresult = lresult && (result > array[i]);
/* Verify the reduction. */
for (i = 0; i < n; i++)
lvresult = lresult && (result > array[i]);
if (lresult != lvresult)
abort ();
result = 5;
vresult = 5;
lresult = false;
lvresult = false;
/* Combined reductions. */
check_reduction_op (float, +, 0, array[i], num_gangs (ng) num_workers (nw)
vector_length (vl), gang worker vector);
check_reduction_op (float, *, 1, array[i], num_gangs (ng) num_workers (nw)
vector_length (vl), gang worker vector);
}
/* '||' reductions. */
#pragma acc parallel vector_length (vl) copy(lresult)
#pragma acc loop reduction (||:lresult)
for (i = 0; i < n; i++)
lresult = lresult || (result > array[i]);
static void
test_reductions_minmax (void)
{
const int n = 1000;
int i;
float array[n];
/* Verify the reduction. */
for (i = 0; i < n; i++)
lvresult = lresult || (result > array[i]);
array[i] = i;
if (lresult != lvresult)
abort ();
/* Gang reductions. */
check_reduction_macro (float, min, n + 1, array[i], num_gangs (ng), gang);
check_reduction_macro (float, max, -1, array[i], num_gangs (ng), gang);
/* Worker reductions. */
check_reduction_macro (float, min, n + 1, array[i], num_workers (nw),
worker);
check_reduction_macro (float, max, -1, array[i], num_workers (nw), worker);
/* Vector reductions. */
check_reduction_macro (float, min, n + 1, array[i], vector_length (vl),
vector);
check_reduction_macro (float, max, -1, array[i], vector_length (vl), vector);
/* Combined reductions. */
check_reduction_macro (float, min, n + 1, array[i], num_gangs (ng)
num_workers (nw) vector_length (vl), gang worker
vector);
check_reduction_macro (float, max, -1, array[i], num_gangs (ng)
num_workers (nw)vector_length (vl), gang worker
vector);
}
int
main (void)
{
test_reductions ();
test_reductions_minmax ();
return 0;
}
/* { dg-do run } */
/* Ignore vector_length warnings for offloaded (nvptx) targets. */
/* { dg-additional-options "-foffload=-w" } */
/* double reductions. */
#include <stdlib.h>
#include <stdbool.h>
#include <math.h>
#include "reduction.h"
#define vl 32
const int ng = 8;
const int nw = 4;
const int vl = 32;
int
main(void)
static void
test_reductions (void)
{
const int n = 1000;
const int n = 10;
int i;
double vresult, result, array[n];
bool lvresult, lresult;
for (i = 0; i < n; i++)
array[i] = i;
result = 0;
vresult = 0;
/* 'max' reductions. */
#pragma acc parallel vector_length (vl) copy(result)
#pragma acc loop reduction (max:result)
for (i = 0; i < n; i++)
result = result > array[i] ? result : array[i];
/* Verify the reduction. */
for (i = 0; i < n; i++)
vresult = vresult > array[i] ? vresult : array[i];
if (result != vresult)
abort ();
result = 0;
vresult = 0;
/* 'min' reductions. */
#pragma acc parallel vector_length (vl) copy(result)
#pragma acc loop reduction (min:result)
for (i = 0; i < n; i++)
result = result < array[i] ? result : array[i];
double array[n];
/* Verify the reduction. */
for (i = 0; i < n; i++)
vresult = vresult < array[i] ? vresult : array[i];
array[i] = i+1;
if (result != vresult)
abort ();
/* Gang reductions. */
check_reduction_op (double, +, 0, array[i], num_gangs (ng), gang);
check_reduction_op (double, *, 1, array[i], num_gangs (ng), gang);
result = 5;
vresult = 5;
/* Worker reductions. */
check_reduction_op (double, +, 0, array[i], num_workers (nw), worker);
check_reduction_op (double, *, 1, array[i], num_workers (nw), worker);
lresult = false;
lvresult = false;
/* Vector reductions. */
check_reduction_op (double, +, 0, array[i], vector_length (vl), vector);
check_reduction_op (double, *, 1, array[i], vector_length (vl), vector);
/* '&&' reductions. */
#pragma acc parallel vector_length (vl) copy(lresult)
#pragma acc loop reduction (&&:lresult)
for (i = 0; i < n; i++)
lresult = lresult && (result > array[i]);
/* Verify the reduction. */
for (i = 0; i < n; i++)
lvresult = lresult && (result > array[i]);
if (lresult != lvresult)
abort ();
result = 5;
vresult = 5;
lresult = false;
lvresult = false;
/* Combined reductions. */
check_reduction_op (double, +, 0, array[i], num_gangs (ng) num_workers (nw)
vector_length (vl), gang worker vector);
check_reduction_op (double, *, 1, array[i], num_gangs (ng) num_workers (nw)
vector_length (vl), gang worker vector);
}
/* '||' reductions. */
#pragma acc parallel vector_length (vl) copy(lresult)
#pragma acc loop reduction (||:lresult)
for (i = 0; i < n; i++)
lresult = lresult || (result > array[i]);
static void
test_reductions_minmax (void)
{
const int n = 1000;
int i;
double array[n];
/* Verify the reduction. */
for (i = 0; i < n; i++)
lvresult = lresult || (result > array[i]);
array[i] = i;
if (lresult != lvresult)
abort ();
/* Gang reductions. */
check_reduction_macro (double, min, n + 1, array[i], num_gangs (ng), gang);
check_reduction_macro (double, max, -1, array[i], num_gangs (ng), gang);
/* Worker reductions. */
check_reduction_macro (double, min, n + 1, array[i], num_workers (nw),
worker);
check_reduction_macro (double, max, -1, array[i], num_workers (nw), worker);
/* Vector reductions. */
check_reduction_macro (double, min, n + 1, array[i], vector_length (vl),
vector);
check_reduction_macro (double, max, -1, array[i], vector_length (vl),
vector);
/* Combined reductions. */
check_reduction_macro (double, min, n + 1, array[i], num_gangs (ng)
num_workers (nw) vector_length (vl), gang worker
vector);
check_reduction_macro (double, max, -1, array[i], num_gangs (ng)
num_workers (nw) vector_length (vl), gang worker
vector);
}
int
main (void)
{
test_reductions ();
test_reductions_minmax ();
return 0;
}
/* { dg-do run { target { ! { hppa*-*-hpux* } } } } */
/* Ignore vector_length warnings for offloaded (nvptx) targets. */
/* { dg-additional-options "-foffload=-w" } */
/* complex reductions. */
#include <stdlib.h>
#include <stdbool.h>
#include <math.h>
#include <complex.h>
#include "reduction.h"
#define vl 32
const int ng = 8;
const int nw = 4;
const int vl = 32;
int
main(void)
static void
test_reductions (void)
{
const int n = 1000;
const int n = 10;
int i;
double _Complex vresult, result, array[n];
bool lvresult, lresult;
for (i = 0; i < n; i++)
array[i] = i;
result = 0;
vresult = 0;
/* '&&' reductions. */
#pragma acc parallel vector_length (vl) copy(lresult)
#pragma acc loop reduction (&&:lresult)
for (i = 0; i < n; i++)
lresult = lresult && (creal(result) > creal(array[i]));
/* Verify the reduction. */
for (i = 0; i < n; i++)
lvresult = lresult && (creal(result) > creal(array[i]));
double _Complex array[n];
if (lresult != lvresult)
abort ();
result = 5;
vresult = 5;
lresult = false;
lvresult = false;
/* '||' reductions. */
#pragma acc parallel vector_length (vl) copy(lresult)
#pragma acc loop reduction (||:lresult)
for (i = 0; i < n; i++)
lresult = lresult || (creal(result) > creal(array[i]));
/* Verify the reduction. */
for (i = 0; i < n; i++)
lvresult = lresult || (creal(result) > creal(array[i]));
if (lresult != lvresult)
abort ();
array[i] = i+1;
/* Gang reductions. */
check_reduction_op (double, +, 0, creal (array[i]), num_gangs (ng), gang);
check_reduction_op (double, *, 1, creal (array[i]), num_gangs (ng), gang);
/* Worker reductions. */
check_reduction_op (double, +, 0, creal (array[i]), num_workers (nw),
worker);
check_reduction_op (double, *, 1, creal (array[i]), num_workers (nw),
worker);
/* Vector reductions. */
check_reduction_op (double, +, 0, creal (array[i]), vector_length (vl),
vector);
check_reduction_op (double, *, 1, creal (array[i]), vector_length (vl),
vector);
/* Combined reductions. */
check_reduction_op (double, +, 0, creal (array[i]), num_gangs (ng)
num_workers (nw) vector_length (vl), gang worker
vector);
check_reduction_op (double, *, 1, creal (array[i]), num_gangs (ng)
num_workers (nw) vector_length (vl), gang worker
vector);
}
int
main (void)
{
test_reductions ();
return 0;
}
/* { dg-do run } */
/* { dg-additional-options "-w" } */
/* Ignore vector_length warnings for offloaded (nvptx) targets. */
/* { dg-additional-options "-foffload=-w" } */
/* Multiple reductions. */
#include <stdio.h>
#include <stdlib.h>
const int ng = 8;
const int nw = 4;
const int vl = 32;
const int n = 100;
#define DO_PRAGMA(x) _Pragma (#x)
#define check_reduction(gwv_par, gwv_loop) \
{ \
s1 = 2; s2 = 5; \
DO_PRAGMA (acc parallel gwv_par copy (s1, s2)) \
DO_PRAGMA (acc loop gwv_loop reduction (+:s1, s2)) \
for (i = 0; i < n; i++) \
{ \
s1 = s1 + 3; \
s2 = s2 + 5; \
} \
\
if (s1 != v1 && s2 != v2) \
abort (); \
}
int
main (void)
{
int s1 = 2, s2 = 5, v1 = 2, v2 = 5;
int n = 100;
int i;
#pragma acc parallel vector_length (32) copy(s1,s2)
#pragma acc loop reduction (+:s1, s2)
for (i = 0; i < n; i++)
{
s1 = s1 + 3;
s2 = s2 + 2;
}
for (i = 0; i < n; i++)
{
v1 = v1 + 3;
v2 = v2 + 2;
}
if (s1 != v1)
abort ();
if (s2 != v2)
abort ();
check_reduction (num_gangs (ng), gang);
/* Nvptx targets require a vector_length or 32 in to allow spinlocks with
gangs. */
check_reduction (num_workers (nw) vector_length (vl), worker);
check_reduction (vector_length (vl), vector);
check_reduction (num_gangs (ng) num_workers (nw) vector_length (vl), gang
worker vector);
return 0;
}
/* { dg-do run } */
/* { dg-additional-options "-w" } */
/* Test reductions on explicitly private variables. */
#include <assert.h>
int
main ()
{
int i, j, red[10];
int v;
for (i = 0; i < 10; i++)
red[i] = -1;
#pragma acc parallel copyout(red)
{
#pragma acc loop gang private(v)
for (j = 0; j < 10; j++)
{
v = j;
#pragma acc loop vector reduction (+:v)
for (i = 0; i < 100; i++)
v++;
red[j] = v;
}
}
for (i = 0; i < 10; i++)
assert (red[i] == i + 100);
return 0;
}
#ifndef REDUCTION_H
#define REDUCTION_H
#define DO_PRAGMA(x) _Pragma (#x)
#define check_reduction_op(type, op, init, b, gwv_par, gwv_loop) \
{ \
type res, vres; \
res = (init); \
DO_PRAGMA (acc parallel gwv_par copy (res)) \
DO_PRAGMA (acc loop gwv_loop reduction (op:res)) \
for (i = 0; i < n; i++) \
res = res op (b); \
\
vres = (init); \
for (i = 0; i < n; i++) \
vres = vres op (b); \
\
if (res != vres) \
abort (); \
}
#define check_reduction_macro(type, op, init, b, gwv_par, gwv_loop) \
{ \
type res, vres; \
res = (init); \
DO_PRAGMA (acc parallel gwv_par copy(res)) \
DO_PRAGMA (acc loop gwv_loop reduction (op:res)) \
for (i = 0; i < n; i++) \
res = op (res, (b)); \
\
vres = (init); \
for (i = 0; i < n; i++) \
vres = op (vres, (b)); \
\
if (res != vres) \
abort (); \
}
#define max(a, b) (((a) > (b)) ? (a) : (b))
#define min(a, b) (((a) < (b)) ? (a) : (b))
#endif
! { dg-do run }
! { dg-additional-options "-w" }
program reduction
implicit none
integer, parameter :: n = 10
integer s1, s2
include "openacc_lib.h"
s1 = 0
s2 = 0
!$acc parallel reduction(+:s1,s2) num_gangs (n) copy(s1)
s1 = s1 + 1
s2 = s2 + 1
!$acc end parallel
if (acc_get_device_type () .eq. acc_device_nvidia) then
if (s1 .ne. n) call abort
if (s2 .ne. n) call abort
else
if (s1 .ne. 1) call abort
if (s2 .ne. 1) call abort
end if
! Test reductions inside subroutines
s1 = 0
s2 = 0
call redsub (s1, s2, n)
if (acc_get_device_type () .eq. acc_device_nvidia) then
if (s1 .ne. n) call abort
else
if (s2 .ne. 1) call abort
end if
end program reduction
subroutine redsub(s1, s2, n)
implicit none
integer :: s1, s2, n
!$acc parallel reduction(+:s1,s2) num_gangs (10) copy(s1)
s1 = s1 + 1
s2 = s2 + 1
!$acc end parallel
end subroutine redsub
program foo
implicit none
integer :: i
integer :: temp = 0
integer :: temp2 = 0
!$acc parallel
!$acc loop gang private(temp)
do i=1, 10000
temp = 0
enddo
!$acc end parallel
!$acc parallel reduction(+:temp2)
!$acc loop gang reduction(+:temp2)
do i=1, 10000
temp2 = 0
enddo
!$acc end parallel
end program foo
......@@ -5,50 +5,108 @@
program reduction_4
implicit none
integer, parameter :: n = 10, vl = 32
integer, parameter :: n = 10, ng = 8, nw = 4, vl = 32
integer :: i
complex :: vresult, result
real :: vresult, rg, rw, rv, rc
complex, dimension (n) :: array
do i = 1, n
array(i) = i
end do
result = 0
!
! '+' reductions
!
rg = 0
rw = 0
rv = 0
rc = 0
vresult = 0
! '+' reductions
!$acc parallel num_gangs(ng) copy(rg)
!$acc loop reduction(+:rg) gang
do i = 1, n
rg = rg + REAL(array(i))
end do
!$acc end parallel
!$acc parallel num_workers(nw) copy(rw)
!$acc loop reduction(+:rw) worker
do i = 1, n
rw = rw + REAL(array(i))
end do
!$acc end parallel
!$acc parallel vector_length(vl) copy(rv)
!$acc loop reduction(+:rv) vector
do i = 1, n
rv = rv + REAL(array(i))
end do
!$acc end parallel
!$acc parallel vector_length(vl) num_gangs(1) copy(result)
!$acc loop reduction(+:result)
!$acc parallel num_gangs(ng) num_workers(nw) vector_length(vl) copy(rc)
!$acc loop reduction(+:rc) gang worker vector
do i = 1, n
result = result + array(i)
rc = rc + REAL(array(i))
end do
!$acc end parallel
! Verify the results
do i = 1, n
vresult = vresult + array(i)
vresult = vresult + REAL(array(i))
end do
if (result .ne. vresult) call abort
if (rg .ne. vresult) call abort
if (rw .ne. vresult) call abort
if (rv .ne. vresult) call abort
if (rc .ne. vresult) call abort
result = 1
!
! '*' reductions
!
rg = 1
rw = 1
rv = 1
rc = 1
vresult = 1
! ! '*' reductions
!
! !$acc parallel vector_length(vl)
! !$acc loop reduction(*:result)
! do i = 1, n
! result = result * array(i)
! end do
! !$acc end parallel
!
! ! Verify the results
! do i = 1, n
! vresult = vresult * array(i)
! end do
!
! if (result.ne.vresult) call abort
!$acc parallel num_gangs(ng) copy(rg)
!$acc loop reduction(*:rg) gang
do i = 1, n
rg = rg * REAL(array(i))
end do
!$acc end parallel
!$acc parallel num_workers(nw) copy(rw)
!$acc loop reduction(*:rw) worker
do i = 1, n
rw = rw * REAL(array(i))
end do
!$acc end parallel
!$acc parallel vector_length(vl) copy(rv)
!$acc loop reduction(*:rv) vector
do i = 1, n
rv = rv * REAL(array(i))
end do
!$acc end parallel
!$acc parallel num_gangs(ng) num_workers(nw) vector_length(vl) copy(rc)
!$acc loop reduction(*:rc) gang worker vector
do i = 1, n
rc = rc * REAL(array(i))
end do
!$acc end parallel
! Verify the results
do i = 1, n
vresult = vresult * REAL(array(i))
end do
if (rg .ne. vresult) call abort
if (rw .ne. vresult) call abort
if (rv .ne. vresult) call abort
if (rc .ne. vresult) call abort
end program reduction_4
! { dg-do run }
! { dg-additional-options "-w" }
! subroutine reduction
program reduction
integer, parameter :: n = 40, c = 10
integer :: i, vsum, sum
integer :: i, vsum, gs, ws, vs, cs, ns
call redsub (sum, n, c)
call redsub_gang (gs, n, c)
call redsub_worker (ws, n, c)
call redsub_vector (vs, n, c)
call redsub_combined (cs, n, c)
call redsub_nested (ns, n, c)
vsum = 0
......@@ -15,21 +20,80 @@ program reduction
vsum = vsum + c
end do
if (sum.ne.vsum) call abort ()
if (gs .ne. vsum) call abort ()
if (ws .ne. vsum) call abort ()
if (vs .ne. vsum) call abort ()
if (cs .ne. vsum) call abort ()
if (ns .ne. vsum) call abort ()
end program reduction
subroutine redsub(sum, n, c)
subroutine redsub_gang(sum, n, c)
integer :: sum, n, c
integer :: s
s = 0
sum = 0
!$acc parallel vector_length(32) copyin (n, c) copy (s) num_gangs(1)
!$acc loop reduction(+:s)
!$acc parallel copyin (n, c) num_gangs(n) copy(sum)
!$acc loop reduction(+:sum) gang
do i = 1, n
s = s + c
sum = sum + c
end do
!$acc end parallel
end subroutine redsub_gang
sum = s
end subroutine redsub
subroutine redsub_worker(sum, n, c)
integer :: sum, n, c
sum = 0
!$acc parallel copyin (n, c) num_workers(4) vector_length (32) copy(sum)
!$acc loop reduction(+:sum) worker
do i = 1, n
sum = sum + c
end do
!$acc end parallel
end subroutine redsub_worker
subroutine redsub_vector(sum, n, c)
integer :: sum, n, c
sum = 0
!$acc parallel copyin (n, c) vector_length(32) copy(sum)
!$acc loop reduction(+:sum) vector
do i = 1, n
sum = sum + c
end do
!$acc end parallel
end subroutine redsub_vector
subroutine redsub_combined(sum, n, c)
integer :: sum, n, c
sum = 0
!$acc parallel num_gangs (8) num_workers (4) vector_length(32) copy(sum)
!$acc loop reduction(+:sum) gang worker vector
do i = 1, n
sum = sum + c
end do
!$acc end parallel
end subroutine redsub_combined
subroutine redsub_nested(sum, n, c)
integer :: sum, n, c
integer :: ii, jj
ii = n / 10;
jj = 10;
sum = 0
!$acc parallel num_gangs (8) copy(sum)
!$acc loop reduction(+:sum) gang
do i = 1, ii
!$acc loop reduction(+:sum) vector
do j = 1, jj
sum = sum + c
end do
end do
!$acc end parallel
end subroutine redsub_nested
! { dg-do run }
! { dg-additional-options "-cpp -w" }
program reduction
implicit none
integer, parameter :: n = 100
integer :: i, s1, s2, vs1, vs2
integer, parameter :: n = 100, n2 = 1000, chunksize = 10
integer :: i, gs1, gs2, ws1, ws2, vs1, vs2, cs1, cs2, hs1, hs2
integer :: j, red, vred
s1 = 0
s2 = 0
gs1 = 0
gs2 = 0
ws1 = 0
ws2 = 0
vs1 = 0
vs2 = 0
cs1 = 0
cs2 = 0
hs1 = 0
hs2 = 0
!$acc parallel vector_length (32) copy(s1, s2)
!$acc loop reduction(+:s1, s2)
!$acc parallel num_gangs (1000) copy(gs1, gs2)
!$acc loop reduction(+:gs1, gs2) gang
do i = 1, n
s1 = s1 + 1
s2 = s2 + 2
gs1 = gs1 + 1
gs2 = gs2 + 2
end do
!$acc end parallel
! Verify the results
!$acc parallel num_workers (4) vector_length (32) copy(ws1, ws2)
!$acc loop reduction(+:ws1, ws2) worker
do i = 1, n
ws1 = ws1 + 1
ws2 = ws2 + 2
end do
!$acc end parallel
!$acc parallel vector_length (32) copy(vs1, vs2)
!$acc loop reduction(+:vs1, vs2) vector
do i = 1, n
vs1 = vs1 + 1
vs2 = vs2 + 2
end do
!$acc end parallel
!$acc parallel num_gangs(8) num_workers(4) vector_length(32) copy(cs1, cs2)
!$acc loop reduction(+:cs1, cs2) gang worker vector
do i = 1, n
cs1 = cs1 + 1
cs2 = cs2 + 2
end do
!$acc end parallel
! Verify the results on the host
do i = 1, n
hs1 = hs1 + 1
hs2 = hs2 + 2
end do
if (gs1 .ne. hs1) call abort ()
if (gs2 .ne. hs2) call abort ()
if (ws1 .ne. hs1) call abort ()
if (ws2 .ne. hs2) call abort ()
if (vs1 .ne. hs1) call abort ()
if (vs2 .ne. hs2) call abort ()
if (cs1 .ne. hs1) call abort ()
if (cs2 .ne. hs2) call abort ()
! Nested reductions.
red = 0
vred = 0
!$acc parallel num_gangs(10) vector_length(32) copy(red)
!$acc loop reduction(+:red) gang
do i = 1, n/chunksize
!$acc loop reduction(+:red) vector
do j = 1, chunksize
red = red + chunksize
end do
end do
!$acc end parallel
do i = 1, n/chunksize
do j = 1, chunksize
vred = vred + chunksize
end do
end do
if (s1.ne.vs1) call abort ()
if (s2.ne.vs2) call abort ()
if (red .ne. vred) call abort ()
end program reduction
! { dg-do run }
! { dg-additional-options "-w" }
! subroutine reduction with private and firstprivate variables
program reduction
integer, parameter :: n = 100
integer :: i, j, vsum, cs, arr(n)
call redsub_private (cs, n, arr)
call redsub_bogus (cs, n)
call redsub_combined (cs, n, arr)
vsum = 0
! Verify the results
do i = 1, n
vsum = i
do j = 1, n
vsum = vsum + 1;
end do
if (vsum .ne. arr(i)) call abort ()
end do
end program reduction
! This subroutine tests a reduction with an explicit private variable.
subroutine redsub_private(sum, n, arr)
integer :: sum, n, arr(n)
integer :: i, j, v
!$acc parallel copyout (arr)
!$acc loop gang private (v)
do j = 1, n
v = j
!$acc loop vector reduction (+:v)
do i = 1, 100
v = v + 1
end do
arr(j) = v
end do
!$acc end parallel
! verify the results
do i = 1, 10
if (arr(i) .ne. 100+i) call abort ()
end do
end subroutine redsub_private
! Bogus reduction on an impliclitly firstprivate variable. The results do
! survive the parallel region. The goal here is to ensure that gfortran
! doesn't ICE.
subroutine redsub_bogus(sum, n)
integer :: sum, n, arr(n)
integer :: i
!$acc parallel
!$acc loop gang worker vector reduction (+:sum)
do i = 1, n
sum = sum + 1
end do
!$acc end parallel
end subroutine redsub_bogus
! This reduction involving a firstprivate variable yields legitimate results.
subroutine redsub_combined(sum, n, arr)
integer :: sum, n, arr(n)
integer :: i, j
!$acc parallel copy (arr)
!$acc loop gang
do i = 1, n
sum = i;
!$acc loop reduction(+:sum)
do j = 1, n
sum = sum + 1
end do
arr(i) = sum
end do
!$acc end parallel
end subroutine redsub_combined
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment