Commit fb79f500 by Jakub Jelinek Committed by Jakub Jelinek

re PR libgomp/49490 (suboptimal load balancing in loops)

	PR libgomp/49490
	* omp-low.c (expand_omp_for_static_nochunk): Only
	use n ceil/ nthreads size for the first
	n % nthreads threads in the team instead of
	all threads except for the last few ones which
	get less work or none at all.

	* iter.c (gomp_iter_static_next): For chunk size 0
	only use n ceil/ nthreads size for the first
	n % nthreads threads in the team instead of
	all threads except for the last few ones which
	get less work or none at all.
	* iter_ull.c (gomp_iter_ull_static_next): Likewise.
	* env.c (parse_schedule): If OMP_SCHEDULE doesn't have
	chunk argument, set run_sched_modifier to 0 for static
	resp. 1 for other kinds.  If chunk argument is 0
	and not static, set value to 1.

From-SVN: r175315
parent 4fb489e7
2011-06-22 Jakub Jelinek <jakub@redhat.com> 2011-06-22 Jakub Jelinek <jakub@redhat.com>
PR libgomp/49490
* omp-low.c (expand_omp_for_static_nochunk): Only
use n ceil/ nthreads size for the first
n % nthreads threads in the team instead of
all threads except for the last few ones which
get less work or none at all.
PR debug/49496 PR debug/49496
* tree-vect-patterns.c (vect_recog_widen_mult_pattern): Ignore debug * tree-vect-patterns.c (vect_recog_widen_mult_pattern): Ignore debug
uses. uses.
......
...@@ -3,7 +3,7 @@ ...@@ -3,7 +3,7 @@
marshalling to implement data sharing and copying clauses. marshalling to implement data sharing and copying clauses.
Contributed by Diego Novillo <dnovillo@redhat.com> Contributed by Diego Novillo <dnovillo@redhat.com>
Copyright (C) 2005, 2006, 2007, 2008, 2009, 2010 Copyright (C) 2005, 2006, 2007, 2008, 2009, 2010, 2011
Free Software Foundation, Inc. Free Software Foundation, Inc.
This file is part of GCC. This file is part of GCC.
...@@ -4108,9 +4108,14 @@ expand_omp_for_generic (struct omp_region *region, ...@@ -4108,9 +4108,14 @@ expand_omp_for_generic (struct omp_region *region,
else else
n = (adj + N2 - N1) / STEP; n = (adj + N2 - N1) / STEP;
q = n / nthreads; q = n / nthreads;
q += (q * nthreads != n); tt = n % nthreads;
s0 = q * threadid; if (threadid < tt) goto L3; else goto L4;
e0 = min(s0 + q, n); L3:
tt = 0;
q = q + 1;
L4:
s0 = q * threadid + tt;
e0 = s0 + q;
V = s0 * STEP + N1; V = s0 * STEP + N1;
if (s0 >= e0) goto L2; else goto L0; if (s0 >= e0) goto L2; else goto L0;
L0: L0:
...@@ -4126,12 +4131,14 @@ static void ...@@ -4126,12 +4131,14 @@ static void
expand_omp_for_static_nochunk (struct omp_region *region, expand_omp_for_static_nochunk (struct omp_region *region,
struct omp_for_data *fd) struct omp_for_data *fd)
{ {
tree n, q, s0, e0, e, t, nthreads, threadid; tree n, q, s0, e0, e, t, tt, nthreads, threadid;
tree type, itype, vmain, vback; tree type, itype, vmain, vback;
basic_block entry_bb, exit_bb, seq_start_bb, body_bb, cont_bb; basic_block entry_bb, second_bb, third_bb, exit_bb, seq_start_bb;
basic_block body_bb, cont_bb;
basic_block fin_bb; basic_block fin_bb;
gimple_stmt_iterator gsi; gimple_stmt_iterator gsi;
gimple stmt; gimple stmt;
edge ep;
itype = type = TREE_TYPE (fd->loop.v); itype = type = TREE_TYPE (fd->loop.v);
if (POINTER_TYPE_P (type)) if (POINTER_TYPE_P (type))
...@@ -4185,19 +4192,39 @@ expand_omp_for_static_nochunk (struct omp_region *region, ...@@ -4185,19 +4192,39 @@ expand_omp_for_static_nochunk (struct omp_region *region,
t = fold_convert (itype, t); t = fold_convert (itype, t);
n = force_gimple_operand_gsi (&gsi, t, true, NULL_TREE, true, GSI_SAME_STMT); n = force_gimple_operand_gsi (&gsi, t, true, NULL_TREE, true, GSI_SAME_STMT);
q = create_tmp_var (itype, "q");
t = fold_build2 (TRUNC_DIV_EXPR, itype, n, nthreads); t = fold_build2 (TRUNC_DIV_EXPR, itype, n, nthreads);
q = force_gimple_operand_gsi (&gsi, t, true, NULL_TREE, true, GSI_SAME_STMT); t = force_gimple_operand_gsi (&gsi, t, false, NULL_TREE, true, GSI_SAME_STMT);
gsi_insert_before (&gsi, gimple_build_assign (q, t), GSI_SAME_STMT);
tt = create_tmp_var (itype, "tt");
t = fold_build2 (TRUNC_MOD_EXPR, itype, n, nthreads);
t = force_gimple_operand_gsi (&gsi, t, false, NULL_TREE, true, GSI_SAME_STMT);
gsi_insert_before (&gsi, gimple_build_assign (tt, t), GSI_SAME_STMT);
t = fold_build2 (MULT_EXPR, itype, q, nthreads); t = build2 (LT_EXPR, boolean_type_node, threadid, tt);
t = fold_build2 (NE_EXPR, itype, t, n); stmt = gimple_build_cond_empty (t);
t = fold_build2 (PLUS_EXPR, itype, q, t); gsi_insert_before (&gsi, stmt, GSI_SAME_STMT);
q = force_gimple_operand_gsi (&gsi, t, true, NULL_TREE, true, GSI_SAME_STMT);
second_bb = split_block (entry_bb, stmt)->dest;
gsi = gsi_last_bb (second_bb);
gcc_assert (gimple_code (gsi_stmt (gsi)) == GIMPLE_OMP_FOR);
gsi_insert_before (&gsi, gimple_build_assign (tt, build_int_cst (itype, 0)),
GSI_SAME_STMT);
stmt = gimple_build_assign_with_ops (PLUS_EXPR, q, q,
build_int_cst (itype, 1));
gsi_insert_before (&gsi, stmt, GSI_SAME_STMT);
third_bb = split_block (second_bb, stmt)->dest;
gsi = gsi_last_bb (third_bb);
gcc_assert (gimple_code (gsi_stmt (gsi)) == GIMPLE_OMP_FOR);
t = build2 (MULT_EXPR, itype, q, threadid); t = build2 (MULT_EXPR, itype, q, threadid);
t = build2 (PLUS_EXPR, itype, t, tt);
s0 = force_gimple_operand_gsi (&gsi, t, true, NULL_TREE, true, GSI_SAME_STMT); s0 = force_gimple_operand_gsi (&gsi, t, true, NULL_TREE, true, GSI_SAME_STMT);
t = fold_build2 (PLUS_EXPR, itype, s0, q); t = fold_build2 (PLUS_EXPR, itype, s0, q);
t = fold_build2 (MIN_EXPR, itype, t, n);
e0 = force_gimple_operand_gsi (&gsi, t, true, NULL_TREE, true, GSI_SAME_STMT); e0 = force_gimple_operand_gsi (&gsi, t, true, NULL_TREE, true, GSI_SAME_STMT);
t = build2 (GE_EXPR, boolean_type_node, s0, e0); t = build2 (GE_EXPR, boolean_type_node, s0, e0);
...@@ -4263,13 +4290,20 @@ expand_omp_for_static_nochunk (struct omp_region *region, ...@@ -4263,13 +4290,20 @@ expand_omp_for_static_nochunk (struct omp_region *region,
gsi_remove (&gsi, true); gsi_remove (&gsi, true);
/* Connect all the blocks. */ /* Connect all the blocks. */
find_edge (entry_bb, seq_start_bb)->flags = EDGE_FALSE_VALUE; ep = make_edge (entry_bb, third_bb, EDGE_FALSE_VALUE);
find_edge (entry_bb, fin_bb)->flags = EDGE_TRUE_VALUE; ep->probability = REG_BR_PROB_BASE / 4 * 3;
ep = find_edge (entry_bb, second_bb);
ep->flags = EDGE_TRUE_VALUE;
ep->probability = REG_BR_PROB_BASE / 4;
find_edge (third_bb, seq_start_bb)->flags = EDGE_FALSE_VALUE;
find_edge (third_bb, fin_bb)->flags = EDGE_TRUE_VALUE;
find_edge (cont_bb, body_bb)->flags = EDGE_TRUE_VALUE; find_edge (cont_bb, body_bb)->flags = EDGE_TRUE_VALUE;
find_edge (cont_bb, fin_bb)->flags = EDGE_FALSE_VALUE; find_edge (cont_bb, fin_bb)->flags = EDGE_FALSE_VALUE;
set_immediate_dominator (CDI_DOMINATORS, seq_start_bb, entry_bb); set_immediate_dominator (CDI_DOMINATORS, second_bb, entry_bb);
set_immediate_dominator (CDI_DOMINATORS, third_bb, entry_bb);
set_immediate_dominator (CDI_DOMINATORS, seq_start_bb, third_bb);
set_immediate_dominator (CDI_DOMINATORS, body_bb, set_immediate_dominator (CDI_DOMINATORS, body_bb,
recompute_dominator (CDI_DOMINATORS, body_bb)); recompute_dominator (CDI_DOMINATORS, body_bb));
set_immediate_dominator (CDI_DOMINATORS, fin_bb, set_immediate_dominator (CDI_DOMINATORS, fin_bb,
......
2011-06-22 Jakub Jelinek <jakub@redhat.com>
PR libgomp/49490
* iter.c (gomp_iter_static_next): For chunk size 0
only use n ceil/ nthreads size for the first
n % nthreads threads in the team instead of
all threads except for the last few ones which
get less work or none at all.
* iter_ull.c (gomp_iter_ull_static_next): Likewise.
* env.c (parse_schedule): If OMP_SCHEDULE doesn't have
chunk argument, set run_sched_modifier to 0 for static
resp. 1 for other kinds. If chunk argument is 0
and not static, set value to 1.
2011-05-19 Jakub Jelinek <jakub@redhat.com> 2011-05-19 Jakub Jelinek <jakub@redhat.com>
PR c++/49043 PR c++/49043
......
/* Copyright (C) 2005, 2006, 2007, 2008, 2009, 2010 /* Copyright (C) 2005, 2006, 2007, 2008, 2009, 2010, 2011
Free Software Foundation, Inc. Free Software Foundation, Inc.
Contributed by Richard Henderson <rth@redhat.com>. Contributed by Richard Henderson <rth@redhat.com>.
...@@ -108,7 +108,11 @@ parse_schedule (void) ...@@ -108,7 +108,11 @@ parse_schedule (void)
while (isspace ((unsigned char) *env)) while (isspace ((unsigned char) *env))
++env; ++env;
if (*env == '\0') if (*env == '\0')
return; {
gomp_global_icv.run_sched_modifier
= gomp_global_icv.run_sched_var != GFS_STATIC;
return;
}
if (*env++ != ',') if (*env++ != ',')
goto unknown; goto unknown;
while (isspace ((unsigned char) *env)) while (isspace ((unsigned char) *env))
...@@ -129,6 +133,8 @@ parse_schedule (void) ...@@ -129,6 +133,8 @@ parse_schedule (void)
if ((int)value != value) if ((int)value != value)
goto invalid; goto invalid;
if (value == 0 && gomp_global_icv.run_sched_var != GFS_STATIC)
value = 1;
gomp_global_icv.run_sched_modifier = value; gomp_global_icv.run_sched_modifier = value;
return; return;
......
/* Copyright (C) 2005, 2008, 2009 Free Software Foundation, Inc. /* Copyright (C) 2005, 2008, 2009, 2011 Free Software Foundation, Inc.
Contributed by Richard Henderson <rth@redhat.com>. Contributed by Richard Henderson <rth@redhat.com>.
This file is part of the GNU OpenMP Library (libgomp). This file is part of the GNU OpenMP Library (libgomp).
...@@ -59,7 +59,7 @@ gomp_iter_static_next (long *pstart, long *pend) ...@@ -59,7 +59,7 @@ gomp_iter_static_next (long *pstart, long *pend)
trip through the outer loop. */ trip through the outer loop. */
if (ws->chunk_size == 0) if (ws->chunk_size == 0)
{ {
unsigned long n, q, i; unsigned long n, q, i, t;
unsigned long s0, e0; unsigned long s0, e0;
long s, e; long s, e;
...@@ -74,11 +74,14 @@ gomp_iter_static_next (long *pstart, long *pend) ...@@ -74,11 +74,14 @@ gomp_iter_static_next (long *pstart, long *pend)
/* Compute the "zero-based" start and end points. That is, as /* Compute the "zero-based" start and end points. That is, as
if the loop began at zero and incremented by one. */ if the loop began at zero and incremented by one. */
q = n / nthreads; q = n / nthreads;
q += (q * nthreads != n); t = n % nthreads;
s0 = q * i; if (i < t)
{
t = 0;
q++;
}
s0 = q * i + t;
e0 = s0 + q; e0 = s0 + q;
if (e0 > n)
e0 = n;
/* Notice when no iterations allocated for this thread. */ /* Notice when no iterations allocated for this thread. */
if (s0 >= e0) if (s0 >= e0)
......
/* Copyright (C) 2005, 2008, 2009 Free Software Foundation, Inc. /* Copyright (C) 2005, 2008, 2009, 2011 Free Software Foundation, Inc.
Contributed by Richard Henderson <rth@redhat.com>. Contributed by Richard Henderson <rth@redhat.com>.
This file is part of the GNU OpenMP Library (libgomp). This file is part of the GNU OpenMP Library (libgomp).
...@@ -60,7 +60,7 @@ gomp_iter_ull_static_next (gomp_ull *pstart, gomp_ull *pend) ...@@ -60,7 +60,7 @@ gomp_iter_ull_static_next (gomp_ull *pstart, gomp_ull *pend)
trip through the outer loop. */ trip through the outer loop. */
if (ws->chunk_size_ull == 0) if (ws->chunk_size_ull == 0)
{ {
gomp_ull n, q, i, s0, e0, s, e; gomp_ull n, q, i, t, s0, e0, s, e;
if (thr->ts.static_trip > 0) if (thr->ts.static_trip > 0)
return 1; return 1;
...@@ -75,11 +75,14 @@ gomp_iter_ull_static_next (gomp_ull *pstart, gomp_ull *pend) ...@@ -75,11 +75,14 @@ gomp_iter_ull_static_next (gomp_ull *pstart, gomp_ull *pend)
/* Compute the "zero-based" start and end points. That is, as /* Compute the "zero-based" start and end points. That is, as
if the loop began at zero and incremented by one. */ if the loop began at zero and incremented by one. */
q = n / nthreads; q = n / nthreads;
q += (q * nthreads != n); t = n % nthreads;
s0 = q * i; if (i < t)
{
t = 0;
q++;
}
s0 = q * i + t;
e0 = s0 + q; e0 = s0 + q;
if (e0 > n)
e0 = n;
/* Notice when no iterations allocated for this thread. */ /* Notice when no iterations allocated for this thread. */
if (s0 >= e0) if (s0 >= e0)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment