Commit 6d8fd122 by Jan Hubicka Committed by Jan Hubicka

Fix overflows in -fprofile-reorder-functions

This patch fixes three sissues with -fprofile-reorder-functions:
1) First is that tp_first_run is stored as 32bit integer while it can easily
   overflow (and does so during Firefox profiling).
2) Second problem is that flag_profile_functions can
   not be tested w/o function context.
   The changes to expand_all_functions makes it to work on mixed units by
   first outputting all functions w/o -fprofile-reorder-function (or with no
   profile info) and then outputting in first_run order
3) LTO partitioner was mixing up order by tp_first_run and by order.
   for no_reorder we definitly want to order via first, while for everything
   else we want to roder by second.

I have also merged duplicated comparators since they are bit fragile into
tp_first_run_node_cmp.

I originaly started to look into this because of undefined symbols with
Firefox PGO builds.  These symbols went away with fixing these bug but I am not
quite sure how. it is possible that there is another problem in lto_blanced_map
but even after reading the noreorder code few times carefuly I did not find it.
Other explanation would be that our new qsort with broken comparator due to
overflow can actualy remove some entries in the array, but that sounds bit
crazy.

Bootstrapped/regested x86_64-linux.

	* cgraph.c (cgraph_node::dump): Make tp_first_run 64bit.
	* cgraph.h (cgrpah_node): Likewise.
	(tp_first_run_node_cmp): Deeclare.
	* cgraphunit.c (node_cmp): Rename to ...
	(tp_first_run_node_cmp): ... this; export; watch for 64bit overflows;
	clear tp_first_run for no_reorder and !flag_profile_reorder_functions.
	(expand_all_functions): Collect tp_first_run and normal functions to
	two vectors so the other functions remain sorted. Do not check for
	flag_profile_reorder_functions it is function local flag.
	* profile.c (compute_value_histograms): Update tp_first_run printing.

	* lto-partition.c (node_cmp): Turn into simple order comparsions.
	(varpool_node_cmp): Remove.
	(add_sorted_nodes): Use node_cmp.
	(lto_balanced_map): Use tp_first_run_node_cmp.

From-SVN: r279093
parent a8d9d664
2019-12-07 Jan Hubicka <hubicka@ucw.cz> 2019-12-07 Jan Hubicka <hubicka@ucw.cz>
* cgraph.c (cgraph_node::dump): Make tp_first_run 64bit.
* cgraph.h (cgrpah_node): Likewise.
(tp_first_run_node_cmp): Deeclare.
* cgraphunit.c (node_cmp): Rename to ...
(tp_first_run_node_cmp): ... this; export; watch for 64bit overflows;
clear tp_first_run for no_reorder and !flag_profile_reorder_functions.
(expand_all_functions): Collect tp_first_run and normal functions to
two vectors so the other functions remain sorted. Do not check for
flag_profile_reorder_functions it is function local flag.
* profile.c (compute_value_histograms): Update tp_first_run printing.
2019-12-07 Jan Hubicka <hubicka@ucw.cz>
* opts.c (common_handle_option): Do not clear ipa_reference for * opts.c (common_handle_option): Do not clear ipa_reference for
-fprofile-use. -fprofile-use.
...@@ -1954,7 +1954,7 @@ cgraph_node::dump (FILE *f) ...@@ -1954,7 +1954,7 @@ cgraph_node::dump (FILE *f)
count.dump (f); count.dump (f);
} }
if (tp_first_run > 0) if (tp_first_run > 0)
fprintf (f, " first_run:%i", tp_first_run); fprintf (f, " first_run:%" PRId64, (int64_t) tp_first_run);
if (origin) if (origin)
fprintf (f, " nested in:%s", origin->asm_name ()); fprintf (f, " nested in:%s", origin->asm_name ());
if (gimple_has_body_p (decl)) if (gimple_has_body_p (decl))
......
...@@ -1430,6 +1430,8 @@ struct GTY((tag ("SYMTAB_FUNCTION"))) cgraph_node : public symtab_node ...@@ -1430,6 +1430,8 @@ struct GTY((tag ("SYMTAB_FUNCTION"))) cgraph_node : public symtab_node
/* Expected number of executions: calculated in profile.c. */ /* Expected number of executions: calculated in profile.c. */
profile_count count; profile_count count;
/* Time profiler: first run of function. */
gcov_type tp_first_run;
/* How to scale counts at materialization time; used to merge /* How to scale counts at materialization time; used to merge
LTO units with different number of profile runs. */ LTO units with different number of profile runs. */
int count_materialization_scale; int count_materialization_scale;
...@@ -1437,8 +1439,6 @@ struct GTY((tag ("SYMTAB_FUNCTION"))) cgraph_node : public symtab_node ...@@ -1437,8 +1439,6 @@ struct GTY((tag ("SYMTAB_FUNCTION"))) cgraph_node : public symtab_node
unsigned int profile_id; unsigned int profile_id;
/* ID of the translation unit. */ /* ID of the translation unit. */
int unit_id; int unit_id;
/* Time profiler: first run of function. */
int tp_first_run;
/* Set when decl is an abstract function pointed to by the /* Set when decl is an abstract function pointed to by the
ABSTRACT_DECL_ORIGIN of a reachable function. */ ABSTRACT_DECL_ORIGIN of a reachable function. */
...@@ -2463,6 +2463,7 @@ cgraph_inline_failed_type_t cgraph_inline_failed_type (cgraph_inline_failed_t); ...@@ -2463,6 +2463,7 @@ cgraph_inline_failed_type_t cgraph_inline_failed_type (cgraph_inline_failed_t);
/* In cgraphunit.c */ /* In cgraphunit.c */
void cgraphunit_c_finalize (void); void cgraphunit_c_finalize (void);
int tp_first_run_node_cmp (const void *pa, const void *pb);
/* Initialize datastructures so DECL is a function in lowered gimple form. /* Initialize datastructures so DECL is a function in lowered gimple form.
IN_SSA is true if the gimple is in SSA. */ IN_SSA is true if the gimple is in SSA. */
......
...@@ -2359,19 +2359,33 @@ cgraph_node::expand (void) ...@@ -2359,19 +2359,33 @@ cgraph_node::expand (void)
/* Node comparator that is responsible for the order that corresponds /* Node comparator that is responsible for the order that corresponds
to time when a function was launched for the first time. */ to time when a function was launched for the first time. */
static int int
node_cmp (const void *pa, const void *pb) tp_first_run_node_cmp (const void *pa, const void *pb)
{ {
const cgraph_node *a = *(const cgraph_node * const *) pa; const cgraph_node *a = *(const cgraph_node * const *) pa;
const cgraph_node *b = *(const cgraph_node * const *) pb; const cgraph_node *b = *(const cgraph_node * const *) pb;
gcov_type tp_first_run_a = a->tp_first_run;
gcov_type tp_first_run_b = b->tp_first_run;
if (!opt_for_fn (a->decl, flag_profile_reorder_functions)
|| a->no_reorder)
tp_first_run_a = 0;
if (!opt_for_fn (b->decl, flag_profile_reorder_functions)
|| b->no_reorder)
tp_first_run_b = 0;
if (tp_first_run_a == tp_first_run_b)
return a->order - b->order;
/* Functions with time profile must be before these without profile. */ /* Functions with time profile must be before these without profile. */
if (!a->tp_first_run || !b->tp_first_run) if (!tp_first_run_a || !tp_first_run_b)
return a->tp_first_run - b->tp_first_run; return tp_first_run_b ? 1 : -1;
return a->tp_first_run != b->tp_first_run /* Watch for overlflow - tp_first_run is 64bit. */
? b->tp_first_run - a->tp_first_run if (tp_first_run_a > tp_first_run_b)
: b->order - a->order; return 1;
else
return -1;
} }
/* Expand all functions that must be output. /* Expand all functions that must be output.
...@@ -2390,8 +2404,10 @@ expand_all_functions (void) ...@@ -2390,8 +2404,10 @@ expand_all_functions (void)
cgraph_node *node; cgraph_node *node;
cgraph_node **order = XCNEWVEC (cgraph_node *, cgraph_node **order = XCNEWVEC (cgraph_node *,
symtab->cgraph_count); symtab->cgraph_count);
cgraph_node **tp_first_run_order = XCNEWVEC (cgraph_node *,
symtab->cgraph_count);
unsigned int expanded_func_count = 0, profiled_func_count = 0; unsigned int expanded_func_count = 0, profiled_func_count = 0;
int order_pos, new_order_pos = 0; int order_pos, tp_first_run_order_pos = 0, new_order_pos = 0;
int i; int i;
order_pos = ipa_reverse_postorder (order); order_pos = ipa_reverse_postorder (order);
...@@ -2401,11 +2417,17 @@ expand_all_functions (void) ...@@ -2401,11 +2417,17 @@ expand_all_functions (void)
optimization. So we must be sure to not reference them. */ optimization. So we must be sure to not reference them. */
for (i = 0; i < order_pos; i++) for (i = 0; i < order_pos; i++)
if (order[i]->process) if (order[i]->process)
order[new_order_pos++] = order[i]; {
if (order[i]->tp_first_run
if (flag_profile_reorder_functions) && opt_for_fn (order[i]->decl, flag_profile_reorder_functions))
qsort (order, new_order_pos, sizeof (cgraph_node *), node_cmp); tp_first_run_order[tp_first_run_order_pos++] = order[i];
else
order[new_order_pos++] = order[i];
}
/* Output functions in RPO so callers get optimized before callees. This
makes ipa-ra and other propagators to work.
FIXME: This is far from optimal code layout. */
for (i = new_order_pos - 1; i >= 0; i--) for (i = new_order_pos - 1; i >= 0; i--)
{ {
node = order[i]; node = order[i];
...@@ -2413,13 +2435,25 @@ expand_all_functions (void) ...@@ -2413,13 +2435,25 @@ expand_all_functions (void)
if (node->process) if (node->process)
{ {
expanded_func_count++; expanded_func_count++;
if(node->tp_first_run) node->process = 0;
profiled_func_count++; node->expand ();
}
}
qsort (tp_first_run_order, tp_first_run_order_pos,
sizeof (cgraph_node *), tp_first_run_node_cmp);
for (i = 0; i < tp_first_run_order_pos; i++)
{
node = tp_first_run_order[i];
if (node->process)
{
expanded_func_count++;
profiled_func_count++;
if (symtab->dump_file) if (symtab->dump_file)
fprintf (symtab->dump_file, fprintf (symtab->dump_file,
"Time profile order in expand_all_functions:%s:%d\n", "Time profile order in expand_all_functions:%s:%" PRId64
node->asm_name (), node->tp_first_run); "\n", node->asm_name (), (int64_t) node->tp_first_run);
node->process = 0; node->process = 0;
node->expand (); node->expand ();
} }
...@@ -2429,7 +2463,7 @@ expand_all_functions (void) ...@@ -2429,7 +2463,7 @@ expand_all_functions (void)
fprintf (dump_file, "Expanded functions with time profile (%s):%u/%u\n", fprintf (dump_file, "Expanded functions with time profile (%s):%u/%u\n",
main_input_filename, profiled_func_count, expanded_func_count); main_input_filename, profiled_func_count, expanded_func_count);
if (symtab->dump_file && flag_profile_reorder_functions) if (symtab->dump_file && tp_first_run_order_pos)
fprintf (symtab->dump_file, "Expanded functions with time profile:%u/%u\n", fprintf (symtab->dump_file, "Expanded functions with time profile:%u/%u\n",
profiled_func_count, expanded_func_count); profiled_func_count, expanded_func_count);
......
2019-11-25 Joseph Myers <joseph@codesourcery.com> 2019-12-07 Jan Hubicka <hubicka@ucw.cz>
* lto-partition.c (node_cmp): Turn into simple order comparsions.
(varpool_node_cmp): Remove.
(add_sorted_nodes): Use node_cmp.
(lto_balanced_map): Use tp_first_run_node_cmp.
/bin/bash: :q: command not found
PR c/91985 PR c/91985
* lto-lang.c (lto_type_for_mode): Handle decimal floating-point * lto-lang.c (lto_type_for_mode): Handle decimal floating-point
types being NULL_TREE. types being NULL_TREE.
......
...@@ -372,38 +372,9 @@ lto_max_map (void) ...@@ -372,38 +372,9 @@ lto_max_map (void)
new_partition ("empty"); new_partition ("empty");
} }
/* Helper function for qsort; sort nodes by order. noreorder functions must have
been removed earlier. */
static int
node_cmp (const void *pa, const void *pb)
{
const struct cgraph_node *a = *(const struct cgraph_node * const *) pa;
const struct cgraph_node *b = *(const struct cgraph_node * const *) pb;
/* Profile reorder flag enables function reordering based on first execution
of a function. All functions with profile are placed in ascending
order at the beginning. */
if (flag_profile_reorder_functions)
{
/* Functions with time profile are sorted in ascending order. */
if (a->tp_first_run && b->tp_first_run)
return a->tp_first_run != b->tp_first_run
? a->tp_first_run - b->tp_first_run
: a->order - b->order;
/* Functions with time profile are sorted before the functions
that do not have the profile. */
if (a->tp_first_run || b->tp_first_run)
return b->tp_first_run - a->tp_first_run;
}
return b->order - a->order;
}
/* Helper function for qsort; sort nodes by order. */ /* Helper function for qsort; sort nodes by order. */
static int static int
varpool_node_cmp (const void *pa, const void *pb) node_cmp (const void *pa, const void *pb)
{ {
const symtab_node *a = *static_cast<const symtab_node * const *> (pa); const symtab_node *a = *static_cast<const symtab_node * const *> (pa);
const symtab_node *b = *static_cast<const symtab_node * const *> (pb); const symtab_node *b = *static_cast<const symtab_node * const *> (pb);
...@@ -418,7 +389,7 @@ add_sorted_nodes (vec<symtab_node *> &next_nodes, ltrans_partition partition) ...@@ -418,7 +389,7 @@ add_sorted_nodes (vec<symtab_node *> &next_nodes, ltrans_partition partition)
unsigned i; unsigned i;
symtab_node *node; symtab_node *node;
next_nodes.qsort (varpool_node_cmp); next_nodes.qsort (node_cmp);
FOR_EACH_VEC_ELT (next_nodes, i, node) FOR_EACH_VEC_ELT (next_nodes, i, node)
if (!symbol_partitioned_p (node)) if (!symbol_partitioned_p (node))
add_symbol_to_partition (partition, node); add_symbol_to_partition (partition, node);
...@@ -537,17 +508,17 @@ lto_balanced_map (int n_lto_partitions, int max_partition_size) ...@@ -537,17 +508,17 @@ lto_balanced_map (int n_lto_partitions, int max_partition_size)
unit tends to import a lot of global trees defined there. We should unit tends to import a lot of global trees defined there. We should
get better about minimizing the function bounday, but until that get better about minimizing the function bounday, but until that
things works smoother if we order in source order. */ things works smoother if we order in source order. */
order.qsort (node_cmp); order.qsort (tp_first_run_node_cmp);
noreorder.qsort (node_cmp); noreorder.qsort (node_cmp);
if (dump_file) if (dump_file)
{ {
for (unsigned i = 0; i < order.length (); i++) for (unsigned i = 0; i < order.length (); i++)
fprintf (dump_file, "Balanced map symbol order:%s:%u\n", fprintf (dump_file, "Balanced map symbol order:%s:%" PRId64 "\n",
order[i]->name (), order[i]->tp_first_run); order[i]->name (), (int64_t) order[i]->tp_first_run);
for (unsigned i = 0; i < noreorder.length (); i++) for (unsigned i = 0; i < noreorder.length (); i++)
fprintf (dump_file, "Balanced map symbol no_reorder:%s:%u\n", fprintf (dump_file, "Balanced map symbol no_reorder:%s:%" PRId64 "\n",
noreorder[i]->name (), noreorder[i]->tp_first_run); noreorder[i]->name (), (int64_t) noreorder[i]->tp_first_run);
} }
/* Collect all variables that should not be reordered. */ /* Collect all variables that should not be reordered. */
...@@ -556,7 +527,7 @@ lto_balanced_map (int n_lto_partitions, int max_partition_size) ...@@ -556,7 +527,7 @@ lto_balanced_map (int n_lto_partitions, int max_partition_size)
&& vnode->no_reorder) && vnode->no_reorder)
varpool_order.safe_push (vnode); varpool_order.safe_push (vnode);
n_varpool_nodes = varpool_order.length (); n_varpool_nodes = varpool_order.length ();
varpool_order.qsort (varpool_node_cmp); varpool_order.qsort (node_cmp);
/* Compute partition size and create the first partition. */ /* Compute partition size and create the first partition. */
if (param_min_partition_size > max_partition_size) if (param_min_partition_size > max_partition_size)
......
...@@ -874,7 +874,8 @@ compute_value_histograms (histogram_values values, unsigned cfg_checksum, ...@@ -874,7 +874,8 @@ compute_value_histograms (histogram_values values, unsigned cfg_checksum,
node->tp_first_run = hist->hvalue.counters[0]; node->tp_first_run = hist->hvalue.counters[0];
if (dump_file) if (dump_file)
fprintf (dump_file, "Read tp_first_run: %d\n", node->tp_first_run); fprintf (dump_file, "Read tp_first_run: %" PRId64 "\n",
(int64_t) node->tp_first_run);
} }
} }
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment