Commit d3465d72 by Richard Sandiford Committed by Richard Sandiford

Avoid unnecessary peeling for gaps with LD3

vectorizable_load forces peeling for gaps if the vectorisation factor
is not a multiple of the group size, since in that case we'd normally load
beyond the original scalar accesses but drop the excess elements as part
of a following permute:

          if (loop_vinfo
              && ! STMT_VINFO_STRIDED_P (stmt_info)
              && (GROUP_GAP (vinfo_for_stmt (first_stmt)) != 0
                  || (!slp && vf % GROUP_SIZE (vinfo_for_stmt (first_stmt)) != 0)))

This isn't necessary for LOAD_LANES though, since it loads only the
data needed and does the permute itself.

Tested on aarch64-linux-gnu and x86_64-linux-gnu.

gcc/
	* tree-vect-stmts.c (vectorizable_load): Reorder checks so that
	load_lanes/grouped_load classification comes first.  Don't check
	whether the vectorization factor is a multiple of the group size
	for load_lanes.

gcc/testsuite/
	* gcc.dg/vect/vect-load-lanes-peeling-1.c: New test.

From-SVN: r236632
parent 836dbb1a
2016-05-24 Richard Sandiford <richard.sandiford@arm.com> 2016-05-24 Richard Sandiford <richard.sandiford@arm.com>
* tree-vect-stmts.c (vectorizable_load): Reorder checks so that
load_lanes/grouped_load classification comes first. Don't check
whether the vectorization factor is a multiple of the group size
for load_lanes.
2016-05-24 Richard Sandiford <richard.sandiford@arm.com>
* tree-vect-data-refs.c (vect_analyze_group_access_1): Set * tree-vect-data-refs.c (vect_analyze_group_access_1): Set
GROUP_GAP for single-element interleaving. GROUP_GAP for single-element interleaving.
* tree-vect-stmts.c (vectorizable_load): Remove force_peeling * tree-vect-stmts.c (vectorizable_load): Remove force_peeling
......
2016-05-24 Richard Sandiford <richard.sandiford@arm.com>
* gcc.dg/vect/vect-load-lanes-peeling-1.c: New test.
2016-05-24 Richard Biener <rguenther@suse.de> 2016-05-24 Richard Biener <rguenther@suse.de>
PR middle-end/70434 PR middle-end/70434
......
/* { dg-do compile } */
/* { dg-require-effective-target vect_int } */
/* { dg-require-effective-target vect_load_lanes } */
void
f (int *__restrict a, int *__restrict b)
{
for (int i = 0; i < 96; ++i)
a[i] = b[i * 3] + b[i * 3 + 1] + b[i * 3 + 2];
}
/* { dg-final { scan-tree-dump-not "Data access with gaps" "vect" } } */
/* { dg-final { scan-tree-dump-not "epilog loop required" "vect" } } */
...@@ -6303,6 +6303,17 @@ vectorizable_load (gimple *stmt, gimple_stmt_iterator *gsi, gimple **vec_stmt, ...@@ -6303,6 +6303,17 @@ vectorizable_load (gimple *stmt, gimple_stmt_iterator *gsi, gimple **vec_stmt,
gcc_assert (!nested_in_vect_loop && !STMT_VINFO_GATHER_SCATTER_P (stmt_info)); gcc_assert (!nested_in_vect_loop && !STMT_VINFO_GATHER_SCATTER_P (stmt_info));
first_stmt = GROUP_FIRST_ELEMENT (stmt_info); first_stmt = GROUP_FIRST_ELEMENT (stmt_info);
group_size = GROUP_SIZE (vinfo_for_stmt (first_stmt));
if (!slp
&& !PURE_SLP_STMT (stmt_info)
&& !STMT_VINFO_STRIDED_P (stmt_info))
{
if (vect_load_lanes_supported (vectype, group_size))
load_lanes_p = true;
else if (!vect_grouped_load_supported (vectype, group_size))
return false;
}
/* If this is single-element interleaving with an element distance /* If this is single-element interleaving with an element distance
that leaves unused vector loads around punt - we at least create that leaves unused vector loads around punt - we at least create
...@@ -6330,7 +6341,7 @@ vectorizable_load (gimple *stmt, gimple_stmt_iterator *gsi, gimple **vec_stmt, ...@@ -6330,7 +6341,7 @@ vectorizable_load (gimple *stmt, gimple_stmt_iterator *gsi, gimple **vec_stmt,
if (loop_vinfo if (loop_vinfo
&& ! STMT_VINFO_STRIDED_P (stmt_info) && ! STMT_VINFO_STRIDED_P (stmt_info)
&& (GROUP_GAP (vinfo_for_stmt (first_stmt)) != 0 && (GROUP_GAP (vinfo_for_stmt (first_stmt)) != 0
|| (!slp && vf % GROUP_SIZE (vinfo_for_stmt (first_stmt)) != 0))) || (!slp && !load_lanes_p && vf % group_size != 0)))
{ {
if (dump_enabled_p ()) if (dump_enabled_p ())
dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
...@@ -6350,8 +6361,6 @@ vectorizable_load (gimple *stmt, gimple_stmt_iterator *gsi, gimple **vec_stmt, ...@@ -6350,8 +6361,6 @@ vectorizable_load (gimple *stmt, gimple_stmt_iterator *gsi, gimple **vec_stmt,
if (slp && SLP_TREE_LOAD_PERMUTATION (slp_node).exists ()) if (slp && SLP_TREE_LOAD_PERMUTATION (slp_node).exists ())
slp_perm = true; slp_perm = true;
group_size = GROUP_SIZE (vinfo_for_stmt (first_stmt));
/* ??? The following is overly pessimistic (as well as the loop /* ??? The following is overly pessimistic (as well as the loop
case above) in the case we can statically determine the excess case above) in the case we can statically determine the excess
elements loaded are within the bounds of a decl that is accessed. elements loaded are within the bounds of a decl that is accessed.
...@@ -6364,16 +6373,6 @@ vectorizable_load (gimple *stmt, gimple_stmt_iterator *gsi, gimple **vec_stmt, ...@@ -6364,16 +6373,6 @@ vectorizable_load (gimple *stmt, gimple_stmt_iterator *gsi, gimple **vec_stmt,
return false; return false;
} }
if (!slp
&& !PURE_SLP_STMT (stmt_info)
&& !STMT_VINFO_STRIDED_P (stmt_info))
{
if (vect_load_lanes_supported (vectype, group_size))
load_lanes_p = true;
else if (!vect_grouped_load_supported (vectype, group_size))
return false;
}
/* Invalidate assumptions made by dependence analysis when vectorization /* Invalidate assumptions made by dependence analysis when vectorization
on the unrolled body effectively re-orders stmts. */ on the unrolled body effectively re-orders stmts. */
if (!PURE_SLP_STMT (stmt_info) if (!PURE_SLP_STMT (stmt_info)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment