Commit 419c5f99 by Richard Biener Committed by Richard Biener

tree-vect-stmts.c (get_group_load_store_type): Avoid peeling for gaps by loading…

tree-vect-stmts.c (get_group_load_store_type): Avoid peeling for gaps by loading only lower halves of vectors if possible.

2019-05-03  Richard Biener  <rguenther@suse.de>

	* tree-vect-stmts.c (get_group_load_store_type): Avoid
	peeling for gaps by loading only lower halves of vectors
	if possible.
	(vectorizable_load): Likewise.

	* gcc.dg/vect/slp-reduc-sad-2.c: New testcase.

From-SVN: r270847
parent 839d0860
2019-05-03 Richard Biener <rguenther@suse.de>
* tree-vect-stmts.c (get_group_load_store_type): Avoid
peeling for gaps by loading only lower halves of vectors
if possible.
(vectorizable_load): Likewise.
2019-05-03 Richard Biener <rguenther@suse.de>
PR middle-end/89518
* match.pd: Add pattern to optimize (A / B) * B + (A % B) to A.
......
2019-05-03 Richard Biener <rguenther@suse.de>
* gcc.dg/vect/slp-reduc-sad-2.c: New testcase.
2019-05-03 Richard Biener <rguenther@suse.de>
PR middle-end/89518
* gcc.dg/pr89518.c: New testcase.
......
/* { dg-do compile } */
/* { dg-require-effective-target vect_usad_char } */
/* With AVX256 or more we do not pull off the trick eliding the epilogue. */
/* { dg-additional-options "-mprefer-avx128" { target { x86_64-*-* i?86-*-* } } } */
typedef unsigned char uint8_t;
int x264_pixel_sad_8x8( uint8_t *pix1, uint8_t *pix2, int i_stride_pix2 )
{
int i_sum = 0;
for( int y = 0; y < 8; y++ )
{
i_sum += __builtin_abs( pix1[0] - pix2[0] );
i_sum += __builtin_abs( pix1[1] - pix2[1] );
i_sum += __builtin_abs( pix1[2] - pix2[2] );
i_sum += __builtin_abs( pix1[3] - pix2[3] );
i_sum += __builtin_abs( pix1[4] - pix2[4] );
i_sum += __builtin_abs( pix1[5] - pix2[5] );
i_sum += __builtin_abs( pix1[6] - pix2[6] );
i_sum += __builtin_abs( pix1[7] - pix2[7] );
pix1 += 16;
pix2 += i_stride_pix2;
}
return i_sum;
}
/* { dg-final { scan-tree-dump "vect_recog_sad_pattern: detected" "vect" } } */
/* { dg-final { scan-tree-dump "vectorizing stmts using SLP" "vect" } } */
/* { dg-final { scan-tree-dump-not "access with gaps requires scalar epilogue loop" "vect" } } */
/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */
......@@ -2258,6 +2258,29 @@ get_group_load_store_type (stmt_vec_info stmt_info, tree vectype, bool slp,
&& gap < (vect_known_alignment_in_bytes (first_dr_info)
/ vect_get_scalar_dr_size (first_dr_info)))
overrun_p = false;
/* If the gap splits the vector in half and the target
can do half-vector operations avoid the epilogue peeling
by simply loading half of the vector only. Usually
the construction with an upper zero half will be elided. */
dr_alignment_support alignment_support_scheme;
scalar_mode elmode = SCALAR_TYPE_MODE (TREE_TYPE (vectype));
machine_mode vmode;
if (overrun_p
&& !masked_p
&& (((alignment_support_scheme
= vect_supportable_dr_alignment (first_dr_info, false)))
== dr_aligned
|| alignment_support_scheme == dr_unaligned_supported)
&& known_eq (nunits, (group_size - gap) * 2)
&& mode_for_vector (elmode, (group_size - gap)).exists (&vmode)
&& VECTOR_MODE_P (vmode)
&& targetm.vector_mode_supported_p (vmode)
&& (convert_optab_handler (vec_init_optab,
TYPE_MODE (vectype), vmode)
!= CODE_FOR_nothing))
overrun_p = false;
if (overrun_p && !can_overrun_p)
{
if (dump_enabled_p ())
......@@ -8516,8 +8539,24 @@ vectorizable_load (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
}
else
{
tree ltype = vectype;
/* If there's no peeling for gaps but we have a gap
with slp loads then load the lower half of the
vector only. See get_group_load_store_type for
when we apply this optimization. */
if (slp
&& loop_vinfo
&& !LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
&& DR_GROUP_GAP (first_stmt_info) != 0
&& known_eq (nunits,
(group_size
- DR_GROUP_GAP (first_stmt_info)) * 2))
ltype = build_vector_type (TREE_TYPE (vectype),
(group_size
- DR_GROUP_GAP
(first_stmt_info)));
data_ref
= fold_build2 (MEM_REF, vectype, dataref_ptr,
= fold_build2 (MEM_REF, ltype, dataref_ptr,
dataref_offset
? dataref_offset
: build_int_cst (ref_type, 0));
......@@ -8531,6 +8570,23 @@ vectorizable_load (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
TREE_TYPE (data_ref)
= build_aligned_type (TREE_TYPE (data_ref),
TYPE_ALIGN (elem_type));
if (ltype != vectype)
{
vect_copy_ref_info (data_ref, DR_REF (first_dr_info->dr));
tree tem = make_ssa_name (ltype);
new_stmt = gimple_build_assign (tem, data_ref);
vect_finish_stmt_generation (stmt_info, new_stmt, gsi);
data_ref = NULL;
vec<constructor_elt, va_gc> *v;
vec_alloc (v, 2);
CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, tem);
CONSTRUCTOR_APPEND_ELT (v, NULL_TREE,
build_zero_cst (ltype));
new_stmt
= gimple_build_assign (vec_dest,
build_constructor
(vectype, v));
}
}
break;
}
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment