Commit 4aa157e8 by Richard Sandiford Committed by Richard Sandiford

Allow single-element interleaving for non-power-of-2 strides

This allows LD3 to be used for isolated a[i * 3] accesses, in a similar
way to the current a[i * 2] and a[i * 4] for LD2 and LD4 respectively.
Given the problems with the cost model underestimating the cost of
elementwise accesses, the patch continues to reject the VMAT_ELEMENTWISE
cases that are currently rejected.

2018-01-13  Richard Sandiford  <richard.sandiford@linaro.org>
	    Alan Hayward  <alan.hayward@arm.com>
	    David Sherwood  <david.sherwood@arm.com>

gcc/
	* tree-vect-data-refs.c (vect_analyze_group_access_1): Allow
	single-element interleaving even if the size is not a power of 2.
	* tree-vect-stmts.c (get_load_store_type): Disallow elementwise
	accesses for single-element interleaving if the group size is
	not a power of 2.

gcc/testsuite/
	* gcc.target/aarch64/sve/struct_vect_18.c: New test.
	* gcc.target/aarch64/sve/struct_vect_18_run.c: Likewise.
	* gcc.target/aarch64/sve/struct_vect_19.c: Likewise.
	* gcc.target/aarch64/sve/struct_vect_19_run.c: Likewise.

Co-Authored-By: Alan Hayward <alan.hayward@arm.com>
Co-Authored-By: David Sherwood <david.sherwood@arm.com>

From-SVN: r256634
parent bb6c2b68
...@@ -2,6 +2,16 @@ ...@@ -2,6 +2,16 @@
Alan Hayward <alan.hayward@arm.com> Alan Hayward <alan.hayward@arm.com>
David Sherwood <david.sherwood@arm.com> David Sherwood <david.sherwood@arm.com>
* tree-vect-data-refs.c (vect_analyze_group_access_1): Allow
single-element interleaving even if the size is not a power of 2.
* tree-vect-stmts.c (get_load_store_type): Disallow elementwise
accesses for single-element interleaving if the group size is
not a power of 2.
2018-01-13 Richard Sandiford <richard.sandiford@linaro.org>
Alan Hayward <alan.hayward@arm.com>
David Sherwood <david.sherwood@arm.com>
* doc/md.texi (fold_extract_last_@var{m}): Document. * doc/md.texi (fold_extract_last_@var{m}): Document.
* doc/sourcebuild.texi (vect_fold_extract_last): Likewise. * doc/sourcebuild.texi (vect_fold_extract_last): Likewise.
* optabs.def (fold_extract_last_optab): New optab. * optabs.def (fold_extract_last_optab): New optab.
......
...@@ -2,6 +2,15 @@ ...@@ -2,6 +2,15 @@
Alan Hayward <alan.hayward@arm.com> Alan Hayward <alan.hayward@arm.com>
David Sherwood <david.sherwood@arm.com> David Sherwood <david.sherwood@arm.com>
* gcc.target/aarch64/sve/struct_vect_18.c: New test.
* gcc.target/aarch64/sve/struct_vect_18_run.c: Likewise.
* gcc.target/aarch64/sve/struct_vect_19.c: Likewise.
* gcc.target/aarch64/sve/struct_vect_19_run.c: Likewise.
2018-01-13 Richard Sandiford <richard.sandiford@linaro.org>
Alan Hayward <alan.hayward@arm.com>
David Sherwood <david.sherwood@arm.com>
* lib/target-supports.exp * lib/target-supports.exp
(check_effective_target_vect_fold_extract_last): New proc. (check_effective_target_vect_fold_extract_last): New proc.
* gcc.dg/vect/pr65947-1.c: Update dump messages. Add markup * gcc.dg/vect/pr65947-1.c: Update dump messages. Add markup
......
/* { dg-do compile } */
/* { dg-options "-O2 -ftree-vectorize" } */
#define N 2000
#define TEST_LOOP(NAME, TYPE) \
void __attribute__ ((noinline, noclone)) \
NAME (TYPE *restrict dest, TYPE *restrict src) \
{ \
for (int i = 0; i < N; ++i) \
dest[i] += src[i * 3]; \
}
#define TEST(NAME) \
TEST_LOOP (NAME##_i8, signed char) \
TEST_LOOP (NAME##_i16, unsigned short) \
TEST_LOOP (NAME##_f32, float) \
TEST_LOOP (NAME##_f64, double)
TEST (test)
/* Check the vectorized loop. */
/* { dg-final { scan-assembler-times {\tld1b\t} 1 } } */
/* { dg-final { scan-assembler-times {\tld3b\t} 1 } } */
/* { dg-final { scan-assembler-times {\tst1b\t} 1 } } */
/* { dg-final { scan-assembler-times {\tld1h\t} 1 } } */
/* { dg-final { scan-assembler-times {\tld3h\t} 1 } } */
/* { dg-final { scan-assembler-times {\tst1h\t} 1 } } */
/* { dg-final { scan-assembler-times {\tld1w\t} 1 } } */
/* { dg-final { scan-assembler-times {\tld3w\t} 1 } } */
/* { dg-final { scan-assembler-times {\tst1w\t} 1 } } */
/* { dg-final { scan-assembler-times {\tld1d\t} 1 } } */
/* { dg-final { scan-assembler-times {\tld3d\t} 1 } } */
/* { dg-final { scan-assembler-times {\tst1d\t} 1 } } */
/* Check the scalar tail. */
/* { dg-final { scan-assembler-times {\tldrb\tw} 2 } } */
/* { dg-final { scan-assembler-times {\tstrb\tw} 1 } } */
/* { dg-final { scan-assembler-times {\tldrh\tw} 2 } } */
/* { dg-final { scan-assembler-times {\tstrh\tw} 1 } } */
/* { dg-final { scan-assembler-times {\tldr\ts} 2 } } */
/* { dg-final { scan-assembler-times {\tstr\ts} 1 } } */
/* { dg-final { scan-assembler-times {\tldr\td} 2 } } */
/* { dg-final { scan-assembler-times {\tstr\td} 1 } } */
/* { dg-do run { target aarch64_sve_hw } } */
/* { dg-options "-O2 -ftree-vectorize" } */
#include "struct_vect_18.c"
#undef TEST_LOOP
#define TEST_LOOP(NAME, TYPE) \
{ \
TYPE out[N]; \
TYPE in[N * 3]; \
for (int i = 0; i < N; ++i) \
{ \
out[i] = i * 7 / 2; \
asm volatile ("" ::: "memory"); \
} \
for (int i = 0; i < N * 3; ++i) \
{ \
in[i] = i * 9 / 2; \
asm volatile ("" ::: "memory"); \
} \
NAME (out, in); \
for (int i = 0; i < N; ++i) \
{ \
TYPE expected = i * 7 / 2 + in[i * 3]; \
if (out[i] != expected) \
__builtin_abort (); \
asm volatile ("" ::: "memory"); \
} \
}
int __attribute__ ((optimize (1)))
main (void)
{
TEST (test);
return 0;
}
/* { dg-do compile } */
/* { dg-options "-O2 -ftree-vectorize" } */
#define TEST_LOOP(NAME, TYPE) \
void __attribute__ ((noinline, noclone)) \
NAME (TYPE *restrict dest, TYPE *restrict src, int n) \
{ \
for (int i = 0; i < n; ++i) \
dest[i] += src[i * 3]; \
}
#define TEST(NAME) \
TEST_LOOP (NAME##_i8, signed char) \
TEST_LOOP (NAME##_i16, unsigned short) \
TEST_LOOP (NAME##_f32, float) \
TEST_LOOP (NAME##_f64, double)
TEST (test)
/* Check the vectorized loop. */
/* { dg-final { scan-assembler-times {\tld1b\t} 1 } } */
/* { dg-final { scan-assembler-times {\tld3b\t} 1 } } */
/* { dg-final { scan-assembler-times {\tst1b\t} 1 } } */
/* { dg-final { scan-assembler-times {\tld1h\t} 1 } } */
/* { dg-final { scan-assembler-times {\tld3h\t} 1 } } */
/* { dg-final { scan-assembler-times {\tst1h\t} 1 } } */
/* { dg-final { scan-assembler-times {\tld1w\t} 1 } } */
/* { dg-final { scan-assembler-times {\tld3w\t} 1 } } */
/* { dg-final { scan-assembler-times {\tst1w\t} 1 } } */
/* { dg-final { scan-assembler-times {\tld1d\t} 1 } } */
/* { dg-final { scan-assembler-times {\tld3d\t} 1 } } */
/* { dg-final { scan-assembler-times {\tst1d\t} 1 } } */
/* Check the scalar tail. */
/* { dg-final { scan-assembler-times {\tldrb\tw} 2 } } */
/* { dg-final { scan-assembler-times {\tstrb\tw} 1 } } */
/* { dg-final { scan-assembler-times {\tldrh\tw} 2 } } */
/* { dg-final { scan-assembler-times {\tstrh\tw} 1 } } */
/* { dg-final { scan-assembler-times {\tldr\ts} 2 } } */
/* { dg-final { scan-assembler-times {\tstr\ts} 1 } } */
/* { dg-final { scan-assembler-times {\tldr\td} 2 } } */
/* { dg-final { scan-assembler-times {\tstr\td} 1 } } */
/* { dg-do run { target aarch64_sve_hw } } */
/* { dg-options "-O2 -ftree-vectorize" } */
#include "struct_vect_19.c"
#define N 1000
#undef TEST_LOOP
#define TEST_LOOP(NAME, TYPE) \
{ \
TYPE out[N]; \
TYPE in[N * 3]; \
int counts[] = { 0, 1, N - 1 }; \
for (int j = 0; j < 3; ++j) \
{ \
int count = counts[j]; \
for (int i = 0; i < N; ++i) \
{ \
out[i] = i * 7 / 2; \
asm volatile ("" ::: "memory"); \
} \
for (int i = 0; i < N * 3; ++i) \
{ \
in[i] = i * 9 / 2; \
asm volatile ("" ::: "memory"); \
} \
NAME (out, in, count); \
for (int i = 0; i < N; ++i) \
{ \
TYPE expected = i * 7 / 2; \
if (i < count) \
expected += in[i * 3]; \
if (out[i] != expected) \
__builtin_abort (); \
asm volatile ("" ::: "memory"); \
} \
} \
}
int __attribute__ ((optimize (1)))
main (void)
{
TEST (test);
return 0;
}
...@@ -2427,11 +2427,10 @@ vect_analyze_group_access_1 (struct data_reference *dr) ...@@ -2427,11 +2427,10 @@ vect_analyze_group_access_1 (struct data_reference *dr)
element of the group that is accessed in the loop. */ element of the group that is accessed in the loop. */
/* Gaps are supported only for loads. STEP must be a multiple of the type /* Gaps are supported only for loads. STEP must be a multiple of the type
size. The size of the group must be a power of 2. */ size. */
if (DR_IS_READ (dr) if (DR_IS_READ (dr)
&& (dr_step % type_size) == 0 && (dr_step % type_size) == 0
&& groupsize > 0 && groupsize > 0)
&& pow2p_hwi (groupsize))
{ {
GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = stmt; GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = stmt;
GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize; GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
......
...@@ -2176,7 +2176,10 @@ get_load_store_type (gimple *stmt, tree vectype, bool slp, bool masked_p, ...@@ -2176,7 +2176,10 @@ get_load_store_type (gimple *stmt, tree vectype, bool slp, bool masked_p,
cost of using elementwise accesses. This check preserves the cost of using elementwise accesses. This check preserves the
traditional behavior until that can be fixed. */ traditional behavior until that can be fixed. */
if (*memory_access_type == VMAT_ELEMENTWISE if (*memory_access_type == VMAT_ELEMENTWISE
&& !STMT_VINFO_STRIDED_P (stmt_info)) && !STMT_VINFO_STRIDED_P (stmt_info)
&& !(stmt == GROUP_FIRST_ELEMENT (stmt_info)
&& !GROUP_NEXT_ELEMENT (stmt_info)
&& !pow2p_hwi (GROUP_SIZE (stmt_info))))
{ {
if (dump_enabled_p ()) if (dump_enabled_p ())
dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment