Commit 3a0afad0 by Prathamesh Kulkarni Committed by Prathamesh Kulkarni

re PR target/88837 ([SVE] Poor vector construction code in VL-specific mode)

2019-06-03  Prathamesh Kulkarni  <prathamesh.kulkarni@linaro.org>

	PR target/88837
	* vector-builder.h (vector_builder::count_dups): New method.
	* config/aarch64/aarch64-protos.h (aarch64_expand_sve_vector_init):
	Declare prototype.
	* config/aarch64/aarch64/sve.md (aarch64_sve_rev64<mode>): Use @.
	(vec_init<mode><Vel>): New pattern.
	* config/aarch64/aarch64.c (emit_insr): New function.
	(aarch64_sve_expand_vector_init_handle_trailing_constants): Likewise.
	(aarch64_sve_expand_vector_init_insert_elems): Likewise.
	(aarch64_sve_expand_vector_init_handle_trailing_same_elem): Likewise.
	(aarch64_sve_expand_vector_init): Define two overloaded functions.

testsuite/
	* gcc.target/aarch64/sve/init_1.c: New test.
	* gcc.target/aarch64/sve/init_1_run.c: Likewise.
	* gcc.target/aarch64/sve/init_2.c: Likewise.
	* gcc.target/aarch64/sve/init_2_run.c: Likewise.
	* gcc.target/aarch64/sve/init_3.c: Likewise.
	* gcc.target/aarch64/sve/init_3_run.c: Likewise.
	* gcc.target/aarch64/sve/init_4.c: Likewise.
	* gcc.target/aarch64/sve/init_4_run.c: Likewise.
	* gcc.target/aarch64/sve/init_5.c: Likewise.
	* gcc.target/aarch64/sve/init_5_run.c: Likewise.
	* gcc.target/aarch64/sve/init_6.c: Likewise.
	* gcc.target/aarch64/sve/init_6_run.c: Likewise.
	* gcc.target/aarch64/sve/init_7.c: Likewise.
	* gcc.target/aarch64/sve/init_7_run.c: Likewise.
	* gcc.target/aarch64/sve/init_8.c: Likewise.
	* gcc.target/aarch64/sve/init_8_run.c: Likewise.
	* gcc.target/aarch64/sve/init_9.c: Likewise.
	* gcc.target/aarch64/sve/init_9_run.c: Likewise.
	* gcc.target/aarch64/sve/init_10.c: Likewise.
	* gcc.target/aarch64/sve/init_10_run.c: Likewise.
	* gcc.target/aarch64/sve/init_11.c: Likewise.
	* gcc.target/aarch64/sve/init_11_run.c: Likewise.
	* gcc.target/aarch64/sve/init_12.c: Likewise.
	* gcc.target/aarch64/sve/init_12_run.c: Likewise.

From-SVN: r271857
parent bcde3345
2019-06-03 Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org>
PR target/88837
* vector-builder.h (vector_builder::count_dups): New method.
* config/aarch64/aarch64-protos.h (aarch64_expand_sve_vector_init):
Declare prototype.
* config/aarch64/aarch64/sve.md (aarch64_sve_rev64<mode>): Use @.
(vec_init<mode><Vel>): New pattern.
* config/aarch64/aarch64.c (emit_insr): New function.
(aarch64_sve_expand_vector_init_handle_trailing_constants): Likewise.
(aarch64_sve_expand_vector_init_insert_elems): Likewise.
(aarch64_sve_expand_vector_init_handle_trailing_same_elem): Likewise.
(aarch64_sve_expand_vector_init): Define two overloaded functions.
2019-06-03 Alejandro Martinez <alejandro.martinezvicente@arm.com>
PR tree-optimization/90681
......
......@@ -524,6 +524,7 @@ bool aarch64_maybe_expand_sve_subreg_move (rtx, rtx);
void aarch64_split_sve_subreg_move (rtx, rtx, rtx);
void aarch64_expand_prologue (void);
void aarch64_expand_vector_init (rtx, rtx);
void aarch64_sve_expand_vector_init (rtx, rtx);
void aarch64_init_cumulative_args (CUMULATIVE_ARGS *, const_tree, rtx,
const_tree, unsigned);
void aarch64_init_expanders (void);
......
......@@ -863,7 +863,7 @@
"revb\t%0.h, %1/m, %2.h"
)
(define_insn "*aarch64_sve_rev<mode>"
(define_insn "@aarch64_sve_rev<mode>"
[(set (match_operand:SVE_ALL 0 "register_operand" "=w")
(unspec:SVE_ALL [(match_operand:SVE_ALL 1 "register_operand" "w")]
UNSPEC_REV))]
......@@ -3201,3 +3201,14 @@
DONE;
}
)
;; Standard pattern name vec_init<mode><Vel>.
(define_expand "vec_init<mode><Vel>"
[(match_operand:SVE_ALL 0 "register_operand" "")
(match_operand 1 "" "")]
"TARGET_SVE"
{
aarch64_sve_expand_vector_init (operands[0], operands[1]);
DONE;
}
)
......@@ -15277,6 +15277,263 @@ aarch64_expand_vector_init (rtx target, rtx vals)
}
}
/* Emit RTL corresponding to:
insr TARGET, ELEM. */
static void
emit_insr (rtx target, rtx elem)
{
machine_mode mode = GET_MODE (target);
scalar_mode elem_mode = GET_MODE_INNER (mode);
elem = force_reg (elem_mode, elem);
insn_code icode = optab_handler (vec_shl_insert_optab, mode);
gcc_assert (icode != CODE_FOR_nothing);
emit_insn (GEN_FCN (icode) (target, target, elem));
}
/* Subroutine of aarch64_sve_expand_vector_init for handling
trailing constants.
This function works as follows:
(a) Create a new vector consisting of trailing constants.
(b) Initialize TARGET with the constant vector using emit_move_insn.
(c) Insert remaining elements in TARGET using insr.
NELTS is the total number of elements in original vector while
while NELTS_REQD is the number of elements that are actually
significant.
??? The heuristic used is to do above only if number of constants
is at least half the total number of elements. May need fine tuning. */
static bool
aarch64_sve_expand_vector_init_handle_trailing_constants
(rtx target, const rtx_vector_builder &builder, int nelts, int nelts_reqd)
{
machine_mode mode = GET_MODE (target);
scalar_mode elem_mode = GET_MODE_INNER (mode);
int n_trailing_constants = 0;
for (int i = nelts_reqd - 1;
i >= 0 && aarch64_legitimate_constant_p (elem_mode, builder.elt (i));
i--)
n_trailing_constants++;
if (n_trailing_constants >= nelts_reqd / 2)
{
rtx_vector_builder v (mode, 1, nelts);
for (int i = 0; i < nelts; i++)
v.quick_push (builder.elt (i + nelts_reqd - n_trailing_constants));
rtx const_vec = v.build ();
emit_move_insn (target, const_vec);
for (int i = nelts_reqd - n_trailing_constants - 1; i >= 0; i--)
emit_insr (target, builder.elt (i));
return true;
}
return false;
}
/* Subroutine of aarch64_sve_expand_vector_init.
Works as follows:
(a) Initialize TARGET by broadcasting element NELTS_REQD - 1 of BUILDER.
(b) Skip trailing elements from BUILDER, which are the same as
element NELTS_REQD - 1.
(c) Insert earlier elements in reverse order in TARGET using insr. */
static void
aarch64_sve_expand_vector_init_insert_elems (rtx target,
const rtx_vector_builder &builder,
int nelts_reqd)
{
machine_mode mode = GET_MODE (target);
scalar_mode elem_mode = GET_MODE_INNER (mode);
struct expand_operand ops[2];
enum insn_code icode = optab_handler (vec_duplicate_optab, mode);
gcc_assert (icode != CODE_FOR_nothing);
create_output_operand (&ops[0], target, mode);
create_input_operand (&ops[1], builder.elt (nelts_reqd - 1), elem_mode);
expand_insn (icode, 2, ops);
int ndups = builder.count_dups (nelts_reqd - 1, -1, -1);
for (int i = nelts_reqd - ndups - 1; i >= 0; i--)
emit_insr (target, builder.elt (i));
}
/* Subroutine of aarch64_sve_expand_vector_init to handle case
when all trailing elements of builder are same.
This works as follows:
(a) Use expand_insn interface to broadcast last vector element in TARGET.
(b) Insert remaining elements in TARGET using insr.
??? The heuristic used is to do above if number of same trailing elements
is at least 3/4 of total number of elements, loosely based on
heuristic from mostly_zeros_p. May need fine-tuning. */
static bool
aarch64_sve_expand_vector_init_handle_trailing_same_elem
(rtx target, const rtx_vector_builder &builder, int nelts_reqd)
{
int ndups = builder.count_dups (nelts_reqd - 1, -1, -1);
if (ndups >= (3 * nelts_reqd) / 4)
{
aarch64_sve_expand_vector_init_insert_elems (target, builder,
nelts_reqd - ndups + 1);
return true;
}
return false;
}
/* Initialize register TARGET from BUILDER. NELTS is the constant number
of elements in BUILDER.
The function tries to initialize TARGET from BUILDER if it fits one
of the special cases outlined below.
Failing that, the function divides BUILDER into two sub-vectors:
v_even = even elements of BUILDER;
v_odd = odd elements of BUILDER;
and recursively calls itself with v_even and v_odd.
if (recursive call succeeded for v_even or v_odd)
TARGET = zip (v_even, v_odd)
The function returns true if it managed to build TARGET from BUILDER
with one of the special cases, false otherwise.
Example: {a, 1, b, 2, c, 3, d, 4}
The vector gets divided into:
v_even = {a, b, c, d}
v_odd = {1, 2, 3, 4}
aarch64_sve_expand_vector_init(v_odd) hits case 1 and
initialize tmp2 from constant vector v_odd using emit_move_insn.
aarch64_sve_expand_vector_init(v_even) fails since v_even contains
4 elements, so we construct tmp1 from v_even using insr:
tmp1 = dup(d)
insr tmp1, c
insr tmp1, b
insr tmp1, a
And finally:
TARGET = zip (tmp1, tmp2)
which sets TARGET to {a, 1, b, 2, c, 3, d, 4}. */
static bool
aarch64_sve_expand_vector_init (rtx target, const rtx_vector_builder &builder,
int nelts, int nelts_reqd)
{
machine_mode mode = GET_MODE (target);
/* Case 1: Vector contains trailing constants. */
if (aarch64_sve_expand_vector_init_handle_trailing_constants
(target, builder, nelts, nelts_reqd))
return true;
/* Case 2: Vector contains leading constants. */
rtx_vector_builder rev_builder (mode, 1, nelts_reqd);
for (int i = 0; i < nelts_reqd; i++)
rev_builder.quick_push (builder.elt (nelts_reqd - i - 1));
rev_builder.finalize ();
if (aarch64_sve_expand_vector_init_handle_trailing_constants
(target, rev_builder, nelts, nelts_reqd))
{
emit_insn (gen_aarch64_sve_rev (mode, target, target));
return true;
}
/* Case 3: Vector contains trailing same element. */
if (aarch64_sve_expand_vector_init_handle_trailing_same_elem
(target, builder, nelts_reqd))
return true;
/* Case 4: Vector contains leading same element. */
if (aarch64_sve_expand_vector_init_handle_trailing_same_elem
(target, rev_builder, nelts_reqd) && nelts_reqd == nelts)
{
emit_insn (gen_aarch64_sve_rev (mode, target, target));
return true;
}
/* Avoid recursing below 4-elements.
??? The threshold 4 may need fine-tuning. */
if (nelts_reqd <= 4)
return false;
rtx_vector_builder v_even (mode, 1, nelts);
rtx_vector_builder v_odd (mode, 1, nelts);
for (int i = 0; i < nelts * 2; i += 2)
{
v_even.quick_push (builder.elt (i));
v_odd.quick_push (builder.elt (i + 1));
}
v_even.finalize ();
v_odd.finalize ();
rtx tmp1 = gen_reg_rtx (mode);
bool did_even_p = aarch64_sve_expand_vector_init (tmp1, v_even,
nelts, nelts_reqd / 2);
rtx tmp2 = gen_reg_rtx (mode);
bool did_odd_p = aarch64_sve_expand_vector_init (tmp2, v_odd,
nelts, nelts_reqd / 2);
if (!did_even_p && !did_odd_p)
return false;
/* Initialize v_even and v_odd using INSR if it didn't match any of the
special cases and zip v_even, v_odd. */
if (!did_even_p)
aarch64_sve_expand_vector_init_insert_elems (tmp1, v_even, nelts_reqd / 2);
if (!did_odd_p)
aarch64_sve_expand_vector_init_insert_elems (tmp2, v_odd, nelts_reqd / 2);
rtvec v = gen_rtvec (2, tmp1, tmp2);
emit_set_insn (target, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
return true;
}
/* Initialize register TARGET from the elements in PARALLEL rtx VALS. */
void
aarch64_sve_expand_vector_init (rtx target, rtx vals)
{
machine_mode mode = GET_MODE (target);
int nelts = XVECLEN (vals, 0);
rtx_vector_builder v (mode, 1, nelts);
for (int i = 0; i < nelts; i++)
v.quick_push (XVECEXP (vals, 0, i));
v.finalize ();
/* If neither sub-vectors of v could be initialized specially,
then use INSR to insert all elements from v into TARGET.
??? This might not be optimal for vectors with large
initializers like 16-element or above.
For nelts < 4, it probably isn't useful to handle specially. */
if (nelts < 4
|| !aarch64_sve_expand_vector_init (target, v, nelts, nelts))
aarch64_sve_expand_vector_init_insert_elems (target, v, nelts);
}
static unsigned HOST_WIDE_INT
aarch64_shift_truncation_mask (machine_mode mode)
{
......
2019-06-03 Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org>
PR target/88837
* gcc.target/aarch64/sve/init_1.c: New test.
* gcc.target/aarch64/sve/init_1_run.c: Likewise.
* gcc.target/aarch64/sve/init_2.c: Likewise.
* gcc.target/aarch64/sve/init_2_run.c: Likewise.
* gcc.target/aarch64/sve/init_3.c: Likewise.
* gcc.target/aarch64/sve/init_3_run.c: Likewise.
* gcc.target/aarch64/sve/init_4.c: Likewise.
* gcc.target/aarch64/sve/init_4_run.c: Likewise.
* gcc.target/aarch64/sve/init_5.c: Likewise.
* gcc.target/aarch64/sve/init_5_run.c: Likewise.
* gcc.target/aarch64/sve/init_6.c: Likewise.
* gcc.target/aarch64/sve/init_6_run.c: Likewise.
* gcc.target/aarch64/sve/init_7.c: Likewise.
* gcc.target/aarch64/sve/init_7_run.c: Likewise.
* gcc.target/aarch64/sve/init_8.c: Likewise.
* gcc.target/aarch64/sve/init_8_run.c: Likewise.
* gcc.target/aarch64/sve/init_9.c: Likewise.
* gcc.target/aarch64/sve/init_9_run.c: Likewise.
* gcc.target/aarch64/sve/init_10.c: Likewise.
* gcc.target/aarch64/sve/init_10_run.c: Likewise.
* gcc.target/aarch64/sve/init_11.c: Likewise.
* gcc.target/aarch64/sve/init_11_run.c: Likewise.
* gcc.target/aarch64/sve/init_12.c: Likewise.
* gcc.target/aarch64/sve/init_12_run.c: Likewise.
2019-06-03 Alejandro Martinez <alejandro.martinezvicente@arm.com>
PR tree-optimization/90681
......
/* { dg-do assemble { target aarch64_asm_sve_ok } } */
/* { dg-options "-O2 -fno-schedule-insns -msve-vector-bits=256 --save-temps" } */
/* Case 1.1: Trailing constants with stepped sequence. */
#include <stdint.h>
typedef int32_t vnx4si __attribute__((vector_size (32)));
__attribute__((noipa))
vnx4si foo(int a, int b)
{
return (vnx4si) { a, b, 1, 2, 3, 4, 5, 6 };
}
/*
foo:
.LFB0:
.cfi_startproc
ptrue p0.s, vl8
index z0.s, #1, #1
insr z0.s, w1
insr z0.s, w0
ret
*/
/* { dg-final { scan-assembler {\tindex\t(z[0-9]+\.s), #1, #1\n\tinsr\t\1, w1\n\tinsr\t\1, w0} } } */
/* { dg-do assemble { target aarch64_asm_sve_ok } } */
/* { dg-options "-O2 -fno-schedule-insns -msve-vector-bits=256 --save-temps" } */
/* Case 5.4: Interleaved repeating elements and non-repeating elements. */
#include <stdint.h>
typedef int32_t vnx4si __attribute__((vector_size (32)));
__attribute__((noipa))
vnx4si foo(int a, int b, int c, int f)
{
return (vnx4si) { a, f, b, f, c, f, c, f };
}
/*
foo:
.LFB0:
.cfi_startproc
mov z0.s, w2
mov z1.s, w3
insr z0.s, w1
ptrue p0.s, vl8
insr z0.s, w0
zip1 z0.s, z0.s, z1.s
ret
*/
/* { dg-final { scan-assembler {\tmov\t(z[0-9]+\.s), w3\n\tmov\t(z[0-9]+\.s), w2\n.*\n\tinsr\t\2, w1\n\tinsr\t\2, w0\n\tzip1\t\2, \2, \1} } } */
/* { dg-do run { target aarch64_sve256_hw } } */
/* { dg-options "-O2 -msve-vector-bits=256" } */
#include "init_10.c"
int main()
{
int a = 10;
int b = 11;
int c = 12;
int f = 13;
vnx4si v = foo (a, b, c, f);
int expected[] = { a, f, b, f, c, f, c, f };
for (int i = 0; i < 8; i++)
if (v[i] != expected[i])
__builtin_abort ();
return 0;
}
/* { dg-do assemble { target aarch64_asm_sve_ok } } */
/* { dg-options "-O2 -fno-schedule-insns -msve-vector-bits=256 --save-temps" } */
/* Case 5.5: Interleaved repeating elements and trailing same elements. */
#include <stdint.h>
typedef int32_t vnx4si __attribute__((vector_size (32)));
__attribute__((noipa))
vnx4si foo(int a, int b, int f)
{
return (vnx4si) { a, f, b, f, b, f, b, f };
}
/*
foo:
.LFB0:
.cfi_startproc
mov z0.s, w1
mov z1.s, w2
insr z0.s, w0
ptrue p0.s, vl8
zip1 z0.s, z0.s, z1.s
ret
*/
/* { dg-final { scan-assembler {\tmov\t(z[0-9]+\.s), w1\n\tmov\t(z[0-9]+\.s), w2\n\tinsr\t\1, w0\n.*\tzip1\t\1, \1, \2} } } */
/* { dg-do run { target aarch64_sve256_hw } } */
/* { dg-options "-O2 -msve-vector-bits=256" } */
#include "init_11.c"
int main()
{
int a = 10;
int b = 11;
int f = 12;
vnx4si v = foo (a, b, f);
int expected[] = { a, f, b, f, b, f, b, f };
for (int i = 0; i < 8; i++)
if (v[i] != expected[i])
__builtin_abort ();
return 0;
}
/* { dg-do assemble { target aarch64_asm_sve_ok } } */
/* { dg-options "-O2 -fno-schedule-insns -msve-vector-bits=256 --save-temps" } */
/* Case 5.5: Interleaved repeating elements and trailing same elements. */
#include <stdint.h>
typedef int32_t vnx4si __attribute__((vector_size (32)));
__attribute__((noipa))
vnx4si foo(int a, int b, int f)
{
return (vnx4si) { b, f, b, f, b, f, a, f };
}
/*
foo:
.LFB0:
.cfi_startproc
mov z0.s, w0
mov z1.s, w2
insr z0.s, w1
ptrue p0.s, vl8
insr z0.s, w1
insr z0.s, w1
zip1 z0.s, z0.s, z1.s
ret
*/
/* { dg-final { scan-assembler {\tmov\t(z[0-9]+\.s), w2\n\tmov\t(z[0-9]+\.s), w0\n.*\n\tinsr\t\2, w1\n\tinsr\t\2, w1\n\tinsr\t\2, w1\n\tzip1\t\2, \2, \1} } } */
/* { dg-do run { target aarch64_sve256_hw } } */
/* { dg-options "-O2 -msve-vector-bits=256" } */
#include "init_12.c"
int main()
{
int a = 10;
int b = 11;
int f = 12;
vnx4si v = foo (a, b, f);
int expected[] = { b, f, b, f, b, f, a, f };
for (int i = 0; i < 8; i++)
if (v[i] != expected[i])
__builtin_abort ();
return 0;
}
/* { dg-do run { target aarch64_sve256_hw } } */
/* { dg-options "-O2 -msve-vector-bits=256" } */
#include "init_1.c"
int main()
{
int a = 10;
int b = 11;
vnx4si v = foo (a, b);
int expected[] = { a, b, 1, 2, 3, 4, 5, 6 };
for (int i = 0; i < 8; i++)
if (v[i] != expected[i])
__builtin_abort ();
return 0;
}
/* { dg-do assemble { target aarch64_asm_sve_ok } } */
/* { dg-options "-O2 -fno-schedule-insns -msve-vector-bits=256 --save-temps" } */
/* Case 1.2: Trailing constants with repeating sequence. */
#include <stdint.h>
typedef int32_t vnx4si __attribute__((vector_size (32)));
__attribute__((noipa))
vnx4si foo(int a, int b)
{
return (vnx4si) { a, b, 2, 3, 2, 3, 2, 3 };
}
/*
foo:
.LFB0:
.cfi_startproc
ptrue p0.s, vl8
adrp x2, .LANCHOR0
add x2, x2, :lo12:.LANCHOR0
ld1w z0.s, p0/z, [x2]
insr z0.s, w1
insr z0.s, w0
ret
*/
/* { dg-final { scan-assembler {\tld1w\t(z[0-9]+\.s), p[0-9]+/z, \[x[0-9]+\]\n\tinsr\t\1, w1\n\tinsr\t\1, w0} } } */
/* { dg-do run { target aarch64_sve256_hw } } */
/* { dg-options "-O2 -msve-vector-bits=256" } */
#include "init_2.c"
int main()
{
int a = 10;
int b = 11;
vnx4si v = foo (a, b);
int expected[] = { a, b, 2, 3, 2, 3, 2, 3 };
for (int i = 0; i < 8; i++)
if (v[i] != expected[i])
__builtin_abort ();
return 0;
}
/* { dg-do assemble { target aarch64_asm_sve_ok } } */
/* { dg-options "-O2 -fno-schedule-insns -msve-vector-bits=256 --save-temps" } */
/* Case 2.1: Leading constants with stepped sequence. */
#include <stdint.h>
typedef int32_t vnx4si __attribute__((vector_size (32)));
__attribute__((noipa))
vnx4si foo(int a, int b)
{
return (vnx4si) { 1, 2, 3, 4, 5, 6, a, b };
}
/*
foo:
.LFB0:
.cfi_startproc
ptrue p0.s, vl8
index z0.s, #6, #-1
insr z0.s, w0
insr z0.s, w1
rev z0.s, z0.s
ret
*/
/* { dg-final { scan-assembler {\tindex\t(z[0-9]+\.s), #6, #-1\n\tinsr\t\1, w0\n\tinsr\t\1, w1\n\trev\t\1, \1} } } */
/* { dg-do run { target aarch64_sve256_hw } } */
/* { dg-options "-O2 -msve-vector-bits=256" } */
#include "init_3.c"
int main()
{
int a = 10;
int b = 11;
vnx4si v = foo (a, b);
int expected[] = { 1, 2, 3, 4, 5, 6, a, b };
for (int i = 0; i < 8; i++)
if (v[i] != expected[i])
__builtin_abort ();
return 0;
}
/* { dg-do assemble { target aarch64_asm_sve_ok } } */
/* { dg-options "-O2 -fno-schedule-insns -msve-vector-bits=256 --save-temps" } */
/* Case 2.2: Leading constants with stepped sequence. */
#include <stdint.h>
typedef int32_t vnx4si __attribute__((vector_size (32)));
__attribute__((noipa))
vnx4si foo(int a, int b)
{
return (vnx4si) { 3, 2, 3, 2, 3, 2, b, a };
}
/*
foo:
.LFB0:
.cfi_startproc
ptrue p0.s, vl8
adrp x2, .LANCHOR0
add x2, x2, :lo12:.LANCHOR0
ld1w z0.s, p0/z, [x2]
insr z0.s, w1
insr z0.s, w0
rev z0.s, z0.s
ret
*/
/* { dg-final { scan-assembler {\tld1w\t(z[0-9]+\.s), p[0-9]+/z, \[x[0-9]+\]\n\tinsr\t\1, w1\n\tinsr\t\1, w0\n\trev\t\1, \1} } } */
/* { dg-do run { target aarch64_sve256_hw } } */
/* { dg-options "-O2 -msve-vector-bits=256" } */
#include "init_4.c"
int main()
{
int a = 10;
int b = 11;
vnx4si v = foo (a, b);
int expected[] = { 3, 2, 3, 2, 3, 2, b, a };
for (int i = 0; i < 8; i++)
if (v[i] != expected[i])
__builtin_abort ();
return 0;
}
/* { dg-do assemble { target aarch64_asm_sve_ok } } */
/* { dg-options "-O2 -fno-schedule-insns -msve-vector-bits=256 --save-temps" } */
/* Case 3: Trailing same element. */
#include <stdint.h>
typedef int32_t vnx4si __attribute__((vector_size (32)));
__attribute__((noipa))
vnx4si foo(int a, int b, int c)
{
return (vnx4si) { a, b, c, c, c, c, c, c };
}
/*
foo:
.LFB0:
.cfi_startproc
mov z0.s, w2
ptrue p0.s, vl8
insr z0.s, w1
insr z0.s, w0
ret
*/
/* { dg-final { scan-assembler {\tmov\t(z[0-9]+\.s), w2\n.*\tinsr\t\1, w1\n\tinsr\t\1, w0} } } */
/* { dg-do run { target aarch64_sve256_hw } } */
/* { dg-options "-O2 -msve-vector-bits=256" } */
#include "init_5.c"
int main()
{
int a = 10;
int b = 11;
int c = 12;
vnx4si v = foo (a, b, c);
int expected[] = { a, b, c, c, c, c, c, c };
for (int i = 0; i < 8; i++)
if (v[i] != expected[i])
__builtin_abort ();
return 0;
}
/* { dg-do assemble { target aarch64_asm_sve_ok } } */
/* { dg-options "-O2 -fno-schedule-insns -msve-vector-bits=256 --save-temps" } */
/* Case 3: Trailing same element. */
#include <stdint.h>
typedef int32_t vnx4si __attribute__((vector_size (32)));
__attribute__((noipa))
vnx4si foo(int a, int b, int c)
{
return (vnx4si) { c, c, c, c, c, c, b, a };
}
/*
foo:
.LFB0:
.cfi_startproc
mov z0.s, w2
ptrue p0.s, vl8
insr z0.s, w1
insr z0.s, w0
rev z0.s, z0.s
ret
*/
/* { dg-final { scan-assembler {\tmov\t(z[0-9]+\.s), w2\n.*\tinsr\t\1, w1\n\tinsr\t\1, w0\n\trev\t\1, \1} } } */
/* { dg-do run { target aarch64_sve256_hw } } */
/* { dg-options "-O2 -msve-vector-bits=256" } */
#include "init_6.c"
int main()
{
int a = 10;
int b = 11;
int c = 12;
vnx4si v = foo (a, b, c);
int expected[] = { c, c, c, c, c, c, b, a };
for (int i = 0; i < 8; i++)
if (v[i] != expected[i])
__builtin_abort ();
return 0;
}
/* { dg-do assemble { target aarch64_asm_sve_ok } } */
/* { dg-options "-O2 -fno-schedule-insns -msve-vector-bits=256 --save-temps" } */
/* Case 5.1: All elements. */
#include <stdint.h>
typedef int32_t vnx4si __attribute__((vector_size (32)));
__attribute__((noipa))
vnx4si foo(int a, int b, int c, int d, int e, int f, int g, int h)
{
return (vnx4si) { a, b, c, d, e, f, g, h };
}
/*
foo:
.LFB0:
.cfi_startproc
mov z0.s, w7
ptrue p0.s, vl8
insr z0.s, w6
insr z0.s, w5
insr z0.s, w4
insr z0.s, w3
insr z0.s, w2
insr z0.s, w1
insr z0.s, w0
ret
*/
/* { dg-final { scan-assembler {\tmov\t(z[0-9]+\.s), w7\n.*\tinsr\t\1, w6\n\tinsr\t\1, w5\n\tinsr\t\1, w4\n\tinsr\t\1, w3\n\tinsr\t\1, w2\n\tinsr\t\1, w1\n\tinsr\t\1, w0} } } */
/* { dg-do run { target aarch64_sve256_hw } } */
/* { dg-options "-O2 -msve-vector-bits=256" } */
#include "init_7.c"
int main()
{
int a = 10;
int b = 11;
int c = 12;
int d = 13;
int e = 14;
int f = 15;
int g = 16;
int h = 17;
vnx4si v = foo (a, b, c, d, e, f, g, h);
int expected[] = { a, b, c, d, e, f, g, h };
for (int i = 0; i < 8; i++)
if (v[i] != expected[i])
__builtin_abort ();
return 0;
}
/* { dg-do assemble { target aarch64_asm_sve_ok } } */
/* { dg-options "-O2 -fno-schedule-insns -msve-vector-bits=256 --save-temps" } */
/* Case 5.2: Interleaved elements and constants. */
#include <stdint.h>
typedef int32_t vnx4si __attribute__((vector_size (32)));
__attribute__((noipa))
vnx4si foo(int a, int b, int c, int d)
{
return (vnx4si) { a, 1, b, 2, c, 3, d, 4 };
}
/*
foo:
.LFB0:
.cfi_startproc
ptrue p0.s, vl8
mov z0.s, w3
adrp x3, .LANCHOR0
insr z0.s, w2
add x3, x3, :lo12:.LANCHOR0
insr z0.s, w1
ld1w z1.s, p0/z, [x3]
insr z0.s, w0
zip1 z0.s, z0.s, z1.s
ret
*/
/* { dg-final { scan-assembler {\tmov\t(z[0-9]+\.s), w3\n\tadrp\t(x[0-9]+), \.LANCHOR0\n\tinsr\t\1, w2\n\tadd\t\2, \2, :lo12:\.LANCHOR0\n\tinsr\t\1, w1\n\tld1w\t(z[0-9]+\.s), p[0-9]+/z, \[\2\]\n\tinsr\t\1, w0\n\tzip1\t\1, \1, \3} } } */
/* { dg-do run { target aarch64_sve256_hw } } */
/* { dg-options "-O2 -msve-vector-bits=256" } */
#include "init_8.c"
int main()
{
int a = 10;
int b = 11;
int c = 12;
int d = 13;
vnx4si v = foo (a, b, c, d);
int expected[] = { a, 1, b, 2, c, 3, d, 4 };
for (int i = 0; i < 8; i++)
if (v[i] != expected[i])
__builtin_abort ();
return 0;
}
/* { dg-do assemble { target aarch64_asm_sve_ok } } */
/* { dg-options "-O2 -fno-schedule-insns -msve-vector-bits=256 --save-temps" } */
/* Case 5.3: Repeated elements. */
#include <stdint.h>
typedef int32_t vnx4si __attribute__((vector_size (32)));
__attribute__((noipa))
vnx4si foo(int a, int b)
{
return (vnx4si) { a, b, a, b, a, b, a, b };
}
/*
foo:
.LFB0:
.cfi_startproc
mov z0.s, w0
mov z1.s, w1
ptrue p0.s, vl8
zip1 z0.s, z0.s, z1.s
ret
*/
/* { dg-final { scan-assembler {\tmov\t(z[0-9]+\.s), w0\n\tmov\t(z[0-9]+\.s), w1\n.*\tzip1\t\1, \1, \2} } } */
/* { dg-do run { target aarch64_sve256_hw } } */
/* { dg-options "-O2 -msve-vector-bits=256" } */
#include "init_9.c"
int main()
{
int a = 10;
int b = 11;
vnx4si v = foo (a, b);
int expected[] = { a, b, a, b, a, b, a, b };
for (int i = 0; i < 8; i++)
if (v[i] != expected[i])
__builtin_abort ();
return 0;
}
......@@ -96,6 +96,7 @@ public:
unsigned int encoded_nelts () const;
bool encoded_full_vector_p () const;
T elt (unsigned int) const;
unsigned int count_dups (int, int, int) const;
bool operator == (const Derived &) const;
bool operator != (const Derived &x) const { return !operator == (x); }
......@@ -223,6 +224,23 @@ vector_builder<T, Derived>::elt (unsigned int i) const
derived ()->step (prev, final));
}
/* Return the number of leading duplicate elements in the range
[START:END:STEP]. The value is always at least 1. */
template<typename T, typename Derived>
unsigned int
vector_builder<T, Derived>::count_dups (int start, int end, int step) const
{
gcc_assert ((end - start) % step == 0);
unsigned int ndups = 1;
for (int i = start + step;
i != end && derived ()->equal_p (elt (i), elt (start));
i += step)
ndups++;
return ndups;
}
/* Change the encoding to NPATTERNS patterns of NELTS_PER_PATTERN each,
but without changing the underlying vector. */
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment