Commit 6c2833e7 by Kewen Lin

re PR tree-optimization/88497 (Improve Accumulation in Auto-Vectorized Code)

gcc/ChangeLog

2019-07-15  Kewen Lin  <linkw@gcc.gnu.org>

    PR tree-optimization/88497
    * tree-ssa-reassoc.c (reassociate_bb): Swap the positions of
    GIMPLE_BINARY_RHS check and gimple_visited_p check, call new
    function undistribute_bitref_for_vector.
    (undistribute_bitref_for_vector): New function.
    (cleanup_vinfo_map): Likewise.
    (sort_by_mach_mode): Likewise.

gcc/testsuite/ChangeLog

2019-07-15  Kewen Lin  <linkw@gcc.gnu.org>

    PR tree-optimization/88497
    * gcc.dg/tree-ssa/pr88497-1.c: New test.
    * gcc.dg/tree-ssa/pr88497-2.c: Likewise.
    * gcc.dg/tree-ssa/pr88497-3.c: Likewise.
    * gcc.dg/tree-ssa/pr88497-4.c: Likewise.
    * gcc.dg/tree-ssa/pr88497-5.c: Likewise.
    * gcc.dg/tree-ssa/pr88497-6.c: Likewise.
    * gcc.dg/tree-ssa/pr88497-7.c: Likewise.

From-SVN: r273490
parent 3126c241
2019-07-15 Kewen Lin <linkw@gcc.gnu.org>
PR tree-optimization/88497
* tree-ssa-reassoc.c (reassociate_bb): Swap the positions of
GIMPLE_BINARY_RHS check and gimple_visited_p check, call new
function undistribute_bitref_for_vector.
(undistribute_bitref_for_vector): New function.
(cleanup_vinfo_map): Likewise.
(sort_by_mach_mode): Likewise.
2019-07-14 Uroš Bizjak <ubizjak@gmail.com>
* config/i386/i386.md (nonmemory_szext_operand): New mode attribute.
......
2019-07-15 Kewen Lin <linkw@gcc.gnu.org>
PR tree-optimization/88497
* gcc.dg/tree-ssa/pr88497-1.c: New test.
* gcc.dg/tree-ssa/pr88497-2.c: Likewise.
* gcc.dg/tree-ssa/pr88497-3.c: Likewise.
* gcc.dg/tree-ssa/pr88497-4.c: Likewise.
* gcc.dg/tree-ssa/pr88497-5.c: Likewise.
* gcc.dg/tree-ssa/pr88497-6.c: Likewise.
* gcc.dg/tree-ssa/pr88497-7.c: Likewise.
2019-07-14 Jerry DeLisle <jvdelisle@gcc.gnu.org>
PR fortran/87233
......
/* { dg-do run } */
/* { dg-require-effective-target vect_double } */
/* { dg-require-effective-target vsx_hw { target { powerpc*-*-* } } } */
/* { dg-require-effective-target sse2_runtime { target { i?86-*-* x86_64-*-* } } } */
/* { dg-options "-O2 -ffast-math -fdump-tree-reassoc1" } */
/* { dg-additional-options "-mvsx" { target { powerpc*-*-* } } } */
/* { dg-additional-options "-msse2" { target { i?86-*-* x86_64-*-* } } } */
/* To test reassoc can undistribute vector bit_field_ref summation.
arg1 and arg2 are two arrays whose elements of type vector double.
Assuming:
A0 = arg1[0], A1 = arg1[1], A2 = arg1[2], A3 = arg1[3],
B0 = arg2[0], B1 = arg2[1], B2 = arg2[2], B3 = arg2[3],
Then:
V0 = A0 * B0, V1 = A1 * B1, V2 = A2 * B2, V3 = A3 * B3,
reassoc transforms
accumulator += V0[0] + V0[1] + V1[0] + V1[1] + V2[0] + V2[1]
+ V3[0] + V3[1];
into:
T = V0 + V1 + V2 + V3
accumulator += T[0] + T[1];
Fewer bit_field_refs, only two for 128 or more bits vector. */
typedef double v2df __attribute__ ((vector_size (16)));
__attribute__ ((noinline)) double
test (double accumulator, v2df arg1[], v2df arg2[])
{
v2df temp;
temp = arg1[0] * arg2[0];
accumulator += temp[0] + temp[1];
temp = arg1[1] * arg2[1];
accumulator += temp[0] + temp[1];
temp = arg1[2] * arg2[2];
accumulator += temp[0] + temp[1];
temp = arg1[3] * arg2[3];
accumulator += temp[0] + temp[1];
return accumulator;
}
extern void abort (void);
int
main ()
{
v2df v2[4] = {{1.0, 2.0}, {4.0, 8.0}, {1.0, 3.0}, {9.0, 27.0}};
v2df v3[4] = {{1.0, 4.0}, {16.0, 64.0}, {1.0, 2.0}, {3.0, 4.0}};
double acc = 100.0;
double res = test (acc, v2, v3);
if (res != 827.0)
abort ();
return 0;
}
/* { dg-final { scan-tree-dump-times "BIT_FIELD_REF" 2 "reassoc1" { target { powerpc*-*-* i?86-*-* x86_64-*-* } } } } */
/* { dg-do compile } */
/* { dg-require-effective-target vect_float } */
/* { dg-require-effective-target powerpc_altivec_ok { target { powerpc*-*-* } } } */
/* { dg-require-effective-target sse2 { target { i?86-*-* x86_64-*-* } } } */
/* { dg-options "-O2 -ffast-math -fdump-tree-reassoc1" } */
/* { dg-additional-options "-maltivec" { target { powerpc*-*-* } } } */
/* { dg-additional-options "-msse2" { target { i?86-*-* x86_64-*-* } } } */
/* To test reassoc can undistribute vector bit_field_ref on multiplication.
v1, v2, v3, v4 of type vector float.
reassoc transforms
accumulator *= v1[0] * v1[1] * v1[2] * v1[3] *
v2[0] * v2[1] * v2[2] * v2[3] *
v3[0] * v3[1] * v3[2] * v3[3] *
v4[0] * v4[1] * v4[2] * v4[3] ;
into:
T = v1 * v2 * v3 * v4;
accumulator *= T[0] * T[1] * T[2] * T[3];
Fewer bit_field_refs, only four for 128 or more bits vector. */
typedef float v4sf __attribute__ ((vector_size (16)));
float
test (float accumulator, v4sf v1, v4sf v2, v4sf v3, v4sf v4)
{
accumulator *= v1[0] * v1[1] * v1[2] * v1[3];
accumulator *= v2[0] * v2[1] * v2[2] * v2[3];
accumulator *= v3[0] * v3[1] * v3[2] * v3[3];
accumulator *= v4[0] * v4[1] * v4[2] * v4[3];
return accumulator;
}
/* { dg-final { scan-tree-dump-times "BIT_FIELD_REF" 4 "reassoc1" { target { powerpc*-*-* i?86-*-* x86_64-*-* } } } } */
/* { dg-do compile } */
/* { dg-require-effective-target vect_int } */
/* { dg-require-effective-target powerpc_altivec_ok { target { powerpc*-*-* } } } */
/* { dg-require-effective-target sse2 { target { i?86-*-* x86_64-*-* } } } */
/* { dg-options "-O2 -ffast-math -fdump-tree-reassoc1" } */
/* { dg-additional-options "-maltivec" { target { powerpc*-*-* } } } */
/* { dg-additional-options "-msse2" { target { i?86-*-* x86_64-*-* } } } */
/* To test reassoc can undistribute vector bit_field_ref on bitwise AND.
v1, v2, v3, v4 of type vector int.
reassoc transforms
accumulator &= v1[0] & v1[1] & v1[2] & v1[3] &
v2[0] & v2[1] & v2[2] & v2[3] &
v3[0] & v3[1] & v3[2] & v3[3] &
v4[0] & v4[1] & v4[2] & v4[3] ;
into:
T = v1 & v2 & v3 & v4;
accumulator &= T[0] & T[1] & T[2] & T[3];
Fewer bit_field_refs, only four for 128 or more bits vector. */
typedef int v4si __attribute__ ((vector_size (16)));
int
test (int accumulator, v4si v1, v4si v2, v4si v3, v4si v4)
{
accumulator &= v1[0] & v1[1] & v1[2] & v1[3];
accumulator &= v2[0] & v2[1] & v2[2] & v2[3];
accumulator &= v3[0] & v3[1] & v3[2] & v3[3];
accumulator &= v4[0] & v4[1] & v4[2] & v4[3];
return accumulator;
}
/* { dg-final { scan-tree-dump-times "BIT_FIELD_REF" 4 "reassoc1" { target { powerpc*-*-* i?86-*-* x86_64-*-* } } } } */
/* { dg-do compile } */
/* { dg-require-effective-target vect_int } */
/* { dg-require-effective-target powerpc_altivec_ok { target { powerpc*-*-* } } } */
/* { dg-require-effective-target sse2 { target { i?86-*-* x86_64-*-* } } } */
/* { dg-options "-O2 -ffast-math -fdump-tree-reassoc1" } */
/* { dg-additional-options "-maltivec" { target { powerpc*-*-* } } } */
/* { dg-additional-options "-msse2" { target { i?86-*-* x86_64-*-* } } } */
/* To test reassoc can undistribute vector bit_field_ref on bitwise IOR.
v1, v2, v3, v4 of type vector int.
reassoc transforms
accumulator |= v1[0] | v1[1] | v1[2] | v1[3] |
v2[0] | v2[1] | v2[2] | v2[3] |
v3[0] | v3[1] | v3[2] | v3[3] |
v4[0] | v4[1] | v4[2] | v4[3] ;
into:
T = v1 | v2 | v3 | v4;
accumulator |= T[0] | T[1] | T[2] | T[3];
Fewer bit_field_refs, only four for 128 or more bits vector. */
typedef int v4si __attribute__ ((vector_size (16)));
int
test (int accumulator, v4si v1, v4si v2, v4si v3, v4si v4)
{
accumulator |= v1[0] | v1[1] | v1[2] | v1[3];
accumulator |= v2[0] | v2[1] | v2[2] | v2[3];
accumulator |= v3[0] | v3[1] | v3[2] | v3[3];
accumulator |= v4[0] | v4[1] | v4[2] | v4[3];
return accumulator;
}
/* { dg-final { scan-tree-dump-times "BIT_FIELD_REF" 4 "reassoc1" { target { powerpc*-*-* i?86-*-* x86_64-*-* } } } } */
/* { dg-do compile } */
/* { dg-require-effective-target vect_int } */
/* { dg-require-effective-target powerpc_altivec_ok { target { powerpc*-*-* } } } */
/* { dg-require-effective-target sse2 { target { i?86-*-* x86_64-*-* } } } */
/* { dg-options "-O2 -ffast-math -fdump-tree-reassoc1" } */
/* { dg-additional-options "-maltivec" { target { powerpc*-*-* } } } */
/* { dg-additional-options "-msse2" { target { i?86-*-* x86_64-*-* } } } */
/* To test reassoc can undistribute vector bit_field_ref on bitwise XOR.
v1, v2, v3, v4 of type vector int.
reassoc transforms
accumulator ^= v1[0] ^ v1[1] ^ v1[2] ^ v1[3] ^
v2[0] ^ v2[1] ^ v2[2] ^ v2[3] ^
v3[0] ^ v3[1] ^ v3[2] ^ v3[3] ^
v4[0] ^ v4[1] ^ v4[2] ^ v4[3] ;
into:
T = v1 ^ v2 ^ v3 ^ v4;
accumulator ^= T[0] ^ T[1] ^ T[2] ^ T[3];
Fewer bit_field_refs, only four for 128 or more bits vector. */
typedef int v4si __attribute__ ((vector_size (16)));
int
test (int accumulator, v4si v1, v4si v2, v4si v3, v4si v4)
{
accumulator ^= v1[0] ^ v1[1] ^ v1[2] ^ v1[3];
accumulator ^= v2[0] ^ v2[1] ^ v2[2] ^ v2[3];
accumulator ^= v3[0] ^ v3[1] ^ v3[2] ^ v3[3];
accumulator ^= v4[0] ^ v4[1] ^ v4[2] ^ v4[3];
return accumulator;
}
/* { dg-final { scan-tree-dump-times "BIT_FIELD_REF" 4 "reassoc1" { target { powerpc*-*-* i?86-*-* x86_64-*-* } } } } */
/* { dg-do compile } */
/* { dg-require-effective-target avx512f } */
/* { dg-options "-O2 -mavx512f -ffast-math -fdump-tree-reassoc1" } */
/* To test reassoc can undistribute vector bit_field_ref on multiple
vector machine modes.
v1, v2 of type vector 4 x float
v3, v4 of type vector 8 x float
v5, v6 of type vector 16 x float
reassoc transforms
accumulator += v1[0] + v1[1] + v1[2] + v1[3] +
v2[0] + v2[1] + v2[2] + v2[3] +
v3[0] + v3[1] + v3[2] + v3[3] +
v3[4] + v3[5] + v3[6] + v3[7] +
v4[0] + v4[1] + v4[2] + v4[3] +
v4[4] + v4[5] + v4[6] + v4[7] +
v5[0] + v5[1] + v5[2] + v5[3] +
v5[4] + v5[5] + v5[6] + v5[7] +
v5[8] + v5[9] + v5[10] + v5[11] +
v5[12] + v5[13] + v5[14] + v5[15] +
v6[0] + v6[1] + v6[2] + v6[3] +
v6[4] + v6[5] + v6[6] + v6[7] +
v6[8] + v6[9] + v6[10] + v6[11] +
v6[12] + v6[13] + v6[14] + v6[15] ;
into:
T12 = v1 + v2;
T34 = v3 + v4;
T56 = v5 + v6;
accumulator += T12[0] + T12[1] + T12[2] + T12[3] +
accumulator += T34[0] + T34[1] + T34[2] + T34[3] +
accumulator += T34[4] + T34[5] + T34[6] + T34[7] +
accumulator += T56[0] + T56[1] + T56[2] + T56[3] +
accumulator += T56[4] + T56[5] + T56[6] + T56[7] +
accumulator += T56[8] + T56[9] + T56[10] + T56[11] +
accumulator += T56[12] + T56[13] + T56[14] + T56[15] ; */
typedef float v4sf __attribute__((vector_size(16)));
typedef float v8sf __attribute__((vector_size(32)));
typedef float v16sf __attribute__((vector_size(64)));
float
test (float accumulator, v4sf v1, v4sf v2, v8sf v3, v8sf v4, v16sf v5, v16sf v6)
{
accumulator += v1[0] + v1[1] + v1[2] + v1[3];
accumulator += v2[0] + v2[1] + v2[2] + v2[3];
accumulator += v3[0] + v3[1] + v3[2] + v3[3];
accumulator += v3[4] + v3[5] + v3[6] + v3[7];
accumulator += v4[0] + v4[1] + v4[2] + v4[3];
accumulator += v4[4] + v4[5] + v4[6] + v4[7];
accumulator += v5[0] + v5[1] + v5[2] + v5[3];
accumulator += v5[4] + v5[5] + v5[6] + v5[7];
accumulator += v5[8] + v5[9] + v5[10] + v5[11];
accumulator += v5[12] + v5[13] + v5[14] + v5[15];
accumulator += v6[0] + v6[1] + v6[2] + v6[3];
accumulator += v6[4] + v6[5] + v6[6] + v6[7];
accumulator += v6[8] + v6[9] + v6[10] + v6[11];
accumulator += v6[12] + v6[13] + v6[14] + v6[15];
return accumulator;
}
/* { dg-final { scan-tree-dump-times "BIT_FIELD_REF" 28 "reassoc1" } } */
/* { dg-do run } */
/* { dg-require-effective-target avx512f_runtime } */
/* { dg-options "-O2 -mavx512f -ffast-math -fdump-tree-reassoc1" } */
/* To test reassoc can undistribute vector bit_field_ref on multiple
vector machine modes, bypass those modes with only one candidate.
v1, v2 of type vector 4 x float
v3 of type vector 8 x float
v5, v6 of type vector 16 x float
reassoc transforms
accumulator += v1[0] + v1[1] + v1[2] + v1[3] +
v2[0] + v2[1] + v2[2] + v2[3] +
v3[0] + v3[1] + v3[2] + v3[3] +
v3[4] + v3[5] + v3[6] + v3[7] +
v5[0] + v5[1] + v5[2] + v5[3] +
v5[4] + v5[5] + v5[6] + v5[7] +
v5[8] + v5[9] + v5[10] + v5[11] +
v5[12] + v5[13] + v5[14] + v5[15] +
v6[0] + v6[1] + v6[2] + v6[3] +
v6[4] + v6[5] + v6[6] + v6[7] +
v6[8] + v6[9] + v6[10] + v6[11] +
v6[12] + v6[13] + v6[14] + v6[15] ;
into:
T12 = v1 + v2;
T56 = v5 + v6;
accumulator += T12[0] + T12[1] + T12[2] + T12[3] +
accumulator += v3[0] + v3[1] + v3[2] + v3[3] +
accumulator += v3[4] + v3[5] + v3[6] + v3[7] +
accumulator += T56[0] + T56[1] + T56[2] + T56[3] +
accumulator += T56[4] + T56[5] + T56[6] + T56[7] +
accumulator += T56[8] + T56[9] + T56[10] + T56[11] +
accumulator += T56[12] + T56[13] + T56[14] + T56[15] ; */
typedef float v4sf __attribute__((vector_size(16)));
typedef float v8sf __attribute__((vector_size(32)));
typedef float v16sf __attribute__((vector_size(64)));
__attribute__ ((noinline))
float test(float accumulator, v4sf v1, v4sf v2, v8sf v3, v16sf v5, v16sf v6) {
accumulator += v1[0] + v1[1] + v1[2] + v1[3];
accumulator += v2[0] + v2[1] + v2[2] + v2[3];
accumulator += v3[0] + v3[1] + v3[2] + v3[3];
accumulator += v3[4] + v3[5] + v3[6] + v3[7];
accumulator += v5[0] + v5[1] + v5[2] + v5[3];
accumulator += v5[4] + v5[5] + v5[6] + v5[7];
accumulator += v5[8] + v5[9] + v5[10] + v5[11];
accumulator += v5[12] + v5[13] + v5[14] + v5[15];
accumulator += v6[0] + v6[1] + v6[2] + v6[3];
accumulator += v6[4] + v6[5] + v6[6] + v6[7];
accumulator += v6[8] + v6[9] + v6[10] + v6[11];
accumulator += v6[12] + v6[13] + v6[14] + v6[15];
return accumulator;
}
extern void abort (void);
int
main ()
{
v4sf v1 = {1.0, 2.0, 3.0, 4.0 };
v4sf v2 = {5.0, 6.0, 7.0, 8.0 };
v8sf v3 = {9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0 };
v16sf v5 = {17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0, 32.0};
v16sf v6 = {33.0, 34.0, 35.0, 36.0, 37.0, 38.0, 39.0, 40.0, 41.0, 42.0, 43.0, 44.0, 45.0, 46.0, 47.0, 48.0};
float acc = 24.0;
double res = test (acc, v1, v2, v3, v5, v6);
if (res != 1200.0)
abort();
return 0;
}
/* { dg-final { scan-tree-dump-times "BIT_FIELD_REF" 28 "reassoc1" } } */
......@@ -1772,6 +1772,274 @@ undistribute_ops_list (enum tree_code opcode,
return changed;
}
/* Pair to hold the information of one specific VECTOR_TYPE SSA_NAME:
first: element index for each relevant BIT_FIELD_REF.
second: the index of vec ops* for each relevant BIT_FIELD_REF. */
typedef std::pair<unsigned, unsigned> v_info_elem;
typedef auto_vec<v_info_elem, 32> v_info;
typedef v_info *v_info_ptr;
/* Comparison function for qsort on VECTOR SSA_NAME trees by machine mode. */
static int
sort_by_mach_mode (const void *p_i, const void *p_j)
{
const tree tr1 = *((const tree *) p_i);
const tree tr2 = *((const tree *) p_j);
unsigned int mode1 = TYPE_MODE (TREE_TYPE (tr1));
unsigned int mode2 = TYPE_MODE (TREE_TYPE (tr2));
if (mode1 > mode2)
return 1;
else if (mode1 < mode2)
return -1;
else
return 0;
}
/* Cleanup hash map for VECTOR information. */
static void
cleanup_vinfo_map (hash_map<tree, v_info_ptr> &info_map)
{
for (hash_map<tree, v_info_ptr>::iterator it = info_map.begin ();
it != info_map.end (); ++it)
{
v_info_ptr info = (*it).second;
delete info;
(*it).second = NULL;
}
}
/* Perform un-distribution of BIT_FIELD_REF on VECTOR_TYPE.
V1[0] + V1[1] + ... + V1[k] + V2[0] + V2[1] + ... + V2[k] + ... Vn[k]
is transformed to
Vs = (V1 + V2 + ... + Vn)
Vs[0] + Vs[1] + ... + Vs[k]
The basic steps are listed below:
1) Check the addition chain *OPS by looking those summands coming from
VECTOR bit_field_ref on VECTOR type. Put the information into
v_info_map for each satisfied summand, using VECTOR SSA_NAME as key.
2) For each key (VECTOR SSA_NAME), validate all its BIT_FIELD_REFs are
continuous, they can cover the whole VECTOR perfectly without any holes.
Obtain one VECTOR list which contain candidates to be transformed.
3) Sort the VECTOR list by machine mode of VECTOR type, for each group of
candidates with same mode, build the addition statements for them and
generate BIT_FIELD_REFs accordingly.
TODO:
The current implementation requires the whole VECTORs should be fully
covered, but it can be extended to support partial, checking adjacent
but not fill the whole, it may need some cost model to define the
boundary to do or not.
*/
static bool
undistribute_bitref_for_vector (enum tree_code opcode,
vec<operand_entry *> *ops, struct loop *loop)
{
if (ops->length () <= 1)
return false;
if (opcode != PLUS_EXPR && opcode != MULT_EXPR && opcode != BIT_XOR_EXPR
&& opcode != BIT_IOR_EXPR && opcode != BIT_AND_EXPR)
return false;
hash_map<tree, v_info_ptr> v_info_map;
operand_entry *oe1;
unsigned i;
/* Find those summands from VECTOR BIT_FIELD_REF in addition chain, put the
information into map. */
FOR_EACH_VEC_ELT (*ops, i, oe1)
{
enum tree_code dcode;
gimple *oe1def;
if (TREE_CODE (oe1->op) != SSA_NAME)
continue;
oe1def = SSA_NAME_DEF_STMT (oe1->op);
if (!is_gimple_assign (oe1def))
continue;
dcode = gimple_assign_rhs_code (oe1def);
if (dcode != BIT_FIELD_REF || !is_reassociable_op (oe1def, dcode, loop))
continue;
tree rhs = gimple_assign_rhs1 (oe1def);
tree vec = TREE_OPERAND (rhs, 0);
tree vec_type = TREE_TYPE (vec);
if (TREE_CODE (vec) != SSA_NAME || !VECTOR_TYPE_P (vec_type))
continue;
/* Ignore it if target machine can't support this VECTOR type. */
if (!VECTOR_MODE_P (TYPE_MODE (vec_type)))
continue;
/* Check const vector type, constrain BIT_FIELD_REF offset and size. */
if (!TYPE_VECTOR_SUBPARTS (vec_type).is_constant ())
continue;
tree elem_type = TREE_TYPE (vec_type);
unsigned HOST_WIDE_INT elem_size
= TREE_INT_CST_LOW (TYPE_SIZE (elem_type));
if (maybe_ne (bit_field_size (rhs), elem_size))
continue;
unsigned idx;
if (!constant_multiple_p (bit_field_offset (rhs), elem_size, &idx))
continue;
/* Ignore it if target machine can't support this type of VECTOR
operation. */
optab op_tab = optab_for_tree_code (opcode, vec_type, optab_vector);
if (optab_handler (op_tab, TYPE_MODE (vec_type)) == CODE_FOR_nothing)
continue;
bool existed;
v_info_ptr &info = v_info_map.get_or_insert (vec, &existed);
if (!existed)
info = new v_info;
info->safe_push (std::make_pair (idx, i));
}
/* At least two VECTOR to combine. */
if (v_info_map.elements () <= 1)
{
cleanup_vinfo_map (v_info_map);
return false;
}
/* Verify all VECTOR candidates by checking two conditions:
1) sorted offsets are adjacent, no holes.
2) can fill the whole VECTOR perfectly.
And add the valid candidates to a vector for further handling. */
auto_vec<tree> valid_vecs (v_info_map.elements ());
for (hash_map<tree, v_info_ptr>::iterator it = v_info_map.begin ();
it != v_info_map.end (); ++it)
{
tree cand_vec = (*it).first;
v_info_ptr cand_info = (*it).second;
unsigned int num_elems = VECTOR_CST_NELTS (cand_vec).to_constant ();
if (cand_info->length () != num_elems)
continue;
sbitmap holes = sbitmap_alloc (num_elems);
bitmap_ones (holes);
bool valid = true;
v_info_elem *curr;
FOR_EACH_VEC_ELT (*cand_info, i, curr)
{
if (!bitmap_bit_p (holes, curr->first))
{
valid = false;
break;
}
else
bitmap_clear_bit (holes, curr->first);
}
if (valid && bitmap_empty_p (holes))
valid_vecs.quick_push (cand_vec);
sbitmap_free (holes);
}
/* At least two VECTOR to combine. */
if (valid_vecs.length () <= 1)
{
cleanup_vinfo_map (v_info_map);
return false;
}
valid_vecs.qsort (sort_by_mach_mode);
/* Go through all candidates by machine mode order, query the mode_to_total
to get the total number for each mode and skip the single one. */
for (unsigned i = 0; i < valid_vecs.length () - 1; ++i)
{
tree tvec = valid_vecs[i];
enum machine_mode mode = TYPE_MODE (TREE_TYPE (tvec));
/* Skip modes with only a single candidate. */
if (TYPE_MODE (TREE_TYPE (valid_vecs[i + 1])) != mode)
continue;
unsigned int idx, j;
gimple *sum = NULL;
v_info_ptr info_ptr;
tree sum_vec = tvec;
v_info_elem *elem;
/* Build the sum for all candidates with same mode. */
do
{
sum = build_and_add_sum (TREE_TYPE (sum_vec), sum_vec,
valid_vecs[i + 1], opcode);
sum_vec = gimple_get_lhs (sum);
info_ptr = *(v_info_map.get (valid_vecs[i + 1]));
/* Update those related ops of current candidate VECTOR. */
FOR_EACH_VEC_ELT (*info_ptr, j, elem)
{
idx = elem->second;
gimple *def = SSA_NAME_DEF_STMT ((*ops)[idx]->op);
/* Set this then op definition will get DCEd later. */
gimple_set_visited (def, true);
if (opcode == PLUS_EXPR || opcode == BIT_XOR_EXPR
|| opcode == BIT_IOR_EXPR)
(*ops)[idx]->op = build_zero_cst (TREE_TYPE ((*ops)[idx]->op));
else if (opcode == MULT_EXPR)
(*ops)[idx]->op = build_one_cst (TREE_TYPE ((*ops)[idx]->op));
else
{
gcc_assert (opcode == BIT_AND_EXPR);
(*ops)[idx]->op
= build_all_ones_cst (TREE_TYPE ((*ops)[idx]->op));
}
(*ops)[idx]->rank = 0;
}
if (dump_file && (dump_flags & TDF_DETAILS))
{
fprintf (dump_file, "Generating addition -> ");
print_gimple_stmt (dump_file, sum, 0);
}
i++;
}
while ((i < valid_vecs.length () - 1)
&& TYPE_MODE (TREE_TYPE (valid_vecs[i + 1])) == mode);
/* Referring to first valid VECTOR with this mode, generate the
BIT_FIELD_REF statements accordingly. */
info_ptr = *(v_info_map.get (tvec));
gcc_assert (sum);
tree elem_type = TREE_TYPE (TREE_TYPE (tvec));
FOR_EACH_VEC_ELT (*info_ptr, j, elem)
{
idx = elem->second;
tree dst = make_ssa_name (elem_type);
gimple *gs = gimple_build_assign (
dst, BIT_FIELD_REF,
build3 (BIT_FIELD_REF, elem_type, sum_vec, TYPE_SIZE (elem_type),
bitsize_int (elem->first
* tree_to_uhwi (TYPE_SIZE (elem_type)))));
insert_stmt_after (gs, sum);
gimple *def = SSA_NAME_DEF_STMT ((*ops)[idx]->op);
/* Set this then op definition will get DCEd later. */
gimple_set_visited (def, true);
(*ops)[idx]->op = gimple_assign_lhs (gs);
(*ops)[idx]->rank = get_rank ((*ops)[idx]->op);
if (dump_file && (dump_flags & TDF_DETAILS))
{
fprintf (dump_file, "Generating bit_field_ref -> ");
print_gimple_stmt (dump_file, gs, 0);
}
}
}
if (dump_file && (dump_flags & TDF_DETAILS))
fprintf (dump_file, "undistributiong bit_field_ref for vector done.\n");
cleanup_vinfo_map (v_info_map);
return true;
}
/* If OPCODE is BIT_IOR_EXPR or BIT_AND_EXPR and CURR is a comparison
expression, examine the other OPS to see if any of them are comparisons
of the same values, which we may be able to combine or eliminate.
......@@ -5879,11 +6147,6 @@ reassociate_bb (basic_block bb)
tree lhs, rhs1, rhs2;
enum tree_code rhs_code = gimple_assign_rhs_code (stmt);
/* If this is not a gimple binary expression, there is
nothing for us to do with it. */
if (get_gimple_rhs_class (rhs_code) != GIMPLE_BINARY_RHS)
continue;
/* If this was part of an already processed statement,
we don't need to touch it again. */
if (gimple_visited_p (stmt))
......@@ -5910,6 +6173,11 @@ reassociate_bb (basic_block bb)
continue;
}
/* If this is not a gimple binary expression, there is
nothing for us to do with it. */
if (get_gimple_rhs_class (rhs_code) != GIMPLE_BINARY_RHS)
continue;
lhs = gimple_assign_lhs (stmt);
rhs1 = gimple_assign_rhs1 (stmt);
rhs2 = gimple_assign_rhs2 (stmt);
......@@ -5948,7 +6216,12 @@ reassociate_bb (basic_block bb)
ops.qsort (sort_by_operand_rank);
optimize_ops_list (rhs_code, &ops);
}
if (undistribute_bitref_for_vector (rhs_code, &ops,
loop_containing_stmt (stmt)))
{
ops.qsort (sort_by_operand_rank);
optimize_ops_list (rhs_code, &ops);
}
if (rhs_code == PLUS_EXPR
&& transform_add_to_multiply (&ops))
ops.qsort (sort_by_operand_rank);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment