Commit 6c2833e7 by Kewen Lin

re PR tree-optimization/88497 (Improve Accumulation in Auto-Vectorized Code)

gcc/ChangeLog

2019-07-15  Kewen Lin  <linkw@gcc.gnu.org>

    PR tree-optimization/88497
    * tree-ssa-reassoc.c (reassociate_bb): Swap the positions of
    GIMPLE_BINARY_RHS check and gimple_visited_p check, call new
    function undistribute_bitref_for_vector.
    (undistribute_bitref_for_vector): New function.
    (cleanup_vinfo_map): Likewise.
    (sort_by_mach_mode): Likewise.

gcc/testsuite/ChangeLog

2019-07-15  Kewen Lin  <linkw@gcc.gnu.org>

    PR tree-optimization/88497
    * gcc.dg/tree-ssa/pr88497-1.c: New test.
    * gcc.dg/tree-ssa/pr88497-2.c: Likewise.
    * gcc.dg/tree-ssa/pr88497-3.c: Likewise.
    * gcc.dg/tree-ssa/pr88497-4.c: Likewise.
    * gcc.dg/tree-ssa/pr88497-5.c: Likewise.
    * gcc.dg/tree-ssa/pr88497-6.c: Likewise.
    * gcc.dg/tree-ssa/pr88497-7.c: Likewise.

From-SVN: r273490
parent 3126c241
2019-07-15 Kewen Lin <linkw@gcc.gnu.org>
PR tree-optimization/88497
* tree-ssa-reassoc.c (reassociate_bb): Swap the positions of
GIMPLE_BINARY_RHS check and gimple_visited_p check, call new
function undistribute_bitref_for_vector.
(undistribute_bitref_for_vector): New function.
(cleanup_vinfo_map): Likewise.
(sort_by_mach_mode): Likewise.
2019-07-14 Uroš Bizjak <ubizjak@gmail.com> 2019-07-14 Uroš Bizjak <ubizjak@gmail.com>
* config/i386/i386.md (nonmemory_szext_operand): New mode attribute. * config/i386/i386.md (nonmemory_szext_operand): New mode attribute.
......
2019-07-15 Kewen Lin <linkw@gcc.gnu.org>
PR tree-optimization/88497
* gcc.dg/tree-ssa/pr88497-1.c: New test.
* gcc.dg/tree-ssa/pr88497-2.c: Likewise.
* gcc.dg/tree-ssa/pr88497-3.c: Likewise.
* gcc.dg/tree-ssa/pr88497-4.c: Likewise.
* gcc.dg/tree-ssa/pr88497-5.c: Likewise.
* gcc.dg/tree-ssa/pr88497-6.c: Likewise.
* gcc.dg/tree-ssa/pr88497-7.c: Likewise.
2019-07-14 Jerry DeLisle <jvdelisle@gcc.gnu.org> 2019-07-14 Jerry DeLisle <jvdelisle@gcc.gnu.org>
PR fortran/87233 PR fortran/87233
......
/* { dg-do run } */
/* { dg-require-effective-target vect_double } */
/* { dg-require-effective-target vsx_hw { target { powerpc*-*-* } } } */
/* { dg-require-effective-target sse2_runtime { target { i?86-*-* x86_64-*-* } } } */
/* { dg-options "-O2 -ffast-math -fdump-tree-reassoc1" } */
/* { dg-additional-options "-mvsx" { target { powerpc*-*-* } } } */
/* { dg-additional-options "-msse2" { target { i?86-*-* x86_64-*-* } } } */
/* To test reassoc can undistribute vector bit_field_ref summation.
arg1 and arg2 are two arrays whose elements of type vector double.
Assuming:
A0 = arg1[0], A1 = arg1[1], A2 = arg1[2], A3 = arg1[3],
B0 = arg2[0], B1 = arg2[1], B2 = arg2[2], B3 = arg2[3],
Then:
V0 = A0 * B0, V1 = A1 * B1, V2 = A2 * B2, V3 = A3 * B3,
reassoc transforms
accumulator += V0[0] + V0[1] + V1[0] + V1[1] + V2[0] + V2[1]
+ V3[0] + V3[1];
into:
T = V0 + V1 + V2 + V3
accumulator += T[0] + T[1];
Fewer bit_field_refs, only two for 128 or more bits vector. */
typedef double v2df __attribute__ ((vector_size (16)));
__attribute__ ((noinline)) double
test (double accumulator, v2df arg1[], v2df arg2[])
{
v2df temp;
temp = arg1[0] * arg2[0];
accumulator += temp[0] + temp[1];
temp = arg1[1] * arg2[1];
accumulator += temp[0] + temp[1];
temp = arg1[2] * arg2[2];
accumulator += temp[0] + temp[1];
temp = arg1[3] * arg2[3];
accumulator += temp[0] + temp[1];
return accumulator;
}
extern void abort (void);
int
main ()
{
v2df v2[4] = {{1.0, 2.0}, {4.0, 8.0}, {1.0, 3.0}, {9.0, 27.0}};
v2df v3[4] = {{1.0, 4.0}, {16.0, 64.0}, {1.0, 2.0}, {3.0, 4.0}};
double acc = 100.0;
double res = test (acc, v2, v3);
if (res != 827.0)
abort ();
return 0;
}
/* { dg-final { scan-tree-dump-times "BIT_FIELD_REF" 2 "reassoc1" { target { powerpc*-*-* i?86-*-* x86_64-*-* } } } } */
/* { dg-do compile } */
/* { dg-require-effective-target vect_float } */
/* { dg-require-effective-target powerpc_altivec_ok { target { powerpc*-*-* } } } */
/* { dg-require-effective-target sse2 { target { i?86-*-* x86_64-*-* } } } */
/* { dg-options "-O2 -ffast-math -fdump-tree-reassoc1" } */
/* { dg-additional-options "-maltivec" { target { powerpc*-*-* } } } */
/* { dg-additional-options "-msse2" { target { i?86-*-* x86_64-*-* } } } */
/* To test reassoc can undistribute vector bit_field_ref on multiplication.
v1, v2, v3, v4 of type vector float.
reassoc transforms
accumulator *= v1[0] * v1[1] * v1[2] * v1[3] *
v2[0] * v2[1] * v2[2] * v2[3] *
v3[0] * v3[1] * v3[2] * v3[3] *
v4[0] * v4[1] * v4[2] * v4[3] ;
into:
T = v1 * v2 * v3 * v4;
accumulator *= T[0] * T[1] * T[2] * T[3];
Fewer bit_field_refs, only four for 128 or more bits vector. */
typedef float v4sf __attribute__ ((vector_size (16)));
float
test (float accumulator, v4sf v1, v4sf v2, v4sf v3, v4sf v4)
{
accumulator *= v1[0] * v1[1] * v1[2] * v1[3];
accumulator *= v2[0] * v2[1] * v2[2] * v2[3];
accumulator *= v3[0] * v3[1] * v3[2] * v3[3];
accumulator *= v4[0] * v4[1] * v4[2] * v4[3];
return accumulator;
}
/* { dg-final { scan-tree-dump-times "BIT_FIELD_REF" 4 "reassoc1" { target { powerpc*-*-* i?86-*-* x86_64-*-* } } } } */
/* { dg-do compile } */
/* { dg-require-effective-target vect_int } */
/* { dg-require-effective-target powerpc_altivec_ok { target { powerpc*-*-* } } } */
/* { dg-require-effective-target sse2 { target { i?86-*-* x86_64-*-* } } } */
/* { dg-options "-O2 -ffast-math -fdump-tree-reassoc1" } */
/* { dg-additional-options "-maltivec" { target { powerpc*-*-* } } } */
/* { dg-additional-options "-msse2" { target { i?86-*-* x86_64-*-* } } } */
/* To test reassoc can undistribute vector bit_field_ref on bitwise AND.
v1, v2, v3, v4 of type vector int.
reassoc transforms
accumulator &= v1[0] & v1[1] & v1[2] & v1[3] &
v2[0] & v2[1] & v2[2] & v2[3] &
v3[0] & v3[1] & v3[2] & v3[3] &
v4[0] & v4[1] & v4[2] & v4[3] ;
into:
T = v1 & v2 & v3 & v4;
accumulator &= T[0] & T[1] & T[2] & T[3];
Fewer bit_field_refs, only four for 128 or more bits vector. */
typedef int v4si __attribute__ ((vector_size (16)));
int
test (int accumulator, v4si v1, v4si v2, v4si v3, v4si v4)
{
accumulator &= v1[0] & v1[1] & v1[2] & v1[3];
accumulator &= v2[0] & v2[1] & v2[2] & v2[3];
accumulator &= v3[0] & v3[1] & v3[2] & v3[3];
accumulator &= v4[0] & v4[1] & v4[2] & v4[3];
return accumulator;
}
/* { dg-final { scan-tree-dump-times "BIT_FIELD_REF" 4 "reassoc1" { target { powerpc*-*-* i?86-*-* x86_64-*-* } } } } */
/* { dg-do compile } */
/* { dg-require-effective-target vect_int } */
/* { dg-require-effective-target powerpc_altivec_ok { target { powerpc*-*-* } } } */
/* { dg-require-effective-target sse2 { target { i?86-*-* x86_64-*-* } } } */
/* { dg-options "-O2 -ffast-math -fdump-tree-reassoc1" } */
/* { dg-additional-options "-maltivec" { target { powerpc*-*-* } } } */
/* { dg-additional-options "-msse2" { target { i?86-*-* x86_64-*-* } } } */
/* To test reassoc can undistribute vector bit_field_ref on bitwise IOR.
v1, v2, v3, v4 of type vector int.
reassoc transforms
accumulator |= v1[0] | v1[1] | v1[2] | v1[3] |
v2[0] | v2[1] | v2[2] | v2[3] |
v3[0] | v3[1] | v3[2] | v3[3] |
v4[0] | v4[1] | v4[2] | v4[3] ;
into:
T = v1 | v2 | v3 | v4;
accumulator |= T[0] | T[1] | T[2] | T[3];
Fewer bit_field_refs, only four for 128 or more bits vector. */
typedef int v4si __attribute__ ((vector_size (16)));
int
test (int accumulator, v4si v1, v4si v2, v4si v3, v4si v4)
{
accumulator |= v1[0] | v1[1] | v1[2] | v1[3];
accumulator |= v2[0] | v2[1] | v2[2] | v2[3];
accumulator |= v3[0] | v3[1] | v3[2] | v3[3];
accumulator |= v4[0] | v4[1] | v4[2] | v4[3];
return accumulator;
}
/* { dg-final { scan-tree-dump-times "BIT_FIELD_REF" 4 "reassoc1" { target { powerpc*-*-* i?86-*-* x86_64-*-* } } } } */
/* { dg-do compile } */
/* { dg-require-effective-target vect_int } */
/* { dg-require-effective-target powerpc_altivec_ok { target { powerpc*-*-* } } } */
/* { dg-require-effective-target sse2 { target { i?86-*-* x86_64-*-* } } } */
/* { dg-options "-O2 -ffast-math -fdump-tree-reassoc1" } */
/* { dg-additional-options "-maltivec" { target { powerpc*-*-* } } } */
/* { dg-additional-options "-msse2" { target { i?86-*-* x86_64-*-* } } } */
/* To test reassoc can undistribute vector bit_field_ref on bitwise XOR.
v1, v2, v3, v4 of type vector int.
reassoc transforms
accumulator ^= v1[0] ^ v1[1] ^ v1[2] ^ v1[3] ^
v2[0] ^ v2[1] ^ v2[2] ^ v2[3] ^
v3[0] ^ v3[1] ^ v3[2] ^ v3[3] ^
v4[0] ^ v4[1] ^ v4[2] ^ v4[3] ;
into:
T = v1 ^ v2 ^ v3 ^ v4;
accumulator ^= T[0] ^ T[1] ^ T[2] ^ T[3];
Fewer bit_field_refs, only four for 128 or more bits vector. */
typedef int v4si __attribute__ ((vector_size (16)));
int
test (int accumulator, v4si v1, v4si v2, v4si v3, v4si v4)
{
accumulator ^= v1[0] ^ v1[1] ^ v1[2] ^ v1[3];
accumulator ^= v2[0] ^ v2[1] ^ v2[2] ^ v2[3];
accumulator ^= v3[0] ^ v3[1] ^ v3[2] ^ v3[3];
accumulator ^= v4[0] ^ v4[1] ^ v4[2] ^ v4[3];
return accumulator;
}
/* { dg-final { scan-tree-dump-times "BIT_FIELD_REF" 4 "reassoc1" { target { powerpc*-*-* i?86-*-* x86_64-*-* } } } } */
/* { dg-do compile } */
/* { dg-require-effective-target avx512f } */
/* { dg-options "-O2 -mavx512f -ffast-math -fdump-tree-reassoc1" } */
/* To test reassoc can undistribute vector bit_field_ref on multiple
vector machine modes.
v1, v2 of type vector 4 x float
v3, v4 of type vector 8 x float
v5, v6 of type vector 16 x float
reassoc transforms
accumulator += v1[0] + v1[1] + v1[2] + v1[3] +
v2[0] + v2[1] + v2[2] + v2[3] +
v3[0] + v3[1] + v3[2] + v3[3] +
v3[4] + v3[5] + v3[6] + v3[7] +
v4[0] + v4[1] + v4[2] + v4[3] +
v4[4] + v4[5] + v4[6] + v4[7] +
v5[0] + v5[1] + v5[2] + v5[3] +
v5[4] + v5[5] + v5[6] + v5[7] +
v5[8] + v5[9] + v5[10] + v5[11] +
v5[12] + v5[13] + v5[14] + v5[15] +
v6[0] + v6[1] + v6[2] + v6[3] +
v6[4] + v6[5] + v6[6] + v6[7] +
v6[8] + v6[9] + v6[10] + v6[11] +
v6[12] + v6[13] + v6[14] + v6[15] ;
into:
T12 = v1 + v2;
T34 = v3 + v4;
T56 = v5 + v6;
accumulator += T12[0] + T12[1] + T12[2] + T12[3] +
accumulator += T34[0] + T34[1] + T34[2] + T34[3] +
accumulator += T34[4] + T34[5] + T34[6] + T34[7] +
accumulator += T56[0] + T56[1] + T56[2] + T56[3] +
accumulator += T56[4] + T56[5] + T56[6] + T56[7] +
accumulator += T56[8] + T56[9] + T56[10] + T56[11] +
accumulator += T56[12] + T56[13] + T56[14] + T56[15] ; */
typedef float v4sf __attribute__((vector_size(16)));
typedef float v8sf __attribute__((vector_size(32)));
typedef float v16sf __attribute__((vector_size(64)));
float
test (float accumulator, v4sf v1, v4sf v2, v8sf v3, v8sf v4, v16sf v5, v16sf v6)
{
accumulator += v1[0] + v1[1] + v1[2] + v1[3];
accumulator += v2[0] + v2[1] + v2[2] + v2[3];
accumulator += v3[0] + v3[1] + v3[2] + v3[3];
accumulator += v3[4] + v3[5] + v3[6] + v3[7];
accumulator += v4[0] + v4[1] + v4[2] + v4[3];
accumulator += v4[4] + v4[5] + v4[6] + v4[7];
accumulator += v5[0] + v5[1] + v5[2] + v5[3];
accumulator += v5[4] + v5[5] + v5[6] + v5[7];
accumulator += v5[8] + v5[9] + v5[10] + v5[11];
accumulator += v5[12] + v5[13] + v5[14] + v5[15];
accumulator += v6[0] + v6[1] + v6[2] + v6[3];
accumulator += v6[4] + v6[5] + v6[6] + v6[7];
accumulator += v6[8] + v6[9] + v6[10] + v6[11];
accumulator += v6[12] + v6[13] + v6[14] + v6[15];
return accumulator;
}
/* { dg-final { scan-tree-dump-times "BIT_FIELD_REF" 28 "reassoc1" } } */
/* { dg-do run } */
/* { dg-require-effective-target avx512f_runtime } */
/* { dg-options "-O2 -mavx512f -ffast-math -fdump-tree-reassoc1" } */
/* To test reassoc can undistribute vector bit_field_ref on multiple
vector machine modes, bypass those modes with only one candidate.
v1, v2 of type vector 4 x float
v3 of type vector 8 x float
v5, v6 of type vector 16 x float
reassoc transforms
accumulator += v1[0] + v1[1] + v1[2] + v1[3] +
v2[0] + v2[1] + v2[2] + v2[3] +
v3[0] + v3[1] + v3[2] + v3[3] +
v3[4] + v3[5] + v3[6] + v3[7] +
v5[0] + v5[1] + v5[2] + v5[3] +
v5[4] + v5[5] + v5[6] + v5[7] +
v5[8] + v5[9] + v5[10] + v5[11] +
v5[12] + v5[13] + v5[14] + v5[15] +
v6[0] + v6[1] + v6[2] + v6[3] +
v6[4] + v6[5] + v6[6] + v6[7] +
v6[8] + v6[9] + v6[10] + v6[11] +
v6[12] + v6[13] + v6[14] + v6[15] ;
into:
T12 = v1 + v2;
T56 = v5 + v6;
accumulator += T12[0] + T12[1] + T12[2] + T12[3] +
accumulator += v3[0] + v3[1] + v3[2] + v3[3] +
accumulator += v3[4] + v3[5] + v3[6] + v3[7] +
accumulator += T56[0] + T56[1] + T56[2] + T56[3] +
accumulator += T56[4] + T56[5] + T56[6] + T56[7] +
accumulator += T56[8] + T56[9] + T56[10] + T56[11] +
accumulator += T56[12] + T56[13] + T56[14] + T56[15] ; */
typedef float v4sf __attribute__((vector_size(16)));
typedef float v8sf __attribute__((vector_size(32)));
typedef float v16sf __attribute__((vector_size(64)));
__attribute__ ((noinline))
float test(float accumulator, v4sf v1, v4sf v2, v8sf v3, v16sf v5, v16sf v6) {
accumulator += v1[0] + v1[1] + v1[2] + v1[3];
accumulator += v2[0] + v2[1] + v2[2] + v2[3];
accumulator += v3[0] + v3[1] + v3[2] + v3[3];
accumulator += v3[4] + v3[5] + v3[6] + v3[7];
accumulator += v5[0] + v5[1] + v5[2] + v5[3];
accumulator += v5[4] + v5[5] + v5[6] + v5[7];
accumulator += v5[8] + v5[9] + v5[10] + v5[11];
accumulator += v5[12] + v5[13] + v5[14] + v5[15];
accumulator += v6[0] + v6[1] + v6[2] + v6[3];
accumulator += v6[4] + v6[5] + v6[6] + v6[7];
accumulator += v6[8] + v6[9] + v6[10] + v6[11];
accumulator += v6[12] + v6[13] + v6[14] + v6[15];
return accumulator;
}
extern void abort (void);
int
main ()
{
v4sf v1 = {1.0, 2.0, 3.0, 4.0 };
v4sf v2 = {5.0, 6.0, 7.0, 8.0 };
v8sf v3 = {9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0 };
v16sf v5 = {17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0, 32.0};
v16sf v6 = {33.0, 34.0, 35.0, 36.0, 37.0, 38.0, 39.0, 40.0, 41.0, 42.0, 43.0, 44.0, 45.0, 46.0, 47.0, 48.0};
float acc = 24.0;
double res = test (acc, v1, v2, v3, v5, v6);
if (res != 1200.0)
abort();
return 0;
}
/* { dg-final { scan-tree-dump-times "BIT_FIELD_REF" 28 "reassoc1" } } */
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment