Commit df10b6d4 by Michael Meissner Committed by Michael Meissner

PR target/48258, improve vector reduction on power7

From-SVN: r172981
parent 35a5db04
2011-04-26 Michael Meissner <meissner@linux.vnet.ibm.com>
PR target/48258
* config/rs6000/vector.md (UNSPEC_REDUC): New unspec for vector
reduction.
(VEC_reduc): New code iterator and splitters for vector reduction.
(VEC_reduc_name): Ditto.
(VEC_reduc_rtx): Ditto.
(reduc_<VEC_reduc_name>_v2df): Vector reduction expanders for VSX.
(reduc_<VEC_reduc_name>_v4sf): Ditto.
* config/rs6000/rs6000.c (rs6000_expand_vector_extract): Add
support for extracting SF on VSX.
* config/rs6000/vsx.md (vsx_xscvspdp_scalar2): New insn for
generating xscvspdp.
(vsx_extract_v4sf): New insn to extract SF from V4SF vector.
(vsx_reduc_<VEC_reduc_name>_v2df): New insns and splitters for
double add, minimum, maximum vector reduction.
(vsx_reduc_<VEC_reduc_name>_v4sf): Ditto.
(vsx_reduc_<VEC_reduc_name>_v2df2_scalar): New combiner insn to
optimize double vector reduction.
(vsx_reduc_<VEC_reduc_name>_v4sf_scalar): Ditto.
2011-04-26 Joseph Myers <joseph@codesourcery.com>
* config/fr30/fr30.h (inhibit_libc): Don't define.
......
......@@ -5463,12 +5463,22 @@ rs6000_expand_vector_extract (rtx target, rtx vec, int elt)
enum machine_mode inner_mode = GET_MODE_INNER (mode);
rtx mem;
if (VECTOR_MEM_VSX_P (mode) && (mode == V2DFmode || mode == V2DImode))
if (VECTOR_MEM_VSX_P (mode))
{
rtx (*extract_func) (rtx, rtx, rtx)
= ((mode == V2DFmode) ? gen_vsx_extract_v2df : gen_vsx_extract_v2di);
emit_insn (extract_func (target, vec, GEN_INT (elt)));
return;
switch (mode)
{
default:
break;
case V2DFmode:
emit_insn (gen_vsx_extract_v2df (target, vec, GEN_INT (elt)));
return;
case V2DImode:
emit_insn (gen_vsx_extract_v2di (target, vec, GEN_INT (elt)));
return;
case V4SFmode:
emit_insn (gen_vsx_extract_v4sf (target, vec, GEN_INT (elt)));
return;
}
}
/* Allocate mode-sized buffer. */
......
......@@ -74,7 +74,19 @@
(V2DF "V2DI")])
;; constants for unspec
(define_c_enum "unspec" [UNSPEC_PREDICATE])
(define_c_enum "unspec" [UNSPEC_PREDICATE
UNSPEC_REDUC])
;; Vector reduction code iterators
(define_code_iterator VEC_reduc [plus smin smax])
(define_code_attr VEC_reduc_name [(plus "splus")
(smin "smin")
(smax "smax")])
(define_code_attr VEC_reduc_rtx [(plus "add")
(smin "smin")
(smax "smax")])
;; Vector move instructions.
......@@ -991,6 +1003,41 @@
"TARGET_ALTIVEC"
"")
;; Vector reduction expanders for VSX
(define_expand "reduc_<VEC_reduc_name>_v2df"
[(parallel [(set (match_operand:V2DF 0 "vfloat_operand" "")
(VEC_reduc:V2DF
(vec_concat:V2DF
(vec_select:DF
(match_operand:V2DF 1 "vfloat_operand" "")
(parallel [(const_int 1)]))
(vec_select:DF
(match_dup 1)
(parallel [(const_int 0)])))
(match_dup 1)))
(clobber (match_scratch:V2DF 2 ""))])]
"VECTOR_UNIT_VSX_P (V2DFmode)"
"")
; The (VEC_reduc:V4SF
; (op1)
; (unspec:V4SF [(const_int 0)] UNSPEC_REDUC))
;
; is to allow us to use a code iterator, but not completely list all of the
; vector rotates, etc. to prevent canonicalization
(define_expand "reduc_<VEC_reduc_name>_v4sf"
[(parallel [(set (match_operand:V4SF 0 "vfloat_operand" "")
(VEC_reduc:V4SF
(unspec:V4SF [(const_int 0)] UNSPEC_REDUC)
(match_operand:V4SF 1 "vfloat_operand" "")))
(clobber (match_scratch:V4SF 2 ""))
(clobber (match_scratch:V4SF 3 ""))])]
"VECTOR_UNIT_VSX_P (V4SFmode)"
"")
;;; Expanders for vector insn patterns shared between the SPE and TARGET_PAIRED systems.
(define_expand "absv2sf2"
......
......@@ -829,6 +829,15 @@
"xscvdpsp %x0,%x1"
[(set_attr "type" "fp")])
;; Same as vsx_xscvspdp, but use SF as the type
(define_insn "vsx_xscvspdp_scalar2"
[(set (match_operand:SF 0 "vsx_register_operand" "=f")
(unspec:SF [(match_operand:V4SF 1 "vsx_register_operand" "wa")]
UNSPEC_VSX_CVSPDP))]
"VECTOR_UNIT_VSX_P (DFmode)"
"xscvspdp %x0,%x1"
[(set_attr "type" "fp")])
;; Convert from 64-bit to 32-bit types
;; Note, favor the Altivec registers since the usual use of these instructions
;; is in vector converts and we need to use the Altivec vperm instruction.
......@@ -1039,6 +1048,43 @@
[(set_attr "type" "fpload")
(set_attr "length" "4")])
;; Extract a SF element from V4SF
(define_insn_and_split "vsx_extract_v4sf"
[(set (match_operand:SF 0 "vsx_register_operand" "=f,f")
(vec_select:SF
(match_operand:V4SF 1 "vsx_register_operand" "wa,wa")
(parallel [(match_operand:QI 2 "u5bit_cint_operand" "O,i")])))
(clobber (match_scratch:V4SF 3 "=X,0"))]
"VECTOR_UNIT_VSX_P (V4SFmode)"
"@
xscvspdp %x0,%x1
#"
""
[(const_int 0)]
"
{
rtx op0 = operands[0];
rtx op1 = operands[1];
rtx op2 = operands[2];
rtx op3 = operands[3];
rtx tmp;
HOST_WIDE_INT ele = INTVAL (op2);
if (ele == 0)
tmp = op1;
else
{
if (GET_CODE (op3) == SCRATCH)
op3 = gen_reg_rtx (V4SFmode);
emit_insn (gen_vsx_xxsldwi_v4sf (op3, op1, op1, op2));
tmp = op3;
}
emit_insn (gen_vsx_xscvspdp_scalar2 (op0, tmp));
DONE;
}"
[(set_attr "length" "4,8")
(set_attr "type" "fp")])
;; General double word oriented permute, allow the other vector types for
;; optimizing the permute instruction.
(define_insn "vsx_xxpermdi_<mode>"
......@@ -1150,3 +1196,153 @@
"VECTOR_MEM_VSX_P (<MODE>mode)"
"xxsldwi %x0,%x1,%x2,%3"
[(set_attr "type" "vecperm")])
;; Vector reduction insns and splitters
(define_insn_and_split "*vsx_reduc_<VEC_reduc_name>_v2df"
[(set (match_operand:V2DF 0 "vfloat_operand" "=&wd,&?wa,wd,?wa")
(VEC_reduc:V2DF
(vec_concat:V2DF
(vec_select:DF
(match_operand:V2DF 1 "vfloat_operand" "wd,wa,wd,wa")
(parallel [(const_int 1)]))
(vec_select:DF
(match_dup 1)
(parallel [(const_int 0)])))
(match_dup 1)))
(clobber (match_scratch:V2DF 2 "=0,0,&wd,&wa"))]
"VECTOR_UNIT_VSX_P (V2DFmode)"
"#"
""
[(const_int 0)]
"
{
rtx tmp = (GET_CODE (operands[2]) == SCRATCH)
? gen_reg_rtx (V2DFmode)
: operands[2];
emit_insn (gen_vsx_xxsldwi_v2df (tmp, operands[1], operands[1], const2_rtx));
emit_insn (gen_<VEC_reduc_rtx>v2df3 (operands[0], tmp, operands[1]));
DONE;
}"
[(set_attr "length" "8")
(set_attr "type" "veccomplex")])
(define_insn_and_split "*vsx_reduc_<VEC_reduc_name>_v4sf"
[(set (match_operand:V4SF 0 "vfloat_operand" "=wf,?wa")
(VEC_reduc:V4SF
(unspec:V4SF [(const_int 0)] UNSPEC_REDUC)
(match_operand:V4SF 1 "vfloat_operand" "wf,wa")))
(clobber (match_scratch:V4SF 2 "=&wf,&wa"))
(clobber (match_scratch:V4SF 3 "=&wf,&wa"))]
"VECTOR_UNIT_VSX_P (V4SFmode)"
"#"
""
[(const_int 0)]
"
{
rtx op0 = operands[0];
rtx op1 = operands[1];
rtx tmp2, tmp3, tmp4;
if (can_create_pseudo_p ())
{
tmp2 = gen_reg_rtx (V4SFmode);
tmp3 = gen_reg_rtx (V4SFmode);
tmp4 = gen_reg_rtx (V4SFmode);
}
else
{
tmp2 = operands[2];
tmp3 = operands[3];
tmp4 = tmp2;
}
emit_insn (gen_vsx_xxsldwi_v4sf (tmp2, op1, op1, const2_rtx));
emit_insn (gen_<VEC_reduc_rtx>v4sf3 (tmp3, tmp2, op1));
emit_insn (gen_vsx_xxsldwi_v4sf (tmp4, tmp3, tmp3, GEN_INT (3)));
emit_insn (gen_<VEC_reduc_rtx>v4sf3 (op0, tmp4, tmp3));
DONE;
}"
[(set_attr "length" "16")
(set_attr "type" "veccomplex")])
;; Combiner patterns with the vector reduction patterns that knows we can get
;; to the top element of the V2DF array without doing an extract.
(define_insn_and_split "*vsx_reduc_<VEC_reduc_name>_v2df_scalar"
[(set (match_operand:DF 0 "vfloat_operand" "=&ws,&?wa,ws,?wa")
(vec_select:DF
(VEC_reduc:V2DF
(vec_concat:V2DF
(vec_select:DF
(match_operand:V2DF 1 "vfloat_operand" "wd,wa,wd,wa")
(parallel [(const_int 1)]))
(vec_select:DF
(match_dup 1)
(parallel [(const_int 0)])))
(match_dup 1))
(parallel [(const_int 1)])))
(clobber (match_scratch:DF 2 "=0,0,&wd,&wa"))]
"VECTOR_UNIT_VSX_P (V2DFmode)"
"#"
""
[(const_int 0)]
"
{
rtx hi = gen_highpart (DFmode, operands[1]);
rtx lo = (GET_CODE (operands[2]) == SCRATCH)
? gen_reg_rtx (DFmode)
: operands[2];
emit_insn (gen_vsx_extract_v2df (lo, operands[1], const1_rtx));
emit_insn (gen_<VEC_reduc_rtx>df3 (operands[0], hi, lo));
DONE;
}"
[(set_attr "length" "8")
(set_attr "type" "veccomplex")])
(define_insn_and_split "*vsx_reduc_<VEC_reduc_name>_v4sf_scalar"
[(set (match_operand:SF 0 "vfloat_operand" "=f,?f")
(vec_select:SF
(VEC_reduc:V4SF
(unspec:V4SF [(const_int 0)] UNSPEC_REDUC)
(match_operand:V4SF 1 "vfloat_operand" "wf,wa"))
(parallel [(const_int 3)])))
(clobber (match_scratch:V4SF 2 "=&wf,&wa"))
(clobber (match_scratch:V4SF 3 "=&wf,&wa"))
(clobber (match_scratch:V4SF 4 "=0,0"))]
"VECTOR_UNIT_VSX_P (V4SFmode)"
"#"
""
[(const_int 0)]
"
{
rtx op0 = operands[0];
rtx op1 = operands[1];
rtx tmp2, tmp3, tmp4, tmp5;
if (can_create_pseudo_p ())
{
tmp2 = gen_reg_rtx (V4SFmode);
tmp3 = gen_reg_rtx (V4SFmode);
tmp4 = gen_reg_rtx (V4SFmode);
tmp5 = gen_reg_rtx (V4SFmode);
}
else
{
tmp2 = operands[2];
tmp3 = operands[3];
tmp4 = tmp2;
tmp5 = operands[4];
}
emit_insn (gen_vsx_xxsldwi_v4sf (tmp2, op1, op1, const2_rtx));
emit_insn (gen_<VEC_reduc_rtx>v4sf3 (tmp3, tmp2, op1));
emit_insn (gen_vsx_xxsldwi_v4sf (tmp4, tmp3, tmp3, GEN_INT (3)));
emit_insn (gen_<VEC_reduc_rtx>v4sf3 (tmp5, tmp4, tmp3));
emit_insn (gen_vsx_xscvspdp_scalar2 (op0, tmp5));
DONE;
}"
[(set_attr "length" "20")
(set_attr "type" "veccomplex")])
2011-03-23 Michael Meissner <meissner@linux.vnet.ibm.com>
PR target/48258
* gcc.target/powerpc/pr48258-1.c: New file.
* gcc.target/powerpc/pr48258-2.c: Ditto.
2011-04-26 Xinliang David Li <davidxl@google.com>
* gcc.dg/uninit-suppress.c: New test.
* gcc.dg/uninit-suppress.c: New test.
......
/* { dg-do compile } */
/* { dg-skip-if "" { powerpc*-*-darwin* } { "*" } { "" } } */
/* { dg-require-effective-target powerpc_vsx_ok } */
/* { dg-options "-O3 -mcpu=power7 -mabi=altivec -ffast-math -fno-unroll-loops" } */
/* { dg-final { scan-assembler-times "xvaddsp" 3 } } */
/* { dg-final { scan-assembler-times "xvminsp" 3 } } */
/* { dg-final { scan-assembler-times "xvmaxsp" 3 } } */
/* { dg-final { scan-assembler-times "xxsldwi" 6 } } */
/* { dg-final { scan-assembler-times "xscvspdp" 3 } } */
/* { dg-final { scan-assembler-not "stvewx" } } */
/* { dg-final { scan-assembler-not "stvx" } } */
/* { dg-final { scan-assembler-not "stxvd2x" } } */
/* { dg-final { scan-assembler-not "stxvw4x" } } */
#include <stddef.h>
#ifndef SIZE
#define SIZE 1024
#endif
float values[SIZE] __attribute__((__aligned__(32)));
float
vector_sum (void)
{
size_t i;
float sum = 0.0f;
for (i = 0; i < SIZE; i++)
sum += values[i];
return sum;
}
float
vector_min (void)
{
size_t i;
float min = values[0];
for (i = 0; i < SIZE; i++)
min = __builtin_fminf (min, values[i]);
return min;
}
float
vector_max (void)
{
size_t i;
float max = values[0];
for (i = 0; i < SIZE; i++)
max = __builtin_fmaxf (max, values[i]);
return max;
}
/* { dg-do compile } */
/* { dg-skip-if "" { powerpc*-*-darwin* } { "*" } { "" } } */
/* { dg-require-effective-target powerpc_vsx_ok } */
/* { dg-options "-O3 -mcpu=power7 -mabi=altivec -ffast-math -fno-unroll-loops" } */
/* { dg-final { scan-assembler-times "xvadddp" 1 } } */
/* { dg-final { scan-assembler-times "xvmindp" 1 } } */
/* { dg-final { scan-assembler-times "xvmaxdp" 1 } } */
/* { dg-final { scan-assembler-times "xsadddp" 1 } } */
/* { dg-final { scan-assembler-times "xsmindp" 1 } } */
/* { dg-final { scan-assembler-times "xsmaxdp" 1 } } */
/* { dg-final { scan-assembler-not "xxsldwi" } } */
/* { dg-final { scan-assembler-not "stvx" } } */
/* { dg-final { scan-assembler-not "stxvd2x" } } */
/* { dg-final { scan-assembler-not "stxvw4x" } } */
#include <stddef.h>
#ifndef SIZE
#define SIZE 1024
#endif
double values[SIZE] __attribute__((__aligned__(32)));
double
vector_sum (void)
{
size_t i;
double sum = 0.0;
for (i = 0; i < SIZE; i++)
sum += values[i];
return sum;
}
double
vector_min (void)
{
size_t i;
double min = values[0];
for (i = 0; i < SIZE; i++)
min = __builtin_fmin (min, values[i]);
return min;
}
double
vector_max (void)
{
size_t i;
double max = values[0];
for (i = 0; i < SIZE; i++)
max = __builtin_fmax (max, values[i]);
return max;
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment