Commit a520f3c3 by Jakub Jelinek Committed by Jakub Jelinek

sse.md (reduc_umin_v8hi): New pattern.

	* config/i386/sse.md (reduc_umin_v8hi): New pattern.
	* config/i386/i386.c (ix86_build_const_vector): Handle
	also V32QI, V16QI, V16HI and V8HI modes.
	(emit_reduc_half): New function.
	(ix86_expand_reduc): Use phminposuw insn for V8HImode UMIN.
	Use emit_reduc_half helper function.

	* gcc.target/i386/sse4_1-phminposuw-2.c: New test.
	* gcc.target/i386/sse4_1-phminposuw-3.c: New test.
	* gcc.target/i386/avx-vphminposuw-2.c: New test.
	* gcc.target/i386/avx-vphminposuw-3.c: New test.

From-SVN: r179929
parent 35f5b1c1
2011-10-13 Jakub Jelinek <jakub@redhat.com>
* config/i386/sse.md (reduc_umin_v8hi): New pattern.
* config/i386/i386.c (ix86_build_const_vector): Handle
also V32QI, V16QI, V16HI and V8HI modes.
(emit_reduc_half): New function.
(ix86_expand_reduc): Use phminposuw insn for V8HImode UMIN.
Use emit_reduc_half helper function.
2011-10-13 Lawrence Crowl <crowl@google.com>
Diego Novillo <dnovillo@google.com>
......@@ -17008,6 +17008,10 @@ ix86_build_const_vector (enum machine_mode mode, bool vect, rtx value)
switch (mode)
{
case V32QImode:
case V16QImode:
case V16HImode:
case V8HImode:
case V8SImode:
case V4SImode:
case V4DImode:
......@@ -33250,72 +33254,100 @@ ix86_expand_vector_extract (bool mmx_ok, rtx target, rtx vec, int elt)
}
}
/* Expand a vector reduction. FN is the binary pattern to reduce;
DEST is the destination; IN is the input vector. */
/* Generate code to copy vector bits i / 2 ... i - 1 from vector SRC
to bits 0 ... i / 2 - 1 of vector DEST, which has the same mode.
The upper bits of DEST are undefined, though they shouldn't cause
exceptions (some bits from src or all zeros are ok). */
void
ix86_expand_reduc (rtx (*fn) (rtx, rtx, rtx), rtx dest, rtx in)
static void
emit_reduc_half (rtx dest, rtx src, int i)
{
rtx tmp1, tmp2, tmp3, tmp4, tmp5;
enum machine_mode mode = GET_MODE (in);
int i;
tmp1 = gen_reg_rtx (mode);
tmp2 = gen_reg_rtx (mode);
tmp3 = gen_reg_rtx (mode);
switch (mode)
rtx tem;
switch (GET_MODE (src))
{
case V4SFmode:
emit_insn (gen_sse_movhlps (tmp1, in, in));
emit_insn (fn (tmp2, tmp1, in));
emit_insn (gen_sse_shufps_v4sf (tmp3, tmp2, tmp2,
const1_rtx, const1_rtx,
GEN_INT (1+4), GEN_INT (1+4)));
if (i == 128)
tem = gen_sse_movhlps (dest, src, src);
else
tem = gen_sse_shufps_v4sf (dest, src, src, const1_rtx, const1_rtx,
GEN_INT (1 + 4), GEN_INT (1 + 4));
break;
case V2DFmode:
tem = gen_vec_interleave_highv2df (dest, src, src);
break;
case V16QImode:
case V8HImode:
case V4SImode:
case V2DImode:
tem = gen_sse2_lshrv1ti3 (gen_lowpart (V1TImode, dest),
gen_lowpart (V1TImode, src),
GEN_INT (i / 2));
break;
case V8SFmode:
tmp4 = gen_reg_rtx (mode);
tmp5 = gen_reg_rtx (mode);
emit_insn (gen_avx_vperm2f128v8sf3 (tmp4, in, in, const1_rtx));
emit_insn (fn (tmp5, tmp4, in));
emit_insn (gen_avx_shufps256 (tmp1, tmp5, tmp5, GEN_INT (2+12)));
emit_insn (fn (tmp2, tmp1, tmp5));
emit_insn (gen_avx_shufps256 (tmp3, tmp2, tmp2, const1_rtx));
if (i == 256)
tem = gen_avx_vperm2f128v8sf3 (dest, src, src, const1_rtx);
else
tem = gen_avx_shufps256 (dest, src, src,
GEN_INT (i == 128 ? 2 + (3 << 2) : 1));
break;
case V4DFmode:
emit_insn (gen_avx_vperm2f128v4df3 (tmp1, in, in, const1_rtx));
emit_insn (fn (tmp2, tmp1, in));
emit_insn (gen_avx_shufpd256 (tmp3, tmp2, tmp2, const1_rtx));
if (i == 256)
tem = gen_avx_vperm2f128v4df3 (dest, src, src, const1_rtx);
else
tem = gen_avx_shufpd256 (dest, src, src, const1_rtx);
break;
case V32QImode:
case V16HImode:
case V8SImode:
case V4DImode:
emit_insn (gen_avx2_permv2ti (gen_lowpart (V4DImode, tmp1),
gen_lowpart (V4DImode, in),
gen_lowpart (V4DImode, in),
const1_rtx));
tmp4 = in;
tmp5 = tmp1;
for (i = 64; i >= GET_MODE_BITSIZE (GET_MODE_INNER (mode)); i >>= 1)
{
if (i != 64)
{
tmp2 = gen_reg_rtx (mode);
tmp3 = gen_reg_rtx (mode);
}
emit_insn (fn (tmp2, tmp4, tmp5));
emit_insn (gen_avx2_lshrv2ti3 (gen_lowpart (V2TImode, tmp3),
gen_lowpart (V2TImode, tmp2),
GEN_INT (i)));
tmp4 = tmp2;
tmp5 = tmp3;
}
if (i == 256)
tem = gen_avx2_permv2ti (gen_lowpart (V4DImode, dest),
gen_lowpart (V4DImode, src),
gen_lowpart (V4DImode, src),
const1_rtx);
else
tem = gen_avx2_lshrv2ti3 (gen_lowpart (V2TImode, dest),
gen_lowpart (V2TImode, src),
GEN_INT (i / 2));
break;
default:
gcc_unreachable ();
}
emit_insn (fn (dest, tmp2, tmp3));
emit_insn (tem);
}
/* Expand a vector reduction. FN is the binary pattern to reduce;
DEST is the destination; IN is the input vector. */
void
ix86_expand_reduc (rtx (*fn) (rtx, rtx, rtx), rtx dest, rtx in)
{
rtx half, dst, vec = in;
enum machine_mode mode = GET_MODE (in);
int i;
/* SSE4 has a special instruction for V8HImode UMIN reduction. */
if (TARGET_SSE4_1
&& mode == V8HImode
&& fn == gen_uminv8hi3)
{
emit_insn (gen_sse4_1_phminposuw (dest, in));
return;
}
for (i = GET_MODE_BITSIZE (mode);
i > GET_MODE_BITSIZE (GET_MODE_INNER (mode));
i >>= 1)
{
half = gen_reg_rtx (mode);
emit_reduc_half (half, vec, i);
if (i == GET_MODE_BITSIZE (GET_MODE_INNER (mode)) * 2)
dst = dest;
else
dst = gen_reg_rtx (mode);
emit_insn (fn (dst, half, vec));
vec = dst;
}
}
/* Target hook for scalar_mode_supported_p. */
......@@ -1303,6 +1303,16 @@
DONE;
})
(define_expand "reduc_umin_v8hi"
[(umin:V8HI
(match_operand:V8HI 0 "register_operand" "")
(match_operand:V8HI 1 "register_operand" ""))]
"TARGET_SSE4_1"
{
ix86_expand_reduc (gen_uminv8hi3, operands[0], operands[1]);
DONE;
})
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;
;; Parallel floating point comparisons
......
2011-10-13 Jakub Jelinek <jakub@redhat.com>
* gcc.target/i386/sse4_1-phminposuw-2.c: New test.
* gcc.target/i386/sse4_1-phminposuw-3.c: New test.
* gcc.target/i386/avx-vphminposuw-2.c: New test.
* gcc.target/i386/avx-vphminposuw-3.c: New test.
2011-10-13 H.J. Lu <hongjiu.lu@intel.com>
* gcc.target/i386/pr50712.c: Check ia32 instead of ilp32.
......
/* { dg-do run } */
/* { dg-require-effective-target avx } */
/* { dg-options "-O3 -mavx -mno-avx2" } */
#define CHECK_H "avx-check.h"
#define TEST avx_test
#include "sse4_1-phminposuw-2.c"
/* { dg-do compile } */
/* { dg-options "-O3 -mavx -mno-avx2" } */
#include "avx-vphminposuw-2.c"
/* { dg-final { scan-assembler "vphminposuw\[^\n\r\]*xmm" } } */
/* { dg-do run } */
/* { dg-require-effective-target sse4 } */
/* { dg-options "-O3 -msse4.1 -mno-avx2" } */
#ifndef CHECK_H
#define CHECK_H "sse4_1-check.h"
#endif
#ifndef TEST
#define TEST sse4_1_test
#endif
#include CHECK_H
extern void abort (void);
#define N 1024
short a[N], c, e;
unsigned short b[N], d, f;
__attribute__((noinline)) short
vecsmax (void)
{
int i;
short r = -32768;
for (i = 0; i < N; ++i)
if (r < a[i]) r = a[i];
return r;
}
__attribute__((noinline)) unsigned short
vecumax (void)
{
int i;
unsigned short r = 0;
for (i = 0; i < N; ++i)
if (r < b[i]) r = b[i];
return r;
}
__attribute__((noinline)) short
vecsmin (void)
{
int i;
short r = 32767;
for (i = 0; i < N; ++i)
if (r > a[i]) r = a[i];
return r;
}
__attribute__((noinline)) unsigned short
vecumin (void)
{
int i;
unsigned short r = 65535;
for (i = 0; i < N; ++i)
if (r > b[i]) r = b[i];
return r;
}
static void
TEST (void)
{
int i;
for (i = 0; i < N; ++i)
{
a[i] = i - N / 2;
b[i] = i + 32768 - N / 2;
}
a[N / 3] = N;
a[2 * N / 3] = -N;
b[N / 5] = 32768 + N;
b[4 * N / 5] = 32768 - N;
if (vecsmax () != N || vecsmin () != -N)
abort ();
if (vecumax () != 32768 + N || vecumin () != 32768 - N)
abort ();
}
/* { dg-do compile } */
/* { dg-options "-O3 -msse4.1 -mno-avx2" } */
#include "sse4_1-phminposuw-2.c"
/* { dg-final { scan-assembler "phminposuw\[^\n\r\]*xmm" } } */
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment