Commit 6bc0ff89 by Richard Henderson Committed by Richard Henderson

Add rtx costs for sse integer ops

From-SVN: r188785
parent b1f75d76
2012-06-19 Richard Henderson <rth@redhat.com>
* config/i386/i386.c (ix86_rtx_costs): Handle CONST_VECTOR, and
integral vector modes.
2012-06-19 Uros Bizjak <ubizjak@gmail.com> 2012-06-19 Uros Bizjak <ubizjak@gmail.com>
* config/i386/i386.md (FRNDINT_ROUNDING): New int iterator. * config/i386/i386.md (FRNDINT_ROUNDING): New int iterator.
......
...@@ -31990,13 +31990,16 @@ ix86_rtx_costs (rtx x, int code, int outer_code_i, int opno, int *total, ...@@ -31990,13 +31990,16 @@ ix86_rtx_costs (rtx x, int code, int outer_code_i, int opno, int *total,
break; break;
case 0: case 0:
case -1: case -1:
/* Start with (MEM (SYMBOL_REF)), since that's where
it'll probably end up. Add a penalty for size. */
*total = (COSTS_N_INSNS (1)
+ (flag_pic != 0 && !TARGET_64BIT)
+ (mode == SFmode ? 0 : mode == DFmode ? 1 : 2));
break; break;
} }
/* FALLTHRU */
case CONST_VECTOR:
/* Start with (MEM (SYMBOL_REF)), since that's where
it'll probably end up. Add a penalty for size. */
*total = (COSTS_N_INSNS (1)
+ (flag_pic != 0 && !TARGET_64BIT)
+ (mode == SFmode ? 0 : mode == DFmode ? 1 : 2));
return true; return true;
case ZERO_EXTEND: case ZERO_EXTEND:
...@@ -32016,8 +32019,9 @@ ix86_rtx_costs (rtx x, int code, int outer_code_i, int opno, int *total, ...@@ -32016,8 +32019,9 @@ ix86_rtx_costs (rtx x, int code, int outer_code_i, int opno, int *total,
return false; return false;
case ASHIFT: case ASHIFT:
if (CONST_INT_P (XEXP (x, 1)) if (SCALAR_INT_MODE_P (mode)
&& (GET_MODE (XEXP (x, 0)) != DImode || TARGET_64BIT)) && GET_MODE_SIZE (mode) < UNITS_PER_WORD
&& CONST_INT_P (XEXP (x, 1)))
{ {
HOST_WIDE_INT value = INTVAL (XEXP (x, 1)); HOST_WIDE_INT value = INTVAL (XEXP (x, 1));
if (value == 1) if (value == 1)
...@@ -32038,7 +32042,15 @@ ix86_rtx_costs (rtx x, int code, int outer_code_i, int opno, int *total, ...@@ -32038,7 +32042,15 @@ ix86_rtx_costs (rtx x, int code, int outer_code_i, int opno, int *total,
case ASHIFTRT: case ASHIFTRT:
case LSHIFTRT: case LSHIFTRT:
case ROTATERT: case ROTATERT:
if (!TARGET_64BIT && GET_MODE (XEXP (x, 0)) == DImode) if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
{
/* ??? Should be SSE vector operation cost. */
/* At least for published AMD latencies, this really is the same
as the latency for a simple fpu operation like fabs. */
*total = cost->fabs;
return false;
}
if (GET_MODE_SIZE (mode) < UNITS_PER_WORD)
{ {
if (CONST_INT_P (XEXP (x, 1))) if (CONST_INT_P (XEXP (x, 1)))
{ {
...@@ -32107,6 +32119,16 @@ ix86_rtx_costs (rtx x, int code, int outer_code_i, int opno, int *total, ...@@ -32107,6 +32119,16 @@ ix86_rtx_costs (rtx x, int code, int outer_code_i, int opno, int *total,
*total = cost->fmul; *total = cost->fmul;
return false; return false;
} }
else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
{
/* Without sse4.1, we don't have PMULLD; it's emulated with 7
insns, including two PMULUDQ. */
if (mode == V4SImode && !(TARGET_SSE4_1 || TARGET_AVX))
*total = cost->fmul * 2 + cost->fabs * 5;
else
*total = cost->fmul;
return false;
}
else else
{ {
rtx op0 = XEXP (x, 0); rtx op0 = XEXP (x, 0);
...@@ -32171,7 +32193,7 @@ ix86_rtx_costs (rtx x, int code, int outer_code_i, int opno, int *total, ...@@ -32171,7 +32193,7 @@ ix86_rtx_costs (rtx x, int code, int outer_code_i, int opno, int *total,
case PLUS: case PLUS:
if (GET_MODE_CLASS (mode) == MODE_INT if (GET_MODE_CLASS (mode) == MODE_INT
&& GET_MODE_BITSIZE (mode) <= GET_MODE_BITSIZE (Pmode)) && GET_MODE_SIZE (mode) <= UNITS_PER_WORD)
{ {
if (GET_CODE (XEXP (x, 0)) == PLUS if (GET_CODE (XEXP (x, 0)) == PLUS
&& GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
...@@ -32271,6 +32293,14 @@ ix86_rtx_costs (rtx x, int code, int outer_code_i, int opno, int *total, ...@@ -32271,6 +32293,14 @@ ix86_rtx_costs (rtx x, int code, int outer_code_i, int opno, int *total,
/* FALLTHRU */ /* FALLTHRU */
case NOT: case NOT:
if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
{
/* ??? Should be SSE vector operation cost. */
/* At least for published AMD latencies, this really is the same
as the latency for a simple fpu operation like fabs. */
*total = cost->fabs;
return false;
}
if (!TARGET_64BIT && mode == DImode) if (!TARGET_64BIT && mode == DImode)
*total = cost->add * 2; *total = cost->add * 2;
else else
...@@ -32331,7 +32361,7 @@ ix86_rtx_costs (rtx x, int code, int outer_code_i, int opno, int *total, ...@@ -32331,7 +32361,7 @@ ix86_rtx_costs (rtx x, int code, int outer_code_i, int opno, int *total,
/* ??? Assume all of these vector manipulation patterns are /* ??? Assume all of these vector manipulation patterns are
recognizable. In which case they all pretty much have the recognizable. In which case they all pretty much have the
same cost. */ same cost. */
*total = COSTS_N_INSNS (1); *total = cost->fabs;
return true; return true;
default: default:
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment