Commit 2f62165d by Ganesh Gopalasubramanian Committed by Ganesh Gopalasubramanian

Enable TARGET_LOOP_UNROLL_ADJUST for bdver3/bdver4

From-SVN: r206110
parent f317df4f
2013-12-19 Ganesh Gopalasubramanian <Ganesh.Gopalasubramanian@amd.com>
* config/i386/i386.c: Include cfgloop.h.
(ix86_loop_memcount): New function.
(ix86_loop_unroll_adjust): New function.
(TARGET_LOOP_UNROLL_ADJUST): Define.
* config/i386/i386.h
(TARGET_ADJUST_UNROLL): Define.
* config/i386/x86-tune.def
(X86_TUNE_ADJUST_UNROLL): Define.
2013-12-19 Marek Polacek <polacek@redhat.com>
* config/i386/i386.c (ix86_parse_stringop_strategy_string): Remove
......@@ -64,6 +64,7 @@ along with GCC; see the file COPYING3. If not see
#include "is-a.h"
#include "gimple.h"
#include "gimplify.h"
#include "cfgloop.h"
#include "dwarf2.h"
#include "df.h"
#include "tm-constrs.h"
......@@ -44014,6 +44015,64 @@ ix86_simd_clone_usable (struct cgraph_node *node)
}
}
/* This function gives out the number of memory references.
This value determines the unrolling factor for
bdver3 and bdver4 architectures. */
static int
ix86_loop_memcount (rtx *x, unsigned *mem_count)
{
if (*x != NULL_RTX && MEM_P (*x))
{
enum machine_mode mode;
unsigned int n_words;
mode = GET_MODE (*x);
n_words = GET_MODE_SIZE (mode) / UNITS_PER_WORD;
if (n_words > 4)
(*mem_count)+=2;
else
(*mem_count)+=1;
}
return 0;
}
/* This function adjusts the unroll factor based on
the hardware capabilities. For ex, bdver3 has
a loop buffer which makes unrolling of smaller
loops less important. This function decides the
unroll factor using number of memory references
(value 32 is used) as a heuristic. */
static unsigned
ix86_loop_unroll_adjust (unsigned nunroll, struct loop *loop)
{
basic_block *bbs;
rtx insn;
unsigned i;
unsigned mem_count = 0;
if (!TARGET_ADJUST_UNROLL)
return nunroll;
/* Count the number of memory references within the loop body. */
bbs = get_loop_body (loop);
for (i = 0; i < loop->num_nodes; i++)
{
for (insn = BB_HEAD (bbs[i]); insn != BB_END (bbs[i]); insn = NEXT_INSN (insn))
if (NONDEBUG_INSN_P (insn))
for_each_rtx (&insn, (rtx_function) ix86_loop_memcount, &mem_count);
}
free (bbs);
if (mem_count && mem_count <=32)
return 32/mem_count;
return nunroll;
}
/* Implement TARGET_FLOAT_EXCEPTIONS_ROUNDING_SUPPORTED_P. */
static bool
......@@ -44499,6 +44558,9 @@ ix86_atomic_assign_expand_fenv (tree *hold, tree *clear, tree *update)
#define TARGET_INIT_LIBFUNCS darwin_rename_builtins
#endif
#undef TARGET_LOOP_UNROLL_ADJUST
#define TARGET_LOOP_UNROLL_ADJUST ix86_loop_unroll_adjust
#undef TARGET_SPILL_CLASS
#define TARGET_SPILL_CLASS ix86_spill_class
......@@ -443,6 +443,8 @@ extern unsigned char ix86_tune_features[X86_TUNE_LAST];
ix86_tune_features[X86_TUNE_AVOID_MEM_OPND_FOR_CMOVE]
#define TARGET_SPLIT_MEM_OPND_FOR_FP_CONVERTS \
ix86_tune_features[X86_TUNE_SPLIT_MEM_OPND_FOR_FP_CONVERTS]
#define TARGET_ADJUST_UNROLL \
ix86_tune_features[X86_TUNE_ADJUST_UNROLL]
/* Feature tests against the various architecture variations. */
enum ix86_arch_indices {
......
......@@ -503,3 +503,9 @@ DEF_TUNE (X86_TUNE_QIMODE_MATH, "qimode_math", ~0)
arithmetic to 32bit via PROMOTE_MODE macro. This code generation scheme
is usually used for RISC targets. */
DEF_TUNE (X86_TUNE_PROMOTE_QI_REGS, "promote_qi_regs", 0)
/* X86_TUNE_ADJUST_UNROLL: This enables adjusting the unroll factor based
on hardware capabilities. Bdver3 hardware has a loop buffer which makes
unrolling small loop less important. For, such architectures we adjust
the unroll factor so that the unrolled loop fits the loop buffer. */
DEF_TUNE (X86_TUNE_ADJUST_UNROLL, "adjust_unroll_factor", m_BDVER3 | m_BDVER4)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment