Commit 14b52538 by Changpeng Fang Committed by Changpeng Fang

Bobcat (btver1) Enablement

	* config.gcc (i[34567]86-*-linux* | ...): Add btver1.
	  (case ${target}): Add btver1.
	* config/i386/driver-i386.c (host_detect_local_cpu): Let
	  -march=native recognize btver1 processors.
	* config/i386/i386-c.c (ix86_target_macros_internal): Add
	  btver1 def_and_undef
	* config/i386/i386.c (struct processor_costs btver1_cost): New
	  btver1 cost table.
	  (m_BTVER1): New definition.
	  ( m_AMD_MULTIPLE): Includes m_BTVER1.
	  (initial_ix86_tune_features): Add btver1 tune.
	  (processor_target_table): Add btver1 entry.
	  (static const char *const cpu_names): Add btver1 entry.
	  (software_prefetching_beneficial_p): Add btver1.
	  (ix86_option_override_internal): Add btver1 instruction sets.
	  (ix86_issue_rate): Add btver1.
	  (ix86_adjust_cost): Add btver1.
	* config/i386/i386.h (TARGET_BTVER1): New definition.
	  (enum target_cpu_default): Add TARGET_CPU_DEFAULT_btver1.
	  (enum processor_type): Add PROCESSOR_BTVER1.
	* config/i386/i386.md (define_attr "cpu"): Add btver1.

From-SVN: r168556
parent a6098a28
2011-01-06 Changpeng Fang <changpeng.fang@amd.com>
Bobcat (btver1) Enablement
* config.gcc (i[34567]86-*-linux* | ...): Add btver1.
(case ${target}): Add btver1.
* config/i386/driver-i386.c (host_detect_local_cpu): Let
-march=native recognize btver1 processors.
* config/i386/i386-c.c (ix86_target_macros_internal): Add
btver1 def_and_undef
* config/i386/i386.c (struct processor_costs btver1_cost): New
btver1 cost table.
(m_BTVER1): New definition.
( m_AMD_MULTIPLE): Includes m_BTVER1.
(initial_ix86_tune_features): Add btver1 tune.
(processor_target_table): Add btver1 entry.
(static const char *const cpu_names): Add btver1 entry.
(software_prefetching_beneficial_p): Add btver1.
(ix86_option_override_internal): Add btver1 instruction sets.
(ix86_issue_rate): Add btver1.
(ix86_adjust_cost): Add btver1.
* config/i386/i386.h (TARGET_BTVER1): New definition.
(enum target_cpu_default): Add TARGET_CPU_DEFAULT_btver1.
(enum processor_type): Add PROCESSOR_BTVER1.
* config/i386/i386.md (define_attr "cpu"): Add btver1.
2011-01-06 Rainer Orth <ro@CeBiTec.Uni-Bielefeld.DE>
PR target/43309
......
......@@ -1237,7 +1237,7 @@ i[34567]86-*-linux* | i[34567]86-*-kfreebsd*-gnu | i[34567]86-*-knetbsd*-gnu | i
need_64bit_hwint=yes
need_64bit_isa=yes
case X"${with_cpu}" in
Xgeneric|Xatom|Xcore2|Xcorei7|Xcorei7-avx|Xnocona|Xx86-64|Xbdver1|Xamdfam10|Xbarcelona|Xk8|Xopteron|Xathlon64|Xathlon-fx|Xathlon64-sse3|Xk8-sse3|Xopteron-sse3)
Xgeneric|Xatom|Xcore2|Xcorei7|Xcorei7-avx|Xnocona|Xx86-64|Xbdver1|Xbtver1|Xamdfam10|Xbarcelona|Xk8|Xopteron|Xathlon64|Xathlon-fx|Xathlon64-sse3|Xk8-sse3|Xopteron-sse3)
;;
X)
if test x$with_cpu_64 = x; then
......@@ -1246,7 +1246,7 @@ i[34567]86-*-linux* | i[34567]86-*-kfreebsd*-gnu | i[34567]86-*-knetbsd*-gnu | i
;;
*)
echo "Unsupported CPU used in --with-cpu=$with_cpu, supported values:" 1>&2
echo "generic atom core2 corei7 corei7-avx nocona x86-64 bdver1 amdfam10 barcelona k8 opteron athlon64 athlon-fx athlon64-sse3 k8-sse3 opteron-sse3" 1>&2
echo "generic atom core2 corei7 corei7-avx nocona x86-64 bdver1 btver1 amdfam10 barcelona k8 opteron athlon64 athlon-fx athlon64-sse3 k8-sse3 opteron-sse3" 1>&2
exit 1
;;
esac
......@@ -1375,7 +1375,7 @@ i[34567]86-*-solaris2*)
need_64bit_isa=yes
use_gcc_stdint=wrap
case X"${with_cpu}" in
Xgeneric|Xatom|Xcore2|Xcorei7|Xcorei7-avx|Xnocona|Xx86-64|Xbdver1|Xamdfam10|Xbarcelona|Xk8|Xopteron|Xathlon64|Xathlon-fx|Xathlon64-sse3|Xk8-sse3|Xopteron-sse3)
Xgeneric|Xatom|Xcore2|Xcorei7|Xcorei7-avx|Xnocona|Xx86-64|Xbdver1|Xbtver1|Xamdfam10|Xbarcelona|Xk8|Xopteron|Xathlon64|Xathlon-fx|Xathlon64-sse3|Xk8-sse3|Xopteron-sse3)
;;
X)
if test x$with_cpu_64 = x; then
......@@ -1384,7 +1384,7 @@ i[34567]86-*-solaris2*)
;;
*)
echo "Unsupported CPU used in --with-cpu=$with_cpu, supported values:" 1>&2
echo "generic atom core2 corei7 corei7-avx nocona x86-64 bdver1 amdfam10 barcelona k8 opteron athlon64 athlon-fx athlon64-sse3 k8-sse3 opteron-sse3" 1>&2
echo "generic atom core2 corei7 corei7-avx nocona x86-64 bdver1 btver1 amdfam10 barcelona k8 opteron athlon64 athlon-fx athlon64-sse3 k8-sse3 opteron-sse3" 1>&2
exit 1
;;
esac
......@@ -1455,7 +1455,7 @@ i[34567]86-*-mingw* | x86_64-*-mingw*)
if test x$enable_targets = xall; then
tm_defines="${tm_defines} TARGET_BI_ARCH=1"
case X"${with_cpu}" in
Xgeneric|Xatom|Xcore2|Xcorei7|Xcorei7-avx|Xnocona|Xx86-64|Xbdver1|Xamdfam10|Xbarcelona|Xk8|Xopteron|Xathlon64|Xathlon-fx|Xathlon64-sse3|Xk8-sse3|Xopteron-sse3)
Xgeneric|Xatom|Xcore2|Xcorei7|Xcorei7-avx|Xnocona|Xx86-64|Xbdver1|Xbtver1|Xamdfam10|Xbarcelona|Xk8|Xopteron|Xathlon64|Xathlon-fx|Xathlon64-sse3|Xk8-sse3|Xopteron-sse3)
;;
X)
if test x$with_cpu_64 = x; then
......@@ -1464,7 +1464,7 @@ i[34567]86-*-mingw* | x86_64-*-mingw*)
;;
*)
echo "Unsupported CPU used in --with-cpu=$with_cpu, supported values:" 1>&2
echo "generic atom core2 corei7 Xcorei7-avx nocona x86-64 bdver1 amdfam10 barcelona k8 opteron athlon64 athlon-fx athlon64-sse3 k8-sse3 opteron-sse3" 1>&2
echo "generic atom core2 corei7 Xcorei7-avx nocona x86-64 bdver1 btver1 amdfam10 barcelona k8 opteron athlon64 athlon-fx athlon64-sse3 k8-sse3 opteron-sse3" 1>&2
exit 1
;;
esac
......@@ -2798,6 +2798,10 @@ case ${target} in
arch=bdver1
cpu=bdver1
;;
btver1-*)
arch=btver1
cpu=btver1
;;
amdfam10-*|barcelona-*)
arch=amdfam10
cpu=amdfam10
......@@ -2887,6 +2891,10 @@ case ${target} in
arch=bdver1
cpu=bdver1
;;
btver1-*)
arch=btver1
cpu=btver1
;;
amdfam10-*|barcelona-*)
arch=amdfam10
cpu=amdfam10
......@@ -3303,7 +3311,7 @@ case "${target}" in
;;
"" | x86-64 | generic | native \
| k8 | k8-sse3 | athlon64 | athlon64-sse3 | opteron \
| opteron-sse3 | athlon-fx | bdver1 | amdfam10 \
| opteron-sse3 | athlon-fx | bdver1 | btver1 | amdfam10 \
| barcelona | nocona | core2 | corei7 | corei7-avx | atom)
# OK
;;
......
......@@ -500,6 +500,8 @@ const char *host_detect_local_cpu (int argc, const char **argv)
processor = PROCESSOR_GEODE;
else if (has_xop)
processor = PROCESSOR_BDVER1;
else if (has_sse4a && has_ssse3)
processor = PROCESSOR_BTVER1;
else if (has_sse4a)
processor = PROCESSOR_AMDFAM10;
else if (has_sse2 || has_longmode)
......@@ -647,6 +649,9 @@ const char *host_detect_local_cpu (int argc, const char **argv)
case PROCESSOR_BDVER1:
cpu = "bdver1";
break;
case PROCESSOR_BTVER1:
cpu = "btver1";
break;
default:
/* Use something reasonable. */
......
......@@ -110,6 +110,10 @@ ix86_target_macros_internal (int isa_flag,
def_or_undef (parse_in, "__bdver1");
def_or_undef (parse_in, "__bdver1__");
break;
case PROCESSOR_BTVER1:
def_or_undef (parse_in, "__btver1");
def_or_undef (parse_in, "__btver1__");
break;
case PROCESSOR_PENTIUM4:
def_or_undef (parse_in, "__pentium4");
def_or_undef (parse_in, "__pentium4__");
......@@ -194,6 +198,9 @@ ix86_target_macros_internal (int isa_flag,
case PROCESSOR_BDVER1:
def_or_undef (parse_in, "__tune_bdver1__");
break;
case PROCESSOR_BTVER1:
def_or_undef (parse_in, "__tune_btver1__");
break;
case PROCESSOR_PENTIUM4:
def_or_undef (parse_in, "__tune_pentium4__");
break;
......
......@@ -1231,6 +1231,88 @@ struct processor_costs bdver1_cost = {
1, /* cond_not_taken_branch_cost. */
};
struct processor_costs btver1_cost = {
COSTS_N_INSNS (1), /* cost of an add instruction */
COSTS_N_INSNS (2), /* cost of a lea instruction */
COSTS_N_INSNS (1), /* variable shift costs */
COSTS_N_INSNS (1), /* constant shift costs */
{COSTS_N_INSNS (3), /* cost of starting multiply for QI */
COSTS_N_INSNS (4), /* HI */
COSTS_N_INSNS (3), /* SI */
COSTS_N_INSNS (4), /* DI */
COSTS_N_INSNS (5)}, /* other */
0, /* cost of multiply per each bit set */
{COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
COSTS_N_INSNS (35), /* HI */
COSTS_N_INSNS (51), /* SI */
COSTS_N_INSNS (83), /* DI */
COSTS_N_INSNS (83)}, /* other */
COSTS_N_INSNS (1), /* cost of movsx */
COSTS_N_INSNS (1), /* cost of movzx */
8, /* "large" insn */
9, /* MOVE_RATIO */
4, /* cost for loading QImode using movzbl */
{3, 4, 3}, /* cost of loading integer registers
in QImode, HImode and SImode.
Relative to reg-reg move (2). */
{3, 4, 3}, /* cost of storing integer registers */
4, /* cost of reg,reg fld/fst */
{4, 4, 12}, /* cost of loading fp registers
in SFmode, DFmode and XFmode */
{6, 6, 8}, /* cost of storing fp registers
in SFmode, DFmode and XFmode */
2, /* cost of moving MMX register */
{3, 3}, /* cost of loading MMX registers
in SImode and DImode */
{4, 4}, /* cost of storing MMX registers
in SImode and DImode */
2, /* cost of moving SSE register */
{4, 4, 3}, /* cost of loading SSE registers
in SImode, DImode and TImode */
{4, 4, 5}, /* cost of storing SSE registers
in SImode, DImode and TImode */
3, /* MMX or SSE register to integer */
/* On K8:
MOVD reg64, xmmreg Double FSTORE 4
MOVD reg32, xmmreg Double FSTORE 4
On AMDFAM10:
MOVD reg64, xmmreg Double FADD 3
1/1 1/1
MOVD reg32, xmmreg Double FADD 3
1/1 1/1 */
32, /* size of l1 cache. */
512, /* size of l2 cache. */
64, /* size of prefetch block */
100, /* number of parallel prefetches */
2, /* Branch cost */
COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
COSTS_N_INSNS (4), /* cost of FMUL instruction. */
COSTS_N_INSNS (19), /* cost of FDIV instruction. */
COSTS_N_INSNS (2), /* cost of FABS instruction. */
COSTS_N_INSNS (2), /* cost of FCHS instruction. */
COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
/* BTVER1 has optimized REP instruction for medium sized blocks, but for
very small blocks it is better to use loop. For large blocks, libcall can
do nontemporary accesses and beat inline considerably. */
{{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}},
{libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
{{libcall, {{8, loop}, {24, unrolled_loop},
{2048, rep_prefix_4_byte}, {-1, libcall}}},
{libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
4, /* scalar_stmt_cost. */
2, /* scalar load_cost. */
2, /* scalar_store_cost. */
6, /* vec_stmt_cost. */
0, /* vec_to_scalar_cost. */
2, /* scalar_to_vec_cost. */
2, /* vec_align_load_cost. */
2, /* vec_unalign_load_cost. */
2, /* vec_store_cost. */
2, /* cond_taken_branch_cost. */
1, /* cond_not_taken_branch_cost. */
};
static const
struct processor_costs pentium4_cost = {
COSTS_N_INSNS (1), /* cost of an add instruction */
......@@ -1624,7 +1706,8 @@ const struct processor_costs *ix86_cost = &pentium_cost;
#define m_ATHLON_K8 (m_K8 | m_ATHLON)
#define m_AMDFAM10 (1<<PROCESSOR_AMDFAM10)
#define m_BDVER1 (1<<PROCESSOR_BDVER1)
#define m_AMD_MULTIPLE (m_K8 | m_ATHLON | m_AMDFAM10 | m_BDVER1)
#define m_BTVER1 (1<<PROCESSOR_BTVER1)
#define m_AMD_MULTIPLE (m_K8 | m_ATHLON | m_AMDFAM10 | m_BDVER1 | m_BTVER1)
#define m_GENERIC32 (1<<PROCESSOR_GENERIC32)
#define m_GENERIC64 (1<<PROCESSOR_GENERIC64)
......@@ -1670,8 +1753,8 @@ static unsigned int initial_ix86_tune_features[X86_TUNE_LAST] = {
~m_386,
/* X86_TUNE_USE_SAHF */
m_ATOM | m_PPRO | m_K6_GEODE | m_K8 | m_AMDFAM10 | m_BDVER1 | m_PENT4
| m_NOCONA | m_CORE2I7 | m_GENERIC,
m_ATOM | m_PPRO | m_K6_GEODE | m_K8 | m_AMDFAM10 | m_BDVER1 | m_BTVER1
| m_PENT4 | m_NOCONA | m_CORE2I7 | m_GENERIC,
/* X86_TUNE_MOVX: Enable to zero extend integer registers to avoid
partial dependencies. */
......@@ -1777,7 +1860,7 @@ static unsigned int initial_ix86_tune_features[X86_TUNE_LAST] = {
| m_AMDFAM10 | m_BDVER1,
/* X86_TUNE_SSE_UNALIGNED_LOAD_OPTIMAL */
m_AMDFAM10 | m_BDVER1 | m_COREI7,
m_AMDFAM10 | m_BDVER1 | m_BTVER1 | m_COREI7,
/* X86_TUNE_SSE_UNALIGNED_STORE_OPTIMAL */
m_BDVER1 | m_COREI7,
......@@ -1855,11 +1938,11 @@ static unsigned int initial_ix86_tune_features[X86_TUNE_LAST] = {
/* X86_TUNE_SLOW_IMUL_IMM32_MEM: Imul of 32-bit constant and memory is
vector path on AMD machines. */
m_K8 | m_CORE2I7_64 | m_GENERIC64 | m_AMDFAM10 | m_BDVER1,
m_K8 | m_CORE2I7_64 | m_GENERIC64 | m_AMDFAM10 | m_BDVER1 | m_BTVER1,
/* X86_TUNE_SLOW_IMUL_IMM8: Imul of 8-bit constant is vector path on AMD
machines. */
m_K8 | m_CORE2I7_64 | m_GENERIC64 | m_AMDFAM10 | m_BDVER1,
m_K8 | m_CORE2I7_64 | m_GENERIC64 | m_AMDFAM10 | m_BDVER1 | m_BTVER1,
/* X86_TUNE_MOVE_M1_VIA_OR: On pentiums, it is faster to load -1 via OR
than a MOV. */
......@@ -2451,6 +2534,7 @@ static const struct ptt processor_target_table[PROCESSOR_max] =
{&generic64_cost, 16, 10, 16, 10, 16},
{&amdfam10_cost, 32, 24, 32, 7, 32},
{&bdver1_cost, 32, 24, 32, 7, 32},
{&btver1_cost, 32, 24, 32, 7, 32},
{&atom_cost, 16, 7, 16, 7, 16}
};
......@@ -2479,7 +2563,8 @@ static const char *const cpu_names[TARGET_CPU_DEFAULT_max] =
"athlon-4",
"k8",
"amdfam10",
"bdver1"
"bdver1",
"btver1"
};
/* Return true if a red-zone is in use. */
......@@ -3075,6 +3160,7 @@ software_prefetching_beneficial_p (void)
case PROCESSOR_ATHLON:
case PROCESSOR_K8:
case PROCESSOR_AMDFAM10:
case PROCESSOR_BTVER1:
return true;
default:
......@@ -3260,6 +3346,9 @@ ix86_option_override_internal (bool main_args_p)
| PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
| PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_FMA4
| PTA_XOP | PTA_LWP},
{"btver1", PROCESSOR_BTVER1, CPU_GENERIC64,
PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
| PTA_SSSE3 | PTA_SSE4A |PTA_ABM | PTA_CX16},
{"generic32", PROCESSOR_GENERIC32, CPU_PENTIUMPRO,
0 /* flags are only used for -march switch. */ },
{"generic64", PROCESSOR_GENERIC64, CPU_GENERIC64,
......@@ -22177,6 +22266,7 @@ ix86_issue_rate (void)
case PROCESSOR_GENERIC32:
case PROCESSOR_GENERIC64:
case PROCESSOR_BDVER1:
case PROCESSOR_BTVER1:
return 3;
default:
......@@ -22364,6 +22454,7 @@ ix86_adjust_cost (rtx insn, rtx link, rtx dep_insn, int cost)
case PROCESSOR_K8:
case PROCESSOR_AMDFAM10:
case PROCESSOR_BDVER1:
case PROCESSOR_BTVER1:
case PROCESSOR_ATOM:
case PROCESSOR_GENERIC32:
case PROCESSOR_GENERIC64:
......@@ -251,6 +251,7 @@ extern const struct processor_costs ix86_size_cost;
#define TARGET_GENERIC (TARGET_GENERIC32 || TARGET_GENERIC64)
#define TARGET_AMDFAM10 (ix86_tune == PROCESSOR_AMDFAM10)
#define TARGET_BDVER1 (ix86_tune == PROCESSOR_BDVER1)
#define TARGET_BTVER1 (ix86_tune == PROCESSOR_BTVER1)
#define TARGET_ATOM (ix86_tune == PROCESSOR_ATOM)
/* Feature tests against the various tunings. */
......@@ -600,6 +601,7 @@ enum target_cpu_default
TARGET_CPU_DEFAULT_k8,
TARGET_CPU_DEFAULT_amdfam10,
TARGET_CPU_DEFAULT_bdver1,
TARGET_CPU_DEFAULT_btver1,
TARGET_CPU_DEFAULT_max
};
......@@ -2060,6 +2062,7 @@ enum processor_type
PROCESSOR_GENERIC64,
PROCESSOR_AMDFAM10,
PROCESSOR_BDVER1,
PROCESSOR_BTVER1,
PROCESSOR_ATOM,
PROCESSOR_max
};
......
......@@ -360,7 +360,7 @@
;; Processor type.
(define_attr "cpu" "none,pentium,pentiumpro,geode,k6,athlon,k8,core2,corei7,
atom,generic64,amdfam10,bdver1"
atom,generic64,amdfam10,bdver1,btver1"
(const (symbol_ref "ix86_schedule")))
;; A basic instruction type. Refinements due to arguments to be
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment