Commit 64766e8d by Jan Hubicka Committed by Jan Hubicka

config.gcc (i386, x86_64): Add extra objects.



	* config.gcc (i386, x86_64): Add extra objects.
	* i386/i386-protos.h (ix86_rip_relative_addr_p): Declare.
	(ix86_min_insn_size): Declare.
	(ix86_issue_rate): Declare.
	(ix86_adjust_cost): Declare.
	(ia32_multipass_dfa_lookahead): Declare.
	(ix86_macro_fusion_p): Declare.
	(ix86_macro_fusion_pair_p): Declare.
	(ix86_bd_has_dispatch): Declare.
	(ix86_bd_do_dispatch): Declare.
	(ix86_core2i7_init_hooks): Declare.
	(ix86_atom_sched_reorder): Declare.
	* i386/i386.c Move all CPU cost tables to x86-tune-costs.h.
	(COSTS_N_BYTES): Move to x86-tune-costs.h.
	(DUMMY_STRINGOP_ALGS):x86-tune-costs.h.
	(rip_relative_addr_p): Rename to ...
	(ix86_rip_relative_addr_p): ... this one; export.
	(memory_address_length): Update.
	(ix86_issue_rate): Move to x86-tune-sched.c.
	(ix86_flags_dependent): Move to x86-tune-sched.c.
	(ix86_agi_dependent): Move to x86-tune-sched.c.
	(exact_dependency_1): Move to x86-tune-sched.c.
	(exact_store_load_dependency): Move to x86-tune-sched.c.
	(ix86_adjust_cost): Move to x86-tune-sched.c.
	(ia32_multipass_dfa_lookahead): Move to x86-tune-sched.c.
	(ix86_macro_fusion_p): Move to x86-tune-sched.c.
	(ix86_macro_fusion_pair_p): Move to x86-tune-sched.c.
	(do_reorder_for_imul): Move to x86-tune-sched-atom.c.
	(swap_top_of_ready_list): Move to x86-tune-sched-atom.c.
	(ix86_sched_reorder): Move to x86-tune-sched-atom.c.
	(core2i7_first_cycle_multipass_init): Move to x86-tune-sched-core.c.
	(core2i7_dfa_post_advance_cycle): Move to x86-tune-sched-core.c.
	(min_insn_size): Rename to ...
	(ix86_min_insn_size): ... this one; export.
	(core2i7_first_cycle_multipass_begin): Move to x86-tune-sched-core.c.
	(core2i7_first_cycle_multipass_issue): Move to x86-tune-sched-core.c.
	(core2i7_first_cycle_multipass_backtrack): Move to x86-tune-sched-core.c.
	(core2i7_first_cycle_multipass_end): Move to x86-tune-sched-core.c.
	(core2i7_first_cycle_multipass_fini): Move to x86-tune-sched-core.c.
	(ix86_sched_init_global): Break up logic to ix86_core2i7_init_hooks.
	(ix86_avoid_jump_mispredicts): Update.
	(TARGET_SCHED_DISPATCH): Move to ix86-tune-sched-bd.c.
	(TARGET_SCHED_DISPATCH_DO): Move to ix86-tune-sched-bd.c.
	(TARGET_SCHED_REORDER): Move to ix86-tune-sched-bd.c.
	(DISPATCH_WINDOW_SIZE): Move to ix86-tune-sched-bd.c.
	(MAX_DISPATCH_WINDOWS): Move to ix86-tune-sched-bd.c.
	(MAX_INSN): Move to ix86-tune-sched-bd.c.
	(MAX_IMM): Move to ix86-tune-sched-bd.c.
	(MAX_IMM_SIZE): Move to ix86-tune-sched-bd.c.
	(MAX_IMM_32): Move to ix86-tune-sched-bd.c.
	(MAX_IMM_64): Move to ix86-tune-sched-bd.c.
	(MAX_LOAD): Move to ix86-tune-sched-bd.c.
	(MAX_STORE): Move to ix86-tune-sched-bd.c.
	(BIG): Move to ix86-tune-sched-bd.c.
	(enum dispatch_group): Move to ix86-tune-sched-bd.c.
	(enum insn_path): Move to ix86-tune-sched-bd.c.
	(get_mem_group): Move to ix86-tune-sched-bd.c.
	(is_cmp): Move to ix86-tune-sched-bd.c.
	(dispatch_violation): Move to ix86-tune-sched-bd.c.
	(is_branch): Move to ix86-tune-sched-bd.c.
	(is_prefetch): Move to ix86-tune-sched-bd.c.
	(init_window): Move to ix86-tune-sched-bd.c.
	(allocate_window): Move to ix86-tune-sched-bd.c.
	(init_dispatch_sched): Move to ix86-tune-sched-bd.c.
	(is_end_basic_block): Move to ix86-tune-sched-bd.c.
	(process_end_window): Move to ix86-tune-sched-bd.c.
	(allocate_next_window): Move to ix86-tune-sched-bd.c.
	(find_constant): Move to ix86-tune-sched-bd.c.
	(get_num_immediates): Move to ix86-tune-sched-bd.c.
	(has_immediate): Move to ix86-tune-sched-bd.c.
	(get_insn_path): Move to ix86-tune-sched-bd.c.
	(get_insn_group): Move to ix86-tune-sched-bd.c.
	(count_num_restricted): Move to ix86-tune-sched-bd.c.
	(fits_dispatch_window): Move to ix86-tune-sched-bd.c.
	(add_insn_window): Move to ix86-tune-sched-bd.c.
	(add_to_dispatch_window): Move to ix86-tune-sched-bd.c.
	(debug_dispatch_window_file): Move to ix86-tune-sched-bd.c.
	(debug_dispatch_window): Move to ix86-tune-sched-bd.c.
	(debug_insn_dispatch_info_file): Move to ix86-tune-sched-bd.c.
	(debug_ready_dispatch): Move to ix86-tune-sched-bd.c.
	(do_dispatch): Move to ix86-tune-sched-bd.c.
	(has_dispatch): Move to ix86-tune-sched-bd.c.
	* i386/t-i386: Add new object files.
	* i386/x86-tune-costs.h: New file.
	* i386/x86-tune-sched-atom.c: New file.
	* i386/x86-tune-sched-bd.c: New file.
	* i386/x86-tune-sched-core.c: New file.
	* i386/x86-tune-sched.c: New file.

From-SVN: r253646
parent db0d1bae
2017-10-11 Jan Hubicka <hubicka@ucw.cz>
* config.gcc (i386, x86_64): Add extra objects.
* i386/i386-protos.h (ix86_rip_relative_addr_p): Declare.
(ix86_min_insn_size): Declare.
(ix86_issue_rate): Declare.
(ix86_adjust_cost): Declare.
(ia32_multipass_dfa_lookahead): Declare.
(ix86_macro_fusion_p): Declare.
(ix86_macro_fusion_pair_p): Declare.
(ix86_bd_has_dispatch): Declare.
(ix86_bd_do_dispatch): Declare.
(ix86_core2i7_init_hooks): Declare.
(ix86_atom_sched_reorder): Declare.
* i386/i386.c Move all CPU cost tables to x86-tune-costs.h.
(COSTS_N_BYTES): Move to x86-tune-costs.h.
(DUMMY_STRINGOP_ALGS):x86-tune-costs.h.
(rip_relative_addr_p): Rename to ...
(ix86_rip_relative_addr_p): ... this one; export.
(memory_address_length): Update.
(ix86_issue_rate): Move to x86-tune-sched.c.
(ix86_flags_dependent): Move to x86-tune-sched.c.
(ix86_agi_dependent): Move to x86-tune-sched.c.
(exact_dependency_1): Move to x86-tune-sched.c.
(exact_store_load_dependency): Move to x86-tune-sched.c.
(ix86_adjust_cost): Move to x86-tune-sched.c.
(ia32_multipass_dfa_lookahead): Move to x86-tune-sched.c.
(ix86_macro_fusion_p): Move to x86-tune-sched.c.
(ix86_macro_fusion_pair_p): Move to x86-tune-sched.c.
(do_reorder_for_imul): Move to x86-tune-sched-atom.c.
(swap_top_of_ready_list): Move to x86-tune-sched-atom.c.
(ix86_sched_reorder): Move to x86-tune-sched-atom.c.
(core2i7_first_cycle_multipass_init): Move to x86-tune-sched-core.c.
(core2i7_dfa_post_advance_cycle): Move to x86-tune-sched-core.c.
(min_insn_size): Rename to ...
(ix86_min_insn_size): ... this one; export.
(core2i7_first_cycle_multipass_begin): Move to x86-tune-sched-core.c.
(core2i7_first_cycle_multipass_issue): Move to x86-tune-sched-core.c.
(core2i7_first_cycle_multipass_backtrack): Move to x86-tune-sched-core.c.
(core2i7_first_cycle_multipass_end): Move to x86-tune-sched-core.c.
(core2i7_first_cycle_multipass_fini): Move to x86-tune-sched-core.c.
(ix86_sched_init_global): Break up logic to ix86_core2i7_init_hooks.
(ix86_avoid_jump_mispredicts): Update.
(TARGET_SCHED_DISPATCH): Move to ix86-tune-sched-bd.c.
(TARGET_SCHED_DISPATCH_DO): Move to ix86-tune-sched-bd.c.
(TARGET_SCHED_REORDER): Move to ix86-tune-sched-bd.c.
(DISPATCH_WINDOW_SIZE): Move to ix86-tune-sched-bd.c.
(MAX_DISPATCH_WINDOWS): Move to ix86-tune-sched-bd.c.
(MAX_INSN): Move to ix86-tune-sched-bd.c.
(MAX_IMM): Move to ix86-tune-sched-bd.c.
(MAX_IMM_SIZE): Move to ix86-tune-sched-bd.c.
(MAX_IMM_32): Move to ix86-tune-sched-bd.c.
(MAX_IMM_64): Move to ix86-tune-sched-bd.c.
(MAX_LOAD): Move to ix86-tune-sched-bd.c.
(MAX_STORE): Move to ix86-tune-sched-bd.c.
(BIG): Move to ix86-tune-sched-bd.c.
(enum dispatch_group): Move to ix86-tune-sched-bd.c.
(enum insn_path): Move to ix86-tune-sched-bd.c.
(get_mem_group): Move to ix86-tune-sched-bd.c.
(is_cmp): Move to ix86-tune-sched-bd.c.
(dispatch_violation): Move to ix86-tune-sched-bd.c.
(is_branch): Move to ix86-tune-sched-bd.c.
(is_prefetch): Move to ix86-tune-sched-bd.c.
(init_window): Move to ix86-tune-sched-bd.c.
(allocate_window): Move to ix86-tune-sched-bd.c.
(init_dispatch_sched): Move to ix86-tune-sched-bd.c.
(is_end_basic_block): Move to ix86-tune-sched-bd.c.
(process_end_window): Move to ix86-tune-sched-bd.c.
(allocate_next_window): Move to ix86-tune-sched-bd.c.
(find_constant): Move to ix86-tune-sched-bd.c.
(get_num_immediates): Move to ix86-tune-sched-bd.c.
(has_immediate): Move to ix86-tune-sched-bd.c.
(get_insn_path): Move to ix86-tune-sched-bd.c.
(get_insn_group): Move to ix86-tune-sched-bd.c.
(count_num_restricted): Move to ix86-tune-sched-bd.c.
(fits_dispatch_window): Move to ix86-tune-sched-bd.c.
(add_insn_window): Move to ix86-tune-sched-bd.c.
(add_to_dispatch_window): Move to ix86-tune-sched-bd.c.
(debug_dispatch_window_file): Move to ix86-tune-sched-bd.c.
(debug_dispatch_window): Move to ix86-tune-sched-bd.c.
(debug_insn_dispatch_info_file): Move to ix86-tune-sched-bd.c.
(debug_ready_dispatch): Move to ix86-tune-sched-bd.c.
(do_dispatch): Move to ix86-tune-sched-bd.c.
(has_dispatch): Move to ix86-tune-sched-bd.c.
* i386/t-i386: Add new object files.
* i386/x86-tune-costs.h: New file.
* i386/x86-tune-sched-atom.c: New file.
* i386/x86-tune-sched-bd.c: New file.
* i386/x86-tune-sched-core.c: New file.
* i386/x86-tune-sched.c: New file.
2017-10-11 Liu Hao <lh_mouse@126.com> 2017-10-11 Liu Hao <lh_mouse@126.com>
* pretty-print.c [_WIN32] (colorize_init): Remove. Use * pretty-print.c [_WIN32] (colorize_init): Remove. Use
...@@ -360,6 +360,7 @@ i[34567]86-*-*) ...@@ -360,6 +360,7 @@ i[34567]86-*-*)
cpu_type=i386 cpu_type=i386
c_target_objs="i386-c.o" c_target_objs="i386-c.o"
cxx_target_objs="i386-c.o" cxx_target_objs="i386-c.o"
extra_objs="x86-tune-sched.o x86-tune-sched-bd.o x86-tune-sched-atom.o x86-tune-sched-core.o"
extra_options="${extra_options} fused-madd.opt" extra_options="${extra_options} fused-madd.opt"
extra_headers="cpuid.h mmintrin.h mm3dnow.h xmmintrin.h emmintrin.h extra_headers="cpuid.h mmintrin.h mm3dnow.h xmmintrin.h emmintrin.h
pmmintrin.h tmmintrin.h ammintrin.h smmintrin.h pmmintrin.h tmmintrin.h ammintrin.h smmintrin.h
...@@ -384,6 +385,7 @@ x86_64-*-*) ...@@ -384,6 +385,7 @@ x86_64-*-*)
c_target_objs="i386-c.o" c_target_objs="i386-c.o"
cxx_target_objs="i386-c.o" cxx_target_objs="i386-c.o"
extra_options="${extra_options} fused-madd.opt" extra_options="${extra_options} fused-madd.opt"
extra_objs="x86-tune-sched.o x86-tune-sched-bd.o x86-tune-sched-atom.o x86-tune-sched-core.o"
extra_headers="cpuid.h mmintrin.h mm3dnow.h xmmintrin.h emmintrin.h extra_headers="cpuid.h mmintrin.h mm3dnow.h xmmintrin.h emmintrin.h
pmmintrin.h tmmintrin.h ammintrin.h smmintrin.h pmmintrin.h tmmintrin.h ammintrin.h smmintrin.h
nmmintrin.h bmmintrin.h fma4intrin.h wmmintrin.h nmmintrin.h bmmintrin.h fma4intrin.h wmmintrin.h
......
...@@ -27,6 +27,7 @@ extern bool ix86_handle_option (struct gcc_options *opts, ...@@ -27,6 +27,7 @@ extern bool ix86_handle_option (struct gcc_options *opts,
extern bool ix86_target_stack_probe (void); extern bool ix86_target_stack_probe (void);
extern bool ix86_can_use_return_insn_p (void); extern bool ix86_can_use_return_insn_p (void);
extern void ix86_setup_frame_addresses (void); extern void ix86_setup_frame_addresses (void);
extern bool ix86_rip_relative_addr_p (struct ix86_address *parts);
extern HOST_WIDE_INT ix86_initial_elimination_offset (int, int); extern HOST_WIDE_INT ix86_initial_elimination_offset (int, int);
extern void ix86_expand_prologue (void); extern void ix86_expand_prologue (void);
...@@ -314,6 +315,21 @@ extern enum attr_cpu ix86_schedule; ...@@ -314,6 +315,21 @@ extern enum attr_cpu ix86_schedule;
extern const char * ix86_output_call_insn (rtx_insn *insn, rtx call_op); extern const char * ix86_output_call_insn (rtx_insn *insn, rtx call_op);
extern bool ix86_operands_ok_for_move_multiple (rtx *operands, bool load, extern bool ix86_operands_ok_for_move_multiple (rtx *operands, bool load,
machine_mode mode); machine_mode mode);
extern int ix86_min_insn_size (rtx_insn *);
extern int ix86_issue_rate (void);
extern int ix86_adjust_cost (rtx_insn *insn, int dep_type, rtx_insn *dep_insn,
int cost, unsigned int);
extern int ia32_multipass_dfa_lookahead (void);
extern bool ix86_macro_fusion_p (void);
extern bool ix86_macro_fusion_pair_p (rtx_insn *condgen, rtx_insn *condjmp);
extern bool ix86_bd_has_dispatch (rtx_insn *insn, int action);
extern void ix86_bd_do_dispatch (rtx_insn *insn, int mode);
extern void ix86_core2i7_init_hooks (void);
extern int ix86_atom_sched_reorder (FILE *, int, rtx_insn **, int *, int);
#ifdef RTX_CODE #ifdef RTX_CODE
/* Target data for multipass lookahead scheduling. /* Target data for multipass lookahead scheduling.
......
This source diff could not be displayed because it is too large. You can view the blob instead.
...@@ -24,6 +24,22 @@ i386-c.o: $(srcdir)/config/i386/i386-c.c ...@@ -24,6 +24,22 @@ i386-c.o: $(srcdir)/config/i386/i386-c.c
$(COMPILE) $< $(COMPILE) $<
$(POSTCOMPILE) $(POSTCOMPILE)
x86-tune-sched.o: $(srcdir)/config/i386/x86-tune-sched.c
$(COMPILE) $<
$(POSTCOMPILE)
x86-tune-sched-bd.o: $(srcdir)/config/i386/x86-tune-sched-bd.c
$(COMPILE) $<
$(POSTCOMPILE)
x86-tune-sched-atom.o: $(srcdir)/config/i386/x86-tune-sched-atom.c
$(COMPILE) $<
$(POSTCOMPILE)
x86-tune-sched-core.o: $(srcdir)/config/i386/x86-tune-sched-core.c
$(COMPILE) $<
$(POSTCOMPILE)
i386.o: i386-builtin-types.inc i386.o: i386-builtin-types.inc
i386-builtin-types.inc: s-i386-bt ; @true i386-builtin-types.inc: s-i386-bt ; @true
......
/* Processor costs (relative to an add) */
/* We assume COSTS_N_INSNS is defined as (N)*4 and an addition is 2 bytes. */
#define COSTS_N_BYTES(N) ((N) * 2)
#define DUMMY_STRINGOP_ALGS {libcall, {{-1, libcall, false}}}
static stringop_algs ix86_size_memcpy[2] = {
{rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
{rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}};
static stringop_algs ix86_size_memset[2] = {
{rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
{rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}};
const
struct processor_costs ix86_size_cost = {/* costs for tuning for size */
COSTS_N_BYTES (2), /* cost of an add instruction */
COSTS_N_BYTES (3), /* cost of a lea instruction */
COSTS_N_BYTES (2), /* variable shift costs */
COSTS_N_BYTES (3), /* constant shift costs */
{COSTS_N_BYTES (3), /* cost of starting multiply for QI */
COSTS_N_BYTES (3), /* HI */
COSTS_N_BYTES (3), /* SI */
COSTS_N_BYTES (3), /* DI */
COSTS_N_BYTES (5)}, /* other */
0, /* cost of multiply per each bit set */
{COSTS_N_BYTES (3), /* cost of a divide/mod for QI */
COSTS_N_BYTES (3), /* HI */
COSTS_N_BYTES (3), /* SI */
COSTS_N_BYTES (3), /* DI */
COSTS_N_BYTES (5)}, /* other */
COSTS_N_BYTES (3), /* cost of movsx */
COSTS_N_BYTES (3), /* cost of movzx */
0, /* "large" insn */
2, /* MOVE_RATIO */
2, /* cost for loading QImode using movzbl */
{2, 2, 2}, /* cost of loading integer registers
in QImode, HImode and SImode.
Relative to reg-reg move (2). */
{2, 2, 2}, /* cost of storing integer registers */
2, /* cost of reg,reg fld/fst */
{2, 2, 2}, /* cost of loading fp registers
in SFmode, DFmode and XFmode */
{2, 2, 2}, /* cost of storing fp registers
in SFmode, DFmode and XFmode */
3, /* cost of moving MMX register */
{3, 3}, /* cost of loading MMX registers
in SImode and DImode */
{3, 3}, /* cost of storing MMX registers
in SImode and DImode */
3, /* cost of moving SSE register */
{3, 3, 3}, /* cost of loading SSE registers
in SImode, DImode and TImode */
{3, 3, 3}, /* cost of storing SSE registers
in SImode, DImode and TImode */
3, /* MMX or SSE register to integer */
0, /* size of l1 cache */
0, /* size of l2 cache */
0, /* size of prefetch block */
0, /* number of parallel prefetches */
2, /* Branch cost */
COSTS_N_BYTES (2), /* cost of FADD and FSUB insns. */
COSTS_N_BYTES (2), /* cost of FMUL instruction. */
COSTS_N_BYTES (2), /* cost of FDIV instruction. */
COSTS_N_BYTES (2), /* cost of FABS instruction. */
COSTS_N_BYTES (2), /* cost of FCHS instruction. */
COSTS_N_BYTES (2), /* cost of FSQRT instruction. */
1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
ix86_size_memcpy,
ix86_size_memset,
1, /* scalar_stmt_cost. */
1, /* scalar load_cost. */
1, /* scalar_store_cost. */
1, /* vec_stmt_cost. */
1, /* vec_to_scalar_cost. */
1, /* scalar_to_vec_cost. */
1, /* vec_align_load_cost. */
1, /* vec_unalign_load_cost. */
1, /* vec_store_cost. */
1, /* cond_taken_branch_cost. */
1, /* cond_not_taken_branch_cost. */
};
/* Processor costs (relative to an add) */
static stringop_algs i386_memcpy[2] = {
{rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
DUMMY_STRINGOP_ALGS};
static stringop_algs i386_memset[2] = {
{rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
DUMMY_STRINGOP_ALGS};
static const
struct processor_costs i386_cost = { /* 386 specific costs */
COSTS_N_INSNS (1), /* cost of an add instruction */
COSTS_N_INSNS (1), /* cost of a lea instruction */
COSTS_N_INSNS (3), /* variable shift costs */
COSTS_N_INSNS (2), /* constant shift costs */
{COSTS_N_INSNS (6), /* cost of starting multiply for QI */
COSTS_N_INSNS (6), /* HI */
COSTS_N_INSNS (6), /* SI */
COSTS_N_INSNS (6), /* DI */
COSTS_N_INSNS (6)}, /* other */
COSTS_N_INSNS (1), /* cost of multiply per each bit set */
{COSTS_N_INSNS (23), /* cost of a divide/mod for QI */
COSTS_N_INSNS (23), /* HI */
COSTS_N_INSNS (23), /* SI */
COSTS_N_INSNS (23), /* DI */
COSTS_N_INSNS (23)}, /* other */
COSTS_N_INSNS (3), /* cost of movsx */
COSTS_N_INSNS (2), /* cost of movzx */
15, /* "large" insn */
3, /* MOVE_RATIO */
4, /* cost for loading QImode using movzbl */
{2, 4, 2}, /* cost of loading integer registers
in QImode, HImode and SImode.
Relative to reg-reg move (2). */
{2, 4, 2}, /* cost of storing integer registers */
2, /* cost of reg,reg fld/fst */
{8, 8, 8}, /* cost of loading fp registers
in SFmode, DFmode and XFmode */
{8, 8, 8}, /* cost of storing fp registers
in SFmode, DFmode and XFmode */
2, /* cost of moving MMX register */
{4, 8}, /* cost of loading MMX registers
in SImode and DImode */
{4, 8}, /* cost of storing MMX registers
in SImode and DImode */
2, /* cost of moving SSE register */
{4, 8, 16}, /* cost of loading SSE registers
in SImode, DImode and TImode */
{4, 8, 16}, /* cost of storing SSE registers
in SImode, DImode and TImode */
3, /* MMX or SSE register to integer */
0, /* size of l1 cache */
0, /* size of l2 cache */
0, /* size of prefetch block */
0, /* number of parallel prefetches */
1, /* Branch cost */
COSTS_N_INSNS (23), /* cost of FADD and FSUB insns. */
COSTS_N_INSNS (27), /* cost of FMUL instruction. */
COSTS_N_INSNS (88), /* cost of FDIV instruction. */
COSTS_N_INSNS (22), /* cost of FABS instruction. */
COSTS_N_INSNS (24), /* cost of FCHS instruction. */
COSTS_N_INSNS (122), /* cost of FSQRT instruction. */
1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
i386_memcpy,
i386_memset,
1, /* scalar_stmt_cost. */
1, /* scalar load_cost. */
1, /* scalar_store_cost. */
1, /* vec_stmt_cost. */
1, /* vec_to_scalar_cost. */
1, /* scalar_to_vec_cost. */
1, /* vec_align_load_cost. */
2, /* vec_unalign_load_cost. */
1, /* vec_store_cost. */
3, /* cond_taken_branch_cost. */
1, /* cond_not_taken_branch_cost. */
};
static stringop_algs i486_memcpy[2] = {
{rep_prefix_4_byte, {{-1, rep_prefix_4_byte, false}}},
DUMMY_STRINGOP_ALGS};
static stringop_algs i486_memset[2] = {
{rep_prefix_4_byte, {{-1, rep_prefix_4_byte, false}}},
DUMMY_STRINGOP_ALGS};
static const
struct processor_costs i486_cost = { /* 486 specific costs */
COSTS_N_INSNS (1), /* cost of an add instruction */
COSTS_N_INSNS (1), /* cost of a lea instruction */
COSTS_N_INSNS (3), /* variable shift costs */
COSTS_N_INSNS (2), /* constant shift costs */
{COSTS_N_INSNS (12), /* cost of starting multiply for QI */
COSTS_N_INSNS (12), /* HI */
COSTS_N_INSNS (12), /* SI */
COSTS_N_INSNS (12), /* DI */
COSTS_N_INSNS (12)}, /* other */
1, /* cost of multiply per each bit set */
{COSTS_N_INSNS (40), /* cost of a divide/mod for QI */
COSTS_N_INSNS (40), /* HI */
COSTS_N_INSNS (40), /* SI */
COSTS_N_INSNS (40), /* DI */
COSTS_N_INSNS (40)}, /* other */
COSTS_N_INSNS (3), /* cost of movsx */
COSTS_N_INSNS (2), /* cost of movzx */
15, /* "large" insn */
3, /* MOVE_RATIO */
4, /* cost for loading QImode using movzbl */
{2, 4, 2}, /* cost of loading integer registers
in QImode, HImode and SImode.
Relative to reg-reg move (2). */
{2, 4, 2}, /* cost of storing integer registers */
2, /* cost of reg,reg fld/fst */
{8, 8, 8}, /* cost of loading fp registers
in SFmode, DFmode and XFmode */
{8, 8, 8}, /* cost of storing fp registers
in SFmode, DFmode and XFmode */
2, /* cost of moving MMX register */
{4, 8}, /* cost of loading MMX registers
in SImode and DImode */
{4, 8}, /* cost of storing MMX registers
in SImode and DImode */
2, /* cost of moving SSE register */
{4, 8, 16}, /* cost of loading SSE registers
in SImode, DImode and TImode */
{4, 8, 16}, /* cost of storing SSE registers
in SImode, DImode and TImode */
3, /* MMX or SSE register to integer */
4, /* size of l1 cache. 486 has 8kB cache
shared for code and data, so 4kB is
not really precise. */
4, /* size of l2 cache */
0, /* size of prefetch block */
0, /* number of parallel prefetches */
1, /* Branch cost */
COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
COSTS_N_INSNS (16), /* cost of FMUL instruction. */
COSTS_N_INSNS (73), /* cost of FDIV instruction. */
COSTS_N_INSNS (3), /* cost of FABS instruction. */
COSTS_N_INSNS (3), /* cost of FCHS instruction. */
COSTS_N_INSNS (83), /* cost of FSQRT instruction. */
1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
i486_memcpy,
i486_memset,
1, /* scalar_stmt_cost. */
1, /* scalar load_cost. */
1, /* scalar_store_cost. */
1, /* vec_stmt_cost. */
1, /* vec_to_scalar_cost. */
1, /* scalar_to_vec_cost. */
1, /* vec_align_load_cost. */
2, /* vec_unalign_load_cost. */
1, /* vec_store_cost. */
3, /* cond_taken_branch_cost. */
1, /* cond_not_taken_branch_cost. */
};
static stringop_algs pentium_memcpy[2] = {
{libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
DUMMY_STRINGOP_ALGS};
static stringop_algs pentium_memset[2] = {
{libcall, {{-1, rep_prefix_4_byte, false}}},
DUMMY_STRINGOP_ALGS};
static const
struct processor_costs pentium_cost = {
COSTS_N_INSNS (1), /* cost of an add instruction */
COSTS_N_INSNS (1), /* cost of a lea instruction */
COSTS_N_INSNS (4), /* variable shift costs */
COSTS_N_INSNS (1), /* constant shift costs */
{COSTS_N_INSNS (11), /* cost of starting multiply for QI */
COSTS_N_INSNS (11), /* HI */
COSTS_N_INSNS (11), /* SI */
COSTS_N_INSNS (11), /* DI */
COSTS_N_INSNS (11)}, /* other */
0, /* cost of multiply per each bit set */
{COSTS_N_INSNS (25), /* cost of a divide/mod for QI */
COSTS_N_INSNS (25), /* HI */
COSTS_N_INSNS (25), /* SI */
COSTS_N_INSNS (25), /* DI */
COSTS_N_INSNS (25)}, /* other */
COSTS_N_INSNS (3), /* cost of movsx */
COSTS_N_INSNS (2), /* cost of movzx */
8, /* "large" insn */
6, /* MOVE_RATIO */
6, /* cost for loading QImode using movzbl */
{2, 4, 2}, /* cost of loading integer registers
in QImode, HImode and SImode.
Relative to reg-reg move (2). */
{2, 4, 2}, /* cost of storing integer registers */
2, /* cost of reg,reg fld/fst */
{2, 2, 6}, /* cost of loading fp registers
in SFmode, DFmode and XFmode */
{4, 4, 6}, /* cost of storing fp registers
in SFmode, DFmode and XFmode */
8, /* cost of moving MMX register */
{8, 8}, /* cost of loading MMX registers
in SImode and DImode */
{8, 8}, /* cost of storing MMX registers
in SImode and DImode */
2, /* cost of moving SSE register */
{4, 8, 16}, /* cost of loading SSE registers
in SImode, DImode and TImode */
{4, 8, 16}, /* cost of storing SSE registers
in SImode, DImode and TImode */
3, /* MMX or SSE register to integer */
8, /* size of l1 cache. */
8, /* size of l2 cache */
0, /* size of prefetch block */
0, /* number of parallel prefetches */
2, /* Branch cost */
COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
COSTS_N_INSNS (3), /* cost of FMUL instruction. */
COSTS_N_INSNS (39), /* cost of FDIV instruction. */
COSTS_N_INSNS (1), /* cost of FABS instruction. */
COSTS_N_INSNS (1), /* cost of FCHS instruction. */
COSTS_N_INSNS (70), /* cost of FSQRT instruction. */
1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
pentium_memcpy,
pentium_memset,
1, /* scalar_stmt_cost. */
1, /* scalar load_cost. */
1, /* scalar_store_cost. */
1, /* vec_stmt_cost. */
1, /* vec_to_scalar_cost. */
1, /* scalar_to_vec_cost. */
1, /* vec_align_load_cost. */
2, /* vec_unalign_load_cost. */
1, /* vec_store_cost. */
3, /* cond_taken_branch_cost. */
1, /* cond_not_taken_branch_cost. */
};
static const
struct processor_costs lakemont_cost = {
COSTS_N_INSNS (1), /* cost of an add instruction */
COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
COSTS_N_INSNS (1), /* variable shift costs */
COSTS_N_INSNS (1), /* constant shift costs */
{COSTS_N_INSNS (11), /* cost of starting multiply for QI */
COSTS_N_INSNS (11), /* HI */
COSTS_N_INSNS (11), /* SI */
COSTS_N_INSNS (11), /* DI */
COSTS_N_INSNS (11)}, /* other */
0, /* cost of multiply per each bit set */
{COSTS_N_INSNS (25), /* cost of a divide/mod for QI */
COSTS_N_INSNS (25), /* HI */
COSTS_N_INSNS (25), /* SI */
COSTS_N_INSNS (25), /* DI */
COSTS_N_INSNS (25)}, /* other */
COSTS_N_INSNS (3), /* cost of movsx */
COSTS_N_INSNS (2), /* cost of movzx */
8, /* "large" insn */
17, /* MOVE_RATIO */
6, /* cost for loading QImode using movzbl */
{2, 4, 2}, /* cost of loading integer registers
in QImode, HImode and SImode.
Relative to reg-reg move (2). */
{2, 4, 2}, /* cost of storing integer registers */
2, /* cost of reg,reg fld/fst */
{2, 2, 6}, /* cost of loading fp registers
in SFmode, DFmode and XFmode */
{4, 4, 6}, /* cost of storing fp registers
in SFmode, DFmode and XFmode */
8, /* cost of moving MMX register */
{8, 8}, /* cost of loading MMX registers
in SImode and DImode */
{8, 8}, /* cost of storing MMX registers
in SImode and DImode */
2, /* cost of moving SSE register */
{4, 8, 16}, /* cost of loading SSE registers
in SImode, DImode and TImode */
{4, 8, 16}, /* cost of storing SSE registers
in SImode, DImode and TImode */
3, /* MMX or SSE register to integer */
8, /* size of l1 cache. */
8, /* size of l2 cache */
0, /* size of prefetch block */
0, /* number of parallel prefetches */
2, /* Branch cost */
COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
COSTS_N_INSNS (3), /* cost of FMUL instruction. */
COSTS_N_INSNS (39), /* cost of FDIV instruction. */
COSTS_N_INSNS (1), /* cost of FABS instruction. */
COSTS_N_INSNS (1), /* cost of FCHS instruction. */
COSTS_N_INSNS (70), /* cost of FSQRT instruction. */
1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
pentium_memcpy,
pentium_memset,
1, /* scalar_stmt_cost. */
1, /* scalar load_cost. */
1, /* scalar_store_cost. */
1, /* vec_stmt_cost. */
1, /* vec_to_scalar_cost. */
1, /* scalar_to_vec_cost. */
1, /* vec_align_load_cost. */
2, /* vec_unalign_load_cost. */
1, /* vec_store_cost. */
3, /* cond_taken_branch_cost. */
1, /* cond_not_taken_branch_cost. */
};
/* PentiumPro has optimized rep instructions for blocks aligned by 8 bytes
(we ensure the alignment). For small blocks inline loop is still a
noticeable win, for bigger blocks either rep movsl or rep movsb is
way to go. Rep movsb has apparently more expensive startup time in CPU,
but after 4K the difference is down in the noise. */
static stringop_algs pentiumpro_memcpy[2] = {
{rep_prefix_4_byte, {{128, loop, false}, {1024, unrolled_loop, false},
{8192, rep_prefix_4_byte, false},
{-1, rep_prefix_1_byte, false}}},
DUMMY_STRINGOP_ALGS};
static stringop_algs pentiumpro_memset[2] = {
{rep_prefix_4_byte, {{1024, unrolled_loop, false},
{8192, rep_prefix_4_byte, false},
{-1, libcall, false}}},
DUMMY_STRINGOP_ALGS};
static const
struct processor_costs pentiumpro_cost = {
COSTS_N_INSNS (1), /* cost of an add instruction */
COSTS_N_INSNS (1), /* cost of a lea instruction */
COSTS_N_INSNS (1), /* variable shift costs */
COSTS_N_INSNS (1), /* constant shift costs */
{COSTS_N_INSNS (4), /* cost of starting multiply for QI */
COSTS_N_INSNS (4), /* HI */
COSTS_N_INSNS (4), /* SI */
COSTS_N_INSNS (4), /* DI */
COSTS_N_INSNS (4)}, /* other */
0, /* cost of multiply per each bit set */
{COSTS_N_INSNS (17), /* cost of a divide/mod for QI */
COSTS_N_INSNS (17), /* HI */
COSTS_N_INSNS (17), /* SI */
COSTS_N_INSNS (17), /* DI */
COSTS_N_INSNS (17)}, /* other */
COSTS_N_INSNS (1), /* cost of movsx */
COSTS_N_INSNS (1), /* cost of movzx */
8, /* "large" insn */
6, /* MOVE_RATIO */
2, /* cost for loading QImode using movzbl */
{4, 4, 4}, /* cost of loading integer registers
in QImode, HImode and SImode.
Relative to reg-reg move (2). */
{2, 2, 2}, /* cost of storing integer registers */
2, /* cost of reg,reg fld/fst */
{2, 2, 6}, /* cost of loading fp registers
in SFmode, DFmode and XFmode */
{4, 4, 6}, /* cost of storing fp registers
in SFmode, DFmode and XFmode */
2, /* cost of moving MMX register */
{2, 2}, /* cost of loading MMX registers
in SImode and DImode */
{2, 2}, /* cost of storing MMX registers
in SImode and DImode */
2, /* cost of moving SSE register */
{2, 2, 8}, /* cost of loading SSE registers
in SImode, DImode and TImode */
{2, 2, 8}, /* cost of storing SSE registers
in SImode, DImode and TImode */
3, /* MMX or SSE register to integer */
8, /* size of l1 cache. */
256, /* size of l2 cache */
32, /* size of prefetch block */
6, /* number of parallel prefetches */
2, /* Branch cost */
COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
COSTS_N_INSNS (5), /* cost of FMUL instruction. */
COSTS_N_INSNS (56), /* cost of FDIV instruction. */
COSTS_N_INSNS (2), /* cost of FABS instruction. */
COSTS_N_INSNS (2), /* cost of FCHS instruction. */
COSTS_N_INSNS (56), /* cost of FSQRT instruction. */
1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
pentiumpro_memcpy,
pentiumpro_memset,
1, /* scalar_stmt_cost. */
1, /* scalar load_cost. */
1, /* scalar_store_cost. */
1, /* vec_stmt_cost. */
1, /* vec_to_scalar_cost. */
1, /* scalar_to_vec_cost. */
1, /* vec_align_load_cost. */
2, /* vec_unalign_load_cost. */
1, /* vec_store_cost. */
3, /* cond_taken_branch_cost. */
1, /* cond_not_taken_branch_cost. */
};
static stringop_algs geode_memcpy[2] = {
{libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
DUMMY_STRINGOP_ALGS};
static stringop_algs geode_memset[2] = {
{libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
DUMMY_STRINGOP_ALGS};
static const
struct processor_costs geode_cost = {
COSTS_N_INSNS (1), /* cost of an add instruction */
COSTS_N_INSNS (1), /* cost of a lea instruction */
COSTS_N_INSNS (2), /* variable shift costs */
COSTS_N_INSNS (1), /* constant shift costs */
{COSTS_N_INSNS (3), /* cost of starting multiply for QI */
COSTS_N_INSNS (4), /* HI */
COSTS_N_INSNS (7), /* SI */
COSTS_N_INSNS (7), /* DI */
COSTS_N_INSNS (7)}, /* other */
0, /* cost of multiply per each bit set */
{COSTS_N_INSNS (15), /* cost of a divide/mod for QI */
COSTS_N_INSNS (23), /* HI */
COSTS_N_INSNS (39), /* SI */
COSTS_N_INSNS (39), /* DI */
COSTS_N_INSNS (39)}, /* other */
COSTS_N_INSNS (1), /* cost of movsx */
COSTS_N_INSNS (1), /* cost of movzx */
8, /* "large" insn */
4, /* MOVE_RATIO */
1, /* cost for loading QImode using movzbl */
{1, 1, 1}, /* cost of loading integer registers
in QImode, HImode and SImode.
Relative to reg-reg move (2). */
{1, 1, 1}, /* cost of storing integer registers */
1, /* cost of reg,reg fld/fst */
{1, 1, 1}, /* cost of loading fp registers
in SFmode, DFmode and XFmode */
{4, 6, 6}, /* cost of storing fp registers
in SFmode, DFmode and XFmode */
2, /* cost of moving MMX register */
{2, 2}, /* cost of loading MMX registers
in SImode and DImode */
{2, 2}, /* cost of storing MMX registers
in SImode and DImode */
2, /* cost of moving SSE register */
{2, 2, 8}, /* cost of loading SSE registers
in SImode, DImode and TImode */
{2, 2, 8}, /* cost of storing SSE registers
in SImode, DImode and TImode */
3, /* MMX or SSE register to integer */
64, /* size of l1 cache. */
128, /* size of l2 cache. */
32, /* size of prefetch block */
1, /* number of parallel prefetches */
1, /* Branch cost */
COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
COSTS_N_INSNS (11), /* cost of FMUL instruction. */
COSTS_N_INSNS (47), /* cost of FDIV instruction. */
COSTS_N_INSNS (1), /* cost of FABS instruction. */
COSTS_N_INSNS (1), /* cost of FCHS instruction. */
COSTS_N_INSNS (54), /* cost of FSQRT instruction. */
1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
geode_memcpy,
geode_memset,
1, /* scalar_stmt_cost. */
1, /* scalar load_cost. */
1, /* scalar_store_cost. */
1, /* vec_stmt_cost. */
1, /* vec_to_scalar_cost. */
1, /* scalar_to_vec_cost. */
1, /* vec_align_load_cost. */
2, /* vec_unalign_load_cost. */
1, /* vec_store_cost. */
3, /* cond_taken_branch_cost. */
1, /* cond_not_taken_branch_cost. */
};
static stringop_algs k6_memcpy[2] = {
{libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
DUMMY_STRINGOP_ALGS};
static stringop_algs k6_memset[2] = {
{libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
DUMMY_STRINGOP_ALGS};
static const
struct processor_costs k6_cost = {
COSTS_N_INSNS (1), /* cost of an add instruction */
COSTS_N_INSNS (2), /* cost of a lea instruction */
COSTS_N_INSNS (1), /* variable shift costs */
COSTS_N_INSNS (1), /* constant shift costs */
{COSTS_N_INSNS (3), /* cost of starting multiply for QI */
COSTS_N_INSNS (3), /* HI */
COSTS_N_INSNS (3), /* SI */
COSTS_N_INSNS (3), /* DI */
COSTS_N_INSNS (3)}, /* other */
0, /* cost of multiply per each bit set */
{COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
COSTS_N_INSNS (18), /* HI */
COSTS_N_INSNS (18), /* SI */
COSTS_N_INSNS (18), /* DI */
COSTS_N_INSNS (18)}, /* other */
COSTS_N_INSNS (2), /* cost of movsx */
COSTS_N_INSNS (2), /* cost of movzx */
8, /* "large" insn */
4, /* MOVE_RATIO */
3, /* cost for loading QImode using movzbl */
{4, 5, 4}, /* cost of loading integer registers
in QImode, HImode and SImode.
Relative to reg-reg move (2). */
{2, 3, 2}, /* cost of storing integer registers */
4, /* cost of reg,reg fld/fst */
{6, 6, 6}, /* cost of loading fp registers
in SFmode, DFmode and XFmode */
{4, 4, 4}, /* cost of storing fp registers
in SFmode, DFmode and XFmode */
2, /* cost of moving MMX register */
{2, 2}, /* cost of loading MMX registers
in SImode and DImode */
{2, 2}, /* cost of storing MMX registers
in SImode and DImode */
2, /* cost of moving SSE register */
{2, 2, 8}, /* cost of loading SSE registers
in SImode, DImode and TImode */
{2, 2, 8}, /* cost of storing SSE registers
in SImode, DImode and TImode */
6, /* MMX or SSE register to integer */
32, /* size of l1 cache. */
32, /* size of l2 cache. Some models
have integrated l2 cache, but
optimizing for k6 is not important
enough to worry about that. */
32, /* size of prefetch block */
1, /* number of parallel prefetches */
1, /* Branch cost */
COSTS_N_INSNS (2), /* cost of FADD and FSUB insns. */
COSTS_N_INSNS (2), /* cost of FMUL instruction. */
COSTS_N_INSNS (56), /* cost of FDIV instruction. */
COSTS_N_INSNS (2), /* cost of FABS instruction. */
COSTS_N_INSNS (2), /* cost of FCHS instruction. */
COSTS_N_INSNS (56), /* cost of FSQRT instruction. */
1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
k6_memcpy,
k6_memset,
1, /* scalar_stmt_cost. */
1, /* scalar load_cost. */
1, /* scalar_store_cost. */
1, /* vec_stmt_cost. */
1, /* vec_to_scalar_cost. */
1, /* scalar_to_vec_cost. */
1, /* vec_align_load_cost. */
2, /* vec_unalign_load_cost. */
1, /* vec_store_cost. */
3, /* cond_taken_branch_cost. */
1, /* cond_not_taken_branch_cost. */
};
/* For some reason, Athlon deals better with REP prefix (relative to loops)
compared to K8. Alignment becomes important after 8 bytes for memcpy and
128 bytes for memset. */
static stringop_algs athlon_memcpy[2] = {
{libcall, {{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
DUMMY_STRINGOP_ALGS};
static stringop_algs athlon_memset[2] = {
{libcall, {{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
DUMMY_STRINGOP_ALGS};
static const
struct processor_costs athlon_cost = {
COSTS_N_INSNS (1), /* cost of an add instruction */
COSTS_N_INSNS (2), /* cost of a lea instruction */
COSTS_N_INSNS (1), /* variable shift costs */
COSTS_N_INSNS (1), /* constant shift costs */
{COSTS_N_INSNS (5), /* cost of starting multiply for QI */
COSTS_N_INSNS (5), /* HI */
COSTS_N_INSNS (5), /* SI */
COSTS_N_INSNS (5), /* DI */
COSTS_N_INSNS (5)}, /* other */
0, /* cost of multiply per each bit set */
{COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
COSTS_N_INSNS (26), /* HI */
COSTS_N_INSNS (42), /* SI */
COSTS_N_INSNS (74), /* DI */
COSTS_N_INSNS (74)}, /* other */
COSTS_N_INSNS (1), /* cost of movsx */
COSTS_N_INSNS (1), /* cost of movzx */
8, /* "large" insn */
9, /* MOVE_RATIO */
4, /* cost for loading QImode using movzbl */
{3, 4, 3}, /* cost of loading integer registers
in QImode, HImode and SImode.
Relative to reg-reg move (2). */
{3, 4, 3}, /* cost of storing integer registers */
4, /* cost of reg,reg fld/fst */
{4, 4, 12}, /* cost of loading fp registers
in SFmode, DFmode and XFmode */
{6, 6, 8}, /* cost of storing fp registers
in SFmode, DFmode and XFmode */
2, /* cost of moving MMX register */
{4, 4}, /* cost of loading MMX registers
in SImode and DImode */
{4, 4}, /* cost of storing MMX registers
in SImode and DImode */
2, /* cost of moving SSE register */
{4, 4, 6}, /* cost of loading SSE registers
in SImode, DImode and TImode */
{4, 4, 5}, /* cost of storing SSE registers
in SImode, DImode and TImode */
5, /* MMX or SSE register to integer */
64, /* size of l1 cache. */
256, /* size of l2 cache. */
64, /* size of prefetch block */
6, /* number of parallel prefetches */
5, /* Branch cost */
COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
COSTS_N_INSNS (4), /* cost of FMUL instruction. */
COSTS_N_INSNS (24), /* cost of FDIV instruction. */
COSTS_N_INSNS (2), /* cost of FABS instruction. */
COSTS_N_INSNS (2), /* cost of FCHS instruction. */
COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
athlon_memcpy,
athlon_memset,
1, /* scalar_stmt_cost. */
1, /* scalar load_cost. */
1, /* scalar_store_cost. */
1, /* vec_stmt_cost. */
1, /* vec_to_scalar_cost. */
1, /* scalar_to_vec_cost. */
1, /* vec_align_load_cost. */
2, /* vec_unalign_load_cost. */
1, /* vec_store_cost. */
3, /* cond_taken_branch_cost. */
1, /* cond_not_taken_branch_cost. */
};
/* K8 has optimized REP instruction for medium sized blocks, but for very
small blocks it is better to use loop. For large blocks, libcall can
do nontemporary accesses and beat inline considerably. */
static stringop_algs k8_memcpy[2] = {
{libcall, {{6, loop, false}, {14, unrolled_loop, false},
{-1, rep_prefix_4_byte, false}}},
{libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
{-1, libcall, false}}}};
static stringop_algs k8_memset[2] = {
{libcall, {{8, loop, false}, {24, unrolled_loop, false},
{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
{libcall, {{48, unrolled_loop, false},
{8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
static const
struct processor_costs k8_cost = {
COSTS_N_INSNS (1), /* cost of an add instruction */
COSTS_N_INSNS (2), /* cost of a lea instruction */
COSTS_N_INSNS (1), /* variable shift costs */
COSTS_N_INSNS (1), /* constant shift costs */
{COSTS_N_INSNS (3), /* cost of starting multiply for QI */
COSTS_N_INSNS (4), /* HI */
COSTS_N_INSNS (3), /* SI */
COSTS_N_INSNS (4), /* DI */
COSTS_N_INSNS (5)}, /* other */
0, /* cost of multiply per each bit set */
{COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
COSTS_N_INSNS (26), /* HI */
COSTS_N_INSNS (42), /* SI */
COSTS_N_INSNS (74), /* DI */
COSTS_N_INSNS (74)}, /* other */
COSTS_N_INSNS (1), /* cost of movsx */
COSTS_N_INSNS (1), /* cost of movzx */
8, /* "large" insn */
9, /* MOVE_RATIO */
4, /* cost for loading QImode using movzbl */
{3, 4, 3}, /* cost of loading integer registers
in QImode, HImode and SImode.
Relative to reg-reg move (2). */
{3, 4, 3}, /* cost of storing integer registers */
4, /* cost of reg,reg fld/fst */
{4, 4, 12}, /* cost of loading fp registers
in SFmode, DFmode and XFmode */
{6, 6, 8}, /* cost of storing fp registers
in SFmode, DFmode and XFmode */
2, /* cost of moving MMX register */
{3, 3}, /* cost of loading MMX registers
in SImode and DImode */
{4, 4}, /* cost of storing MMX registers
in SImode and DImode */
2, /* cost of moving SSE register */
{4, 3, 6}, /* cost of loading SSE registers
in SImode, DImode and TImode */
{4, 4, 5}, /* cost of storing SSE registers
in SImode, DImode and TImode */
5, /* MMX or SSE register to integer */
64, /* size of l1 cache. */
512, /* size of l2 cache. */
64, /* size of prefetch block */
/* New AMD processors never drop prefetches; if they cannot be performed
immediately, they are queued. We set number of simultaneous prefetches
to a large constant to reflect this (it probably is not a good idea not
to limit number of prefetches at all, as their execution also takes some
time). */
100, /* number of parallel prefetches */
3, /* Branch cost */
COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
COSTS_N_INSNS (4), /* cost of FMUL instruction. */
COSTS_N_INSNS (19), /* cost of FDIV instruction. */
COSTS_N_INSNS (2), /* cost of FABS instruction. */
COSTS_N_INSNS (2), /* cost of FCHS instruction. */
COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
k8_memcpy,
k8_memset,
4, /* scalar_stmt_cost. */
2, /* scalar load_cost. */
2, /* scalar_store_cost. */
5, /* vec_stmt_cost. */
0, /* vec_to_scalar_cost. */
2, /* scalar_to_vec_cost. */
2, /* vec_align_load_cost. */
3, /* vec_unalign_load_cost. */
3, /* vec_store_cost. */
3, /* cond_taken_branch_cost. */
2, /* cond_not_taken_branch_cost. */
};
/* AMDFAM10 has optimized REP instruction for medium sized blocks, but for
very small blocks it is better to use loop. For large blocks, libcall can
do nontemporary accesses and beat inline considerably. */
static stringop_algs amdfam10_memcpy[2] = {
{libcall, {{6, loop, false}, {14, unrolled_loop, false},
{-1, rep_prefix_4_byte, false}}},
{libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
{-1, libcall, false}}}};
static stringop_algs amdfam10_memset[2] = {
{libcall, {{8, loop, false}, {24, unrolled_loop, false},
{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
{libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
{-1, libcall, false}}}};
struct processor_costs amdfam10_cost = {
COSTS_N_INSNS (1), /* cost of an add instruction */
COSTS_N_INSNS (2), /* cost of a lea instruction */
COSTS_N_INSNS (1), /* variable shift costs */
COSTS_N_INSNS (1), /* constant shift costs */
{COSTS_N_INSNS (3), /* cost of starting multiply for QI */
COSTS_N_INSNS (4), /* HI */
COSTS_N_INSNS (3), /* SI */
COSTS_N_INSNS (4), /* DI */
COSTS_N_INSNS (5)}, /* other */
0, /* cost of multiply per each bit set */
{COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
COSTS_N_INSNS (35), /* HI */
COSTS_N_INSNS (51), /* SI */
COSTS_N_INSNS (83), /* DI */
COSTS_N_INSNS (83)}, /* other */
COSTS_N_INSNS (1), /* cost of movsx */
COSTS_N_INSNS (1), /* cost of movzx */
8, /* "large" insn */
9, /* MOVE_RATIO */
4, /* cost for loading QImode using movzbl */
{3, 4, 3}, /* cost of loading integer registers
in QImode, HImode and SImode.
Relative to reg-reg move (2). */
{3, 4, 3}, /* cost of storing integer registers */
4, /* cost of reg,reg fld/fst */
{4, 4, 12}, /* cost of loading fp registers
in SFmode, DFmode and XFmode */
{6, 6, 8}, /* cost of storing fp registers
in SFmode, DFmode and XFmode */
2, /* cost of moving MMX register */
{3, 3}, /* cost of loading MMX registers
in SImode and DImode */
{4, 4}, /* cost of storing MMX registers
in SImode and DImode */
2, /* cost of moving SSE register */
{4, 4, 3}, /* cost of loading SSE registers
in SImode, DImode and TImode */
{4, 4, 5}, /* cost of storing SSE registers
in SImode, DImode and TImode */
3, /* MMX or SSE register to integer */
/* On K8:
MOVD reg64, xmmreg Double FSTORE 4
MOVD reg32, xmmreg Double FSTORE 4
On AMDFAM10:
MOVD reg64, xmmreg Double FADD 3
1/1 1/1
MOVD reg32, xmmreg Double FADD 3
1/1 1/1 */
64, /* size of l1 cache. */
512, /* size of l2 cache. */
64, /* size of prefetch block */
/* New AMD processors never drop prefetches; if they cannot be performed
immediately, they are queued. We set number of simultaneous prefetches
to a large constant to reflect this (it probably is not a good idea not
to limit number of prefetches at all, as their execution also takes some
time). */
100, /* number of parallel prefetches */
2, /* Branch cost */
COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
COSTS_N_INSNS (4), /* cost of FMUL instruction. */
COSTS_N_INSNS (19), /* cost of FDIV instruction. */
COSTS_N_INSNS (2), /* cost of FABS instruction. */
COSTS_N_INSNS (2), /* cost of FCHS instruction. */
COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
amdfam10_memcpy,
amdfam10_memset,
4, /* scalar_stmt_cost. */
2, /* scalar load_cost. */
2, /* scalar_store_cost. */
6, /* vec_stmt_cost. */
0, /* vec_to_scalar_cost. */
2, /* scalar_to_vec_cost. */
2, /* vec_align_load_cost. */
2, /* vec_unalign_load_cost. */
2, /* vec_store_cost. */
2, /* cond_taken_branch_cost. */
1, /* cond_not_taken_branch_cost. */
};
/* BDVER1 has optimized REP instruction for medium sized blocks, but for
very small blocks it is better to use loop. For large blocks, libcall
can do nontemporary accesses and beat inline considerably. */
static stringop_algs bdver1_memcpy[2] = {
{libcall, {{6, loop, false}, {14, unrolled_loop, false},
{-1, rep_prefix_4_byte, false}}},
{libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
{-1, libcall, false}}}};
static stringop_algs bdver1_memset[2] = {
{libcall, {{8, loop, false}, {24, unrolled_loop, false},
{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
{libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
{-1, libcall, false}}}};
const struct processor_costs bdver1_cost = {
COSTS_N_INSNS (1), /* cost of an add instruction */
COSTS_N_INSNS (1), /* cost of a lea instruction */
COSTS_N_INSNS (1), /* variable shift costs */
COSTS_N_INSNS (1), /* constant shift costs */
{COSTS_N_INSNS (4), /* cost of starting multiply for QI */
COSTS_N_INSNS (4), /* HI */
COSTS_N_INSNS (4), /* SI */
COSTS_N_INSNS (6), /* DI */
COSTS_N_INSNS (6)}, /* other */
0, /* cost of multiply per each bit set */
{COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
COSTS_N_INSNS (35), /* HI */
COSTS_N_INSNS (51), /* SI */
COSTS_N_INSNS (83), /* DI */
COSTS_N_INSNS (83)}, /* other */
COSTS_N_INSNS (1), /* cost of movsx */
COSTS_N_INSNS (1), /* cost of movzx */
8, /* "large" insn */
9, /* MOVE_RATIO */
4, /* cost for loading QImode using movzbl */
{5, 5, 4}, /* cost of loading integer registers
in QImode, HImode and SImode.
Relative to reg-reg move (2). */
{4, 4, 4}, /* cost of storing integer registers */
2, /* cost of reg,reg fld/fst */
{5, 5, 12}, /* cost of loading fp registers
in SFmode, DFmode and XFmode */
{4, 4, 8}, /* cost of storing fp registers
in SFmode, DFmode and XFmode */
2, /* cost of moving MMX register */
{4, 4}, /* cost of loading MMX registers
in SImode and DImode */
{4, 4}, /* cost of storing MMX registers
in SImode and DImode */
2, /* cost of moving SSE register */
{4, 4, 4}, /* cost of loading SSE registers
in SImode, DImode and TImode */
{4, 4, 4}, /* cost of storing SSE registers
in SImode, DImode and TImode */
2, /* MMX or SSE register to integer */
/* On K8:
MOVD reg64, xmmreg Double FSTORE 4
MOVD reg32, xmmreg Double FSTORE 4
On AMDFAM10:
MOVD reg64, xmmreg Double FADD 3
1/1 1/1
MOVD reg32, xmmreg Double FADD 3
1/1 1/1 */
16, /* size of l1 cache. */
2048, /* size of l2 cache. */
64, /* size of prefetch block */
/* New AMD processors never drop prefetches; if they cannot be performed
immediately, they are queued. We set number of simultaneous prefetches
to a large constant to reflect this (it probably is not a good idea not
to limit number of prefetches at all, as their execution also takes some
time). */
100, /* number of parallel prefetches */
2, /* Branch cost */
COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
COSTS_N_INSNS (6), /* cost of FMUL instruction. */
COSTS_N_INSNS (42), /* cost of FDIV instruction. */
COSTS_N_INSNS (2), /* cost of FABS instruction. */
COSTS_N_INSNS (2), /* cost of FCHS instruction. */
COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
1, 2, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
bdver1_memcpy,
bdver1_memset,
6, /* scalar_stmt_cost. */
4, /* scalar load_cost. */
4, /* scalar_store_cost. */
6, /* vec_stmt_cost. */
0, /* vec_to_scalar_cost. */
2, /* scalar_to_vec_cost. */
4, /* vec_align_load_cost. */
4, /* vec_unalign_load_cost. */
4, /* vec_store_cost. */
4, /* cond_taken_branch_cost. */
2, /* cond_not_taken_branch_cost. */
};
/* BDVER2 has optimized REP instruction for medium sized blocks, but for
very small blocks it is better to use loop. For large blocks, libcall
can do nontemporary accesses and beat inline considerably. */
static stringop_algs bdver2_memcpy[2] = {
{libcall, {{6, loop, false}, {14, unrolled_loop, false},
{-1, rep_prefix_4_byte, false}}},
{libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
{-1, libcall, false}}}};
static stringop_algs bdver2_memset[2] = {
{libcall, {{8, loop, false}, {24, unrolled_loop, false},
{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
{libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
{-1, libcall, false}}}};
const struct processor_costs bdver2_cost = {
COSTS_N_INSNS (1), /* cost of an add instruction */
COSTS_N_INSNS (1), /* cost of a lea instruction */
COSTS_N_INSNS (1), /* variable shift costs */
COSTS_N_INSNS (1), /* constant shift costs */
{COSTS_N_INSNS (4), /* cost of starting multiply for QI */
COSTS_N_INSNS (4), /* HI */
COSTS_N_INSNS (4), /* SI */
COSTS_N_INSNS (6), /* DI */
COSTS_N_INSNS (6)}, /* other */
0, /* cost of multiply per each bit set */
{COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
COSTS_N_INSNS (35), /* HI */
COSTS_N_INSNS (51), /* SI */
COSTS_N_INSNS (83), /* DI */
COSTS_N_INSNS (83)}, /* other */
COSTS_N_INSNS (1), /* cost of movsx */
COSTS_N_INSNS (1), /* cost of movzx */
8, /* "large" insn */
9, /* MOVE_RATIO */
4, /* cost for loading QImode using movzbl */
{5, 5, 4}, /* cost of loading integer registers
in QImode, HImode and SImode.
Relative to reg-reg move (2). */
{4, 4, 4}, /* cost of storing integer registers */
2, /* cost of reg,reg fld/fst */
{5, 5, 12}, /* cost of loading fp registers
in SFmode, DFmode and XFmode */
{4, 4, 8}, /* cost of storing fp registers
in SFmode, DFmode and XFmode */
2, /* cost of moving MMX register */
{4, 4}, /* cost of loading MMX registers
in SImode and DImode */
{4, 4}, /* cost of storing MMX registers
in SImode and DImode */
2, /* cost of moving SSE register */
{4, 4, 4}, /* cost of loading SSE registers
in SImode, DImode and TImode */
{4, 4, 4}, /* cost of storing SSE registers
in SImode, DImode and TImode */
2, /* MMX or SSE register to integer */
/* On K8:
MOVD reg64, xmmreg Double FSTORE 4
MOVD reg32, xmmreg Double FSTORE 4
On AMDFAM10:
MOVD reg64, xmmreg Double FADD 3
1/1 1/1
MOVD reg32, xmmreg Double FADD 3
1/1 1/1 */
16, /* size of l1 cache. */
2048, /* size of l2 cache. */
64, /* size of prefetch block */
/* New AMD processors never drop prefetches; if they cannot be performed
immediately, they are queued. We set number of simultaneous prefetches
to a large constant to reflect this (it probably is not a good idea not
to limit number of prefetches at all, as their execution also takes some
time). */
100, /* number of parallel prefetches */
2, /* Branch cost */
COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
COSTS_N_INSNS (6), /* cost of FMUL instruction. */
COSTS_N_INSNS (42), /* cost of FDIV instruction. */
COSTS_N_INSNS (2), /* cost of FABS instruction. */
COSTS_N_INSNS (2), /* cost of FCHS instruction. */
COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
1, 2, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
bdver2_memcpy,
bdver2_memset,
6, /* scalar_stmt_cost. */
4, /* scalar load_cost. */
4, /* scalar_store_cost. */
6, /* vec_stmt_cost. */
0, /* vec_to_scalar_cost. */
2, /* scalar_to_vec_cost. */
4, /* vec_align_load_cost. */
4, /* vec_unalign_load_cost. */
4, /* vec_store_cost. */
4, /* cond_taken_branch_cost. */
2, /* cond_not_taken_branch_cost. */
};
/* BDVER3 has optimized REP instruction for medium sized blocks, but for
very small blocks it is better to use loop. For large blocks, libcall
can do nontemporary accesses and beat inline considerably. */
static stringop_algs bdver3_memcpy[2] = {
{libcall, {{6, loop, false}, {14, unrolled_loop, false},
{-1, rep_prefix_4_byte, false}}},
{libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
{-1, libcall, false}}}};
static stringop_algs bdver3_memset[2] = {
{libcall, {{8, loop, false}, {24, unrolled_loop, false},
{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
{libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
{-1, libcall, false}}}};
struct processor_costs bdver3_cost = {
COSTS_N_INSNS (1), /* cost of an add instruction */
COSTS_N_INSNS (1), /* cost of a lea instruction */
COSTS_N_INSNS (1), /* variable shift costs */
COSTS_N_INSNS (1), /* constant shift costs */
{COSTS_N_INSNS (4), /* cost of starting multiply for QI */
COSTS_N_INSNS (4), /* HI */
COSTS_N_INSNS (4), /* SI */
COSTS_N_INSNS (6), /* DI */
COSTS_N_INSNS (6)}, /* other */
0, /* cost of multiply per each bit set */
{COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
COSTS_N_INSNS (35), /* HI */
COSTS_N_INSNS (51), /* SI */
COSTS_N_INSNS (83), /* DI */
COSTS_N_INSNS (83)}, /* other */
COSTS_N_INSNS (1), /* cost of movsx */
COSTS_N_INSNS (1), /* cost of movzx */
8, /* "large" insn */
9, /* MOVE_RATIO */
4, /* cost for loading QImode using movzbl */
{5, 5, 4}, /* cost of loading integer registers
in QImode, HImode and SImode.
Relative to reg-reg move (2). */
{4, 4, 4}, /* cost of storing integer registers */
2, /* cost of reg,reg fld/fst */
{5, 5, 12}, /* cost of loading fp registers
in SFmode, DFmode and XFmode */
{4, 4, 8}, /* cost of storing fp registers
in SFmode, DFmode and XFmode */
2, /* cost of moving MMX register */
{4, 4}, /* cost of loading MMX registers
in SImode and DImode */
{4, 4}, /* cost of storing MMX registers
in SImode and DImode */
2, /* cost of moving SSE register */
{4, 4, 4}, /* cost of loading SSE registers
in SImode, DImode and TImode */
{4, 4, 4}, /* cost of storing SSE registers
in SImode, DImode and TImode */
2, /* MMX or SSE register to integer */
16, /* size of l1 cache. */
2048, /* size of l2 cache. */
64, /* size of prefetch block */
/* New AMD processors never drop prefetches; if they cannot be performed
immediately, they are queued. We set number of simultaneous prefetches
to a large constant to reflect this (it probably is not a good idea not
to limit number of prefetches at all, as their execution also takes some
time). */
100, /* number of parallel prefetches */
2, /* Branch cost */
COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
COSTS_N_INSNS (6), /* cost of FMUL instruction. */
COSTS_N_INSNS (42), /* cost of FDIV instruction. */
COSTS_N_INSNS (2), /* cost of FABS instruction. */
COSTS_N_INSNS (2), /* cost of FCHS instruction. */
COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
1, 2, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
bdver3_memcpy,
bdver3_memset,
6, /* scalar_stmt_cost. */
4, /* scalar load_cost. */
4, /* scalar_store_cost. */
6, /* vec_stmt_cost. */
0, /* vec_to_scalar_cost. */
2, /* scalar_to_vec_cost. */
4, /* vec_align_load_cost. */
4, /* vec_unalign_load_cost. */
4, /* vec_store_cost. */
4, /* cond_taken_branch_cost. */
2, /* cond_not_taken_branch_cost. */
};
/* BDVER4 has optimized REP instruction for medium sized blocks, but for
very small blocks it is better to use loop. For large blocks, libcall
can do nontemporary accesses and beat inline considerably. */
static stringop_algs bdver4_memcpy[2] = {
{libcall, {{6, loop, false}, {14, unrolled_loop, false},
{-1, rep_prefix_4_byte, false}}},
{libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
{-1, libcall, false}}}};
static stringop_algs bdver4_memset[2] = {
{libcall, {{8, loop, false}, {24, unrolled_loop, false},
{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
{libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
{-1, libcall, false}}}};
struct processor_costs bdver4_cost = {
COSTS_N_INSNS (1), /* cost of an add instruction */
COSTS_N_INSNS (1), /* cost of a lea instruction */
COSTS_N_INSNS (1), /* variable shift costs */
COSTS_N_INSNS (1), /* constant shift costs */
{COSTS_N_INSNS (4), /* cost of starting multiply for QI */
COSTS_N_INSNS (4), /* HI */
COSTS_N_INSNS (4), /* SI */
COSTS_N_INSNS (6), /* DI */
COSTS_N_INSNS (6)}, /* other */
0, /* cost of multiply per each bit set */
{COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
COSTS_N_INSNS (35), /* HI */
COSTS_N_INSNS (51), /* SI */
COSTS_N_INSNS (83), /* DI */
COSTS_N_INSNS (83)}, /* other */
COSTS_N_INSNS (1), /* cost of movsx */
COSTS_N_INSNS (1), /* cost of movzx */
8, /* "large" insn */
9, /* MOVE_RATIO */
4, /* cost for loading QImode using movzbl */
{5, 5, 4}, /* cost of loading integer registers
in QImode, HImode and SImode.
Relative to reg-reg move (2). */
{4, 4, 4}, /* cost of storing integer registers */
2, /* cost of reg,reg fld/fst */
{5, 5, 12}, /* cost of loading fp registers
in SFmode, DFmode and XFmode */
{4, 4, 8}, /* cost of storing fp registers
in SFmode, DFmode and XFmode */
2, /* cost of moving MMX register */
{4, 4}, /* cost of loading MMX registers
in SImode and DImode */
{4, 4}, /* cost of storing MMX registers
in SImode and DImode */
2, /* cost of moving SSE register */
{4, 4, 4}, /* cost of loading SSE registers
in SImode, DImode and TImode */
{4, 4, 4}, /* cost of storing SSE registers
in SImode, DImode and TImode */
2, /* MMX or SSE register to integer */
16, /* size of l1 cache. */
2048, /* size of l2 cache. */
64, /* size of prefetch block */
/* New AMD processors never drop prefetches; if they cannot be performed
immediately, they are queued. We set number of simultaneous prefetches
to a large constant to reflect this (it probably is not a good idea not
to limit number of prefetches at all, as their execution also takes some
time). */
100, /* number of parallel prefetches */
2, /* Branch cost */
COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
COSTS_N_INSNS (6), /* cost of FMUL instruction. */
COSTS_N_INSNS (42), /* cost of FDIV instruction. */
COSTS_N_INSNS (2), /* cost of FABS instruction. */
COSTS_N_INSNS (2), /* cost of FCHS instruction. */
COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
1, 2, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
bdver4_memcpy,
bdver4_memset,
6, /* scalar_stmt_cost. */
4, /* scalar load_cost. */
4, /* scalar_store_cost. */
6, /* vec_stmt_cost. */
0, /* vec_to_scalar_cost. */
2, /* scalar_to_vec_cost. */
4, /* vec_align_load_cost. */
4, /* vec_unalign_load_cost. */
4, /* vec_store_cost. */
4, /* cond_taken_branch_cost. */
2, /* cond_not_taken_branch_cost. */
};
/* ZNVER1 has optimized REP instruction for medium sized blocks, but for
very small blocks it is better to use loop. For large blocks, libcall
can do nontemporary accesses and beat inline considerably. */
static stringop_algs znver1_memcpy[2] = {
{libcall, {{6, loop, false}, {14, unrolled_loop, false},
{-1, rep_prefix_4_byte, false}}},
{libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
{-1, libcall, false}}}};
static stringop_algs znver1_memset[2] = {
{libcall, {{8, loop, false}, {24, unrolled_loop, false},
{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
{libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
{-1, libcall, false}}}};
struct processor_costs znver1_cost = {
COSTS_N_INSNS (1), /* cost of an add instruction. */
COSTS_N_INSNS (1), /* cost of a lea instruction. */
COSTS_N_INSNS (1), /* variable shift costs. */
COSTS_N_INSNS (1), /* constant shift costs. */
{COSTS_N_INSNS (3), /* cost of starting multiply for QI. */
COSTS_N_INSNS (3), /* HI. */
COSTS_N_INSNS (3), /* SI. */
COSTS_N_INSNS (4), /* DI. */
COSTS_N_INSNS (4)}, /* other. */
0, /* cost of multiply per each bit
set. */
{COSTS_N_INSNS (19), /* cost of a divide/mod for QI. */
COSTS_N_INSNS (35), /* HI. */
COSTS_N_INSNS (51), /* SI. */
COSTS_N_INSNS (83), /* DI. */
COSTS_N_INSNS (83)}, /* other. */
COSTS_N_INSNS (1), /* cost of movsx. */
COSTS_N_INSNS (1), /* cost of movzx. */
8, /* "large" insn. */
9, /* MOVE_RATIO. */
4, /* cost for loading QImode using
movzbl. */
{5, 5, 4}, /* cost of loading integer registers
in QImode, HImode and SImode.
Relative to reg-reg move (2). */
{4, 4, 4}, /* cost of storing integer
registers. */
2, /* cost of reg,reg fld/fst. */
{5, 5, 12}, /* cost of loading fp registers
in SFmode, DFmode and XFmode. */
{4, 4, 8}, /* cost of storing fp registers
in SFmode, DFmode and XFmode. */
2, /* cost of moving MMX register. */
{4, 4}, /* cost of loading MMX registers
in SImode and DImode. */
{4, 4}, /* cost of storing MMX registers
in SImode and DImode. */
2, /* cost of moving SSE register. */
{4, 4, 4}, /* cost of loading SSE registers
in SImode, DImode and TImode. */
{4, 4, 4}, /* cost of storing SSE registers
in SImode, DImode and TImode. */
2, /* MMX or SSE register to integer. */
32, /* size of l1 cache. */
512, /* size of l2 cache. */
64, /* size of prefetch block. */
/* New AMD processors never drop prefetches; if they cannot be performed
immediately, they are queued. We set number of simultaneous prefetches
to a large constant to reflect this (it probably is not a good idea not
to limit number of prefetches at all, as their execution also takes some
time). */
100, /* number of parallel prefetches. */
3, /* Branch cost. */
COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
COSTS_N_INSNS (6), /* cost of FMUL instruction. */
COSTS_N_INSNS (42), /* cost of FDIV instruction. */
COSTS_N_INSNS (2), /* cost of FABS instruction. */
COSTS_N_INSNS (2), /* cost of FCHS instruction. */
COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
/* Zen can execute 4 integer operations per cycle. FP operations take 3 cycles
and it can execute 2 integer additions and 2 multiplications thus
reassociation may make sense up to with of 6. SPEC2k6 bencharks suggests
that 4 works better than 6 probably due to register pressure.
Integer vector operations are taken by FP unit and execute 3 vector
plus/minus operations per cycle but only one multiply. This is adjusted
in ix86_reassociation_width. */
4, 4, 3, 6, /* reassoc int, fp, vec_int, vec_fp. */
znver1_memcpy,
znver1_memset,
6, /* scalar_stmt_cost. */
4, /* scalar load_cost. */
4, /* scalar_store_cost. */
6, /* vec_stmt_cost. */
0, /* vec_to_scalar_cost. */
2, /* scalar_to_vec_cost. */
4, /* vec_align_load_cost. */
4, /* vec_unalign_load_cost. */
4, /* vec_store_cost. */
4, /* cond_taken_branch_cost. */
2, /* cond_not_taken_branch_cost. */
};
/* BTVER1 has optimized REP instruction for medium sized blocks, but for
very small blocks it is better to use loop. For large blocks, libcall can
do nontemporary accesses and beat inline considerably. */
static stringop_algs btver1_memcpy[2] = {
{libcall, {{6, loop, false}, {14, unrolled_loop, false},
{-1, rep_prefix_4_byte, false}}},
{libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
{-1, libcall, false}}}};
static stringop_algs btver1_memset[2] = {
{libcall, {{8, loop, false}, {24, unrolled_loop, false},
{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
{libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
{-1, libcall, false}}}};
const struct processor_costs btver1_cost = {
COSTS_N_INSNS (1), /* cost of an add instruction */
COSTS_N_INSNS (2), /* cost of a lea instruction */
COSTS_N_INSNS (1), /* variable shift costs */
COSTS_N_INSNS (1), /* constant shift costs */
{COSTS_N_INSNS (3), /* cost of starting multiply for QI */
COSTS_N_INSNS (4), /* HI */
COSTS_N_INSNS (3), /* SI */
COSTS_N_INSNS (4), /* DI */
COSTS_N_INSNS (5)}, /* other */
0, /* cost of multiply per each bit set */
{COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
COSTS_N_INSNS (35), /* HI */
COSTS_N_INSNS (51), /* SI */
COSTS_N_INSNS (83), /* DI */
COSTS_N_INSNS (83)}, /* other */
COSTS_N_INSNS (1), /* cost of movsx */
COSTS_N_INSNS (1), /* cost of movzx */
8, /* "large" insn */
9, /* MOVE_RATIO */
4, /* cost for loading QImode using movzbl */
{3, 4, 3}, /* cost of loading integer registers
in QImode, HImode and SImode.
Relative to reg-reg move (2). */
{3, 4, 3}, /* cost of storing integer registers */
4, /* cost of reg,reg fld/fst */
{4, 4, 12}, /* cost of loading fp registers
in SFmode, DFmode and XFmode */
{6, 6, 8}, /* cost of storing fp registers
in SFmode, DFmode and XFmode */
2, /* cost of moving MMX register */
{3, 3}, /* cost of loading MMX registers
in SImode and DImode */
{4, 4}, /* cost of storing MMX registers
in SImode and DImode */
2, /* cost of moving SSE register */
{4, 4, 3}, /* cost of loading SSE registers
in SImode, DImode and TImode */
{4, 4, 5}, /* cost of storing SSE registers
in SImode, DImode and TImode */
3, /* MMX or SSE register to integer */
/* On K8:
MOVD reg64, xmmreg Double FSTORE 4
MOVD reg32, xmmreg Double FSTORE 4
On AMDFAM10:
MOVD reg64, xmmreg Double FADD 3
1/1 1/1
MOVD reg32, xmmreg Double FADD 3
1/1 1/1 */
32, /* size of l1 cache. */
512, /* size of l2 cache. */
64, /* size of prefetch block */
100, /* number of parallel prefetches */
2, /* Branch cost */
COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
COSTS_N_INSNS (4), /* cost of FMUL instruction. */
COSTS_N_INSNS (19), /* cost of FDIV instruction. */
COSTS_N_INSNS (2), /* cost of FABS instruction. */
COSTS_N_INSNS (2), /* cost of FCHS instruction. */
COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
btver1_memcpy,
btver1_memset,
4, /* scalar_stmt_cost. */
2, /* scalar load_cost. */
2, /* scalar_store_cost. */
6, /* vec_stmt_cost. */
0, /* vec_to_scalar_cost. */
2, /* scalar_to_vec_cost. */
2, /* vec_align_load_cost. */
2, /* vec_unalign_load_cost. */
2, /* vec_store_cost. */
2, /* cond_taken_branch_cost. */
1, /* cond_not_taken_branch_cost. */
};
static stringop_algs btver2_memcpy[2] = {
{libcall, {{6, loop, false}, {14, unrolled_loop, false},
{-1, rep_prefix_4_byte, false}}},
{libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
{-1, libcall, false}}}};
static stringop_algs btver2_memset[2] = {
{libcall, {{8, loop, false}, {24, unrolled_loop, false},
{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
{libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
{-1, libcall, false}}}};
const struct processor_costs btver2_cost = {
COSTS_N_INSNS (1), /* cost of an add instruction */
COSTS_N_INSNS (2), /* cost of a lea instruction */
COSTS_N_INSNS (1), /* variable shift costs */
COSTS_N_INSNS (1), /* constant shift costs */
{COSTS_N_INSNS (3), /* cost of starting multiply for QI */
COSTS_N_INSNS (4), /* HI */
COSTS_N_INSNS (3), /* SI */
COSTS_N_INSNS (4), /* DI */
COSTS_N_INSNS (5)}, /* other */
0, /* cost of multiply per each bit set */
{COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
COSTS_N_INSNS (35), /* HI */
COSTS_N_INSNS (51), /* SI */
COSTS_N_INSNS (83), /* DI */
COSTS_N_INSNS (83)}, /* other */
COSTS_N_INSNS (1), /* cost of movsx */
COSTS_N_INSNS (1), /* cost of movzx */
8, /* "large" insn */
9, /* MOVE_RATIO */
4, /* cost for loading QImode using movzbl */
{3, 4, 3}, /* cost of loading integer registers
in QImode, HImode and SImode.
Relative to reg-reg move (2). */
{3, 4, 3}, /* cost of storing integer registers */
4, /* cost of reg,reg fld/fst */
{4, 4, 12}, /* cost of loading fp registers
in SFmode, DFmode and XFmode */
{6, 6, 8}, /* cost of storing fp registers
in SFmode, DFmode and XFmode */
2, /* cost of moving MMX register */
{3, 3}, /* cost of loading MMX registers
in SImode and DImode */
{4, 4}, /* cost of storing MMX registers
in SImode and DImode */
2, /* cost of moving SSE register */
{4, 4, 3}, /* cost of loading SSE registers
in SImode, DImode and TImode */
{4, 4, 5}, /* cost of storing SSE registers
in SImode, DImode and TImode */
3, /* MMX or SSE register to integer */
/* On K8:
MOVD reg64, xmmreg Double FSTORE 4
MOVD reg32, xmmreg Double FSTORE 4
On AMDFAM10:
MOVD reg64, xmmreg Double FADD 3
1/1 1/1
MOVD reg32, xmmreg Double FADD 3
1/1 1/1 */
32, /* size of l1 cache. */
2048, /* size of l2 cache. */
64, /* size of prefetch block */
100, /* number of parallel prefetches */
2, /* Branch cost */
COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
COSTS_N_INSNS (4), /* cost of FMUL instruction. */
COSTS_N_INSNS (19), /* cost of FDIV instruction. */
COSTS_N_INSNS (2), /* cost of FABS instruction. */
COSTS_N_INSNS (2), /* cost of FCHS instruction. */
COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
btver2_memcpy,
btver2_memset,
4, /* scalar_stmt_cost. */
2, /* scalar load_cost. */
2, /* scalar_store_cost. */
6, /* vec_stmt_cost. */
0, /* vec_to_scalar_cost. */
2, /* scalar_to_vec_cost. */
2, /* vec_align_load_cost. */
2, /* vec_unalign_load_cost. */
2, /* vec_store_cost. */
2, /* cond_taken_branch_cost. */
1, /* cond_not_taken_branch_cost. */
};
static stringop_algs pentium4_memcpy[2] = {
{libcall, {{12, loop_1_byte, false}, {-1, rep_prefix_4_byte, false}}},
DUMMY_STRINGOP_ALGS};
static stringop_algs pentium4_memset[2] = {
{libcall, {{6, loop_1_byte, false}, {48, loop, false},
{20480, rep_prefix_4_byte, false}, {-1, libcall, false}}},
DUMMY_STRINGOP_ALGS};
static const
struct processor_costs pentium4_cost = {
COSTS_N_INSNS (1), /* cost of an add instruction */
COSTS_N_INSNS (3), /* cost of a lea instruction */
COSTS_N_INSNS (4), /* variable shift costs */
COSTS_N_INSNS (4), /* constant shift costs */
{COSTS_N_INSNS (15), /* cost of starting multiply for QI */
COSTS_N_INSNS (15), /* HI */
COSTS_N_INSNS (15), /* SI */
COSTS_N_INSNS (15), /* DI */
COSTS_N_INSNS (15)}, /* other */
0, /* cost of multiply per each bit set */
{COSTS_N_INSNS (56), /* cost of a divide/mod for QI */
COSTS_N_INSNS (56), /* HI */
COSTS_N_INSNS (56), /* SI */
COSTS_N_INSNS (56), /* DI */
COSTS_N_INSNS (56)}, /* other */
COSTS_N_INSNS (1), /* cost of movsx */
COSTS_N_INSNS (1), /* cost of movzx */
16, /* "large" insn */
6, /* MOVE_RATIO */
2, /* cost for loading QImode using movzbl */
{4, 5, 4}, /* cost of loading integer registers
in QImode, HImode and SImode.
Relative to reg-reg move (2). */
{2, 3, 2}, /* cost of storing integer registers */
2, /* cost of reg,reg fld/fst */
{2, 2, 6}, /* cost of loading fp registers
in SFmode, DFmode and XFmode */
{4, 4, 6}, /* cost of storing fp registers
in SFmode, DFmode and XFmode */
2, /* cost of moving MMX register */
{2, 2}, /* cost of loading MMX registers
in SImode and DImode */
{2, 2}, /* cost of storing MMX registers
in SImode and DImode */
12, /* cost of moving SSE register */
{12, 12, 12}, /* cost of loading SSE registers
in SImode, DImode and TImode */
{2, 2, 8}, /* cost of storing SSE registers
in SImode, DImode and TImode */
10, /* MMX or SSE register to integer */
8, /* size of l1 cache. */
256, /* size of l2 cache. */
64, /* size of prefetch block */
6, /* number of parallel prefetches */
2, /* Branch cost */
COSTS_N_INSNS (5), /* cost of FADD and FSUB insns. */
COSTS_N_INSNS (7), /* cost of FMUL instruction. */
COSTS_N_INSNS (43), /* cost of FDIV instruction. */
COSTS_N_INSNS (2), /* cost of FABS instruction. */
COSTS_N_INSNS (2), /* cost of FCHS instruction. */
COSTS_N_INSNS (43), /* cost of FSQRT instruction. */
1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
pentium4_memcpy,
pentium4_memset,
1, /* scalar_stmt_cost. */
1, /* scalar load_cost. */
1, /* scalar_store_cost. */
1, /* vec_stmt_cost. */
1, /* vec_to_scalar_cost. */
1, /* scalar_to_vec_cost. */
1, /* vec_align_load_cost. */
2, /* vec_unalign_load_cost. */
1, /* vec_store_cost. */
3, /* cond_taken_branch_cost. */
1, /* cond_not_taken_branch_cost. */
};
static stringop_algs nocona_memcpy[2] = {
{libcall, {{12, loop_1_byte, false}, {-1, rep_prefix_4_byte, false}}},
{libcall, {{32, loop, false}, {20000, rep_prefix_8_byte, false},
{100000, unrolled_loop, false}, {-1, libcall, false}}}};
static stringop_algs nocona_memset[2] = {
{libcall, {{6, loop_1_byte, false}, {48, loop, false},
{20480, rep_prefix_4_byte, false}, {-1, libcall, false}}},
{libcall, {{24, loop, false}, {64, unrolled_loop, false},
{8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
static const
struct processor_costs nocona_cost = {
COSTS_N_INSNS (1), /* cost of an add instruction */
COSTS_N_INSNS (1), /* cost of a lea instruction */
COSTS_N_INSNS (1), /* variable shift costs */
COSTS_N_INSNS (1), /* constant shift costs */
{COSTS_N_INSNS (10), /* cost of starting multiply for QI */
COSTS_N_INSNS (10), /* HI */
COSTS_N_INSNS (10), /* SI */
COSTS_N_INSNS (10), /* DI */
COSTS_N_INSNS (10)}, /* other */
0, /* cost of multiply per each bit set */
{COSTS_N_INSNS (66), /* cost of a divide/mod for QI */
COSTS_N_INSNS (66), /* HI */
COSTS_N_INSNS (66), /* SI */
COSTS_N_INSNS (66), /* DI */
COSTS_N_INSNS (66)}, /* other */
COSTS_N_INSNS (1), /* cost of movsx */
COSTS_N_INSNS (1), /* cost of movzx */
16, /* "large" insn */
17, /* MOVE_RATIO */
4, /* cost for loading QImode using movzbl */
{4, 4, 4}, /* cost of loading integer registers
in QImode, HImode and SImode.
Relative to reg-reg move (2). */
{4, 4, 4}, /* cost of storing integer registers */
3, /* cost of reg,reg fld/fst */
{12, 12, 12}, /* cost of loading fp registers
in SFmode, DFmode and XFmode */
{4, 4, 4}, /* cost of storing fp registers
in SFmode, DFmode and XFmode */
6, /* cost of moving MMX register */
{12, 12}, /* cost of loading MMX registers
in SImode and DImode */
{12, 12}, /* cost of storing MMX registers
in SImode and DImode */
6, /* cost of moving SSE register */
{12, 12, 12}, /* cost of loading SSE registers
in SImode, DImode and TImode */
{12, 12, 12}, /* cost of storing SSE registers
in SImode, DImode and TImode */
8, /* MMX or SSE register to integer */
8, /* size of l1 cache. */
1024, /* size of l2 cache. */
64, /* size of prefetch block */
8, /* number of parallel prefetches */
1, /* Branch cost */
COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
COSTS_N_INSNS (8), /* cost of FMUL instruction. */
COSTS_N_INSNS (40), /* cost of FDIV instruction. */
COSTS_N_INSNS (3), /* cost of FABS instruction. */
COSTS_N_INSNS (3), /* cost of FCHS instruction. */
COSTS_N_INSNS (44), /* cost of FSQRT instruction. */
1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
nocona_memcpy,
nocona_memset,
1, /* scalar_stmt_cost. */
1, /* scalar load_cost. */
1, /* scalar_store_cost. */
1, /* vec_stmt_cost. */
1, /* vec_to_scalar_cost. */
1, /* scalar_to_vec_cost. */
1, /* vec_align_load_cost. */
2, /* vec_unalign_load_cost. */
1, /* vec_store_cost. */
3, /* cond_taken_branch_cost. */
1, /* cond_not_taken_branch_cost. */
};
static stringop_algs atom_memcpy[2] = {
{libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}},
{libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false},
{8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
static stringop_algs atom_memset[2] = {
{libcall, {{8, loop, false}, {15, unrolled_loop, false},
{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
{libcall, {{24, loop, false}, {32, unrolled_loop, false},
{8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
static const
struct processor_costs atom_cost = {
COSTS_N_INSNS (1), /* cost of an add instruction */
COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
COSTS_N_INSNS (1), /* variable shift costs */
COSTS_N_INSNS (1), /* constant shift costs */
{COSTS_N_INSNS (3), /* cost of starting multiply for QI */
COSTS_N_INSNS (4), /* HI */
COSTS_N_INSNS (3), /* SI */
COSTS_N_INSNS (4), /* DI */
COSTS_N_INSNS (2)}, /* other */
0, /* cost of multiply per each bit set */
{COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
COSTS_N_INSNS (26), /* HI */
COSTS_N_INSNS (42), /* SI */
COSTS_N_INSNS (74), /* DI */
COSTS_N_INSNS (74)}, /* other */
COSTS_N_INSNS (1), /* cost of movsx */
COSTS_N_INSNS (1), /* cost of movzx */
8, /* "large" insn */
17, /* MOVE_RATIO */
4, /* cost for loading QImode using movzbl */
{4, 4, 4}, /* cost of loading integer registers
in QImode, HImode and SImode.
Relative to reg-reg move (2). */
{4, 4, 4}, /* cost of storing integer registers */
4, /* cost of reg,reg fld/fst */
{12, 12, 12}, /* cost of loading fp registers
in SFmode, DFmode and XFmode */
{6, 6, 8}, /* cost of storing fp registers
in SFmode, DFmode and XFmode */
2, /* cost of moving MMX register */
{8, 8}, /* cost of loading MMX registers
in SImode and DImode */
{8, 8}, /* cost of storing MMX registers
in SImode and DImode */
2, /* cost of moving SSE register */
{8, 8, 8}, /* cost of loading SSE registers
in SImode, DImode and TImode */
{8, 8, 8}, /* cost of storing SSE registers
in SImode, DImode and TImode */
5, /* MMX or SSE register to integer */
32, /* size of l1 cache. */
256, /* size of l2 cache. */
64, /* size of prefetch block */
6, /* number of parallel prefetches */
3, /* Branch cost */
COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
COSTS_N_INSNS (8), /* cost of FMUL instruction. */
COSTS_N_INSNS (20), /* cost of FDIV instruction. */
COSTS_N_INSNS (8), /* cost of FABS instruction. */
COSTS_N_INSNS (8), /* cost of FCHS instruction. */
COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
2, 2, 2, 2, /* reassoc int, fp, vec_int, vec_fp. */
atom_memcpy,
atom_memset,
1, /* scalar_stmt_cost. */
1, /* scalar load_cost. */
1, /* scalar_store_cost. */
1, /* vec_stmt_cost. */
1, /* vec_to_scalar_cost. */
1, /* scalar_to_vec_cost. */
1, /* vec_align_load_cost. */
2, /* vec_unalign_load_cost. */
1, /* vec_store_cost. */
3, /* cond_taken_branch_cost. */
1, /* cond_not_taken_branch_cost. */
};
static stringop_algs slm_memcpy[2] = {
{libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}},
{libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false},
{8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
static stringop_algs slm_memset[2] = {
{libcall, {{8, loop, false}, {15, unrolled_loop, false},
{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
{libcall, {{24, loop, false}, {32, unrolled_loop, false},
{8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
static const
struct processor_costs slm_cost = {
COSTS_N_INSNS (1), /* cost of an add instruction */
COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
COSTS_N_INSNS (1), /* variable shift costs */
COSTS_N_INSNS (1), /* constant shift costs */
{COSTS_N_INSNS (3), /* cost of starting multiply for QI */
COSTS_N_INSNS (3), /* HI */
COSTS_N_INSNS (3), /* SI */
COSTS_N_INSNS (4), /* DI */
COSTS_N_INSNS (2)}, /* other */
0, /* cost of multiply per each bit set */
{COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
COSTS_N_INSNS (26), /* HI */
COSTS_N_INSNS (42), /* SI */
COSTS_N_INSNS (74), /* DI */
COSTS_N_INSNS (74)}, /* other */
COSTS_N_INSNS (1), /* cost of movsx */
COSTS_N_INSNS (1), /* cost of movzx */
8, /* "large" insn */
17, /* MOVE_RATIO */
4, /* cost for loading QImode using movzbl */
{4, 4, 4}, /* cost of loading integer registers
in QImode, HImode and SImode.
Relative to reg-reg move (2). */
{4, 4, 4}, /* cost of storing integer registers */
4, /* cost of reg,reg fld/fst */
{12, 12, 12}, /* cost of loading fp registers
in SFmode, DFmode and XFmode */
{6, 6, 8}, /* cost of storing fp registers
in SFmode, DFmode and XFmode */
2, /* cost of moving MMX register */
{8, 8}, /* cost of loading MMX registers
in SImode and DImode */
{8, 8}, /* cost of storing MMX registers
in SImode and DImode */
2, /* cost of moving SSE register */
{8, 8, 8}, /* cost of loading SSE registers
in SImode, DImode and TImode */
{8, 8, 8}, /* cost of storing SSE registers
in SImode, DImode and TImode */
5, /* MMX or SSE register to integer */
32, /* size of l1 cache. */
256, /* size of l2 cache. */
64, /* size of prefetch block */
6, /* number of parallel prefetches */
3, /* Branch cost */
COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
COSTS_N_INSNS (8), /* cost of FMUL instruction. */
COSTS_N_INSNS (20), /* cost of FDIV instruction. */
COSTS_N_INSNS (8), /* cost of FABS instruction. */
COSTS_N_INSNS (8), /* cost of FCHS instruction. */
COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1, 2, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
slm_memcpy,
slm_memset,
1, /* scalar_stmt_cost. */
1, /* scalar load_cost. */
1, /* scalar_store_cost. */
1, /* vec_stmt_cost. */
4, /* vec_to_scalar_cost. */
1, /* scalar_to_vec_cost. */
1, /* vec_align_load_cost. */
2, /* vec_unalign_load_cost. */
1, /* vec_store_cost. */
3, /* cond_taken_branch_cost. */
1, /* cond_not_taken_branch_cost. */
};
static stringop_algs intel_memcpy[2] = {
{libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}},
{libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false},
{8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
static stringop_algs intel_memset[2] = {
{libcall, {{8, loop, false}, {15, unrolled_loop, false},
{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
{libcall, {{24, loop, false}, {32, unrolled_loop, false},
{8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
static const
struct processor_costs intel_cost = {
COSTS_N_INSNS (1), /* cost of an add instruction */
COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
COSTS_N_INSNS (1), /* variable shift costs */
COSTS_N_INSNS (1), /* constant shift costs */
{COSTS_N_INSNS (3), /* cost of starting multiply for QI */
COSTS_N_INSNS (3), /* HI */
COSTS_N_INSNS (3), /* SI */
COSTS_N_INSNS (4), /* DI */
COSTS_N_INSNS (2)}, /* other */
0, /* cost of multiply per each bit set */
{COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
COSTS_N_INSNS (26), /* HI */
COSTS_N_INSNS (42), /* SI */
COSTS_N_INSNS (74), /* DI */
COSTS_N_INSNS (74)}, /* other */
COSTS_N_INSNS (1), /* cost of movsx */
COSTS_N_INSNS (1), /* cost of movzx */
8, /* "large" insn */
17, /* MOVE_RATIO */
4, /* cost for loading QImode using movzbl */
{4, 4, 4}, /* cost of loading integer registers
in QImode, HImode and SImode.
Relative to reg-reg move (2). */
{4, 4, 4}, /* cost of storing integer registers */
4, /* cost of reg,reg fld/fst */
{12, 12, 12}, /* cost of loading fp registers
in SFmode, DFmode and XFmode */
{6, 6, 8}, /* cost of storing fp registers
in SFmode, DFmode and XFmode */
2, /* cost of moving MMX register */
{8, 8}, /* cost of loading MMX registers
in SImode and DImode */
{8, 8}, /* cost of storing MMX registers
in SImode and DImode */
2, /* cost of moving SSE register */
{8, 8, 8}, /* cost of loading SSE registers
in SImode, DImode and TImode */
{8, 8, 8}, /* cost of storing SSE registers
in SImode, DImode and TImode */
5, /* MMX or SSE register to integer */
32, /* size of l1 cache. */
256, /* size of l2 cache. */
64, /* size of prefetch block */
6, /* number of parallel prefetches */
3, /* Branch cost */
COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
COSTS_N_INSNS (8), /* cost of FMUL instruction. */
COSTS_N_INSNS (20), /* cost of FDIV instruction. */
COSTS_N_INSNS (8), /* cost of FABS instruction. */
COSTS_N_INSNS (8), /* cost of FCHS instruction. */
COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1, 4, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
intel_memcpy,
intel_memset,
1, /* scalar_stmt_cost. */
1, /* scalar load_cost. */
1, /* scalar_store_cost. */
1, /* vec_stmt_cost. */
4, /* vec_to_scalar_cost. */
1, /* scalar_to_vec_cost. */
1, /* vec_align_load_cost. */
2, /* vec_unalign_load_cost. */
1, /* vec_store_cost. */
3, /* cond_taken_branch_cost. */
1, /* cond_not_taken_branch_cost. */
};
/* Generic should produce code tuned for Core-i7 (and newer chips)
and btver1 (and newer chips). */
static stringop_algs generic_memcpy[2] = {
{libcall, {{32, loop, false}, {8192, rep_prefix_4_byte, false},
{-1, libcall, false}}},
{libcall, {{32, loop, false}, {8192, rep_prefix_8_byte, false},
{-1, libcall, false}}}};
static stringop_algs generic_memset[2] = {
{libcall, {{32, loop, false}, {8192, rep_prefix_4_byte, false},
{-1, libcall, false}}},
{libcall, {{32, loop, false}, {8192, rep_prefix_8_byte, false},
{-1, libcall, false}}}};
static const
struct processor_costs generic_cost = {
COSTS_N_INSNS (1), /* cost of an add instruction */
/* On all chips taken into consideration lea is 2 cycles and more. With
this cost however our current implementation of synth_mult results in
use of unnecessary temporary registers causing regression on several
SPECfp benchmarks. */
COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
COSTS_N_INSNS (1), /* variable shift costs */
COSTS_N_INSNS (1), /* constant shift costs */
{COSTS_N_INSNS (3), /* cost of starting multiply for QI */
COSTS_N_INSNS (4), /* HI */
COSTS_N_INSNS (3), /* SI */
COSTS_N_INSNS (4), /* DI */
COSTS_N_INSNS (2)}, /* other */
0, /* cost of multiply per each bit set */
{COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
COSTS_N_INSNS (26), /* HI */
COSTS_N_INSNS (42), /* SI */
COSTS_N_INSNS (74), /* DI */
COSTS_N_INSNS (74)}, /* other */
COSTS_N_INSNS (1), /* cost of movsx */
COSTS_N_INSNS (1), /* cost of movzx */
8, /* "large" insn */
17, /* MOVE_RATIO */
4, /* cost for loading QImode using movzbl */
{4, 4, 4}, /* cost of loading integer registers
in QImode, HImode and SImode.
Relative to reg-reg move (2). */
{4, 4, 4}, /* cost of storing integer registers */
4, /* cost of reg,reg fld/fst */
{12, 12, 12}, /* cost of loading fp registers
in SFmode, DFmode and XFmode */
{6, 6, 8}, /* cost of storing fp registers
in SFmode, DFmode and XFmode */
2, /* cost of moving MMX register */
{8, 8}, /* cost of loading MMX registers
in SImode and DImode */
{8, 8}, /* cost of storing MMX registers
in SImode and DImode */
2, /* cost of moving SSE register */
{8, 8, 8}, /* cost of loading SSE registers
in SImode, DImode and TImode */
{8, 8, 8}, /* cost of storing SSE registers
in SImode, DImode and TImode */
5, /* MMX or SSE register to integer */
32, /* size of l1 cache. */
512, /* size of l2 cache. */
64, /* size of prefetch block */
6, /* number of parallel prefetches */
/* Benchmarks shows large regressions on K8 sixtrack benchmark when this
value is increased to perhaps more appropriate value of 5. */
3, /* Branch cost */
COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
COSTS_N_INSNS (8), /* cost of FMUL instruction. */
COSTS_N_INSNS (20), /* cost of FDIV instruction. */
COSTS_N_INSNS (8), /* cost of FABS instruction. */
COSTS_N_INSNS (8), /* cost of FCHS instruction. */
COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1, 2, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
generic_memcpy,
generic_memset,
1, /* scalar_stmt_cost. */
1, /* scalar load_cost. */
1, /* scalar_store_cost. */
1, /* vec_stmt_cost. */
1, /* vec_to_scalar_cost. */
1, /* scalar_to_vec_cost. */
1, /* vec_align_load_cost. */
2, /* vec_unalign_load_cost. */
1, /* vec_store_cost. */
3, /* cond_taken_branch_cost. */
1, /* cond_not_taken_branch_cost. */
};
/* core_cost should produce code tuned for Core familly of CPUs. */
static stringop_algs core_memcpy[2] = {
{libcall, {{1024, rep_prefix_4_byte, true}, {-1, libcall, false}}},
{libcall, {{24, loop, true}, {128, rep_prefix_8_byte, true},
{-1, libcall, false}}}};
static stringop_algs core_memset[2] = {
{libcall, {{6, loop_1_byte, true},
{24, loop, true},
{8192, rep_prefix_4_byte, true},
{-1, libcall, false}}},
{libcall, {{24, loop, true}, {512, rep_prefix_8_byte, true},
{-1, libcall, false}}}};
static const
struct processor_costs core_cost = {
COSTS_N_INSNS (1), /* cost of an add instruction */
/* On all chips taken into consideration lea is 2 cycles and more. With
this cost however our current implementation of synth_mult results in
use of unnecessary temporary registers causing regression on several
SPECfp benchmarks. */
COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
COSTS_N_INSNS (1), /* variable shift costs */
COSTS_N_INSNS (1), /* constant shift costs */
{COSTS_N_INSNS (3), /* cost of starting multiply for QI */
COSTS_N_INSNS (4), /* HI */
COSTS_N_INSNS (3), /* SI */
COSTS_N_INSNS (4), /* DI */
COSTS_N_INSNS (2)}, /* other */
0, /* cost of multiply per each bit set */
{COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
COSTS_N_INSNS (26), /* HI */
COSTS_N_INSNS (42), /* SI */
COSTS_N_INSNS (74), /* DI */
COSTS_N_INSNS (74)}, /* other */
COSTS_N_INSNS (1), /* cost of movsx */
COSTS_N_INSNS (1), /* cost of movzx */
8, /* "large" insn */
17, /* MOVE_RATIO */
4, /* cost for loading QImode using movzbl */
{4, 4, 4}, /* cost of loading integer registers
in QImode, HImode and SImode.
Relative to reg-reg move (2). */
{4, 4, 4}, /* cost of storing integer registers */
4, /* cost of reg,reg fld/fst */
{12, 12, 12}, /* cost of loading fp registers
in SFmode, DFmode and XFmode */
{6, 6, 8}, /* cost of storing fp registers
in SFmode, DFmode and XFmode */
2, /* cost of moving MMX register */
{8, 8}, /* cost of loading MMX registers
in SImode and DImode */
{8, 8}, /* cost of storing MMX registers
in SImode and DImode */
2, /* cost of moving SSE register */
{8, 8, 8}, /* cost of loading SSE registers
in SImode, DImode and TImode */
{8, 8, 8}, /* cost of storing SSE registers
in SImode, DImode and TImode */
5, /* MMX or SSE register to integer */
64, /* size of l1 cache. */
512, /* size of l2 cache. */
64, /* size of prefetch block */
6, /* number of parallel prefetches */
/* FIXME perhaps more appropriate value is 5. */
3, /* Branch cost */
COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
COSTS_N_INSNS (8), /* cost of FMUL instruction. */
COSTS_N_INSNS (20), /* cost of FDIV instruction. */
COSTS_N_INSNS (8), /* cost of FABS instruction. */
COSTS_N_INSNS (8), /* cost of FCHS instruction. */
COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1, 4, 2, 2, /* reassoc int, fp, vec_int, vec_fp. */
core_memcpy,
core_memset,
1, /* scalar_stmt_cost. */
1, /* scalar load_cost. */
1, /* scalar_store_cost. */
1, /* vec_stmt_cost. */
1, /* vec_to_scalar_cost. */
1, /* scalar_to_vec_cost. */
1, /* vec_align_load_cost. */
2, /* vec_unalign_load_cost. */
1, /* vec_store_cost. */
3, /* cond_taken_branch_cost. */
1, /* cond_not_taken_branch_cost. */
};
/* Scheduler hooks for IA-32 which implement atom+ specific logic.
Copyright (C) 1988-2017 Free Software Foundation, Inc.
This file is part of GCC.
GCC is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 3, or (at your option)
any later version.
GCC is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with GCC; see the file COPYING3. If not see
<http://www.gnu.org/licenses/>. */
#include "config.h"
#include "system.h"
#include "coretypes.h"
#include "backend.h"
#include "rtl.h"
#include "tree.h"
#include "cfghooks.h"
#include "tm_p.h"
#include "insn-config.h"
#include "insn-attr.h"
#include "recog.h"
#include "target.h"
#include "rtl-iter.h"
#include "regset.h"
#include "sched-int.h"
/* Try to reorder ready list to take advantage of Atom pipelined IMUL
execution. It is applied if
(1) IMUL instruction is on the top of list;
(2) There exists the only producer of independent IMUL instruction in
ready list.
Return index of IMUL producer if it was found and -1 otherwise. */
static int
do_reorder_for_imul (rtx_insn **ready, int n_ready)
{
rtx_insn *insn;
rtx set, insn1, insn2;
sd_iterator_def sd_it;
dep_t dep;
int index = -1;
int i;
if (!TARGET_BONNELL)
return index;
/* Check that IMUL instruction is on the top of ready list. */
insn = ready[n_ready - 1];
set = single_set (insn);
if (!set)
return index;
if (!(GET_CODE (SET_SRC (set)) == MULT
&& GET_MODE (SET_SRC (set)) == SImode))
return index;
/* Search for producer of independent IMUL instruction. */
for (i = n_ready - 2; i >= 0; i--)
{
insn = ready[i];
if (!NONDEBUG_INSN_P (insn))
continue;
/* Skip IMUL instruction. */
insn2 = PATTERN (insn);
if (GET_CODE (insn2) == PARALLEL)
insn2 = XVECEXP (insn2, 0, 0);
if (GET_CODE (insn2) == SET
&& GET_CODE (SET_SRC (insn2)) == MULT
&& GET_MODE (SET_SRC (insn2)) == SImode)
continue;
FOR_EACH_DEP (insn, SD_LIST_FORW, sd_it, dep)
{
rtx con;
con = DEP_CON (dep);
if (!NONDEBUG_INSN_P (con))
continue;
insn1 = PATTERN (con);
if (GET_CODE (insn1) == PARALLEL)
insn1 = XVECEXP (insn1, 0, 0);
if (GET_CODE (insn1) == SET
&& GET_CODE (SET_SRC (insn1)) == MULT
&& GET_MODE (SET_SRC (insn1)) == SImode)
{
sd_iterator_def sd_it1;
dep_t dep1;
/* Check if there is no other dependee for IMUL. */
index = i;
FOR_EACH_DEP (con, SD_LIST_BACK, sd_it1, dep1)
{
rtx pro;
pro = DEP_PRO (dep1);
if (!NONDEBUG_INSN_P (pro))
continue;
if (pro != insn)
index = -1;
}
if (index >= 0)
break;
}
}
if (index >= 0)
break;
}
return index;
}
/* Try to find the best candidate on the top of ready list if two insns
have the same priority - candidate is best if its dependees were
scheduled earlier. Applied for Silvermont only.
Return true if top 2 insns must be interchanged. */
static bool
swap_top_of_ready_list (rtx_insn **ready, int n_ready)
{
rtx_insn *top = ready[n_ready - 1];
rtx_insn *next = ready[n_ready - 2];
rtx set;
sd_iterator_def sd_it;
dep_t dep;
int clock1 = -1;
int clock2 = -1;
#define INSN_TICK(INSN) (HID (INSN)->tick)
if (!TARGET_SILVERMONT && !TARGET_INTEL)
return false;
if (!NONDEBUG_INSN_P (top))
return false;
if (!NONJUMP_INSN_P (top))
return false;
if (!NONDEBUG_INSN_P (next))
return false;
if (!NONJUMP_INSN_P (next))
return false;
set = single_set (top);
if (!set)
return false;
set = single_set (next);
if (!set)
return false;
if (INSN_PRIORITY_KNOWN (top) && INSN_PRIORITY_KNOWN (next))
{
if (INSN_PRIORITY (top) != INSN_PRIORITY (next))
return false;
/* Determine winner more precise. */
FOR_EACH_DEP (top, SD_LIST_RES_BACK, sd_it, dep)
{
rtx pro;
pro = DEP_PRO (dep);
if (!NONDEBUG_INSN_P (pro))
continue;
if (INSN_TICK (pro) > clock1)
clock1 = INSN_TICK (pro);
}
FOR_EACH_DEP (next, SD_LIST_RES_BACK, sd_it, dep)
{
rtx pro;
pro = DEP_PRO (dep);
if (!NONDEBUG_INSN_P (pro))
continue;
if (INSN_TICK (pro) > clock2)
clock2 = INSN_TICK (pro);
}
if (clock1 == clock2)
{
/* Determine winner - load must win. */
enum attr_memory memory1, memory2;
memory1 = get_attr_memory (top);
memory2 = get_attr_memory (next);
if (memory2 == MEMORY_LOAD && memory1 != MEMORY_LOAD)
return true;
}
return (bool) (clock2 < clock1);
}
return false;
#undef INSN_TICK
}
/* Perform possible reodering of ready list for Atom/Silvermont only.
Return issue rate. */
int
ix86_atom_sched_reorder (FILE *dump, int sched_verbose, rtx_insn **ready,
int *pn_ready, int clock_var)
{
int issue_rate = -1;
int n_ready = *pn_ready;
int i;
rtx_insn *insn;
int index = -1;
/* Set up issue rate. */
issue_rate = ix86_issue_rate ();
/* Do reodering for BONNELL/SILVERMONT only. */
if (!TARGET_BONNELL && !TARGET_SILVERMONT && !TARGET_INTEL)
return issue_rate;
/* Nothing to do if ready list contains only 1 instruction. */
if (n_ready <= 1)
return issue_rate;
/* Do reodering for post-reload scheduler only. */
if (!reload_completed)
return issue_rate;
if ((index = do_reorder_for_imul (ready, n_ready)) >= 0)
{
if (sched_verbose > 1)
fprintf (dump, ";;\tatom sched_reorder: put %d insn on top\n",
INSN_UID (ready[index]));
/* Put IMUL producer (ready[index]) at the top of ready list. */
insn = ready[index];
for (i = index; i < n_ready - 1; i++)
ready[i] = ready[i + 1];
ready[n_ready - 1] = insn;
return issue_rate;
}
/* Skip selective scheduling since HID is not populated in it. */
if (clock_var != 0
&& !sel_sched_p ()
&& swap_top_of_ready_list (ready, n_ready))
{
if (sched_verbose > 1)
fprintf (dump, ";;\tslm sched_reorder: swap %d and %d insns\n",
INSN_UID (ready[n_ready - 1]), INSN_UID (ready[n_ready - 2]));
/* Swap 2 top elements of ready list. */
insn = ready[n_ready - 1];
ready[n_ready - 1] = ready[n_ready - 2];
ready[n_ready - 2] = insn;
}
return issue_rate;
}
/* Scheduler hooks for IA-32 which implement bdver1-4 specific logic.
Copyright (C) 1988-2017 Free Software Foundation, Inc.
This file is part of GCC.
GCC is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 3, or (at your option)
any later version.
GCC is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with GCC; see the file COPYING3. If not see
<http://www.gnu.org/licenses/>. */
#include "config.h"
#include "system.h"
#include "coretypes.h"
#include "backend.h"
#include "rtl.h"
#include "tree.h"
#include "cfghooks.h"
#include "tm_p.h"
#include "insn-config.h"
#include "insn-attr.h"
#include "recog.h"
#include "target.h"
#include "rtl-iter.h"
#include "regset.h"
#include "sched-int.h"
/* The size of the dispatch window is the total number of bytes of
object code allowed in a window. */
#define DISPATCH_WINDOW_SIZE 16
/* Number of dispatch windows considered for scheduling. */
#define MAX_DISPATCH_WINDOWS 3
/* Maximum number of instructions in a window. */
#define MAX_INSN 4
/* Maximum number of immediate operands in a window. */
#define MAX_IMM 4
/* Maximum number of immediate bits allowed in a window. */
#define MAX_IMM_SIZE 128
/* Maximum number of 32 bit immediates allowed in a window. */
#define MAX_IMM_32 4
/* Maximum number of 64 bit immediates allowed in a window. */
#define MAX_IMM_64 2
/* Maximum total of loads or prefetches allowed in a window. */
#define MAX_LOAD 2
/* Maximum total of stores allowed in a window. */
#define MAX_STORE 1
#undef BIG
#define BIG 100
/* Dispatch groups. Istructions that affect the mix in a dispatch window. */
enum dispatch_group {
disp_no_group = 0,
disp_load,
disp_store,
disp_load_store,
disp_prefetch,
disp_imm,
disp_imm_32,
disp_imm_64,
disp_branch,
disp_cmp,
disp_jcc,
disp_last
};
/* Number of allowable groups in a dispatch window. It is an array
indexed by dispatch_group enum. 100 is used as a big number,
because the number of these kind of operations does not have any
effect in dispatch window, but we need them for other reasons in
the table. */
static unsigned int num_allowable_groups[disp_last] = {
0, 2, 1, 1, 2, 4, 4, 2, 1, BIG, BIG
};
char group_name[disp_last + 1][16] = {
"disp_no_group", "disp_load", "disp_store", "disp_load_store",
"disp_prefetch", "disp_imm", "disp_imm_32", "disp_imm_64",
"disp_branch", "disp_cmp", "disp_jcc", "disp_last"
};
/* Instruction path. */
enum insn_path {
no_path = 0,
path_single, /* Single micro op. */
path_double, /* Double micro op. */
path_multi, /* Instructions with more than 2 micro op.. */
last_path
};
/* sched_insn_info defines a window to the instructions scheduled in
the basic block. It contains a pointer to the insn_info table and
the instruction scheduled.
Windows are allocated for each basic block and are linked
together. */
typedef struct sched_insn_info_s {
rtx insn;
enum dispatch_group group;
enum insn_path path;
int byte_len;
int imm_bytes;
} sched_insn_info;
/* Linked list of dispatch windows. This is a two way list of
dispatch windows of a basic block. It contains information about
the number of uops in the window and the total number of
instructions and of bytes in the object code for this dispatch
window. */
typedef struct dispatch_windows_s {
int num_insn; /* Number of insn in the window. */
int num_uops; /* Number of uops in the window. */
int window_size; /* Number of bytes in the window. */
int window_num; /* Window number between 0 or 1. */
int num_imm; /* Number of immediates in an insn. */
int num_imm_32; /* Number of 32 bit immediates in an insn. */
int num_imm_64; /* Number of 64 bit immediates in an insn. */
int imm_size; /* Total immediates in the window. */
int num_loads; /* Total memory loads in the window. */
int num_stores; /* Total memory stores in the window. */
int violation; /* Violation exists in window. */
sched_insn_info *window; /* Pointer to the window. */
struct dispatch_windows_s *next;
struct dispatch_windows_s *prev;
} dispatch_windows;
/* Immediate valuse used in an insn. */
typedef struct imm_info_s
{
int imm;
int imm32;
int imm64;
} imm_info;
static dispatch_windows *dispatch_window_list;
static dispatch_windows *dispatch_window_list1;
/* Get dispatch group of insn. */
static enum dispatch_group
get_mem_group (rtx_insn *insn)
{
enum attr_memory memory;
if (INSN_CODE (insn) < 0)
return disp_no_group;
memory = get_attr_memory (insn);
if (memory == MEMORY_STORE)
return disp_store;
if (memory == MEMORY_LOAD)
return disp_load;
if (memory == MEMORY_BOTH)
return disp_load_store;
return disp_no_group;
}
/* Return true if insn is a compare instruction. */
static bool
is_cmp (rtx_insn *insn)
{
enum attr_type type;
type = get_attr_type (insn);
return (type == TYPE_TEST
|| type == TYPE_ICMP
|| type == TYPE_FCMP
|| GET_CODE (PATTERN (insn)) == COMPARE);
}
/* Return true if a dispatch violation encountered. */
static bool
dispatch_violation (void)
{
if (dispatch_window_list->next)
return dispatch_window_list->next->violation;
return dispatch_window_list->violation;
}
/* Return true if insn is a branch instruction. */
static bool
is_branch (rtx_insn *insn)
{
return (CALL_P (insn) || JUMP_P (insn));
}
/* Return true if insn is a prefetch instruction. */
static bool
is_prefetch (rtx_insn *insn)
{
return NONJUMP_INSN_P (insn) && GET_CODE (PATTERN (insn)) == PREFETCH;
}
/* This function initializes a dispatch window and the list container holding a
pointer to the window. */
static void
init_window (int window_num)
{
int i;
dispatch_windows *new_list;
if (window_num == 0)
new_list = dispatch_window_list;
else
new_list = dispatch_window_list1;
new_list->num_insn = 0;
new_list->num_uops = 0;
new_list->window_size = 0;
new_list->next = NULL;
new_list->prev = NULL;
new_list->window_num = window_num;
new_list->num_imm = 0;
new_list->num_imm_32 = 0;
new_list->num_imm_64 = 0;
new_list->imm_size = 0;
new_list->num_loads = 0;
new_list->num_stores = 0;
new_list->violation = false;
for (i = 0; i < MAX_INSN; i++)
{
new_list->window[i].insn = NULL;
new_list->window[i].group = disp_no_group;
new_list->window[i].path = no_path;
new_list->window[i].byte_len = 0;
new_list->window[i].imm_bytes = 0;
}
return;
}
/* This function allocates and initializes a dispatch window and the
list container holding a pointer to the window. */
static dispatch_windows *
allocate_window (void)
{
dispatch_windows *new_list = XNEW (struct dispatch_windows_s);
new_list->window = XNEWVEC (struct sched_insn_info_s, MAX_INSN + 1);
return new_list;
}
/* This routine initializes the dispatch scheduling information. It
initiates building dispatch scheduler tables and constructs the
first dispatch window. */
static void
init_dispatch_sched (void)
{
/* Allocate a dispatch list and a window. */
dispatch_window_list = allocate_window ();
dispatch_window_list1 = allocate_window ();
init_window (0);
init_window (1);
}
/* This function returns true if a branch is detected. End of a basic block
does not have to be a branch, but here we assume only branches end a
window. */
static bool
is_end_basic_block (enum dispatch_group group)
{
return group == disp_branch;
}
/* This function is called when the end of a window processing is reached. */
static void
process_end_window (void)
{
gcc_assert (dispatch_window_list->num_insn <= MAX_INSN);
if (dispatch_window_list->next)
{
gcc_assert (dispatch_window_list1->num_insn <= MAX_INSN);
gcc_assert (dispatch_window_list->window_size
+ dispatch_window_list1->window_size <= 48);
init_window (1);
}
init_window (0);
}
/* Allocates a new dispatch window and adds it to WINDOW_LIST.
WINDOW_NUM is either 0 or 1. A maximum of two windows are generated
for 48 bytes of instructions. Note that these windows are not dispatch
windows that their sizes are DISPATCH_WINDOW_SIZE. */
static dispatch_windows *
allocate_next_window (int window_num)
{
if (window_num == 0)
{
if (dispatch_window_list->next)
init_window (1);
init_window (0);
return dispatch_window_list;
}
dispatch_window_list->next = dispatch_window_list1;
dispatch_window_list1->prev = dispatch_window_list;
return dispatch_window_list1;
}
/* Compute number of immediate operands of an instruction. */
static void
find_constant (rtx in_rtx, imm_info *imm_values)
{
if (INSN_P (in_rtx))
in_rtx = PATTERN (in_rtx);
subrtx_iterator::array_type array;
FOR_EACH_SUBRTX (iter, array, in_rtx, ALL)
if (const_rtx x = *iter)
switch (GET_CODE (x))
{
case CONST:
case SYMBOL_REF:
case CONST_INT:
(imm_values->imm)++;
if (x86_64_immediate_operand (CONST_CAST_RTX (x), SImode))
(imm_values->imm32)++;
else
(imm_values->imm64)++;
break;
case CONST_DOUBLE:
case CONST_WIDE_INT:
(imm_values->imm)++;
(imm_values->imm64)++;
break;
case CODE_LABEL:
if (LABEL_KIND (x) == LABEL_NORMAL)
{
(imm_values->imm)++;
(imm_values->imm32)++;
}
break;
default:
break;
}
}
/* Return total size of immediate operands of an instruction along with number
of corresponding immediate-operands. It initializes its parameters to zero
befor calling FIND_CONSTANT.
INSN is the input instruction. IMM is the total of immediates.
IMM32 is the number of 32 bit immediates. IMM64 is the number of 64
bit immediates. */
static int
get_num_immediates (rtx_insn *insn, int *imm, int *imm32, int *imm64)
{
imm_info imm_values = {0, 0, 0};
find_constant (insn, &imm_values);
*imm = imm_values.imm;
*imm32 = imm_values.imm32;
*imm64 = imm_values.imm64;
return imm_values.imm32 * 4 + imm_values.imm64 * 8;
}
/* This function indicates if an operand of an instruction is an
immediate. */
static bool
has_immediate (rtx_insn *insn)
{
int num_imm_operand;
int num_imm32_operand;
int num_imm64_operand;
if (insn)
return get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
&num_imm64_operand);
return false;
}
/* Return single or double path for instructions. */
static enum insn_path
get_insn_path (rtx_insn *insn)
{
enum attr_amdfam10_decode path = get_attr_amdfam10_decode (insn);
if ((int)path == 0)
return path_single;
if ((int)path == 1)
return path_double;
return path_multi;
}
/* Return insn dispatch group. */
static enum dispatch_group
get_insn_group (rtx_insn *insn)
{
enum dispatch_group group = get_mem_group (insn);
if (group)
return group;
if (is_branch (insn))
return disp_branch;
if (is_cmp (insn))
return disp_cmp;
if (has_immediate (insn))
return disp_imm;
if (is_prefetch (insn))
return disp_prefetch;
return disp_no_group;
}
/* Count number of GROUP restricted instructions in a dispatch
window WINDOW_LIST. */
static int
count_num_restricted (rtx_insn *insn, dispatch_windows *window_list)
{
enum dispatch_group group = get_insn_group (insn);
int imm_size;
int num_imm_operand;
int num_imm32_operand;
int num_imm64_operand;
if (group == disp_no_group)
return 0;
if (group == disp_imm)
{
imm_size = get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
&num_imm64_operand);
if (window_list->imm_size + imm_size > MAX_IMM_SIZE
|| num_imm_operand + window_list->num_imm > MAX_IMM
|| (num_imm32_operand > 0
&& (window_list->num_imm_32 + num_imm32_operand > MAX_IMM_32
|| window_list->num_imm_64 * 2 + num_imm32_operand > MAX_IMM_32))
|| (num_imm64_operand > 0
&& (window_list->num_imm_64 + num_imm64_operand > MAX_IMM_64
|| window_list->num_imm_32 + num_imm64_operand * 2 > MAX_IMM_32))
|| (window_list->imm_size + imm_size == MAX_IMM_SIZE
&& num_imm64_operand > 0
&& ((window_list->num_imm_64 > 0
&& window_list->num_insn >= 2)
|| window_list->num_insn >= 3)))
return BIG;
return 1;
}
if ((group == disp_load_store
&& (window_list->num_loads >= MAX_LOAD
|| window_list->num_stores >= MAX_STORE))
|| ((group == disp_load
|| group == disp_prefetch)
&& window_list->num_loads >= MAX_LOAD)
|| (group == disp_store
&& window_list->num_stores >= MAX_STORE))
return BIG;
return 1;
}
/* This function returns true if insn satisfies dispatch rules on the
last window scheduled. */
static bool
fits_dispatch_window (rtx_insn *insn)
{
dispatch_windows *window_list = dispatch_window_list;
dispatch_windows *window_list_next = dispatch_window_list->next;
unsigned int num_restrict;
enum dispatch_group group = get_insn_group (insn);
enum insn_path path = get_insn_path (insn);
int sum;
/* Make disp_cmp and disp_jcc get scheduled at the latest. These
instructions should be given the lowest priority in the
scheduling process in Haifa scheduler to make sure they will be
scheduled in the same dispatch window as the reference to them. */
if (group == disp_jcc || group == disp_cmp)
return false;
/* Check nonrestricted. */
if (group == disp_no_group || group == disp_branch)
return true;
/* Get last dispatch window. */
if (window_list_next)
window_list = window_list_next;
if (window_list->window_num == 1)
{
sum = window_list->prev->window_size + window_list->window_size;
if (sum == 32
|| (ix86_min_insn_size (insn) + sum) >= 48)
/* Window 1 is full. Go for next window. */
return true;
}
num_restrict = count_num_restricted (insn, window_list);
if (num_restrict > num_allowable_groups[group])
return false;
/* See if it fits in the first window. */
if (window_list->window_num == 0)
{
/* The first widow should have only single and double path
uops. */
if (path == path_double
&& (window_list->num_uops + 2) > MAX_INSN)
return false;
else if (path != path_single)
return false;
}
return true;
}
/* Add an instruction INSN with NUM_UOPS micro-operations to the
dispatch window WINDOW_LIST. */
static void
add_insn_window (rtx_insn *insn, dispatch_windows *window_list, int num_uops)
{
int byte_len = ix86_min_insn_size (insn);
int num_insn = window_list->num_insn;
int imm_size;
sched_insn_info *window = window_list->window;
enum dispatch_group group = get_insn_group (insn);
enum insn_path path = get_insn_path (insn);
int num_imm_operand;
int num_imm32_operand;
int num_imm64_operand;
if (!window_list->violation && group != disp_cmp
&& !fits_dispatch_window (insn))
window_list->violation = true;
imm_size = get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
&num_imm64_operand);
/* Initialize window with new instruction. */
window[num_insn].insn = insn;
window[num_insn].byte_len = byte_len;
window[num_insn].group = group;
window[num_insn].path = path;
window[num_insn].imm_bytes = imm_size;
window_list->window_size += byte_len;
window_list->num_insn = num_insn + 1;
window_list->num_uops = window_list->num_uops + num_uops;
window_list->imm_size += imm_size;
window_list->num_imm += num_imm_operand;
window_list->num_imm_32 += num_imm32_operand;
window_list->num_imm_64 += num_imm64_operand;
if (group == disp_store)
window_list->num_stores += 1;
else if (group == disp_load
|| group == disp_prefetch)
window_list->num_loads += 1;
else if (group == disp_load_store)
{
window_list->num_stores += 1;
window_list->num_loads += 1;
}
}
/* Adds a scheduled instruction, INSN, to the current dispatch window.
If the total bytes of instructions or the number of instructions in
the window exceed allowable, it allocates a new window. */
static void
add_to_dispatch_window (rtx_insn *insn)
{
int byte_len;
dispatch_windows *window_list;
dispatch_windows *next_list;
dispatch_windows *window0_list;
enum insn_path path;
enum dispatch_group insn_group;
bool insn_fits;
int num_insn;
int num_uops;
int window_num;
int insn_num_uops;
int sum;
if (INSN_CODE (insn) < 0)
return;
byte_len = ix86_min_insn_size (insn);
window_list = dispatch_window_list;
next_list = window_list->next;
path = get_insn_path (insn);
insn_group = get_insn_group (insn);
/* Get the last dispatch window. */
if (next_list)
window_list = dispatch_window_list->next;
if (path == path_single)
insn_num_uops = 1;
else if (path == path_double)
insn_num_uops = 2;
else
insn_num_uops = (int) path;
/* If current window is full, get a new window.
Window number zero is full, if MAX_INSN uops are scheduled in it.
Window number one is full, if window zero's bytes plus window
one's bytes is 32, or if the bytes of the new instruction added
to the total makes it greater than 48, or it has already MAX_INSN
instructions in it. */
num_insn = window_list->num_insn;
num_uops = window_list->num_uops;
window_num = window_list->window_num;
insn_fits = fits_dispatch_window (insn);
if (num_insn >= MAX_INSN
|| num_uops + insn_num_uops > MAX_INSN
|| !(insn_fits))
{
window_num = ~window_num & 1;
window_list = allocate_next_window (window_num);
}
if (window_num == 0)
{
add_insn_window (insn, window_list, insn_num_uops);
if (window_list->num_insn >= MAX_INSN
&& insn_group == disp_branch)
{
process_end_window ();
return;
}
}
else if (window_num == 1)
{
window0_list = window_list->prev;
sum = window0_list->window_size + window_list->window_size;
if (sum == 32
|| (byte_len + sum) >= 48)
{
process_end_window ();
window_list = dispatch_window_list;
}
add_insn_window (insn, window_list, insn_num_uops);
}
else
gcc_unreachable ();
if (is_end_basic_block (insn_group))
{
/* End of basic block is reached do end-basic-block process. */
process_end_window ();
return;
}
}
/* Print the dispatch window, WINDOW_NUM, to FILE. */
DEBUG_FUNCTION static void
debug_dispatch_window_file (FILE *file, int window_num)
{
dispatch_windows *list;
int i;
if (window_num == 0)
list = dispatch_window_list;
else
list = dispatch_window_list1;
fprintf (file, "Window #%d:\n", list->window_num);
fprintf (file, " num_insn = %d, num_uops = %d, window_size = %d\n",
list->num_insn, list->num_uops, list->window_size);
fprintf (file, " num_imm = %d, num_imm_32 = %d, num_imm_64 = %d, imm_size = %d\n",
list->num_imm, list->num_imm_32, list->num_imm_64, list->imm_size);
fprintf (file, " num_loads = %d, num_stores = %d\n", list->num_loads,
list->num_stores);
fprintf (file, " insn info:\n");
for (i = 0; i < MAX_INSN; i++)
{
if (!list->window[i].insn)
break;
fprintf (file, " group[%d] = %s, insn[%d] = %p, path[%d] = %d byte_len[%d] = %d, imm_bytes[%d] = %d\n",
i, group_name[list->window[i].group],
i, (void *)list->window[i].insn,
i, list->window[i].path,
i, list->window[i].byte_len,
i, list->window[i].imm_bytes);
}
}
/* Print to stdout a dispatch window. */
DEBUG_FUNCTION void
debug_dispatch_window (int window_num)
{
debug_dispatch_window_file (stdout, window_num);
}
/* Print INSN dispatch information to FILE. */
DEBUG_FUNCTION static void
debug_insn_dispatch_info_file (FILE *file, rtx_insn *insn)
{
int byte_len;
enum insn_path path;
enum dispatch_group group;
int imm_size;
int num_imm_operand;
int num_imm32_operand;
int num_imm64_operand;
if (INSN_CODE (insn) < 0)
return;
byte_len = ix86_min_insn_size (insn);
path = get_insn_path (insn);
group = get_insn_group (insn);
imm_size = get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
&num_imm64_operand);
fprintf (file, " insn info:\n");
fprintf (file, " group = %s, path = %d, byte_len = %d\n",
group_name[group], path, byte_len);
fprintf (file, " num_imm = %d, num_imm_32 = %d, num_imm_64 = %d, imm_size = %d\n",
num_imm_operand, num_imm32_operand, num_imm64_operand, imm_size);
}
/* Print to STDERR the status of the ready list with respect to
dispatch windows. */
DEBUG_FUNCTION void
debug_ready_dispatch (void)
{
int i;
int no_ready = number_in_ready ();
fprintf (stdout, "Number of ready: %d\n", no_ready);
for (i = 0; i < no_ready; i++)
debug_insn_dispatch_info_file (stdout, get_ready_element (i));
}
/* This routine is the driver of the dispatch scheduler. */
void
ix86_bd_do_dispatch (rtx_insn *insn, int mode)
{
if (mode == DISPATCH_INIT)
init_dispatch_sched ();
else if (mode == ADD_TO_DISPATCH_WINDOW)
add_to_dispatch_window (insn);
}
/* Return TRUE if Dispatch Scheduling is supported. */
bool
ix86_bd_has_dispatch (rtx_insn *insn, int action)
{
/* Current implementation of dispatch scheduler models buldozer only. */
if ((TARGET_BDVER1 || TARGET_BDVER2 || TARGET_BDVER3
|| TARGET_BDVER4) && flag_dispatch_scheduler)
switch (action)
{
default:
return false;
case IS_DISPATCH_ON:
return true;
case IS_CMP:
return is_cmp (insn);
case DISPATCH_VIOLATION:
return dispatch_violation ();
case FITS_DISPATCH_WINDOW:
return fits_dispatch_window (insn);
}
return false;
}
/* Scheduler hooks for IA-32 which implement bdver1-4 specific logic.
Copyright (C) 1988-2017 Free Software Foundation, Inc.
This file is part of GCC.
GCC is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 3, or (at your option)
any later version.
GCC is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with GCC; see the file COPYING3. If not see
<http://www.gnu.org/licenses/>. */
#include "config.h"
#include "system.h"
#include "coretypes.h"
#include "backend.h"
#include "rtl.h"
#include "tree.h"
#include "cfghooks.h"
#include "tm_p.h"
#include "insn-config.h"
#include "insn-attr.h"
#include "recog.h"
#include "target.h"
#include "rtl-iter.h"
#include "regset.h"
#include "sched-int.h"
/* Model decoder of Core 2/i7.
Below hooks for multipass scheduling (see haifa-sched.c:max_issue)
track the instruction fetch block boundaries and make sure that long
(9+ bytes) instructions are assigned to D0. */
/* Maximum length of an insn that can be handled by
a secondary decoder unit. '8' for Core 2/i7. */
static int core2i7_secondary_decoder_max_insn_size;
/* Ifetch block size, i.e., number of bytes decoder reads per cycle.
'16' for Core 2/i7. */
static int core2i7_ifetch_block_size;
/* Maximum number of instructions decoder can handle per cycle.
'6' for Core 2/i7. */
static int core2i7_ifetch_block_max_insns;
typedef struct ix86_first_cycle_multipass_data_ *
ix86_first_cycle_multipass_data_t;
typedef const struct ix86_first_cycle_multipass_data_ *
const_ix86_first_cycle_multipass_data_t;
/* A variable to store target state across calls to max_issue within
one cycle. */
static struct ix86_first_cycle_multipass_data_ _ix86_first_cycle_multipass_data,
*ix86_first_cycle_multipass_data = &_ix86_first_cycle_multipass_data;
/* Initialize DATA. */
static void
core2i7_first_cycle_multipass_init (void *_data)
{
ix86_first_cycle_multipass_data_t data
= (ix86_first_cycle_multipass_data_t) _data;
data->ifetch_block_len = 0;
data->ifetch_block_n_insns = 0;
data->ready_try_change = NULL;
data->ready_try_change_size = 0;
}
/* Advancing the cycle; reset ifetch block counts. */
static void
core2i7_dfa_post_advance_cycle (void)
{
ix86_first_cycle_multipass_data_t data = ix86_first_cycle_multipass_data;
gcc_assert (data->ifetch_block_n_insns <= core2i7_ifetch_block_max_insns);
data->ifetch_block_len = 0;
data->ifetch_block_n_insns = 0;
}
/* Filter out insns from ready_try that the core will not be able to issue
on current cycle due to decoder. */
static void
core2i7_first_cycle_multipass_filter_ready_try
(const_ix86_first_cycle_multipass_data_t data,
signed char *ready_try, int n_ready, bool first_cycle_insn_p)
{
while (n_ready--)
{
rtx_insn *insn;
int insn_size;
if (ready_try[n_ready])
continue;
insn = get_ready_element (n_ready);
insn_size = ix86_min_insn_size (insn);
if (/* If this is a too long an insn for a secondary decoder ... */
(!first_cycle_insn_p
&& insn_size > core2i7_secondary_decoder_max_insn_size)
/* ... or it would not fit into the ifetch block ... */
|| data->ifetch_block_len + insn_size > core2i7_ifetch_block_size
/* ... or the decoder is full already ... */
|| data->ifetch_block_n_insns + 1 > core2i7_ifetch_block_max_insns)
/* ... mask the insn out. */
{
ready_try[n_ready] = 1;
if (data->ready_try_change)
bitmap_set_bit (data->ready_try_change, n_ready);
}
}
}
/* Prepare for a new round of multipass lookahead scheduling. */
static void
core2i7_first_cycle_multipass_begin (void *_data,
signed char *ready_try, int n_ready,
bool first_cycle_insn_p)
{
ix86_first_cycle_multipass_data_t data
= (ix86_first_cycle_multipass_data_t) _data;
const_ix86_first_cycle_multipass_data_t prev_data
= ix86_first_cycle_multipass_data;
/* Restore the state from the end of the previous round. */
data->ifetch_block_len = prev_data->ifetch_block_len;
data->ifetch_block_n_insns = prev_data->ifetch_block_n_insns;
/* Filter instructions that cannot be issued on current cycle due to
decoder restrictions. */
core2i7_first_cycle_multipass_filter_ready_try (data, ready_try, n_ready,
first_cycle_insn_p);
}
/* INSN is being issued in current solution. Account for its impact on
the decoder model. */
static void
core2i7_first_cycle_multipass_issue (void *_data,
signed char *ready_try, int n_ready,
rtx_insn *insn, const void *_prev_data)
{
ix86_first_cycle_multipass_data_t data
= (ix86_first_cycle_multipass_data_t) _data;
const_ix86_first_cycle_multipass_data_t prev_data
= (const_ix86_first_cycle_multipass_data_t) _prev_data;
int insn_size = ix86_min_insn_size (insn);
data->ifetch_block_len = prev_data->ifetch_block_len + insn_size;
data->ifetch_block_n_insns = prev_data->ifetch_block_n_insns + 1;
gcc_assert (data->ifetch_block_len <= core2i7_ifetch_block_size
&& data->ifetch_block_n_insns <= core2i7_ifetch_block_max_insns);
/* Allocate or resize the bitmap for storing INSN's effect on ready_try. */
if (!data->ready_try_change)
{
data->ready_try_change = sbitmap_alloc (n_ready);
data->ready_try_change_size = n_ready;
}
else if (data->ready_try_change_size < n_ready)
{
data->ready_try_change = sbitmap_resize (data->ready_try_change,
n_ready, 0);
data->ready_try_change_size = n_ready;
}
bitmap_clear (data->ready_try_change);
/* Filter out insns from ready_try that the core will not be able to issue
on current cycle due to decoder. */
core2i7_first_cycle_multipass_filter_ready_try (data, ready_try, n_ready,
false);
}
/* Revert the effect on ready_try. */
static void
core2i7_first_cycle_multipass_backtrack (const void *_data,
signed char *ready_try,
int n_ready ATTRIBUTE_UNUSED)
{
const_ix86_first_cycle_multipass_data_t data
= (const_ix86_first_cycle_multipass_data_t) _data;
unsigned int i = 0;
sbitmap_iterator sbi;
gcc_assert (bitmap_last_set_bit (data->ready_try_change) < n_ready);
EXECUTE_IF_SET_IN_BITMAP (data->ready_try_change, 0, i, sbi)
{
ready_try[i] = 0;
}
}
/* Save the result of multipass lookahead scheduling for the next round. */
static void
core2i7_first_cycle_multipass_end (const void *_data)
{
const_ix86_first_cycle_multipass_data_t data
= (const_ix86_first_cycle_multipass_data_t) _data;
ix86_first_cycle_multipass_data_t next_data
= ix86_first_cycle_multipass_data;
if (data != NULL)
{
next_data->ifetch_block_len = data->ifetch_block_len;
next_data->ifetch_block_n_insns = data->ifetch_block_n_insns;
}
}
/* Deallocate target data. */
static void
core2i7_first_cycle_multipass_fini (void *_data)
{
ix86_first_cycle_multipass_data_t data
= (ix86_first_cycle_multipass_data_t) _data;
if (data->ready_try_change)
{
sbitmap_free (data->ready_try_change);
data->ready_try_change = NULL;
data->ready_try_change_size = 0;
}
}
void
ix86_core2i7_init_hooks (void)
{
targetm.sched.dfa_post_advance_cycle
= core2i7_dfa_post_advance_cycle;
targetm.sched.first_cycle_multipass_init
= core2i7_first_cycle_multipass_init;
targetm.sched.first_cycle_multipass_begin
= core2i7_first_cycle_multipass_begin;
targetm.sched.first_cycle_multipass_issue
= core2i7_first_cycle_multipass_issue;
targetm.sched.first_cycle_multipass_backtrack
= core2i7_first_cycle_multipass_backtrack;
targetm.sched.first_cycle_multipass_end
= core2i7_first_cycle_multipass_end;
targetm.sched.first_cycle_multipass_fini
= core2i7_first_cycle_multipass_fini;
/* Set decoder parameters. */
core2i7_secondary_decoder_max_insn_size = 8;
core2i7_ifetch_block_size = 16;
core2i7_ifetch_block_max_insns = 6;
}
/* Scheduler hooks for IA-32 which implement CPU specific logic.
Copyright (C) 1988-2017 Free Software Foundation, Inc.
This file is part of GCC.
GCC is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 3, or (at your option)
any later version.
GCC is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with GCC; see the file COPYING3. If not see
<http://www.gnu.org/licenses/>. */
#include "config.h"
#include "system.h"
#include "coretypes.h"
#include "backend.h"
#include "rtl.h"
#include "tree.h"
#include "cfghooks.h"
#include "tm_p.h"
#include "insn-config.h"
#include "insn-attr.h"
#include "recog.h"
#include "target.h"
/* Return the maximum number of instructions a cpu can issue. */
int
ix86_issue_rate (void)
{
switch (ix86_tune)
{
case PROCESSOR_PENTIUM:
case PROCESSOR_LAKEMONT:
case PROCESSOR_BONNELL:
case PROCESSOR_SILVERMONT:
case PROCESSOR_KNL:
case PROCESSOR_KNM:
case PROCESSOR_INTEL:
case PROCESSOR_K6:
case PROCESSOR_BTVER2:
case PROCESSOR_PENTIUM4:
case PROCESSOR_NOCONA:
return 2;
case PROCESSOR_PENTIUMPRO:
case PROCESSOR_ATHLON:
case PROCESSOR_K8:
case PROCESSOR_AMDFAM10:
case PROCESSOR_GENERIC:
case PROCESSOR_BTVER1:
return 3;
case PROCESSOR_BDVER1:
case PROCESSOR_BDVER2:
case PROCESSOR_BDVER3:
case PROCESSOR_BDVER4:
case PROCESSOR_ZNVER1:
case PROCESSOR_CORE2:
case PROCESSOR_NEHALEM:
case PROCESSOR_SANDYBRIDGE:
case PROCESSOR_HASWELL:
return 4;
default:
return 1;
}
}
/* Return true iff USE_INSN has a memory address with operands set by
SET_INSN. */
bool
ix86_agi_dependent (rtx_insn *set_insn, rtx_insn *use_insn)
{
int i;
extract_insn_cached (use_insn);
for (i = recog_data.n_operands - 1; i >= 0; --i)
if (MEM_P (recog_data.operand[i]))
{
rtx addr = XEXP (recog_data.operand[i], 0);
if (modified_in_p (addr, set_insn) != 0)
{
/* No AGI stall if SET_INSN is a push or pop and USE_INSN
has SP based memory (unless index reg is modified in a pop). */
rtx set = single_set (set_insn);
if (set
&& (push_operand (SET_DEST (set), GET_MODE (SET_DEST (set)))
|| pop_operand (SET_SRC (set), GET_MODE (SET_SRC (set)))))
{
struct ix86_address parts;
if (ix86_decompose_address (addr, &parts)
&& parts.base == stack_pointer_rtx
&& (parts.index == NULL_RTX
|| MEM_P (SET_DEST (set))
|| !modified_in_p (parts.index, set_insn)))
return false;
}
return true;
}
return false;
}
return false;
}
/* A subroutine of ix86_adjust_cost -- return TRUE iff INSN reads flags set
by DEP_INSN and nothing set by DEP_INSN. */
static bool
ix86_flags_dependent (rtx_insn *insn, rtx_insn *dep_insn, enum attr_type insn_type)
{
rtx set, set2;
/* Simplify the test for uninteresting insns. */
if (insn_type != TYPE_SETCC
&& insn_type != TYPE_ICMOV
&& insn_type != TYPE_FCMOV
&& insn_type != TYPE_IBR)
return false;
if ((set = single_set (dep_insn)) != 0)
{
set = SET_DEST (set);
set2 = NULL_RTX;
}
else if (GET_CODE (PATTERN (dep_insn)) == PARALLEL
&& XVECLEN (PATTERN (dep_insn), 0) == 2
&& GET_CODE (XVECEXP (PATTERN (dep_insn), 0, 0)) == SET
&& GET_CODE (XVECEXP (PATTERN (dep_insn), 0, 1)) == SET)
{
set = SET_DEST (XVECEXP (PATTERN (dep_insn), 0, 0));
set2 = SET_DEST (XVECEXP (PATTERN (dep_insn), 0, 0));
}
else
return false;
if (!REG_P (set) || REGNO (set) != FLAGS_REG)
return false;
/* This test is true if the dependent insn reads the flags but
not any other potentially set register. */
if (!reg_overlap_mentioned_p (set, PATTERN (insn)))
return false;
if (set2 && reg_overlap_mentioned_p (set2, PATTERN (insn)))
return false;
return true;
}
/* Helper function for exact_store_load_dependency.
Return true if addr is found in insn. */
static bool
exact_dependency_1 (rtx addr, rtx insn)
{
enum rtx_code code;
const char *format_ptr;
int i, j;
code = GET_CODE (insn);
switch (code)
{
case MEM:
if (rtx_equal_p (addr, insn))
return true;
break;
case REG:
CASE_CONST_ANY:
case SYMBOL_REF:
case CODE_LABEL:
case PC:
case CC0:
case EXPR_LIST:
return false;
default:
break;
}
format_ptr = GET_RTX_FORMAT (code);
for (i = 0; i < GET_RTX_LENGTH (code); i++)
{
switch (*format_ptr++)
{
case 'e':
if (exact_dependency_1 (addr, XEXP (insn, i)))
return true;
break;
case 'E':
for (j = 0; j < XVECLEN (insn, i); j++)
if (exact_dependency_1 (addr, XVECEXP (insn, i, j)))
return true;
break;
}
}
return false;
}
/* Return true if there exists exact dependency for store & load, i.e.
the same memory address is used in them. */
static bool
exact_store_load_dependency (rtx_insn *store, rtx_insn *load)
{
rtx set1, set2;
set1 = single_set (store);
if (!set1)
return false;
if (!MEM_P (SET_DEST (set1)))
return false;
set2 = single_set (load);
if (!set2)
return false;
if (exact_dependency_1 (SET_DEST (set1), SET_SRC (set2)))
return true;
return false;
}
/* This function corrects the value of COST (latency) based on the relationship
between INSN and DEP_INSN through a dependence of type DEP_TYPE, and strength
DW. It should return the new value.
On x86 CPUs this is most commonly used to model the fact that valus of
registers used to compute address of memory operand needs to be ready
earlier than values of registers used in the actual operation. */
int
ix86_adjust_cost (rtx_insn *insn, int dep_type, rtx_insn *dep_insn, int cost,
unsigned int)
{
enum attr_type insn_type, dep_insn_type;
enum attr_memory memory;
rtx set, set2;
int dep_insn_code_number;
/* Anti and output dependencies have zero cost on all CPUs. */
if (dep_type != 0)
return 0;
dep_insn_code_number = recog_memoized (dep_insn);
/* If we can't recognize the insns, we can't really do anything. */
if (dep_insn_code_number < 0 || recog_memoized (insn) < 0)
return cost;
insn_type = get_attr_type (insn);
dep_insn_type = get_attr_type (dep_insn);
switch (ix86_tune)
{
case PROCESSOR_PENTIUM:
case PROCESSOR_LAKEMONT:
/* Address Generation Interlock adds a cycle of latency. */
if (insn_type == TYPE_LEA)
{
rtx addr = PATTERN (insn);
if (GET_CODE (addr) == PARALLEL)
addr = XVECEXP (addr, 0, 0);
gcc_assert (GET_CODE (addr) == SET);
addr = SET_SRC (addr);
if (modified_in_p (addr, dep_insn))
cost += 1;
}
else if (ix86_agi_dependent (dep_insn, insn))
cost += 1;
/* ??? Compares pair with jump/setcc. */
if (ix86_flags_dependent (insn, dep_insn, insn_type))
cost = 0;
/* Floating point stores require value to be ready one cycle earlier. */
if (insn_type == TYPE_FMOV
&& get_attr_memory (insn) == MEMORY_STORE
&& !ix86_agi_dependent (dep_insn, insn))
cost += 1;
break;
case PROCESSOR_PENTIUMPRO:
/* INT->FP conversion is expensive. */
if (get_attr_fp_int_src (dep_insn))
cost += 5;
/* There is one cycle extra latency between an FP op and a store. */
if (insn_type == TYPE_FMOV
&& (set = single_set (dep_insn)) != NULL_RTX
&& (set2 = single_set (insn)) != NULL_RTX
&& rtx_equal_p (SET_DEST (set), SET_SRC (set2))
&& MEM_P (SET_DEST (set2)))
cost += 1;
memory = get_attr_memory (insn);
/* Show ability of reorder buffer to hide latency of load by executing
in parallel with previous instruction in case
previous instruction is not needed to compute the address. */
if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
&& !ix86_agi_dependent (dep_insn, insn))
{
/* Claim moves to take one cycle, as core can issue one load
at time and the next load can start cycle later. */
if (dep_insn_type == TYPE_IMOV
|| dep_insn_type == TYPE_FMOV)
cost = 1;
else if (cost > 1)
cost--;
}
break;
case PROCESSOR_K6:
/* The esp dependency is resolved before
the instruction is really finished. */
if ((insn_type == TYPE_PUSH || insn_type == TYPE_POP)
&& (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP))
return 1;
/* INT->FP conversion is expensive. */
if (get_attr_fp_int_src (dep_insn))
cost += 5;
memory = get_attr_memory (insn);
/* Show ability of reorder buffer to hide latency of load by executing
in parallel with previous instruction in case
previous instruction is not needed to compute the address. */
if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
&& !ix86_agi_dependent (dep_insn, insn))
{
/* Claim moves to take one cycle, as core can issue one load
at time and the next load can start cycle later. */
if (dep_insn_type == TYPE_IMOV
|| dep_insn_type == TYPE_FMOV)
cost = 1;
else if (cost > 2)
cost -= 2;
else
cost = 1;
}
break;
case PROCESSOR_AMDFAM10:
case PROCESSOR_BDVER1:
case PROCESSOR_BDVER2:
case PROCESSOR_BDVER3:
case PROCESSOR_BDVER4:
case PROCESSOR_ZNVER1:
case PROCESSOR_BTVER1:
case PROCESSOR_BTVER2:
case PROCESSOR_GENERIC:
/* Stack engine allows to execute push&pop instructions in parall. */
if ((insn_type == TYPE_PUSH || insn_type == TYPE_POP)
&& (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP))
return 0;
/* FALLTHRU */
case PROCESSOR_ATHLON:
case PROCESSOR_K8:
memory = get_attr_memory (insn);
/* Show ability of reorder buffer to hide latency of load by executing
in parallel with previous instruction in case
previous instruction is not needed to compute the address. */
if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
&& !ix86_agi_dependent (dep_insn, insn))
{
enum attr_unit unit = get_attr_unit (insn);
int loadcost = 3;
/* Because of the difference between the length of integer and
floating unit pipeline preparation stages, the memory operands
for floating point are cheaper.
??? For Athlon it the difference is most probably 2. */
if (unit == UNIT_INTEGER || unit == UNIT_UNKNOWN)
loadcost = 3;
else
loadcost = TARGET_ATHLON ? 2 : 0;
if (cost >= loadcost)
cost -= loadcost;
else
cost = 0;
}
break;
case PROCESSOR_CORE2:
case PROCESSOR_NEHALEM:
case PROCESSOR_SANDYBRIDGE:
case PROCESSOR_HASWELL:
/* Stack engine allows to execute push&pop instructions in parall. */
if ((insn_type == TYPE_PUSH || insn_type == TYPE_POP)
&& (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP))
return 0;
memory = get_attr_memory (insn);
/* Show ability of reorder buffer to hide latency of load by executing
in parallel with previous instruction in case
previous instruction is not needed to compute the address. */
if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
&& !ix86_agi_dependent (dep_insn, insn))
{
if (cost >= 4)
cost -= 4;
else
cost = 0;
}
break;
case PROCESSOR_SILVERMONT:
case PROCESSOR_KNL:
case PROCESSOR_KNM:
case PROCESSOR_INTEL:
if (!reload_completed)
return cost;
/* Increase cost of integer loads. */
memory = get_attr_memory (dep_insn);
if (memory == MEMORY_LOAD || memory == MEMORY_BOTH)
{
enum attr_unit unit = get_attr_unit (dep_insn);
if (unit == UNIT_INTEGER && cost == 1)
{
if (memory == MEMORY_LOAD)
cost = 3;
else
{
/* Increase cost of ld/st for short int types only
because of store forwarding issue. */
rtx set = single_set (dep_insn);
if (set && (GET_MODE (SET_DEST (set)) == QImode
|| GET_MODE (SET_DEST (set)) == HImode))
{
/* Increase cost of store/load insn if exact
dependence exists and it is load insn. */
enum attr_memory insn_memory = get_attr_memory (insn);
if (insn_memory == MEMORY_LOAD
&& exact_store_load_dependency (dep_insn, insn))
cost = 3;
}
}
}
}
default:
break;
}
return cost;
}
/* How many alternative schedules to try. This should be as wide as the
scheduling freedom in the DFA, but no wider. Making this value too
large results extra work for the scheduler. */
int
ia32_multipass_dfa_lookahead (void)
{
/* Generally, we want haifa-sched:max_issue() to look ahead as far
as many instructions can be executed on a cycle, i.e.,
issue_rate. */
if (reload_completed)
return ix86_issue_rate ();
/* Don't use lookahead for pre-reload schedule to save compile time. */
return 0;
}
/* Return true if target platform supports macro-fusion. */
bool
ix86_macro_fusion_p ()
{
return TARGET_FUSE_CMP_AND_BRANCH;
}
/* Check whether current microarchitecture support macro fusion
for insn pair "CONDGEN + CONDJMP". Refer to
"Intel Architectures Optimization Reference Manual". */
bool
ix86_macro_fusion_pair_p (rtx_insn *condgen, rtx_insn *condjmp)
{
rtx src, dest;
enum rtx_code ccode;
rtx compare_set = NULL_RTX, test_if, cond;
rtx alu_set = NULL_RTX, addr = NULL_RTX;
if (!any_condjump_p (condjmp))
return false;
unsigned int condreg1, condreg2;
rtx cc_reg_1;
targetm.fixed_condition_code_regs (&condreg1, &condreg2);
cc_reg_1 = gen_rtx_REG (CCmode, condreg1);
if (!reg_referenced_p (cc_reg_1, PATTERN (condjmp))
|| !condgen
|| !modified_in_p (cc_reg_1, condgen))
return false;
if (get_attr_type (condgen) != TYPE_TEST
&& get_attr_type (condgen) != TYPE_ICMP
&& get_attr_type (condgen) != TYPE_INCDEC
&& get_attr_type (condgen) != TYPE_ALU)
return false;
compare_set = single_set (condgen);
if (compare_set == NULL_RTX
&& !TARGET_FUSE_ALU_AND_BRANCH)
return false;
if (compare_set == NULL_RTX)
{
int i;
rtx pat = PATTERN (condgen);
for (i = 0; i < XVECLEN (pat, 0); i++)
if (GET_CODE (XVECEXP (pat, 0, i)) == SET)
{
rtx set_src = SET_SRC (XVECEXP (pat, 0, i));
if (GET_CODE (set_src) == COMPARE)
compare_set = XVECEXP (pat, 0, i);
else
alu_set = XVECEXP (pat, 0, i);
}
}
if (compare_set == NULL_RTX)
return false;
src = SET_SRC (compare_set);
if (GET_CODE (src) != COMPARE)
return false;
/* Macro-fusion for cmp/test MEM-IMM + conditional jmp is not
supported. */
if ((MEM_P (XEXP (src, 0))
&& CONST_INT_P (XEXP (src, 1)))
|| (MEM_P (XEXP (src, 1))
&& CONST_INT_P (XEXP (src, 0))))
return false;
/* No fusion for RIP-relative address. */
if (MEM_P (XEXP (src, 0)))
addr = XEXP (XEXP (src, 0), 0);
else if (MEM_P (XEXP (src, 1)))
addr = XEXP (XEXP (src, 1), 0);
if (addr) {
ix86_address parts;
int ok = ix86_decompose_address (addr, &parts);
gcc_assert (ok);
if (ix86_rip_relative_addr_p (&parts))
return false;
}
test_if = SET_SRC (pc_set (condjmp));
cond = XEXP (test_if, 0);
ccode = GET_CODE (cond);
/* Check whether conditional jump use Sign or Overflow Flags. */
if (!TARGET_FUSE_CMP_AND_BRANCH_SOFLAGS
&& (ccode == GE
|| ccode == GT
|| ccode == LE
|| ccode == LT))
return false;
/* Return true for TYPE_TEST and TYPE_ICMP. */
if (get_attr_type (condgen) == TYPE_TEST
|| get_attr_type (condgen) == TYPE_ICMP)
return true;
/* The following is the case that macro-fusion for alu + jmp. */
if (!TARGET_FUSE_ALU_AND_BRANCH || !alu_set)
return false;
/* No fusion for alu op with memory destination operand. */
dest = SET_DEST (alu_set);
if (MEM_P (dest))
return false;
/* Macro-fusion for inc/dec + unsigned conditional jump is not
supported. */
if (get_attr_type (condgen) == TYPE_INCDEC
&& (ccode == GEU
|| ccode == GTU
|| ccode == LEU
|| ccode == LTU))
return false;
return true;
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment