Commit a6fc00da by Benedikt Huber Committed by Philipp Tomsich

aarch64-builtins.c: Builtins for rsqrt and rsqrtf.

2015-11-06  Benedikt Huber  <benedikt.huber@theobroma-systems.com>
	    Philipp Tomsich  <philipp.tomsich@theobroma-systems.com>

	* config/aarch64/aarch64-builtins.c: Builtins for rsqrt and rsqrtf.
	* config/aarch64/aarch64-protos.h: Declare.
	* config/aarch64/aarch64-simd.md: Matching expressions for frsqrte and
	frsqrts.
	* config/aarch64/aarch64-tuning-flags.def: Added recip_sqrt.
	* config/aarch64/aarch64.c: New functions. Emit rsqrt estimation code when
	applicable.
	* config/aarch64/aarch64.md: Added enum entries.
	* config/aarch64/aarch64.opt: Added option -mlow-precision-recip-sqrt.
	* testsuite/gcc.target/aarch64/rsqrt_asm_check_common.h: Common macros for
	assembly checks.
	* testsuite/gcc.target/aarch64/rsqrt_asm_check_negative_1.c: Make sure
	frsqrts and frsqrte are not emitted.
	* testsuite/gcc.target/aarch64/rsqrt_asm_check_1.c: Make sure frsqrts and
	frsqrte are emitted.
	* testsuite/gcc.target/aarch64/rsqrt_1.c: Functional tests for rsqrt.

Signed-off-by: Philipp Tomsich <philipp.tomsich@theobroma-systems.com>

Co-Authored-By: Philipp Tomsich <philipp.tomsich@theobroma-systems.com>

From-SVN: r229866
parent 7ad72a97
2015-11-06 Benedikt Huber <benedikt.huber@theobroma-systems.com>
Philipp Tomsich <philipp.tomsich@theobroma-systems.com>
* config/aarch64/aarch64-builtins.c: Builtins for rsqrt and rsqrtf.
* config/aarch64/aarch64-protos.h: Declare.
* config/aarch64/aarch64-simd.md: Matching expressions for frsqrte and
frsqrts.
* config/aarch64/aarch64-tuning-flags.def: Added recip_sqrt.
* config/aarch64/aarch64.c: New functions. Emit rsqrt estimation code when
applicable.
* config/aarch64/aarch64.md: Added enum entries.
* config/aarch64/aarch64.opt: Added option -mlow-precision-recip-sqrt.
* testsuite/gcc.target/aarch64/rsqrt_asm_check_common.h: Common macros for
assembly checks.
* testsuite/gcc.target/aarch64/rsqrt_asm_check_negative_1.c: Make sure
frsqrts and frsqrte are not emitted.
* testsuite/gcc.target/aarch64/rsqrt_asm_check_1.c: Make sure frsqrts and
frsqrte are emitted.
* testsuite/gcc.target/aarch64/rsqrt_1.c: Functional tests for rsqrt.
2015-11-07 Jan Hubicka <hubicka@ucw.cz> 2015-11-07 Jan Hubicka <hubicka@ucw.cz>
PR ipa/68057 PR ipa/68057
...@@ -324,6 +324,11 @@ enum aarch64_builtins ...@@ -324,6 +324,11 @@ enum aarch64_builtins
AARCH64_BUILTIN_GET_FPSR, AARCH64_BUILTIN_GET_FPSR,
AARCH64_BUILTIN_SET_FPSR, AARCH64_BUILTIN_SET_FPSR,
AARCH64_BUILTIN_RSQRT_DF,
AARCH64_BUILTIN_RSQRT_SF,
AARCH64_BUILTIN_RSQRT_V2DF,
AARCH64_BUILTIN_RSQRT_V2SF,
AARCH64_BUILTIN_RSQRT_V4SF,
AARCH64_SIMD_BUILTIN_BASE, AARCH64_SIMD_BUILTIN_BASE,
AARCH64_SIMD_BUILTIN_LANE_CHECK, AARCH64_SIMD_BUILTIN_LANE_CHECK,
#include "aarch64-simd-builtins.def" #include "aarch64-simd-builtins.def"
...@@ -822,6 +827,46 @@ aarch64_init_crc32_builtins () ...@@ -822,6 +827,46 @@ aarch64_init_crc32_builtins ()
} }
} }
/* Add builtins for reciprocal square root. */
void
aarch64_init_builtin_rsqrt (void)
{
tree fndecl = NULL;
tree ftype = NULL;
tree V2SF_type_node = build_vector_type (float_type_node, 2);
tree V2DF_type_node = build_vector_type (double_type_node, 2);
tree V4SF_type_node = build_vector_type (float_type_node, 4);
struct builtin_decls_data
{
tree type_node;
const char *builtin_name;
int function_code;
};
builtin_decls_data bdda[] =
{
{ double_type_node, "__builtin_aarch64_rsqrt_df", AARCH64_BUILTIN_RSQRT_DF },
{ float_type_node, "__builtin_aarch64_rsqrt_sf", AARCH64_BUILTIN_RSQRT_SF },
{ V2DF_type_node, "__builtin_aarch64_rsqrt_v2df", AARCH64_BUILTIN_RSQRT_V2DF },
{ V2SF_type_node, "__builtin_aarch64_rsqrt_v2sf", AARCH64_BUILTIN_RSQRT_V2SF },
{ V4SF_type_node, "__builtin_aarch64_rsqrt_v4sf", AARCH64_BUILTIN_RSQRT_V4SF }
};
builtin_decls_data *bdd = bdda;
builtin_decls_data *bdd_end = bdd + (sizeof (bdda) / sizeof (builtin_decls_data));
for (; bdd < bdd_end; bdd++)
{
ftype = build_function_type_list (bdd->type_node, bdd->type_node, NULL_TREE);
fndecl = add_builtin_function (bdd->builtin_name,
ftype, bdd->function_code, BUILT_IN_MD, NULL, NULL_TREE);
aarch64_builtin_decls[bdd->function_code] = fndecl;
}
}
void void
aarch64_init_builtins (void) aarch64_init_builtins (void)
{ {
...@@ -853,6 +898,7 @@ aarch64_init_builtins (void) ...@@ -853,6 +898,7 @@ aarch64_init_builtins (void)
aarch64_init_simd_builtins (); aarch64_init_simd_builtins ();
aarch64_init_crc32_builtins (); aarch64_init_crc32_builtins ();
aarch64_init_builtin_rsqrt ();
} }
tree tree
...@@ -1116,6 +1162,44 @@ aarch64_crc32_expand_builtin (int fcode, tree exp, rtx target) ...@@ -1116,6 +1162,44 @@ aarch64_crc32_expand_builtin (int fcode, tree exp, rtx target)
return target; return target;
} }
/* Function to expand reciprocal square root builtins. */
static rtx
aarch64_expand_builtin_rsqrt (int fcode, tree exp, rtx target)
{
tree arg0 = CALL_EXPR_ARG (exp, 0);
rtx op0 = expand_normal (arg0);
rtx (*gen) (rtx, rtx);
switch (fcode)
{
case AARCH64_BUILTIN_RSQRT_DF:
gen = gen_aarch64_rsqrt_df2;
break;
case AARCH64_BUILTIN_RSQRT_SF:
gen = gen_aarch64_rsqrt_sf2;
break;
case AARCH64_BUILTIN_RSQRT_V2DF:
gen = gen_aarch64_rsqrt_v2df2;
break;
case AARCH64_BUILTIN_RSQRT_V2SF:
gen = gen_aarch64_rsqrt_v2sf2;
break;
case AARCH64_BUILTIN_RSQRT_V4SF:
gen = gen_aarch64_rsqrt_v4sf2;
break;
default: gcc_unreachable ();
}
if (!target)
target = gen_reg_rtx (GET_MODE (op0));
emit_insn (gen (target, op0));
return target;
}
/* Expand an expression EXP that calls a built-in function, /* Expand an expression EXP that calls a built-in function,
with result going to TARGET if that's convenient. */ with result going to TARGET if that's convenient. */
rtx rtx
...@@ -1163,6 +1247,13 @@ aarch64_expand_builtin (tree exp, ...@@ -1163,6 +1247,13 @@ aarch64_expand_builtin (tree exp,
else if (fcode >= AARCH64_CRC32_BUILTIN_BASE && fcode <= AARCH64_CRC32_BUILTIN_MAX) else if (fcode >= AARCH64_CRC32_BUILTIN_BASE && fcode <= AARCH64_CRC32_BUILTIN_MAX)
return aarch64_crc32_expand_builtin (fcode, exp, target); return aarch64_crc32_expand_builtin (fcode, exp, target);
if (fcode == AARCH64_BUILTIN_RSQRT_DF
|| fcode == AARCH64_BUILTIN_RSQRT_SF
|| fcode == AARCH64_BUILTIN_RSQRT_V2DF
|| fcode == AARCH64_BUILTIN_RSQRT_V2SF
|| fcode == AARCH64_BUILTIN_RSQRT_V4SF)
return aarch64_expand_builtin_rsqrt (fcode, exp, target);
gcc_unreachable (); gcc_unreachable ();
} }
...@@ -1320,6 +1411,30 @@ aarch64_builtin_vectorized_function (tree fndecl, tree type_out, tree type_in) ...@@ -1320,6 +1411,30 @@ aarch64_builtin_vectorized_function (tree fndecl, tree type_out, tree type_in)
return NULL_TREE; return NULL_TREE;
} }
/* Return builtin for reciprocal square root. */
tree
aarch64_builtin_rsqrt (unsigned int fn, bool md_fn)
{
if (md_fn)
{
if (fn == AARCH64_SIMD_BUILTIN_UNOP_sqrtv2df)
return aarch64_builtin_decls[AARCH64_BUILTIN_RSQRT_V2DF];
if (fn == AARCH64_SIMD_BUILTIN_UNOP_sqrtv2sf)
return aarch64_builtin_decls[AARCH64_BUILTIN_RSQRT_V2SF];
if (fn == AARCH64_SIMD_BUILTIN_UNOP_sqrtv4sf)
return aarch64_builtin_decls[AARCH64_BUILTIN_RSQRT_V4SF];
}
else
{
if (fn == BUILT_IN_SQRT)
return aarch64_builtin_decls[AARCH64_BUILTIN_RSQRT_DF];
if (fn == BUILT_IN_SQRTF)
return aarch64_builtin_decls[AARCH64_BUILTIN_RSQRT_SF];
}
return NULL_TREE;
}
#undef VAR1 #undef VAR1
#define VAR1(T, N, MAP, A) \ #define VAR1(T, N, MAP, A) \
case AARCH64_SIMD_BUILTIN_##T##_##N##A: case AARCH64_SIMD_BUILTIN_##T##_##N##A:
......
...@@ -352,6 +352,8 @@ void aarch64_register_pragmas (void); ...@@ -352,6 +352,8 @@ void aarch64_register_pragmas (void);
void aarch64_relayout_simd_types (void); void aarch64_relayout_simd_types (void);
void aarch64_reset_previous_fndecl (void); void aarch64_reset_previous_fndecl (void);
void aarch64_emit_swrsqrt (rtx, rtx);
/* Initialize builtins for SIMD intrinsics. */ /* Initialize builtins for SIMD intrinsics. */
void init_aarch64_simd_builtins (void); void init_aarch64_simd_builtins (void);
...@@ -403,6 +405,8 @@ rtx aarch64_expand_builtin (tree exp, ...@@ -403,6 +405,8 @@ rtx aarch64_expand_builtin (tree exp,
int ignore ATTRIBUTE_UNUSED); int ignore ATTRIBUTE_UNUSED);
tree aarch64_builtin_decl (unsigned, bool ATTRIBUTE_UNUSED); tree aarch64_builtin_decl (unsigned, bool ATTRIBUTE_UNUSED);
tree aarch64_builtin_rsqrt (unsigned int, bool);
tree tree
aarch64_builtin_vectorized_function (tree fndecl, aarch64_builtin_vectorized_function (tree fndecl,
tree type_out, tree type_out,
......
...@@ -382,6 +382,33 @@ ...@@ -382,6 +382,33 @@
[(set_attr "type" "neon_fp_mul_d_scalar_q")] [(set_attr "type" "neon_fp_mul_d_scalar_q")]
) )
(define_insn "aarch64_rsqrte_<mode>2"
[(set (match_operand:VALLF 0 "register_operand" "=w")
(unspec:VALLF [(match_operand:VALLF 1 "register_operand" "w")]
UNSPEC_RSQRTE))]
"TARGET_SIMD"
"frsqrte\\t%<v>0<Vmtype>, %<v>1<Vmtype>"
[(set_attr "type" "neon_fp_rsqrte_<Vetype><q>")])
(define_insn "aarch64_rsqrts_<mode>3"
[(set (match_operand:VALLF 0 "register_operand" "=w")
(unspec:VALLF [(match_operand:VALLF 1 "register_operand" "w")
(match_operand:VALLF 2 "register_operand" "w")]
UNSPEC_RSQRTS))]
"TARGET_SIMD"
"frsqrts\\t%<v>0<Vmtype>, %<v>1<Vmtype>, %<v>2<Vmtype>"
[(set_attr "type" "neon_fp_rsqrts_<Vetype><q>")])
(define_expand "aarch64_rsqrt_<mode>2"
[(set (match_operand:VALLF 0 "register_operand" "=w")
(unspec:VALLF [(match_operand:VALLF 1 "register_operand" "w")]
UNSPEC_RSQRT))]
"TARGET_SIMD"
{
aarch64_emit_swrsqrt (operands[0], operands[1]);
DONE;
})
(define_insn "*aarch64_mul3_elt_to_64v2df" (define_insn "*aarch64_mul3_elt_to_64v2df"
[(set (match_operand:DF 0 "register_operand" "=w") [(set (match_operand:DF 0 "register_operand" "=w")
(mult:DF (mult:DF
......
...@@ -29,4 +29,5 @@ ...@@ -29,4 +29,5 @@
AARCH64_TUNE_ to give an enum name. */ AARCH64_TUNE_ to give an enum name. */
AARCH64_EXTRA_TUNING_OPTION ("rename_fma_regs", RENAME_FMA_REGS) AARCH64_EXTRA_TUNING_OPTION ("rename_fma_regs", RENAME_FMA_REGS)
AARCH64_EXTRA_TUNING_OPTION ("recip_sqrt", RECIP_SQRT)
...@@ -403,7 +403,8 @@ static const struct tune_params cortexa57_tunings = ...@@ -403,7 +403,8 @@ static const struct tune_params cortexa57_tunings =
2, /* min_div_recip_mul_sf. */ 2, /* min_div_recip_mul_sf. */
2, /* min_div_recip_mul_df. */ 2, /* min_div_recip_mul_df. */
tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */ tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
(AARCH64_EXTRA_TUNE_RENAME_FMA_REGS) /* tune_flags. */ (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS
| AARCH64_EXTRA_TUNE_RECIP_SQRT) /* tune_flags. */
}; };
static const struct tune_params cortexa72_tunings = static const struct tune_params cortexa72_tunings =
...@@ -470,7 +471,7 @@ static const struct tune_params xgene1_tunings = ...@@ -470,7 +471,7 @@ static const struct tune_params xgene1_tunings =
2, /* min_div_recip_mul_sf. */ 2, /* min_div_recip_mul_sf. */
2, /* min_div_recip_mul_df. */ 2, /* min_div_recip_mul_df. */
tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */ tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
(AARCH64_EXTRA_TUNE_NONE) /* tune_flags. */ (AARCH64_EXTRA_TUNE_RECIP_SQRT) /* tune_flags. */
}; };
/* Support for fine-grained override of the tuning structures. */ /* Support for fine-grained override of the tuning structures. */
...@@ -7031,6 +7032,105 @@ aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED, ...@@ -7031,6 +7032,105 @@ aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
return aarch64_tune_params.memmov_cost; return aarch64_tune_params.memmov_cost;
} }
/* Function to decide when to use
reciprocal square root builtins. */
static tree
aarch64_builtin_reciprocal (unsigned int fn,
bool md_fn,
bool)
{
if (flag_trapping_math
|| !flag_unsafe_math_optimizations
|| optimize_size
|| ! (aarch64_tune_params.extra_tuning_flags
& AARCH64_EXTRA_TUNE_RECIP_SQRT))
{
return NULL_TREE;
}
return aarch64_builtin_rsqrt (fn, md_fn);
}
typedef rtx (*rsqrte_type) (rtx, rtx);
/* Select reciprocal square root initial estimate
insn depending on machine mode. */
rsqrte_type
get_rsqrte_type (machine_mode mode)
{
switch (mode)
{
case DFmode: return gen_aarch64_rsqrte_df2;
case SFmode: return gen_aarch64_rsqrte_sf2;
case V2DFmode: return gen_aarch64_rsqrte_v2df2;
case V2SFmode: return gen_aarch64_rsqrte_v2sf2;
case V4SFmode: return gen_aarch64_rsqrte_v4sf2;
default: gcc_unreachable ();
}
}
typedef rtx (*rsqrts_type) (rtx, rtx, rtx);
/* Select reciprocal square root Newton-Raphson step
insn depending on machine mode. */
rsqrts_type
get_rsqrts_type (machine_mode mode)
{
switch (mode)
{
case DFmode: return gen_aarch64_rsqrts_df3;
case SFmode: return gen_aarch64_rsqrts_sf3;
case V2DFmode: return gen_aarch64_rsqrts_v2df3;
case V2SFmode: return gen_aarch64_rsqrts_v2sf3;
case V4SFmode: return gen_aarch64_rsqrts_v4sf3;
default: gcc_unreachable ();
}
}
/* Emit instruction sequence to compute
reciprocal square root. Use two Newton-Raphson steps
for single precision and three for double precision. */
void
aarch64_emit_swrsqrt (rtx dst, rtx src)
{
machine_mode mode = GET_MODE (src);
gcc_assert (
mode == SFmode || mode == V2SFmode || mode == V4SFmode
|| mode == DFmode || mode == V2DFmode);
rtx xsrc = gen_reg_rtx (mode);
emit_move_insn (xsrc, src);
rtx x0 = gen_reg_rtx (mode);
emit_insn ((*get_rsqrte_type (mode)) (x0, xsrc));
bool double_mode = (mode == DFmode || mode == V2DFmode);
int iterations = double_mode ? 3 : 2;
if (flag_mrecip_low_precision_sqrt)
iterations--;
for (int i = 0; i < iterations; ++i)
{
rtx x1 = gen_reg_rtx (mode);
rtx x2 = gen_reg_rtx (mode);
rtx x3 = gen_reg_rtx (mode);
emit_set_insn (x2, gen_rtx_MULT (mode, x0, x0));
emit_insn ((*get_rsqrts_type (mode)) (x3, xsrc, x2));
emit_set_insn (x1, gen_rtx_MULT (mode, x0, x3));
x0 = x1;
}
emit_move_insn (dst, x0);
}
/* Return the number of instructions that can be issued per cycle. */ /* Return the number of instructions that can be issued per cycle. */
static int static int
aarch64_sched_issue_rate (void) aarch64_sched_issue_rate (void)
...@@ -13455,6 +13555,9 @@ aarch64_promoted_type (const_tree t) ...@@ -13455,6 +13555,9 @@ aarch64_promoted_type (const_tree t)
#undef TARGET_BUILTIN_DECL #undef TARGET_BUILTIN_DECL
#define TARGET_BUILTIN_DECL aarch64_builtin_decl #define TARGET_BUILTIN_DECL aarch64_builtin_decl
#undef TARGET_BUILTIN_RECIPROCAL
#define TARGET_BUILTIN_RECIPROCAL aarch64_builtin_reciprocal
#undef TARGET_EXPAND_BUILTIN #undef TARGET_EXPAND_BUILTIN
#define TARGET_EXPAND_BUILTIN aarch64_expand_builtin #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
......
...@@ -126,6 +126,9 @@ ...@@ -126,6 +126,9 @@
UNSPEC_VSTRUCTDUMMY UNSPEC_VSTRUCTDUMMY
UNSPEC_SP_SET UNSPEC_SP_SET
UNSPEC_SP_TEST UNSPEC_SP_TEST
UNSPEC_RSQRT
UNSPEC_RSQRTE
UNSPEC_RSQRTS
]) ])
(define_c_enum "unspecv" [ (define_c_enum "unspecv" [
......
...@@ -148,3 +148,8 @@ Enum(aarch64_abi) String(lp64) Value(AARCH64_ABI_LP64) ...@@ -148,3 +148,8 @@ Enum(aarch64_abi) String(lp64) Value(AARCH64_ABI_LP64)
mpc-relative-literal-loads mpc-relative-literal-loads
Target Report Save Var(nopcrelative_literal_loads) Init(2) Save Target Report Save Var(nopcrelative_literal_loads) Init(2) Save
PC relative literal loads. PC relative literal loads.
mlow-precision-recip-sqrt
Common Var(flag_mrecip_low_precision_sqrt) Optimization
When calculating a sqrt approximation, run fewer steps.
This reduces precision, but can result in faster computation.
...@@ -521,6 +521,7 @@ Objective-C and Objective-C++ Dialects}. ...@@ -521,6 +521,7 @@ Objective-C and Objective-C++ Dialects}.
-mtls-size=@var{size} @gol -mtls-size=@var{size} @gol
-mfix-cortex-a53-835769 -mno-fix-cortex-a53-835769 @gol -mfix-cortex-a53-835769 -mno-fix-cortex-a53-835769 @gol
-mfix-cortex-a53-843419 -mno-fix-cortex-a53-843419 @gol -mfix-cortex-a53-843419 -mno-fix-cortex-a53-843419 @gol
-mlow-precision-recip-sqrt -mno-low-precision-recip-sqrt@gol
-march=@var{name} -mcpu=@var{name} -mtune=@var{name}} -march=@var{name} -mcpu=@var{name} -mtune=@var{name}}
@emph{Adapteva Epiphany Options} @emph{Adapteva Epiphany Options}
...@@ -12519,6 +12520,17 @@ Enable or disable the workaround for the ARM Cortex-A53 erratum number 843419. ...@@ -12519,6 +12520,17 @@ Enable or disable the workaround for the ARM Cortex-A53 erratum number 843419.
This erratum workaround is made at link time and this will only pass the This erratum workaround is made at link time and this will only pass the
corresponding flag to the linker. corresponding flag to the linker.
@item -mlow-precision-recip-sqrt
@item -mno-low-precision-recip-sqrt
@opindex -mlow-precision-recip-sqrt
@opindex -mno-low-precision-recip-sqrt
The square root estimate uses two steps instead of three for double-precision,
and one step instead of two for single-precision.
Thus reducing latency and precision.
This is only relevant if @option{-ffast-math} activates
reciprocal square root estimate instructions.
Which in turn depends on the target processor.
@item -march=@var{name} @item -march=@var{name}
@opindex march @opindex march
Specify the name of the target architecture, optionally suffixed by one or Specify the name of the target architecture, optionally suffixed by one or
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment