Commit 828573a5 by Uros Bizjak

Fix TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL handling.

The reason for TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL on AMD target is
only insn size, as advised in e.g. Software Optimization Guide for the
AMD Family 15h Processors [1], section 7.1.2, where it is said:

--quote--
7.1.2 Reduce Instruction SizeOptimization

Reduce the size of instructions when possible.

Rationale

Using smaller instruction sizes improves instruction fetch throughput.
Specific examples include the following:

*In SIMD code, use the single-precision (PS) form of instructions
instead of the double-precision (PD) form. For example, for register
to register moves, MOVAPS achieves the same result as MOVAPD, but uses
one less byte to encode the instruction and has no prefix byte. Other
examples in which single-precision forms can be substituted for
double-precision forms include MOVUPS, MOVNTPS, XORPS, ORPS, ANDPS,
and SHUFPS.
...
--/quote--

Please note that this optimization applies only to non-AVX forms, as
demonstrated by:

   0:   0f 28 c8                movaps %xmm0,%xmm1
   3:   66 0f 28 c8             movapd %xmm0,%xmm1
   7:   c5 f8 28 d1             vmovaps %xmm1,%xmm2
   b:   c5 f9 28 d1             vmovapd %xmm1,%xmm2

Also note that MOVDQA is missing in the above optimization. It is
harmful to substitute MOVDQA with MOVAPS, as it can (and does)
introduce +1 cycle forwarding penalty between FLT (FPA/FPM) and INT
(VALU) FP clusters.

[1] https://www.amd.com/system/files/TechDocs/47414_15h_sw_opt_guide.pdf
parent 5a28e272
2020-01-31 Uroš Bizjak <ubizjak@gmail.com>
* config/i386/i386.md (*movoi_internal_avx): Do not check for
TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL. Remove MODE_V8SF handling.
(*movti_internal): Do not check for
TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL.
(*movtf_internal): Move check for TARGET_SSE2 and size optimization
just after check for TARGET_AVX.
(*movdf_internal): Ditto.
* config/i386/mmx.md (*mov<mode>_internal): Do not check for
TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL.
* config/i386/sse.md (mov<mode>_internal): Only check
TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL with V2DFmode. Move check
for TARGET_SSE2 and size optimization just after check for TARGET_AVX.
(<sse>_andnot<mode>3<mask_name>): Move check for
TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL after check for TARGET_AVX.
(<code><mode>3<mask_name>): Ditto.
(*andnot<mode>3): Ditto.
(*andnottf3): Ditto.
(*<code><mode>3): Ditto.
(*<code>tf3): Ditto.
(*andnot<VI:mode>3): Remove
TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL handling.
(<mask_codefor><code><VI48_AVX_AVX512F:mode>3<mask_name>): Ditto.
(*<code><VI12_AVX_AVX512F:mode>3): Ditto.
(sse4_1_blendv<ssemodesuffix>): Ditto.
* config/i386/x86-tune.def (X86_TUNE_SSE_UNALIGNED_STORE_OPTIMAL):
Explain that tune applies to 128bit instructions only.
2020-01-31 Kwok Cheung Yeung <kcy@codesourcery.com>
* config/gcn/mkoffload.c (process_asm): Add sgpr_count and vgpr_count
......
......@@ -1949,18 +1949,14 @@
if (misaligned_operand (operands[0], OImode)
|| misaligned_operand (operands[1], OImode))
{
if (get_attr_mode (insn) == MODE_V8SF)
return "vmovups\t{%1, %0|%0, %1}";
else if (get_attr_mode (insn) == MODE_XI)
if (get_attr_mode (insn) == MODE_XI)
return "vmovdqu32\t{%1, %0|%0, %1}";
else
return "vmovdqu\t{%1, %0|%0, %1}";
}
else
{
if (get_attr_mode (insn) == MODE_V8SF)
return "vmovaps\t{%1, %0|%0, %1}";
else if (get_attr_mode (insn) == MODE_XI)
if (get_attr_mode (insn) == MODE_XI)
return "vmovdqa32\t{%1, %0|%0, %1}";
else
return "vmovdqa\t{%1, %0|%0, %1}";
......@@ -1980,8 +1976,6 @@
(and (eq_attr "alternative" "1")
(match_test "TARGET_AVX512VL"))
(const_string "XI")
(match_test "TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL")
(const_string "V8SF")
]
(const_string "OI")))])
......@@ -2060,11 +2054,10 @@
(match_test "TARGET_AVX")
(const_string "TI")
(ior (not (match_test "TARGET_SSE2"))
(ior (match_test "TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL")
(and (eq_attr "alternative" "5")
(match_test "TARGET_SSE_TYPELESS_STORES"))))
(match_test "optimize_function_for_size_p (cfun)"))
(const_string "V4SF")
(match_test "optimize_function_for_size_p (cfun)")
(and (eq_attr "alternative" "5")
(match_test "TARGET_SSE_TYPELESS_STORES"))
(const_string "V4SF")
]
(const_string "TI")))
......@@ -2243,12 +2236,10 @@
(cond [(ior (match_operand 0 "ext_sse_reg_operand")
(match_operand 1 "ext_sse_reg_operand"))
(const_string "TI")
(ior (not (match_test "TARGET_SSE2"))
(match_test "TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL"))
(const_string "V4SF")
(match_test "TARGET_AVX")
(const_string "TI")
(match_test "optimize_function_for_size_p (cfun)")
(ior (not (match_test "TARGET_SSE2"))
(match_test "optimize_function_for_size_p (cfun)"))
(const_string "V4SF")
]
(const_string "TI"))
......@@ -2453,12 +2444,10 @@
(cond [(ior (match_operand 0 "ext_sse_reg_operand")
(match_operand 1 "ext_sse_reg_operand"))
(const_string "XI")
(ior (not (match_test "TARGET_SSE2"))
(match_test "TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL"))
(const_string "V4SF")
(match_test "TARGET_AVX")
(const_string "TI")
(match_test "optimize_function_for_size_p (cfun)")
(ior (not (match_test "TARGET_SSE2"))
(match_test "optimize_function_for_size_p (cfun)"))
(const_string "V4SF")
]
(const_string "TI"))
......@@ -3324,14 +3313,14 @@
(const_string "DI")
(match_test "TARGET_AVX")
(const_string "TI")
(ior (not (match_test "TARGET_SSE2"))
(match_test "optimize_function_for_size_p (cfun)"))
(const_string "V4SF")
(match_test "TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL")
(const_string "V4SF")
(and (eq_attr "alternative" "2")
(match_test "TARGET_SSE_TYPELESS_STORES"))
(const_string "V4SF")
(ior (not (match_test "TARGET_SSE2"))
(match_test "optimize_function_for_size_p (cfun)"))
(const_string "V4SF")
]
(const_string "TI")))])
......@@ -3541,14 +3530,13 @@
/* xorps is one byte shorter for non-AVX targets. */
(eq_attr "alternative" "12,16")
(cond [(not (match_test "TARGET_SSE2"))
(const_string "V4SF")
(and (match_test "TARGET_AVX512F")
(not (match_test "TARGET_PREFER_AVX256")))
(cond [(and (match_test "TARGET_AVX512F")
(not (match_test "TARGET_PREFER_AVX256")))
(const_string "XI")
(match_test "TARGET_AVX")
(const_string "V2DF")
(match_test "optimize_function_for_size_p (cfun)")
(ior (not (match_test "TARGET_SSE2"))
(match_test "optimize_function_for_size_p (cfun)"))
(const_string "V4SF")
(match_test "TARGET_SSE_LOAD0_BY_PXOR")
(const_string "TI")
......@@ -3566,15 +3554,15 @@
(ior (match_operand 0 "ext_sse_reg_operand")
(match_operand 1 "ext_sse_reg_operand")))
(const_string "V8DF")
(match_test "TARGET_AVX")
(const_string "DF")
(ior (not (match_test "TARGET_SSE2"))
(match_test "TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL"))
(match_test "optimize_function_for_size_p (cfun)"))
(const_string "V4SF")
(match_test "TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL")
(const_string "V4SF")
(match_test "TARGET_SSE_PARTIAL_REG_DEPENDENCY")
(const_string "V2DF")
(match_test "TARGET_AVX")
(const_string "DF")
(match_test "optimize_function_for_size_p (cfun)")
(const_string "V4SF")
]
(const_string "DF"))
......@@ -3723,16 +3711,15 @@
(eq_attr "alternative" "11")
(const_string "DI")
(eq_attr "alternative" "5")
(cond [(not (match_test "TARGET_SSE2"))
(const_string "V4SF")
(and (match_test "TARGET_AVX512F")
(not (match_test "TARGET_PREFER_AVX256")))
(cond [(and (match_test "TARGET_AVX512F")
(not (match_test "TARGET_PREFER_AVX256")))
(const_string "V16SF")
(match_test "TARGET_AVX")
(const_string "V4SF")
(match_test "optimize_function_for_size_p (cfun)")
(ior (not (match_test "TARGET_SSE2"))
(match_test "optimize_function_for_size_p (cfun)"))
(const_string "V4SF")
(match_test "TARGET_SSE_LOAD0_BY_PXOR")
(match_test "TARGET_SSE_LOAD0_BY_PXOR")
(const_string "TI")
]
(const_string "V4SF"))
......
......@@ -195,11 +195,7 @@
(match_test "<MODE>mode == V2SFmode")
(const_string "V4SF")
(ior (not (match_test "TARGET_SSE2"))
(match_test "TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL"))
(const_string "V4SF")
(match_test "TARGET_AVX")
(const_string "TI")
(match_test "optimize_function_for_size_p (cfun)")
(match_test "optimize_function_for_size_p (cfun)"))
(const_string "V4SF")
]
(const_string "TI"))
......
......@@ -1118,13 +1118,15 @@
(const_string "<sseinsnmode>")
(match_test "TARGET_AVX")
(const_string "<sseinsnmode>")
(ior (match_test "TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL")
(and (eq_attr "alternative" "3")
(match_test "TARGET_SSE_TYPELESS_STORES")))
(const_string "<ssePSmode>")
(ior (not (match_test "TARGET_SSE2"))
(match_test "optimize_function_for_size_p (cfun)"))
(const_string "V4SF")
(and (match_test "<MODE>mode == V2DFmode")
(match_test "TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL"))
(const_string "V4SF")
(and (eq_attr "alternative" "3")
(match_test "TARGET_SSE_TYPELESS_STORES"))
(const_string "V4SF")
(and (eq_attr "alternative" "0")
(match_test "TARGET_SSE_LOAD0_BY_PXOR"))
(const_string "TI")
......@@ -3555,16 +3557,14 @@
(const_string "<sseintvecmode2>")
(eq_attr "alternative" "3")
(const_string "<sseintvecmode2>")
(and (match_test "<MODE_SIZE> == 16")
(match_test "TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL"))
(const_string "<ssePSmode>")
(match_test "TARGET_AVX")
(const_string "<MODE>")
(match_test "optimize_function_for_size_p (cfun)")
(const_string "V4SF")
]
(const_string "<MODE>")))])
(match_test "TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL")
(const_string "V4SF")
]
(const_string "<MODE>")))])
(define_insn "<sse>_andnot<mode>3<mask_name>"
[(set (match_operand:VF_512 0 "register_operand" "=v")
......@@ -3673,15 +3673,14 @@
(const_string "<sseintvecmode2>")
(eq_attr "alternative" "3")
(const_string "<sseintvecmode2>")
(and (match_test "<MODE_SIZE> == 16")
(match_test "TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL"))
(const_string "<ssePSmode>")
(match_test "TARGET_AVX")
(const_string "<MODE>")
(match_test "optimize_function_for_size_p (cfun)")
(const_string "V4SF")
]
(const_string "<MODE>")))])
(match_test "TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL")
(const_string "V4SF")
]
(const_string "<MODE>")))])
(define_insn "*<code><mode>3<mask_name>"
[(set (match_operand:VF_512 0 "register_operand" "=v")
......@@ -3822,15 +3821,14 @@
(if_then_else (match_test "TARGET_AVX512DQ")
(const_string "<avx512fvecmode>")
(const_string "XI"))
(and (match_test "<MODE_SIZE> == 16")
(match_test "TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL"))
(const_string "V4SF")
(match_test "TARGET_AVX")
(const_string "<ssevecmode>")
(match_test "optimize_function_for_size_p (cfun)")
(const_string "V4SF")
]
(const_string "<ssevecmode>")))])
(match_test "TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL")
(const_string "V4SF")
]
(const_string "<ssevecmode>")))])
(define_insn "*andnottf3"
[(set (match_operand:TF 0 "register_operand" "=x,x,v,v")
......@@ -3879,15 +3877,15 @@
(const_string "TI")
(eq_attr "alternative" "3")
(const_string "XI")
(match_test "TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL")
(const_string "V4SF")
(match_test "TARGET_AVX")
(const_string "TI")
(ior (not (match_test "TARGET_SSE2"))
(match_test "optimize_function_for_size_p (cfun)"))
(const_string "V4SF")
]
(const_string "TI")))])
(match_test "TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL")
(const_string "V4SF")
]
(const_string "TI")))])
(define_insn "*<code><mode>3"
[(set (match_operand:MODEF 0 "register_operand" "=x,x,v,v")
......@@ -3946,15 +3944,14 @@
(if_then_else (match_test "TARGET_AVX512DQ")
(const_string "<avx512fvecmode>")
(const_string "XI"))
(and (match_test "<MODE_SIZE> == 16")
(match_test "TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL"))
(const_string "V4SF")
(match_test "TARGET_AVX")
(const_string "<ssevecmode>")
(match_test "optimize_function_for_size_p (cfun)")
(const_string "V4SF")
]
(const_string "<ssevecmode>")))])
(match_test "TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL")
(const_string "V4SF")
]
(const_string "<ssevecmode>")))])
(define_expand "<code>tf3"
[(set (match_operand:TF 0 "register_operand")
......@@ -4011,15 +4008,15 @@
(const_string "TI")
(eq_attr "alternative" "3")
(const_string "QI")
(match_test "TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL")
(const_string "V4SF")
(match_test "TARGET_AVX")
(const_string "TI")
(ior (not (match_test "TARGET_SSE2"))
(match_test "optimize_function_for_size_p (cfun)"))
(const_string "V4SF")
]
(const_string "TI")))])
(match_test "TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL")
(const_string "V4SF")
]
(const_string "TI")))])
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;
......@@ -13007,10 +13004,7 @@
(const_string "*")))
(set_attr "prefix" "orig,vex,evex")
(set (attr "mode")
(cond [(and (match_test "<MODE_SIZE> == 16")
(match_test "TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL"))
(const_string "<ssePSmode>")
(match_test "TARGET_AVX2")
(cond [(match_test "TARGET_AVX2")
(const_string "<sseinsnmode>")
(match_test "TARGET_AVX")
(if_then_else
......@@ -13148,10 +13142,7 @@
(const_string "*")))
(set_attr "prefix" "<mask_prefix3>,evex")
(set (attr "mode")
(cond [(and (match_test "<MODE_SIZE> == 16")
(match_test "TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL"))
(const_string "<ssePSmode>")
(match_test "TARGET_AVX2")
(cond [(match_test "TARGET_AVX2")
(const_string "<sseinsnmode>")
(match_test "TARGET_AVX")
(if_then_else
......@@ -13244,10 +13235,7 @@
(const_string "*")))
(set_attr "prefix" "orig,vex,evex")
(set (attr "mode")
(cond [(and (match_test "<MODE_SIZE> == 16")
(match_test "TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL"))
(const_string "<ssePSmode>")
(match_test "TARGET_AVX2")
(cond [(match_test "TARGET_AVX2")
(const_string "<sseinsnmode>")
(match_test "TARGET_AVX")
(if_then_else
......@@ -17190,14 +17178,14 @@
(set_attr "prefix" "orig,orig,vex")
(set_attr "btver2_decode" "vector,vector,vector")
(set (attr "mode")
(cond [(match_test "TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL")
(const_string "V4SF")
(match_test "TARGET_AVX")
(cond [(match_test "TARGET_AVX")
(const_string "<ssevecmode>")
(match_test "optimize_function_for_size_p (cfun)")
(const_string "V4SF")
]
(const_string "<ssevecmode>")))])
(match_test "TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL")
(const_string "V4SF")
]
(const_string "<ssevecmode>")))])
(define_insn_and_split "*<sse4_1>_blendv<ssemodesuffix><avxsizesuffix>_lt"
[(set (match_operand:VF_128_256 0 "register_operand" "=Yr,*x,x")
......
......@@ -366,15 +366,15 @@ DEF_TUNE (X86_TUNE_SSE_UNALIGNED_LOAD_OPTIMAL, "sse_unaligned_load_optimal",
| m_INTEL | m_GOLDMONT | m_GOLDMONT_PLUS
| m_TREMONT | m_AMDFAM10 | m_BDVER | m_BTVER | m_ZNVER | m_GENERIC)
/* X86_TUNE_SSE_UNALIGNED_STORE_OPTIMAL: Use movups for misaligned stores instead
of a sequence loading registers by parts. */
/* X86_TUNE_SSE_UNALIGNED_STORE_OPTIMAL: Use movups for misaligned stores
instead of a sequence loading registers by parts. */
DEF_TUNE (X86_TUNE_SSE_UNALIGNED_STORE_OPTIMAL, "sse_unaligned_store_optimal",
m_NEHALEM | m_SANDYBRIDGE | m_CORE_AVX2 | m_SILVERMONT | m_KNL | m_KNM
| m_INTEL | m_GOLDMONT | m_GOLDMONT_PLUS
| m_TREMONT | m_BDVER | m_ZNVER | m_GENERIC)
/* Use packed single precision instructions where posisble. I.e. movups instead
of movupd. */
/* X86_TUNE_SSE_PACKED_SINGLE_INSN_OPTIMAL: Use packed single
precision 128bit instructions instead of double where possible. */
DEF_TUNE (X86_TUNE_SSE_PACKED_SINGLE_INSN_OPTIMAL, "sse_packed_single_insn_optimal",
m_BDVER | m_ZNVER)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment