Commit e38341a8 by Sylvia Taylor Committed by Kyrylo Tkachov

[patch1/2][arm][PR90317]: fix sha1 patterns

This patch fixes:

1) Ice message thrown when using the crypto_sha1h intrinsic due to
incompatible mode used for zero_extend. Removed zero extend as it is
not a good choice for vector modes and using an equivalent single
mode like TI (128bits) instead of V4SI produces extra instructions
making it inefficient.

This affects gcc version 8 and above.

2) Incorrect combine optimizations made due to vec_select usage
in the sha1 patterns on arm. The patterns should only combine
a vec select within a sha1h<op> instruction when the lane is 0.

This affects gcc version 5 and above.

- Fixed by explicitly declaring the valid const int for such
optimizations. For cases when the lane is not 0, the vector
lane selection now occurs in a e.g. vmov instruction prior
to sha1h<op>.

- Updated the sha1h testcases on arm to check for additional
cases with custom vector lane selection.

The intrinsic functions for the sha1 patterns have also been
simplified which seems to eliminate extra vmovs like:
- vmov.i32 q8, #0.


2019-07-18  Sylvia Taylor  <sylvia.taylor@arm.com>

        PR target/90317
        * config/arm/arm_neon.h
        (vsha1h_u32): Refactor.
        (vsha1cq_u32): Likewise.
        (vsha1pq_u32): Likewise.
        (vsha1mq_u32): Likewise.
        * config/arm/crypto.md:
        (crypto_sha1h): Remove zero extend, correct vec select.
        (crypto_sha1c): Correct vec select.
        (crypto_sha1m): Likewise.
        (crypto_sha1p): Likewise.

        * gcc.target/arm/crypto-vsha1cq_u32.c (foo): Change return type to
        uint32_t.
        (GET_LANE, TEST_SHA1C_VEC_SELECT): New.
        * gcc.target/arm/crypto-vsha1h_u32.c (foo): Change return type to
        uint32_t.
        (GET_LANE, TEST_SHA1H_VEC_SELECT): New.
        * gcc.target/arm/crypto-vsha1mq_u32.c (foo): Change return type to
        uint32_t.
        (GET_LANE, TEST_SHA1M_VEC_SELECT): New.
        * gcc.target/arm/crypto-vsha1pq_u32.c (foo): Change return type to
        uint32_t.
        (GET_LANE, TEST_SHA1P_VEC_SELECT): New.

From-SVN: r273574
parent 979526c9
2019-07-18 Sylvia Taylor <sylvia.taylor@arm.com>
PR target/90317
* config/arm/arm_neon.h
(vsha1h_u32): Refactor.
(vsha1cq_u32): Likewise.
(vsha1pq_u32): Likewise.
(vsha1mq_u32): Likewise.
* config/arm/crypto.md:
(crypto_sha1h): Remove zero extend, correct vec select.
(crypto_sha1c): Correct vec select.
(crypto_sha1m): Likewise.
(crypto_sha1p): Likewise.
2019-07-18 Richard Earnshaw <rearnsha@arm.com> 2019-07-18 Richard Earnshaw <rearnsha@arm.com>
* config/arm/predicates.md (arm_borrow_operation): New predicate. * config/arm/predicates.md (arm_borrow_operation): New predicate.
......
...@@ -16938,37 +16938,32 @@ __extension__ extern __inline uint32_t ...@@ -16938,37 +16938,32 @@ __extension__ extern __inline uint32_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vsha1h_u32 (uint32_t __hash_e) vsha1h_u32 (uint32_t __hash_e)
{ {
uint32x4_t __t = vdupq_n_u32 (0); return vgetq_lane_u32 (__builtin_arm_crypto_sha1h (vdupq_n_u32 (__hash_e)),
__t = vsetq_lane_u32 (__hash_e, __t, 0); 0);
__t = __builtin_arm_crypto_sha1h (__t);
return vgetq_lane_u32 (__t, 0);
} }
__extension__ extern __inline uint32x4_t __extension__ extern __inline uint32x4_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vsha1cq_u32 (uint32x4_t __hash_abcd, uint32_t __hash_e, uint32x4_t __wk) vsha1cq_u32 (uint32x4_t __hash_abcd, uint32_t __hash_e, uint32x4_t __wk)
{ {
uint32x4_t __t = vdupq_n_u32 (0); return __builtin_arm_crypto_sha1c (__hash_abcd, vdupq_n_u32 (__hash_e),
__t = vsetq_lane_u32 (__hash_e, __t, 0); __wk);
return __builtin_arm_crypto_sha1c (__hash_abcd, __t, __wk);
} }
__extension__ extern __inline uint32x4_t __extension__ extern __inline uint32x4_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vsha1pq_u32 (uint32x4_t __hash_abcd, uint32_t __hash_e, uint32x4_t __wk) vsha1pq_u32 (uint32x4_t __hash_abcd, uint32_t __hash_e, uint32x4_t __wk)
{ {
uint32x4_t __t = vdupq_n_u32 (0); return __builtin_arm_crypto_sha1p (__hash_abcd, vdupq_n_u32 (__hash_e),
__t = vsetq_lane_u32 (__hash_e, __t, 0); __wk);
return __builtin_arm_crypto_sha1p (__hash_abcd, __t, __wk);
} }
__extension__ extern __inline uint32x4_t __extension__ extern __inline uint32x4_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vsha1mq_u32 (uint32x4_t __hash_abcd, uint32_t __hash_e, uint32x4_t __wk) vsha1mq_u32 (uint32x4_t __hash_abcd, uint32_t __hash_e, uint32x4_t __wk)
{ {
uint32x4_t __t = vdupq_n_u32 (0); return __builtin_arm_crypto_sha1m (__hash_abcd, vdupq_n_u32 (__hash_e),
__t = vsetq_lane_u32 (__hash_e, __t, 0); __wk);
return __builtin_arm_crypto_sha1m (__hash_abcd, __t, __wk);
} }
__extension__ extern __inline uint32x4_t __extension__ extern __inline uint32x4_t
......
...@@ -105,14 +105,18 @@ ...@@ -105,14 +105,18 @@
[(set_attr "type" "<crypto_type>")] [(set_attr "type" "<crypto_type>")]
) )
/* The vec_select operation always selects index 0 from the lower V2SI subreg
of the V4SI, adjusted for endianness. Required due to neon_vget_lane and
neon_set_lane that change the element ordering in memory for big-endian. */
(define_insn "crypto_sha1h" (define_insn "crypto_sha1h"
[(set (match_operand:V4SI 0 "register_operand" "=w") [(set (match_operand:V4SI 0 "register_operand" "=w")
(zero_extend:V4SI (unspec:V4SI
(unspec:SI [(vec_select:SI [(vec_select:SI
(match_operand:V4SI 1 "register_operand" "w") (match_operand:V4SI 1 "register_operand" "w")
(parallel [(match_operand:SI 2 "immediate_operand" "i")]))] (parallel [(match_operand:SI 2 "immediate_operand" "i")]))]
UNSPEC_SHA1H)))] UNSPEC_SHA1H))]
"TARGET_CRYPTO" "TARGET_CRYPTO && INTVAL (operands[2]) == NEON_ENDIAN_LANE_N (V2SImode, 0)"
"sha1h.32\\t%q0, %q1" "sha1h.32\\t%q0, %q1"
[(set_attr "type" "crypto_sha1_fast")] [(set_attr "type" "crypto_sha1_fast")]
) )
...@@ -127,6 +131,10 @@ ...@@ -127,6 +131,10 @@
[(set_attr "type" "crypto_pmull")] [(set_attr "type" "crypto_pmull")]
) )
/* The vec_select operation always selects index 0 from the lower V2SI subreg
of the V4SI, adjusted for endianness. Required due to neon_vget_lane and
neon_set_lane that change the element ordering in memory for big-endian. */
(define_insn "crypto_<crypto_pattern>" (define_insn "crypto_<crypto_pattern>"
[(set (match_operand:V4SI 0 "register_operand" "=w") [(set (match_operand:V4SI 0 "register_operand" "=w")
(unspec:<crypto_mode> (unspec:<crypto_mode>
...@@ -136,7 +144,7 @@ ...@@ -136,7 +144,7 @@
(parallel [(match_operand:SI 4 "immediate_operand" "i")])) (parallel [(match_operand:SI 4 "immediate_operand" "i")]))
(match_operand:<crypto_mode> 3 "register_operand" "w")] (match_operand:<crypto_mode> 3 "register_operand" "w")]
CRYPTO_SELECTING))] CRYPTO_SELECTING))]
"TARGET_CRYPTO" "TARGET_CRYPTO && INTVAL (operands[4]) == NEON_ENDIAN_LANE_N (V2SImode, 0)"
"<crypto_pattern>.<crypto_size_sfx>\\t%q0, %q2, %q3" "<crypto_pattern>.<crypto_size_sfx>\\t%q0, %q2, %q3"
[(set_attr "type" "<crypto_type>")] [(set_attr "type" "<crypto_type>")]
) )
2019-07-18 Sylvia Taylor <sylvia.taylor@arm.com>
PR target/90317
* gcc.target/arm/crypto-vsha1cq_u32.c (foo): Change return type to
uint32_t.
(GET_LANE, TEST_SHA1C_VEC_SELECT): New.
* gcc.target/arm/crypto-vsha1h_u32.c (foo): Change return type to
uint32_t.
(GET_LANE, TEST_SHA1H_VEC_SELECT): New.
* gcc.target/arm/crypto-vsha1mq_u32.c (foo): Change return type to
uint32_t.
(GET_LANE, TEST_SHA1M_VEC_SELECT): New.
* gcc.target/arm/crypto-vsha1pq_u32.c (foo): Change return type to
uint32_t.
(GET_LANE, TEST_SHA1P_VEC_SELECT): New.
2019-07-18 Jan Hubicka <hubicka@ucw.cz> 2019-07-18 Jan Hubicka <hubicka@ucw.cz>
* g++.dg/lto/alias-5_0.C: New testcase. * g++.dg/lto/alias-5_0.C: New testcase.
......
/* { dg-do compile } */ /* { dg-do compile } */
/* { dg-require-effective-target arm_crypto_ok } */ /* { dg-require-effective-target arm_crypto_ok } */
/* { dg-add-options arm_crypto } */ /* { dg-add-options arm_crypto } */
/* { dg-additional-options "-O3" } */
#include "arm_neon.h" #include "arm_neon.h"
int uint32_t foo (void)
foo (void)
{ {
uint32_t hash = 0xdeadbeef; uint32_t hash = 0xdeadbeef;
uint32x4_t a = {0, 1, 2, 3}; uint32x4_t a = {0, 1, 2, 3};
...@@ -15,4 +16,20 @@ foo (void) ...@@ -15,4 +16,20 @@ foo (void)
return res[0]; return res[0];
} }
/* { dg-final { scan-assembler "sha1c.32\tq\[0-9\]+, q\[0-9\]+" } } */ #define GET_LANE(lane) \
uint32x4_t foo_lane##lane (uint32x4_t val,uint32x4_t a, uint32x4_t b)\
{ \
return vsha1cq_u32 (a, vgetq_lane_u32 (val, lane), b); \
}
#define TEST_SHA1C_VEC_SELECT(FUNC) \
FUNC (0) \
FUNC (1) \
FUNC (2) \
FUNC (3) \
TEST_SHA1C_VEC_SELECT (GET_LANE)
/* { dg-final { scan-assembler-times {sha1c.32\tq[0-9]+, q[0-9]+} 5 } } */
/* { dg-final { scan-assembler-times {vdup.32\tq[0-9]+, r[0-9]+} 3 } } */
/* { dg-final { scan-assembler-times {vmov.32\tr[0-9]+, d[0-9]+\[[0-9]+\]+} 4 } } */
/* { dg-do compile } */ /* { dg-do compile } */
/* { dg-require-effective-target arm_crypto_ok } */ /* { dg-require-effective-target arm_crypto_ok } */
/* { dg-add-options arm_crypto } */ /* { dg-add-options arm_crypto } */
/* { dg-additional-options "-O3" } */
#include "arm_neon.h" #include "arm_neon.h"
int uint32_t foo (void)
foo (void)
{ {
uint32_t val = 0xdeadbeef; uint32_t val = 0xdeadbeef;
return vsha1h_u32 (val); return vsha1h_u32 (val);
} }
/* { dg-final { scan-assembler "sha1h.32\tq\[0-9\]+, q\[0-9\]+" } } */ #define GET_LANE(lane) \
uint32_t foo_lane##lane (uint32x4_t val) \
{ \
return vsha1h_u32 (vgetq_lane_u32 (val, lane)); \
}
#define TEST_SHA1H_VEC_SELECT(FUNC) \
FUNC (0) \
FUNC (1) \
FUNC (2) \
FUNC (3) \
TEST_SHA1H_VEC_SELECT (GET_LANE)
/* { dg-final { scan-assembler-times {sha1h.32\tq[0-9]+, q[0-9]+} 5 } } */
/* { dg-final { scan-assembler-times {vdup.32\tq[0-9]+, r[0-9]+} 3 } } */
/* { dg-final { scan-assembler-times {vmov.32\tr[0-9]+, d[0-9]+\[[0-9]+\]+} 8 } } */
/* { dg-do compile } */ /* { dg-do compile } */
/* { dg-require-effective-target arm_crypto_ok } */ /* { dg-require-effective-target arm_crypto_ok } */
/* { dg-add-options arm_crypto } */ /* { dg-add-options arm_crypto } */
/* { dg-additional-options "-O3" } */
#include "arm_neon.h" #include "arm_neon.h"
int uint32_t foo (void)
foo (void)
{ {
uint32_t hash = 0xdeadbeef; uint32_t hash = 0xdeadbeef;
uint32x4_t a = {0, 1, 2, 3}; uint32x4_t a = {0, 1, 2, 3};
...@@ -15,4 +16,20 @@ foo (void) ...@@ -15,4 +16,20 @@ foo (void)
return res[0]; return res[0];
} }
/* { dg-final { scan-assembler "sha1m.32\tq\[0-9\]+, q\[0-9\]+" } } */ #define GET_LANE(lane) \
uint32x4_t foo_lane##lane (uint32x4_t val,uint32x4_t a, uint32x4_t b)\
{ \
return vsha1mq_u32 (a, vgetq_lane_u32 (val, lane), b); \
}
#define TEST_SHA1M_VEC_SELECT(FUNC) \
FUNC (0) \
FUNC (1) \
FUNC (2) \
FUNC (3) \
TEST_SHA1M_VEC_SELECT (GET_LANE)
/* { dg-final { scan-assembler-times {sha1m.32\tq[0-9]+, q[0-9]+} 5 } } */
/* { dg-final { scan-assembler-times {vdup.32\tq[0-9]+, r[0-9]+} 3 } } */
/* { dg-final { scan-assembler-times {vmov.32\tr[0-9]+, d[0-9]+\[[0-9]+\]+} 4 } } */
/* { dg-do compile } */ /* { dg-do compile } */
/* { dg-require-effective-target arm_crypto_ok } */ /* { dg-require-effective-target arm_crypto_ok } */
/* { dg-add-options arm_crypto } */ /* { dg-add-options arm_crypto } */
/* { dg-additional-options "-O3" } */
#include "arm_neon.h" #include "arm_neon.h"
int uint32_t foo (void)
foo (void)
{ {
uint32_t hash = 0xdeadbeef; uint32_t hash = 0xdeadbeef;
uint32x4_t a = {0, 1, 2, 3}; uint32x4_t a = {0, 1, 2, 3};
...@@ -15,4 +16,20 @@ foo (void) ...@@ -15,4 +16,20 @@ foo (void)
return res[0]; return res[0];
} }
/* { dg-final { scan-assembler "sha1p.32\tq\[0-9\]+, q\[0-9\]+" } } */ #define GET_LANE(lane) \
uint32x4_t foo_lane##lane (uint32x4_t val,uint32x4_t a, uint32x4_t b)\
{ \
return vsha1pq_u32 (a, vgetq_lane_u32 (val, lane), b); \
}
#define TEST_SHA1P_VEC_SELECT(FUNC) \
FUNC (0) \
FUNC (1) \
FUNC (2) \
FUNC (3) \
TEST_SHA1P_VEC_SELECT (GET_LANE)
/* { dg-final { scan-assembler-times {sha1p.32\tq[0-9]+, q[0-9]+} 5 } } */
/* { dg-final { scan-assembler-times {vdup.32\tq[0-9]+, r[0-9]+} 3 } } */
/* { dg-final { scan-assembler-times {vmov.32\tr[0-9]+, d[0-9]+\[[0-9]+\]+} 4 } } */
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment