Rework constant subreg folds and handle more variable-length cases

This patch rewrites the way simplify_subreg handles constants. It uses similar native_encode/native_decode routines to the tree-level handling of VIEW_CONVERT_EXPR, meaning that we can move between rtx constants and the target memory image of them. The main point of this patch is to support subregs of constant-length vectors for VLA vectors, beyond the very simple cases that were already handled. Many of the new tests failed before the patch for variable- length vectors. The boolean side is tested more by the upcoming SVE ACLE work. 2019-09-19 Richard Sandiford <richard.sandiford@arm.com> gcc/ * defaults.h (TARGET_UNIT): New macro. (target_unit): New type. * rtl.h (native_encode_rtx, native_decode_rtx) (native_decode_vector_rtx, subreg_size_lsb): Declare. (subreg_lsb_1): Turn into an inline wrapper around subreg_size_lsb. * rtlanal.c (subreg_lsb_1): Delete. (subreg_size_lsb): New function. * simplify-rtx.c: Include rtx-vector-builder.h (simplify_immed_subreg): Delete. (native_encode_rtx, native_decode_vector_rtx, native_decode_rtx) (simplify_const_vector_byte_offset, simplify_const_vector_subreg): New functions. (simplify_subreg): Use them. (test_vector_subregs_modes, test_vector_subregs_repeating) (test_vector_subregs_fore_back, test_vector_subregs_stepped) (test_vector_subregs): New functions. (test_vector_ops): Call test_vector_subregs for integer vector modes with at least 2 elements. From-SVN: r275959

Rework constant subreg folds and handle more variable-length cases
This patch rewrites the way simplify_subreg handles constants. It uses similar native_encode/native_decode routines to the tree-level handling of VIEW_CONVERT_EXPR, meaning that we can move between rtx constants and the target memory image of them. The main point of this patch is to support subregs of constant-length vectors for VLA vectors, beyond the very simple cases that were already handled. Many of the new tests failed before the patch for variable- length vectors. The boolean side is tested more by the upcoming SVE ACLE work. 2019-09-19 Richard Sandiford <richard.sandiford@arm.com> gcc/ * defaults.h (TARGET_UNIT): New macro. (target_unit): New type. * rtl.h (native_encode_rtx, native_decode_rtx) (native_decode_vector_rtx, subreg_size_lsb): Declare. (subreg_lsb_1): Turn into an inline wrapper around subreg_size_lsb. * rtlanal.c (subreg_lsb_1): Delete. (subreg_size_lsb): New function. * simplify-rtx.c: Include rtx-vector-builder.h (simplify_immed_subreg): Delete. (native_encode_rtx, native_decode_vector_rtx, native_decode_rtx) (simplify_const_vector_byte_offset, simplify_const_vector_subreg): New functions. (simplify_subreg): Use them. (test_vector_subregs_modes, test_vector_subregs_repeating) (test_vector_subregs_fore_back, test_vector_subregs_stepped) (test_vector_subregs): New functions. (test_vector_ops): Call test_vector_subregs for integer vector modes with at least 2 elements. From-SVN: r275959
f24f4c15 · Richard Sandiford · Richard Sandiford · 4736041b · f24f4c15 · f24f4c15
Commit f24f4c15 authored Sep 19, 2019 by Richard Sandiford Committed by Richard Sandiford Sep 19, 2019
Hide whitespace changes
Inline Side-by-side

Showing with 663 additions and 322 deletions

gcc/ChangeLog
+21 -0

gcc/defaults.h
+14 -0

gcc/rtl.h
+19 -1

gcc/rtlanal.c
+18 -10

gcc/simplify-rtx.c
+591 -311

No files found.
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
+2019-09-19  Richard Sandiford  <richard.sandiford@arm.com>
+
+	* defaults.h (TARGET_UNIT): New macro.
+	(target_unit): New type.
+	* rtl.h (native_encode_rtx, native_decode_rtx)
+	(native_decode_vector_rtx, subreg_size_lsb): Declare.
+	(subreg_lsb_1): Turn into an inline wrapper around subreg_size_lsb.
+	* rtlanal.c (subreg_lsb_1): Delete.
+	(subreg_size_lsb): New function.
+	* simplify-rtx.c: Include rtx-vector-builder.h
+	(simplify_immed_subreg): Delete.
+	(native_encode_rtx, native_decode_vector_rtx, native_decode_rtx)
+	(simplify_const_vector_byte_offset, simplify_const_vector_subreg): New
+	functions.
+	(simplify_subreg): Use them.
+	(test_vector_subregs_modes, test_vector_subregs_repeating)
+	(test_vector_subregs_fore_back, test_vector_subregs_stepped)
+	(test_vector_subregs): New functions.
+	(test_vector_ops): Call test_vector_subregs for integer vector
+	modes with at least 2 elements.
+
 2019-09-19  Richard Biener  <rguenther@suse.de>

 	* tree-parloops.c (parloops_is_slp_reduction): Do not set

--- a/gcc/defaults.h
+++ b/gcc/defaults.h
@@ -1459,4 +1459,18 @@ see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
 #define DWARF_GNAT_ENCODINGS_DEFAULT DWARF_GNAT_ENCODINGS_GDB
 #endif

+#ifndef USED_FOR_TARGET
+/* Done this way to keep gengtype happy.  */
+#if BITS_PER_UNIT == 8
+#define TARGET_UNIT uint8_t
+#elif BITS_PER_UNIT == 16
+#define TARGET_UNIT uint16_t
+#elif BITS_PER_UNIT == 32
+#define TARGET_UNIT uint32_t
+#else
+#error Unknown BITS_PER_UNIT
+#endif
+typedef TARGET_UNIT target_unit;
+#endif
+
 #endif  /* ! GCC_DEFAULTS_H */
--- a/gcc/rtl.h
+++ b/gcc/rtl.h
@@ -2406,12 +2406,30 @@ extern int rtx_cost (rtx, machine_mode, enum rtx_code, int, bool);
 extern int address_cost (rtx, machine_mode, addr_space_t, bool);
 extern void get_full_rtx_cost (rtx, machine_mode, enum rtx_code, int,
 			       struct full_rtx_costs *);
+extern bool native_encode_rtx (machine_mode, rtx, vec<target_unit> &,
+			       unsigned int, unsigned int);
+extern rtx native_decode_rtx (machine_mode, vec<target_unit>,
+			      unsigned int);
+extern rtx native_decode_vector_rtx (machine_mode, vec<target_unit>,
+				     unsigned int, unsigned int, unsigned int);
 extern poly_uint64 subreg_lsb (const_rtx);
-extern poly_uint64 subreg_lsb_1 (machine_mode, machine_mode, poly_uint64);
+extern poly_uint64 subreg_size_lsb (poly_uint64, poly_uint64, poly_uint64);
 extern poly_uint64 subreg_size_offset_from_lsb (poly_uint64, poly_uint64,
 						poly_uint64);
 extern bool read_modify_subreg_p (const_rtx);

+/* Given a subreg's OUTER_MODE, INNER_MODE, and SUBREG_BYTE, return the
+   bit offset at which the subreg begins (counting from the least significant
+   bit of the operand).  */
+
+inline poly_uint64
+subreg_lsb_1 (machine_mode outer_mode, machine_mode inner_mode,
+	      poly_uint64 subreg_byte)
+{
+  return subreg_size_lsb (GET_MODE_SIZE (outer_mode),
+			  GET_MODE_SIZE (inner_mode), subreg_byte);
+}
+
 /* Return the subreg byte offset for a subreg whose outer mode is
   OUTER_MODE, whose inner mode is INNER_MODE, and where there are
   LSB_SHIFT *bits* between the lsb of the outer value and the lsb of

--- a/gcc/rtlanal.c
+++ b/gcc/rtlanal.c
@@ -3637,23 +3637,31 @@ loc_mentioned_in_p (rtx *loc, const_rtx in)
  return 0;
 }

-/* Helper function for subreg_lsb.  Given a subreg's OUTER_MODE, INNER_MODE,
-   and SUBREG_BYTE, return the bit offset where the subreg begins
-   (counting from the least significant bit of the operand).  */
+/* Reinterpret a subreg as a bit extraction from an integer and return
+   the position of the least significant bit of the extracted value.
+   In other words, if the extraction were performed as a shift right
+   and mask, return the number of bits to shift right.
+
+   The outer value of the subreg has OUTER_BYTES bytes and starts at
+   byte offset SUBREG_BYTE within an inner value of INNER_BYTES bytes.  */

 poly_uint64
-subreg_lsb_1 (machine_mode outer_mode,
-	      machine_mode inner_mode,
-	      poly_uint64 subreg_byte)
+subreg_size_lsb (poly_uint64 outer_bytes,
+		 poly_uint64 inner_bytes,
+		 poly_uint64 subreg_byte)
 {
  poly_uint64 subreg_end, trailing_bytes, byte_pos;

  /* A paradoxical subreg begins at bit position 0.  */
-  if (paradoxical_subreg_p (outer_mode, inner_mode))
-    return 0;
+  gcc_checking_assert (ordered_p (outer_bytes, inner_bytes));
+  if (maybe_gt (outer_bytes, inner_bytes))
+    {
+      gcc_checking_assert (known_eq (subreg_byte, 0U));
+      return 0;
+    }

-  subreg_end = subreg_byte + GET_MODE_SIZE (outer_mode);
-  trailing_bytes = GET_MODE_SIZE (inner_mode) - subreg_end;
+  subreg_end = subreg_byte + outer_bytes;
+  trailing_bytes = inner_bytes - subreg_end;
  if (WORDS_BIG_ENDIAN && BYTES_BIG_ENDIAN)
    byte_pos = trailing_bytes;
  else if (!WORDS_BIG_ENDIAN && !BYTES_BIG_ENDIAN)

--- a/gcc/simplify-rtx.c
+++ b/gcc/simplify-rtx.c
@@ -6130,342 +6130,466 @@ simplify_ternary_operation (enum rtx_code code, machine_mode mode,
  return 0;
 }

-/* Evaluate a SUBREG of a CONST_INT or CONST_WIDE_INT or CONST_DOUBLE
-   or CONST_FIXED or CONST_VECTOR, returning another CONST_INT or
-   CONST_WIDE_INT or CONST_DOUBLE or CONST_FIXED or CONST_VECTOR.
+/* Try to calculate NUM_BYTES bytes of the target memory image of X,
+   starting at byte FIRST_BYTE.  Return true on success and add the
+   bytes to BYTES, such that each byte has BITS_PER_UNIT bits and such
+   that the bytes follow target memory order.  Leave BYTES unmodified
+   on failure.

-   Works by unpacking INNER_BYTES bytes of OP into a collection of 8-bit values
-   represented as a little-endian array of 'unsigned char', selecting by BYTE,
-   and then repacking them again for OUTERMODE.  If OP is a CONST_VECTOR,
-   FIRST_ELEM is the number of the first element to extract, otherwise
-   FIRST_ELEM is ignored.  */
+   MODE is the mode of X.  The caller must reserve NUM_BYTES bytes in
+   BYTES before calling this function.  */

-static rtx
-simplify_immed_subreg (fixed_size_mode outermode, rtx op,
-		       machine_mode innermode, unsigned int byte,
-		       unsigned int first_elem, unsigned int inner_bytes)
+bool
+native_encode_rtx (machine_mode mode, rtx x, vec<target_unit> &bytes,
+		   unsigned int first_byte, unsigned int num_bytes)
 {
-  enum {
-    value_bit = 8,
-    value_mask = (1 << value_bit) - 1
-  };
-  unsigned char value[MAX_BITSIZE_MODE_ANY_MODE / value_bit];
-  int value_start;
-  int i;
-  int elem;
-
-  int num_elem;
-  rtx * elems;
-  int elem_bitsize;
-  rtx result_s = NULL;
-  rtvec result_v = NULL;
-  enum mode_class outer_class;
-  scalar_mode outer_submode;
-  int max_bitsize;
+  /* Check the mode is sensible.  */
+  gcc_assert (GET_MODE (x) == VOIDmode
+	      ? is_a <scalar_int_mode> (mode)
+	      : mode == GET_MODE (x));

-  /* Some ports misuse CCmode.  */
-  if (GET_MODE_CLASS (outermode) == MODE_CC && CONST_INT_P (op))
-    return op;
+  if (GET_CODE (x) == CONST_VECTOR)
+    {
+      /* CONST_VECTOR_ELT follows target memory order, so no shuffling
+	 is necessary.  The only complication is that MODE_VECTOR_BOOL
+	 vectors can have several elements per byte.  */
+      unsigned int elt_bits = vector_element_size (GET_MODE_BITSIZE (mode),
+						   GET_MODE_NUNITS (mode));
+      unsigned int elt = first_byte * BITS_PER_UNIT / elt_bits;
+      if (elt_bits < BITS_PER_UNIT)
+	{
+	  /* This is the only case in which elements can be smaller than
+	     a byte.  */
+	  gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL);
+	  for (unsigned int i = 0; i < num_bytes; ++i)
+	    {
+	      target_unit value = 0;
+	      for (unsigned int j = 0; j < BITS_PER_UNIT; j += elt_bits)
+		{
+		  value |= (INTVAL (CONST_VECTOR_ELT (x, elt)) & 1) << j;
+		  elt += 1;
+		}
+	      bytes.quick_push (value);
+	    }
+	  return true;
+	}

-  /* We have no way to represent a complex constant at the rtl level.  */
-  if (COMPLEX_MODE_P (outermode))
-    return NULL_RTX;
+      unsigned int start = bytes.length ();
+      unsigned int elt_bytes = GET_MODE_UNIT_SIZE (mode);
+      /* Make FIRST_BYTE relative to ELT.  */
+      first_byte %= elt_bytes;
+      while (num_bytes > 0)
+	{
+	  /* Work out how many bytes we want from element ELT.  */
+	  unsigned int chunk_bytes = MIN (num_bytes, elt_bytes - first_byte);
+	  if (!native_encode_rtx (GET_MODE_INNER (mode),
+				  CONST_VECTOR_ELT (x, elt), bytes,
+				  first_byte, chunk_bytes))
+	    {
+	      bytes.truncate (start);
+	      return false;
+	    }
+	  elt += 1;
+	  first_byte = 0;
+	  num_bytes -= chunk_bytes;
+	}
+      return true;
+    }

-  /* We support any size mode.  */
-  max_bitsize = MAX (GET_MODE_BITSIZE (outermode),
-		     inner_bytes * BITS_PER_UNIT);
+  /* All subsequent cases are limited to scalars.  */
+  scalar_mode smode;
+  if (!is_a <scalar_mode> (mode, &smode))
+    return false;

-  /* Unpack the value.  */
+  /* Make sure that the region is in range.  */
+  unsigned int end_byte = first_byte + num_bytes;
+  unsigned int mode_bytes = GET_MODE_SIZE (smode);
+  gcc_assert (end_byte <= mode_bytes);

-  if (GET_CODE (op) == CONST_VECTOR)
+  if (CONST_SCALAR_INT_P (x))
    {
-      num_elem = CEIL (inner_bytes, GET_MODE_UNIT_SIZE (innermode));
-      elem_bitsize = GET_MODE_UNIT_BITSIZE (innermode);
+      /* The target memory layout is affected by both BYTES_BIG_ENDIAN
+	 and WORDS_BIG_ENDIAN.  Use the subreg machinery to get the lsb
+	 position of each byte.  */
+      rtx_mode_t value (x, smode);
+      wide_int_ref value_wi (value);
+      for (unsigned int byte = first_byte; byte < end_byte; ++byte)
+	{
+	  /* Always constant because the inputs are.  */
+	  unsigned int lsb
+	    = subreg_size_lsb (1, mode_bytes, byte).to_constant ();
+	  /* Operate directly on the encoding rather than using
+	     wi::extract_uhwi, so that we preserve the sign or zero
+	     extension for modes that are not a whole number of bits in
+	     size.  (Zero extension is only used for the combination of
+	     innermode == BImode && STORE_FLAG_VALUE == 1).  */
+	  unsigned int elt = lsb / HOST_BITS_PER_WIDE_INT;
+	  unsigned int shift = lsb % HOST_BITS_PER_WIDE_INT;
+	  unsigned HOST_WIDE_INT uhwi = value_wi.elt (elt);
+	  bytes.quick_push (uhwi >> shift);
+	}
+      return true;
    }
-  else
+
+  if (CONST_DOUBLE_P (x))
    {
-      num_elem = 1;
-      elem_bitsize = max_bitsize;
+      /* real_to_target produces an array of integers in target memory order.
+	 All integers before the last one have 32 bits; the last one may
+	 have 32 bits or fewer, depending on whether the mode bitsize
+	 is divisible by 32.  Each of these integers is then laid out
+	 in target memory as any other integer would be.  */
+      long el32[MAX_BITSIZE_MODE_ANY_MODE / 32];
+      real_to_target (el32, CONST_DOUBLE_REAL_VALUE (x), smode);
+
+      /* The (maximum) number of target bytes per element of el32.  */
+      unsigned int bytes_per_el32 = 32 / BITS_PER_UNIT;
+      gcc_assert (bytes_per_el32 != 0);
+
+      /* Build up the integers in a similar way to the CONST_SCALAR_INT_P
+	 handling above.  */
+      for (unsigned int byte = first_byte; byte < end_byte; ++byte)
+	{
+	  unsigned int index = byte / bytes_per_el32;
+	  unsigned int subbyte = byte % bytes_per_el32;
+	  unsigned int int_bytes = MIN (bytes_per_el32,
+					mode_bytes - index * bytes_per_el32);
+	  /* Always constant because the inputs are.  */
+	  unsigned int lsb
+	    = subreg_size_lsb (1, int_bytes, subbyte).to_constant ();
+	  bytes.quick_push ((unsigned long) el32[index] >> lsb);
+	}
+      return true;
    }
-  /* If this asserts, it is too complicated; reducing value_bit may help.  */
-  gcc_assert (BITS_PER_UNIT % value_bit == 0);
-  /* I don't know how to handle endianness of sub-units.  */
-  gcc_assert (elem_bitsize % BITS_PER_UNIT == 0);

-  for (elem = 0; elem < num_elem; elem++)
+  if (GET_CODE (x) == CONST_FIXED)
    {
-      unsigned char * vp;
-      rtx el = (GET_CODE (op) == CONST_VECTOR
-		? CONST_VECTOR_ELT (op, first_elem + elem)
-		: op);
+      for (unsigned int byte = first_byte; byte < end_byte; ++byte)
+	{
+	  /* Always constant because the inputs are.  */
+	  unsigned int lsb
+	    = subreg_size_lsb (1, mode_bytes, byte).to_constant ();
+	  unsigned HOST_WIDE_INT piece = CONST_FIXED_VALUE_LOW (x);
+	  if (lsb >= HOST_BITS_PER_WIDE_INT)
+	    {
+	      lsb -= HOST_BITS_PER_WIDE_INT;
+	      piece = CONST_FIXED_VALUE_HIGH (x);
+	    }
+	  bytes.quick_push (piece >> lsb);
+	}
+      return true;
+    }

-      /* Vectors are kept in target memory order.  (This is probably
-	 a mistake.)  */
-      {
-	unsigned byte = (elem * elem_bitsize) / BITS_PER_UNIT;
-	unsigned ibyte = (((num_elem - 1 - elem) * elem_bitsize)
-			  / BITS_PER_UNIT);
-	unsigned word_byte = WORDS_BIG_ENDIAN ? ibyte : byte;
-	unsigned subword_byte = BYTES_BIG_ENDIAN ? ibyte : byte;
-	unsigned bytele = (subword_byte % UNITS_PER_WORD
-			 + (word_byte / UNITS_PER_WORD) * UNITS_PER_WORD);
-	vp = value + (bytele * BITS_PER_UNIT) / value_bit;
-      }
+  return false;
+}

-      switch (GET_CODE (el))
-	{
-	case CONST_INT:
-	  for (i = 0;
-	       i < HOST_BITS_PER_WIDE_INT && i < elem_bitsize;
-	       i += value_bit)
-	    *vp++ = INTVAL (el) >> i;
-	  /* CONST_INTs are always logically sign-extended.  */
-	  for (; i < elem_bitsize; i += value_bit)
-	    *vp++ = INTVAL (el) < 0 ? -1 : 0;
-	  break;
+/* Read a vector of mode MODE from the target memory image given by BYTES,
+   starting at byte FIRST_BYTE.  The vector is known to be encodable using
+   NPATTERNS interleaved patterns with NELTS_PER_PATTERN elements each,
+   and BYTES is known to have enough bytes to supply NPATTERNS *
+   NELTS_PER_PATTERN vector elements.  Each element of BYTES contains
+   BITS_PER_UNIT bits and the bytes are in target memory order.

-	case CONST_WIDE_INT:
-	  {
-	    rtx_mode_t val = rtx_mode_t (el, GET_MODE_INNER (innermode));
-	    unsigned char extend = wi::sign_mask (val);
-	    int prec = wi::get_precision (val);
-
-	    for (i = 0; i < prec && i < elem_bitsize; i += value_bit)
-	      *vp++ = wi::extract_uhwi (val, i, value_bit);
-	    for (; i < elem_bitsize; i += value_bit)
-	      *vp++ = extend;
-	  }
-	  break;
+   Return the vector on success, otherwise return NULL_RTX.  */

-	case CONST_DOUBLE:
-	  if (TARGET_SUPPORTS_WIDE_INT == 0 && GET_MODE (el) == VOIDmode)
-	    {
-	      unsigned char extend = 0;
-	      /* If this triggers, someone should have generated a
-		 CONST_INT instead.  */
-	      gcc_assert (elem_bitsize > HOST_BITS_PER_WIDE_INT);
-
-	      for (i = 0; i < HOST_BITS_PER_WIDE_INT; i += value_bit)
-		*vp++ = CONST_DOUBLE_LOW (el) >> i;
-	      while (i < HOST_BITS_PER_DOUBLE_INT && i < elem_bitsize)
-		{
-		  *vp++
-		    = CONST_DOUBLE_HIGH (el) >> (i - HOST_BITS_PER_WIDE_INT);
-		  i += value_bit;
-		}
+rtx
+native_decode_vector_rtx (machine_mode mode, vec<target_unit> bytes,
+			  unsigned int first_byte, unsigned int npatterns,
+			  unsigned int nelts_per_pattern)
+{
+  rtx_vector_builder builder (mode, npatterns, nelts_per_pattern);

-	      if (CONST_DOUBLE_HIGH (el) >> (HOST_BITS_PER_WIDE_INT - 1))
-		extend = -1;
-	      for (; i < elem_bitsize; i += value_bit)
-		*vp++ = extend;
-	    }
-	  else
-	    {
-	      /* This is big enough for anything on the platform.  */
-	      long tmp[MAX_BITSIZE_MODE_ANY_MODE / 32];
-	      scalar_float_mode el_mode;
+  unsigned int elt_bits = vector_element_size (GET_MODE_BITSIZE (mode),
+					       GET_MODE_NUNITS (mode));
+  if (elt_bits < BITS_PER_UNIT)
+    {
+      /* This is the only case in which elements can be smaller than a byte.
+	 Element 0 is always in the lsb of the containing byte.  */
+      gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL);
+      for (unsigned int i = 0; i < builder.encoded_nelts (); ++i)
+	{
+	  unsigned int bit_index = first_byte * BITS_PER_UNIT + i * elt_bits;
+	  unsigned int byte_index = bit_index / BITS_PER_UNIT;
+	  unsigned int lsb = bit_index % BITS_PER_UNIT;
+	  builder.quick_push (bytes[byte_index] & (1 << lsb)
+			      ? CONST1_RTX (BImode)
+			      : CONST0_RTX (BImode));
+	}
+    }
+  else
+    {
+      for (unsigned int i = 0; i < builder.encoded_nelts (); ++i)
+	{
+	  rtx x = native_decode_rtx (GET_MODE_INNER (mode), bytes, first_byte);
+	  if (!x)
+	    return NULL_RTX;
+	  builder.quick_push (x);
+	  first_byte += elt_bits / BITS_PER_UNIT;
+	}
+    }
+  return builder.build ();
+}

-	      el_mode = as_a <scalar_float_mode> (GET_MODE (el));
-	      int bitsize = GET_MODE_BITSIZE (el_mode);
+/* Read an rtx of mode MODE from the target memory image given by BYTES,
+   starting at byte FIRST_BYTE.  Each element of BYTES contains BITS_PER_UNIT
+   bits and the bytes are in target memory order.  The image has enough
+   values to specify all bytes of MODE.

-	      gcc_assert (bitsize <= elem_bitsize);
-	      gcc_assert (bitsize % value_bit == 0);
+   Return the rtx on success, otherwise return NULL_RTX.  */

-	      real_to_target (tmp, CONST_DOUBLE_REAL_VALUE (el),
-			      GET_MODE (el));
+rtx
+native_decode_rtx (machine_mode mode, vec<target_unit> bytes,
+		   unsigned int first_byte)
+{
+  if (VECTOR_MODE_P (mode))
+    {
+      /* If we know at compile time how many elements there are,
+	 pull each element directly from BYTES.  */
+      unsigned int nelts;
+      if (GET_MODE_NUNITS (mode).is_constant (&nelts))
+	return native_decode_vector_rtx (mode, bytes, first_byte, nelts, 1);
+      return NULL_RTX;
+    }

-	      /* real_to_target produces its result in words affected by
-		 FLOAT_WORDS_BIG_ENDIAN.  However, we ignore this,
-		 and use WORDS_BIG_ENDIAN instead; see the documentation
-	         of SUBREG in rtl.texi.  */
-	      for (i = 0; i < bitsize; i += value_bit)
-		{
-		  int ibase;
-		  if (WORDS_BIG_ENDIAN)
-		    ibase = bitsize - 1 - i;
-		  else
-		    ibase = i;
-		  *vp++ = tmp[ibase / 32] >> i % 32;
-		}
+  scalar_int_mode imode;
+  if (is_a <scalar_int_mode> (mode, &imode)
+      && GET_MODE_PRECISION (imode) <= MAX_BITSIZE_MODE_ANY_INT)
+    {
+      /* Pull the bytes msb first, so that we can use simple
+	 shift-and-insert wide_int operations.  */
+      unsigned int size = GET_MODE_SIZE (imode);
+      wide_int result (wi::zero (GET_MODE_PRECISION (imode)));
+      for (unsigned int i = 0; i < size; ++i)
+	{
+	  unsigned int lsb = (size - i - 1) * BITS_PER_UNIT;
+	  /* Always constant because the inputs are.  */
+	  unsigned int subbyte
+	    = subreg_size_offset_from_lsb (1, size, lsb).to_constant ();
+	  result <<= BITS_PER_UNIT;
+	  result |= bytes[first_byte + subbyte];
+	}
+      return immed_wide_int_const (result, imode);
+    }

-	      /* It shouldn't matter what's done here, so fill it with
-		 zero.  */
-	      for (; i < elem_bitsize; i += value_bit)
-		*vp++ = 0;
-	    }
-	  break;
+  scalar_float_mode fmode;
+  if (is_a <scalar_float_mode> (mode, &fmode))
+    {
+      /* We need to build an array of integers in target memory order.
+	 All integers before the last one have 32 bits; the last one may
+	 have 32 bits or fewer, depending on whether the mode bitsize
+	 is divisible by 32.  */
+      long el32[MAX_BITSIZE_MODE_ANY_MODE / 32];
+      unsigned int num_el32 = CEIL (GET_MODE_BITSIZE (fmode), 32);
+      memset (el32, 0, num_el32 * sizeof (long));
+
+      /* The (maximum) number of target bytes per element of el32.  */
+      unsigned int bytes_per_el32 = 32 / BITS_PER_UNIT;
+      gcc_assert (bytes_per_el32 != 0);
+
+      unsigned int mode_bytes = GET_MODE_SIZE (fmode);
+      for (unsigned int byte = 0; byte < mode_bytes; ++byte)
+	{
+	  unsigned int index = byte / bytes_per_el32;
+	  unsigned int subbyte = byte % bytes_per_el32;
+	  unsigned int int_bytes = MIN (bytes_per_el32,
+					mode_bytes - index * bytes_per_el32);
+	  /* Always constant because the inputs are.  */
+	  unsigned int lsb
+	    = subreg_size_lsb (1, int_bytes, subbyte).to_constant ();
+	  el32[index] |= (unsigned long) bytes[first_byte + byte] << lsb;
+	}
+      REAL_VALUE_TYPE r;
+      real_from_target (&r, el32, fmode);
+      return const_double_from_real_value (r, fmode);
+    }

-        case CONST_FIXED:
-	  if (elem_bitsize <= HOST_BITS_PER_WIDE_INT)
-	    {
-	      for (i = 0; i < elem_bitsize; i += value_bit)
-		*vp++ = CONST_FIXED_VALUE_LOW (el) >> i;
-	    }
+  if (ALL_SCALAR_FIXED_POINT_MODE_P (mode))
+    {
+      scalar_mode smode = as_a <scalar_mode> (mode);
+      FIXED_VALUE_TYPE f;
+      f.data.low = 0;
+      f.data.high = 0;
+      f.mode = smode;
+
+      unsigned int mode_bytes = GET_MODE_SIZE (smode);
+      for (unsigned int byte = 0; byte < mode_bytes; ++byte)
+	{
+	  /* Always constant because the inputs are.  */
+	  unsigned int lsb
+	    = subreg_size_lsb (1, mode_bytes, byte).to_constant ();
+	  unsigned HOST_WIDE_INT unit = bytes[first_byte + byte];
+	  if (lsb >= HOST_BITS_PER_WIDE_INT)
+	    f.data.high |= unit << (lsb - HOST_BITS_PER_WIDE_INT);
 	  else
-	    {
-	      for (i = 0; i < HOST_BITS_PER_WIDE_INT; i += value_bit)
-		*vp++ = CONST_FIXED_VALUE_LOW (el) >> i;
-              for (; i < HOST_BITS_PER_DOUBLE_INT && i < elem_bitsize;
-		   i += value_bit)
-		*vp++ = CONST_FIXED_VALUE_HIGH (el)
-			>> (i - HOST_BITS_PER_WIDE_INT);
-	      for (; i < elem_bitsize; i += value_bit)
-		*vp++ = 0;
-	    }
-          break;
-
-	default:
-	  gcc_unreachable ();
+	    f.data.low |= unit << lsb;
 	}
+      return CONST_FIXED_FROM_FIXED_VALUE (f, mode);
    }

-  /* Now, pick the right byte to start with.  */
-  /* Renumber BYTE so that the least-significant byte is byte 0.  A special
-     case is paradoxical SUBREGs, which shouldn't be adjusted since they
-     will already have offset 0.  */
-  if (inner_bytes >= GET_MODE_SIZE (outermode))
+  return NULL_RTX;
+}
+
+/* Simplify a byte offset BYTE into CONST_VECTOR X.  The main purpose
+   is to convert a runtime BYTE value into a constant one.  */
+
+static poly_uint64
+simplify_const_vector_byte_offset (rtx x, poly_uint64 byte)
+{
+  /* Cope with MODE_VECTOR_BOOL by operating on bits rather than bytes.  */
+  machine_mode mode = GET_MODE (x);
+  unsigned int elt_bits = vector_element_size (GET_MODE_BITSIZE (mode),
+					       GET_MODE_NUNITS (mode));
+  /* The number of bits needed to encode one element from each pattern.  */
+  unsigned int sequence_bits = CONST_VECTOR_NPATTERNS (x) * elt_bits;
+
+  /* Identify the start point in terms of a sequence number and a byte offset
+     within that sequence.  */
+  poly_uint64 first_sequence;
+  unsigned HOST_WIDE_INT subbit;
+  if (can_div_trunc_p (byte * BITS_PER_UNIT, sequence_bits,
+		       &first_sequence, &subbit))
    {
-      unsigned ibyte = inner_bytes - GET_MODE_SIZE (outermode) - byte;
-      unsigned word_byte = WORDS_BIG_ENDIAN ? ibyte : byte;
-      unsigned subword_byte = BYTES_BIG_ENDIAN ? ibyte : byte;
-      byte = (subword_byte % UNITS_PER_WORD
-	      + (word_byte / UNITS_PER_WORD) * UNITS_PER_WORD);
+      unsigned int nelts_per_pattern = CONST_VECTOR_NELTS_PER_PATTERN (x);
+      if (nelts_per_pattern == 1)
+	/* This is a duplicated vector, so the value of FIRST_SEQUENCE
+	   doesn't matter.  */
+	byte = subbit / BITS_PER_UNIT;
+      else if (nelts_per_pattern == 2 && known_gt (first_sequence, 0U))
+	{
+	  /* The subreg drops the first element from each pattern and
+	     only uses the second element.  Find the first sequence
+	     that starts on a byte boundary.  */
+	  subbit += least_common_multiple (sequence_bits, BITS_PER_UNIT);
+	  byte = subbit / BITS_PER_UNIT;
+	}
    }
+  return byte;
+}
+
+/* Subroutine of simplify_subreg in which:
+
+   - X is known to be a CONST_VECTOR
+   - OUTERMODE is known to be a vector mode

-  /* BYTE should still be inside OP.  (Note that BYTE is unsigned,
-     so if it's become negative it will instead be very large.)  */
-  gcc_assert (byte < inner_bytes);
+   Try to handle the subreg by operating on the CONST_VECTOR encoding
+   rather than on each individual element of the CONST_VECTOR.

-  /* Convert from bytes to chunks of size value_bit.  */
-  value_start = byte * (BITS_PER_UNIT / value_bit);
+   Return the simplified subreg on success, otherwise return NULL_RTX.  */
+
+static rtx
+simplify_const_vector_subreg (machine_mode outermode, rtx x,
+			      machine_mode innermode, unsigned int first_byte)
+{
+  /* Paradoxical subregs of vectors have dubious semantics.  */
+  if (paradoxical_subreg_p (outermode, innermode))
+    return NULL_RTX;

-  /* Re-pack the value.  */
-  num_elem = GET_MODE_NUNITS (outermode);
+  /* We can only preserve the semantics of a stepped pattern if the new
+     vector element is the same as the original one.  */
+  if (CONST_VECTOR_STEPPED_P (x)
+      && GET_MODE_INNER (outermode) != GET_MODE_INNER (innermode))
+    return NULL_RTX;

-  if (VECTOR_MODE_P (outermode))
+  /* Cope with MODE_VECTOR_BOOL by operating on bits rather than bytes.  */
+  unsigned int x_elt_bits
+    = vector_element_size (GET_MODE_BITSIZE (innermode),
+			   GET_MODE_NUNITS (innermode));
+  unsigned int out_elt_bits
+    = vector_element_size (GET_MODE_BITSIZE (outermode),
+			   GET_MODE_NUNITS (outermode));
+
+  /* The number of bits needed to encode one element from every pattern
+     of the original vector.  */
+  unsigned int x_sequence_bits = CONST_VECTOR_NPATTERNS (x) * x_elt_bits;
+
+  /* The number of bits needed to encode one element from every pattern
+     of the result.  */
+  unsigned int out_sequence_bits
+    = least_common_multiple (x_sequence_bits, out_elt_bits);
+
+  /* Work out the number of interleaved patterns in the output vector
+     and the number of encoded elements per pattern.  */
+  unsigned int out_npatterns = out_sequence_bits / out_elt_bits;
+  unsigned int nelts_per_pattern = CONST_VECTOR_NELTS_PER_PATTERN (x);
+
+  /* The encoding scheme requires the number of elements to be a multiple
+     of the number of patterns, so that each pattern appears at least once
+     and so that the same number of elements appear from each pattern.  */
+  bool ok_p = multiple_p (GET_MODE_NUNITS (outermode), out_npatterns);
+  unsigned int const_nunits;
+  if (GET_MODE_NUNITS (outermode).is_constant (&const_nunits)
+      && (!ok_p || out_npatterns * nelts_per_pattern > const_nunits))
    {
-      result_v = rtvec_alloc (num_elem);
-      elems = &RTVEC_ELT (result_v, 0);
+      /* Either the encoding is invalid, or applying it would give us
+	 more elements than we need.  Just encode each element directly.  */
+      out_npatterns = const_nunits;
+      nelts_per_pattern = 1;
    }
-  else
-    elems = &result_s;
+  else if (!ok_p)
+    return NULL_RTX;

-  outer_submode = GET_MODE_INNER (outermode);
-  outer_class = GET_MODE_CLASS (outer_submode);
-  elem_bitsize = GET_MODE_BITSIZE (outer_submode);
+  /* Get enough bytes of X to form the new encoding.  */
+  unsigned int buffer_bits = out_npatterns * nelts_per_pattern * out_elt_bits;
+  unsigned int buffer_bytes = CEIL (buffer_bits, BITS_PER_UNIT);
+  auto_vec<target_unit, 128> buffer (buffer_bytes);
+  if (!native_encode_rtx (innermode, x, buffer, first_byte, buffer_bytes))
+    return NULL_RTX;

-  gcc_assert (elem_bitsize % value_bit == 0);
-  gcc_assert (elem_bitsize + value_start * value_bit <= max_bitsize);
+  /* Reencode the bytes as OUTERMODE.  */
+  return native_decode_vector_rtx (outermode, buffer, 0, out_npatterns,
+				   nelts_per_pattern);
+}

-  for (elem = 0; elem < num_elem; elem++)
-    {
-      unsigned char *vp;
+/* Try to simplify a subreg of a constant by encoding the subreg region
+   as a sequence of target bytes and reading them back in the new mode.
+   Return the new value on success, otherwise return null.

-      /* Vectors are stored in target memory order.  (This is probably
-	 a mistake.)  */
-      {
-	unsigned byte = (elem * elem_bitsize) / BITS_PER_UNIT;
-	unsigned ibyte = (((num_elem - 1 - elem) * elem_bitsize)
-			  / BITS_PER_UNIT);
-	unsigned word_byte = WORDS_BIG_ENDIAN ? ibyte : byte;
-	unsigned subword_byte = BYTES_BIG_ENDIAN ? ibyte : byte;
-	unsigned bytele = (subword_byte % UNITS_PER_WORD
-			 + (word_byte / UNITS_PER_WORD) * UNITS_PER_WORD);
-	vp = value + value_start + (bytele * BITS_PER_UNIT) / value_bit;
-      }
+   The subreg has outer mode OUTERMODE, inner mode INNERMODE, inner value X
+   and byte offset FIRST_BYTE.  */

-      switch (outer_class)
-	{
-	case MODE_INT:
-	case MODE_PARTIAL_INT:
-	  {
-	    int u;
-	    int base = 0;
-	    int units
-	      = (GET_MODE_BITSIZE (outer_submode) + HOST_BITS_PER_WIDE_INT - 1)
-	      / HOST_BITS_PER_WIDE_INT;
-	    HOST_WIDE_INT tmp[MAX_BITSIZE_MODE_ANY_INT / HOST_BITS_PER_WIDE_INT];
-	    wide_int r;
-
-	    if (GET_MODE_PRECISION (outer_submode) > MAX_BITSIZE_MODE_ANY_INT)
-	      return NULL_RTX;
-	    for (u = 0; u < units; u++)
-	      {
-		unsigned HOST_WIDE_INT buf = 0;
-		for (i = 0;
-		     i < HOST_BITS_PER_WIDE_INT && base + i < elem_bitsize;
-		     i += value_bit)
-		  buf |= (unsigned HOST_WIDE_INT)(*vp++ & value_mask) << i;
-
-		tmp[u] = buf;
-		base += HOST_BITS_PER_WIDE_INT;
-	      }
-	    r = wide_int::from_array (tmp, units,
-				      GET_MODE_PRECISION (outer_submode));
-#if TARGET_SUPPORTS_WIDE_INT == 0
-	    /* Make sure r will fit into CONST_INT or CONST_DOUBLE.  */
-	    if (wi::min_precision (r, SIGNED) > HOST_BITS_PER_DOUBLE_INT)
-	      return NULL_RTX;
-#endif
-	    elems[elem] = immed_wide_int_const (r, outer_submode);
-	  }
-	  break;
+static rtx
+simplify_immed_subreg (fixed_size_mode outermode, rtx x,
+		       machine_mode innermode, unsigned int first_byte)
+{
+  unsigned int buffer_bytes = GET_MODE_SIZE (outermode);
+  auto_vec<target_unit, 128> buffer (buffer_bytes);

-	case MODE_FLOAT:
-	case MODE_DECIMAL_FLOAT:
-	  {
-	    REAL_VALUE_TYPE r;
-	    long tmp[MAX_BITSIZE_MODE_ANY_MODE / 32] = { 0 };
-
-	    /* real_from_target wants its input in words affected by
-	       FLOAT_WORDS_BIG_ENDIAN.  However, we ignore this,
-	       and use WORDS_BIG_ENDIAN instead; see the documentation
-	       of SUBREG in rtl.texi.  */
-	    for (i = 0; i < elem_bitsize; i += value_bit)
-	      {
-		int ibase;
-		if (WORDS_BIG_ENDIAN)
-		  ibase = elem_bitsize - 1 - i;
-		else
-		  ibase = i;
-		tmp[ibase / 32] |= (*vp++ & value_mask) << i % 32;
-	      }
+  /* Some ports misuse CCmode.  */
+  if (GET_MODE_CLASS (outermode) == MODE_CC && CONST_INT_P (x))
+    return x;

-	    real_from_target (&r, tmp, outer_submode);
-	    elems[elem] = const_double_from_real_value (r, outer_submode);
-	  }
-	  break;
+  /* Paradoxical subregs read undefined values for bytes outside of the
+     inner value.  However, we have traditionally always sign-extended
+     integer constants and zero-extended others.  */
+  unsigned int inner_bytes = buffer_bytes;
+  if (paradoxical_subreg_p (outermode, innermode))
+    {
+      if (!GET_MODE_SIZE (innermode).is_constant (&inner_bytes))
+	return NULL_RTX;

-	case MODE_FRACT:
-	case MODE_UFRACT:
-	case MODE_ACCUM:
-	case MODE_UACCUM:
-	  {
-	    FIXED_VALUE_TYPE f;
-	    f.data.low = 0;
-	    f.data.high = 0;
-	    f.mode = outer_submode;
-
-	    for (i = 0;
-		 i < HOST_BITS_PER_WIDE_INT && i < elem_bitsize;
-		 i += value_bit)
-	      f.data.low |= (unsigned HOST_WIDE_INT)(*vp++ & value_mask) << i;
-	    for (; i < elem_bitsize; i += value_bit)
-	      f.data.high |= ((unsigned HOST_WIDE_INT)(*vp++ & value_mask)
-			     << (i - HOST_BITS_PER_WIDE_INT));
-
-	    elems[elem] = CONST_FIXED_FROM_FIXED_VALUE (f, outer_submode);
-          }
-          break;
+      target_unit filler = 0;
+      if (CONST_SCALAR_INT_P (x) && wi::neg_p (rtx_mode_t (x, innermode)))
+	filler = -1;

-	default:
-	  gcc_unreachable ();
-	}
+      /* Add any leading bytes due to big-endian layout.  The number of
+	 bytes must be constant because both modes have constant size.  */
+      unsigned int leading_bytes
+	= -byte_lowpart_offset (outermode, innermode).to_constant ();
+      for (unsigned int i = 0; i < leading_bytes; ++i)
+	buffer.quick_push (filler);
+
+      if (!native_encode_rtx (innermode, x, buffer, first_byte, inner_bytes))
+	return NULL_RTX;
+
+      /* Add any trailing bytes due to little-endian layout.  */
+      while (buffer.length () < buffer_bytes)
+	buffer.quick_push (filler);
    }
-  if (VECTOR_MODE_P (outermode))
-    return gen_rtx_CONST_VECTOR (outermode, result_v);
  else
-    return result_s;
+    {
+      if (!native_encode_rtx (innermode, x, buffer, first_byte, inner_bytes))
+	return NULL_RTX;
+      }
+  return native_decode_rtx (outermode, buffer, 0);
 }

 /* Simplify SUBREG:OUTERMODE(OP:INNERMODE, BYTE)
@@ -6494,6 +6618,9 @@ simplify_subreg (machine_mode outermode, rtx op,
  if (outermode == innermode && known_eq (byte, 0U))
    return op;

+  if (GET_CODE (op) == CONST_VECTOR)
+    byte = simplify_const_vector_byte_offset (op, byte);
+
  if (multiple_p (byte, GET_MODE_UNIT_SIZE (innermode)))
    {
      rtx elt;
@@ -6513,30 +6640,21 @@ simplify_subreg (machine_mode outermode, rtx op,
      || CONST_FIXED_P (op)
      || GET_CODE (op) == CONST_VECTOR)
    {
-      /* simplify_immed_subreg deconstructs OP into bytes and constructs
-	 the result from bytes, so it only works if the sizes of the modes
-	 and the value of the offset are known at compile time.  Cases that
-	 that apply to general modes and offsets should be handled here
-	 before calling simplify_immed_subreg.  */
-      fixed_size_mode fs_outermode, fs_innermode;
      unsigned HOST_WIDE_INT cbyte;
-      if (is_a <fixed_size_mode> (outermode, &fs_outermode)
-	  && is_a <fixed_size_mode> (innermode, &fs_innermode)
-	  && byte.is_constant (&cbyte))
-	return simplify_immed_subreg (fs_outermode, op, fs_innermode, cbyte,
-				      0, GET_MODE_SIZE (fs_innermode));
-
-      /* Handle constant-sized outer modes and variable-sized inner modes.  */
-      unsigned HOST_WIDE_INT first_elem;
-      if (GET_CODE (op) == CONST_VECTOR
-	  && is_a <fixed_size_mode> (outermode, &fs_outermode)
-	  && constant_multiple_p (byte, GET_MODE_UNIT_SIZE (innermode),
-				  &first_elem))
-	return simplify_immed_subreg (fs_outermode, op, innermode, 0,
-				      first_elem,
-				      GET_MODE_SIZE (fs_outermode));
+      if (byte.is_constant (&cbyte))
+	{
+	  if (GET_CODE (op) == CONST_VECTOR && VECTOR_MODE_P (outermode))
+	    {
+	      rtx tmp = simplify_const_vector_subreg (outermode, op,
+						      innermode, cbyte);
+	      if (tmp)
+		return tmp;
+	    }

-      return NULL_RTX;
+	  fixed_size_mode fs_outermode;
+	  if (is_a <fixed_size_mode> (outermode, &fs_outermode))
+	    return simplify_immed_subreg (fs_outermode, op, innermode, cbyte);
+	}
    }

  /* Changing mode twice with SUBREG => just change it once,
@@ -7179,6 +7297,165 @@ test_vec_merge (machine_mode mode)
 		 simplify_rtx (nvm));
 }

+/* Test subregs of integer vector constant X, trying elements in
+   the range [ELT_BIAS, ELT_BIAS + constant_lower_bound (NELTS)),
+   where NELTS is the number of elements in X.  Subregs involving
+   elements [ELT_BIAS, ELT_BIAS + FIRST_VALID) are expected to fail.  */
+
+static void
+test_vector_subregs_modes (rtx x, poly_uint64 elt_bias = 0,
+			   unsigned int first_valid = 0)
+{
+  machine_mode inner_mode = GET_MODE (x);
+  scalar_mode int_mode = GET_MODE_INNER (inner_mode);
+
+  for (unsigned int modei = 0; modei < NUM_MACHINE_MODES; ++modei)
+    {
+      machine_mode outer_mode = (machine_mode) modei;
+      if (!VECTOR_MODE_P (outer_mode))
+	continue;
+
+      unsigned int outer_nunits;
+      if (GET_MODE_INNER (outer_mode) == int_mode
+	  && GET_MODE_NUNITS (outer_mode).is_constant (&outer_nunits)
+	  && multiple_p (GET_MODE_NUNITS (inner_mode), outer_nunits))
+	{
+	  /* Test subregs in which the outer mode is a smaller,
+	     constant-sized vector of the same element type.  */
+	  unsigned int limit
+	    = constant_lower_bound (GET_MODE_NUNITS (inner_mode));
+	  for (unsigned int elt = 0; elt < limit; elt += outer_nunits)
+	    {
+	      rtx expected = NULL_RTX;
+	      if (elt >= first_valid)
+		{
+		  rtx_vector_builder builder (outer_mode, outer_nunits, 1);
+		  for (unsigned int i = 0; i < outer_nunits; ++i)
+		    builder.quick_push (CONST_VECTOR_ELT (x, elt + i));
+		  expected = builder.build ();
+		}
+	      poly_uint64 byte = (elt_bias + elt) * GET_MODE_SIZE (int_mode);
+	      ASSERT_RTX_EQ (expected,
+			     simplify_subreg (outer_mode, x,
+					      inner_mode, byte));
+	    }
+	}
+      else if (known_eq (GET_MODE_SIZE (outer_mode),
+			 GET_MODE_SIZE (inner_mode))
+	       && known_eq (elt_bias, 0U)
+	       && (GET_MODE_CLASS (outer_mode) != MODE_VECTOR_BOOL
+		   || known_eq (GET_MODE_BITSIZE (outer_mode),
+				GET_MODE_NUNITS (outer_mode)))
+	       && (!FLOAT_MODE_P (outer_mode)
+		   || (FLOAT_MODE_FORMAT (outer_mode)->ieee_bits
+		       == GET_MODE_UNIT_PRECISION (outer_mode)))
+	       && (GET_MODE_SIZE (inner_mode).is_constant ()
+		   || !CONST_VECTOR_STEPPED_P (x)))
+	{
+	  /* Try converting to OUTER_MODE and back.  */
+	  rtx outer_x = simplify_subreg (outer_mode, x, inner_mode, 0);
+	  ASSERT_TRUE (outer_x != NULL_RTX);
+	  ASSERT_RTX_EQ (x, simplify_subreg (inner_mode, outer_x,
+					     outer_mode, 0));
+	}
+    }
+
+  if (BYTES_BIG_ENDIAN == WORDS_BIG_ENDIAN)
+    {
+      /* Test each byte in the element range.  */
+      unsigned int limit
+	= constant_lower_bound (GET_MODE_SIZE (inner_mode));
+      for (unsigned int i = 0; i < limit; ++i)
+	{
+	  unsigned int elt = i / GET_MODE_SIZE (int_mode);
+	  rtx expected = NULL_RTX;
+	  if (elt >= first_valid)
+	    {
+	      unsigned int byte_shift = i % GET_MODE_SIZE (int_mode);
+	      if (BYTES_BIG_ENDIAN)
+		byte_shift = GET_MODE_SIZE (int_mode) - byte_shift - 1;
+	      rtx_mode_t vec_elt (CONST_VECTOR_ELT (x, elt), int_mode);
+	      wide_int shifted_elt
+		= wi::lrshift (vec_elt, byte_shift * BITS_PER_UNIT);
+	      expected = immed_wide_int_const (shifted_elt, QImode);
+	    }
+	  poly_uint64 byte = elt_bias * GET_MODE_SIZE (int_mode) + i;
+	  ASSERT_RTX_EQ (expected,
+			 simplify_subreg (QImode, x, inner_mode, byte));
+	}
+    }
+}
+
+/* Test constant subregs of integer vector mode INNER_MODE, using 1
+   element per pattern.  */
+
+static void
+test_vector_subregs_repeating (machine_mode inner_mode)
+{
+  poly_uint64 nunits = GET_MODE_NUNITS (inner_mode);
+  unsigned int min_nunits = constant_lower_bound (nunits);
+  scalar_mode int_mode = GET_MODE_INNER (inner_mode);
+  unsigned int count = gcd (min_nunits, 8);
+
+  rtx_vector_builder builder (inner_mode, count, 1);
+  for (unsigned int i = 0; i < count; ++i)
+    builder.quick_push (gen_int_mode (8 - i, int_mode));
+  rtx x = builder.build ();
+
+  test_vector_subregs_modes (x);
+  if (!nunits.is_constant ())
+    test_vector_subregs_modes (x, nunits - min_nunits);
+}
+
+/* Test constant subregs of integer vector mode INNER_MODE, using 2
+   elements per pattern.  */
+
+static void
+test_vector_subregs_fore_back (machine_mode inner_mode)
+{
+  poly_uint64 nunits = GET_MODE_NUNITS (inner_mode);
+  unsigned int min_nunits = constant_lower_bound (nunits);
+  scalar_mode int_mode = GET_MODE_INNER (inner_mode);
+  unsigned int count = gcd (min_nunits, 4);
+
+  rtx_vector_builder builder (inner_mode, count, 2);
+  for (unsigned int i = 0; i < count; ++i)
+    builder.quick_push (gen_int_mode (i, int_mode));
+  for (unsigned int i = 0; i < count; ++i)
+    builder.quick_push (gen_int_mode (-(int) i, int_mode));
+  rtx x = builder.build ();
+
+  test_vector_subregs_modes (x);
+  if (!nunits.is_constant ())
+    test_vector_subregs_modes (x, nunits - min_nunits, count);
+}
+
+/* Test constant subregs of integer vector mode INNER_MODE, using 3
+   elements per pattern.  */
+
+static void
+test_vector_subregs_stepped (machine_mode inner_mode)
+{
+  /* Build { 0, 1, 2, 3, ... }.  */
+  scalar_mode int_mode = GET_MODE_INNER (inner_mode);
+  rtx_vector_builder builder (inner_mode, 1, 3);
+  for (unsigned int i = 0; i < 3; ++i)
+    builder.quick_push (gen_int_mode (i, int_mode));
+  rtx x = builder.build ();
+
+  test_vector_subregs_modes (x);
+}
+
+/* Test constant subregs of integer vector mode INNER_MODE.  */
+
+static void
+test_vector_subregs (machine_mode inner_mode)
+{
+  test_vector_subregs_repeating (inner_mode);
+  test_vector_subregs_fore_back (inner_mode);
+  test_vector_subregs_stepped (inner_mode);
+}
+
 /* Verify some simplifications involving vectors.  */

 static void
@@ -7193,7 +7470,10 @@ test_vector_ops ()
 	  test_vector_ops_duplicate (mode, scalar_reg);
 	  if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
 	      && maybe_gt (GET_MODE_NUNITS (mode), 2))
-	    test_vector_ops_series (mode, scalar_reg);
+	    {
+	      test_vector_ops_series (mode, scalar_reg);
+	      test_vector_subregs (mode);
+	    }
 	  test_vec_merge (mode);
 	}
    }