1 files changed, 792 insertions, 0 deletions
diff --git a/0162-Backport-SME-aarch64-Use-SVE-s-RDVL-instruction.patch b/0162-Backport-SME-aarch64-Use-SVE-s-RDVL-instruction.patch
new file mode 100644
index 0000000..f92df2d
--- /dev/null
+++ b/0162-Backport-SME-aarch64-Use-SVE-s-RDVL-instruction.patch
@@ -0,0 +1,792 @@
+From 46310765c05cde8732e07bfb0df9f0ec25a34018 Mon Sep 17 00:00:00 2001
+From: Richard Sandiford <richard.sandiford@arm.com>
+Date: Tue, 5 Dec 2023 10:11:18 +0000
+Subject: [PATCH 063/157] [Backport][SME] aarch64: Use SVE's RDVL instruction
+
+Reference: https://gcc.gnu.org/git/?p=gcc.git;a=commit;h=80f47d7bbe38234e1530d27fe5c2f130223ca7a0
+
+We didn't previously use SVE's RDVL instruction, since the CNT*
+forms are preferred and provide most of the range.  However,
+there are some cases that RDVL can handle and CNT* can't,
+and using RDVL-like instructions becomes important for SME.
+
+gcc/
+	* config/aarch64/aarch64-protos.h (aarch64_sve_rdvl_immediate_p)
+	(aarch64_output_sve_rdvl): Declare.
+	* config/aarch64/aarch64.cc (aarch64_sve_cnt_factor_p): New
+	function, split out from...
+	(aarch64_sve_cnt_immediate_p): ...here.
+	(aarch64_sve_rdvl_factor_p): New function.
+	(aarch64_sve_rdvl_immediate_p): Likewise.
+	(aarch64_output_sve_rdvl): Likewise.
+	(aarch64_offset_temporaries): Rewrite the SVE handling to use RDVL
+	for some cases.
+	(aarch64_expand_mov_immediate): Handle RDVL immediates.
+	(aarch64_mov_operand_p): Likewise.
+	* config/aarch64/constraints.md (Usr): New constraint.
+	* config/aarch64/aarch64.md (*mov<SHORT:mode>_aarch64): Add an RDVL
+	alternative.
+	(*movsi_aarch64, *movdi_aarch64): Likewise.
+
+gcc/testsuite/
+	* gcc.target/aarch64/sve/acle/asm/cntb.c: Tweak expected output.
+	* gcc.target/aarch64/sve/acle/asm/cnth.c: Likewise.
+	* gcc.target/aarch64/sve/acle/asm/cntw.c: Likewise.
+	* gcc.target/aarch64/sve/acle/asm/cntd.c: Likewise.
+	* gcc.target/aarch64/sve/acle/asm/prfb.c: Likewise.
+	* gcc.target/aarch64/sve/acle/asm/prfh.c: Likewise.
+	* gcc.target/aarch64/sve/acle/asm/prfw.c: Likewise.
+	* gcc.target/aarch64/sve/acle/asm/prfd.c: Likewise.
+	* gcc.target/aarch64/sve/loop_add_4.c: Expect RDVL to be used
+	to calculate the -17 and 17 factors.
+	* gcc.target/aarch64/sve/pcs/stack_clash_1.c: Likewise the 18 factor.
+---
+ gcc/config/aarch64/aarch64-protos.h           |   2 +
+ gcc/config/aarch64/aarch64.cc                 | 191 ++++++++++++------
+ gcc/config/aarch64/aarch64.md                 |   3 +
+ gcc/config/aarch64/constraints.md             |   6 +
+ .../gcc.target/aarch64/sve/acle/asm/cntb.c    |  71 +++++--
+ .../gcc.target/aarch64/sve/acle/asm/cntd.c    |  12 +-
+ .../gcc.target/aarch64/sve/acle/asm/cnth.c    |  20 +-
+ .../gcc.target/aarch64/sve/acle/asm/cntw.c    |  16 +-
+ .../gcc.target/aarch64/sve/acle/asm/prfb.c    |   6 +-
+ .../gcc.target/aarch64/sve/acle/asm/prfd.c    |   4 +-
+ .../gcc.target/aarch64/sve/acle/asm/prfh.c    |   4 +-
+ .../gcc.target/aarch64/sve/acle/asm/prfw.c    |   4 +-
+ .../gcc.target/aarch64/sve/loop_add_4.c       |   6 +-
+ .../aarch64/sve/pcs/stack_clash_1.c           |   3 +-
+ 14 files changed, 225 insertions(+), 123 deletions(-)
+
+diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h
+index 3ff1a0163..14a568140 100644
+--- a/gcc/config/aarch64/aarch64-protos.h
++++ b/gcc/config/aarch64/aarch64-protos.h
+@@ -802,6 +802,7 @@ bool aarch64_sve_mode_p (machine_mode);
+ HOST_WIDE_INT aarch64_fold_sve_cnt_pat (aarch64_svpattern, unsigned int);
+ bool aarch64_sve_cnt_immediate_p (rtx);
+ bool aarch64_sve_scalar_inc_dec_immediate_p (rtx);
++bool aarch64_sve_rdvl_immediate_p (rtx);
+ bool aarch64_sve_addvl_addpl_immediate_p (rtx);
+ bool aarch64_sve_vector_inc_dec_immediate_p (rtx);
+ int aarch64_add_offset_temporaries (rtx);
+@@ -814,6 +815,7 @@ char *aarch64_output_sve_prefetch (const char *, rtx, const char *);
+ char *aarch64_output_sve_cnt_immediate (const char *, const char *, rtx);
+ char *aarch64_output_sve_cnt_pat_immediate (const char *, const char *, rtx *);
+ char *aarch64_output_sve_scalar_inc_dec (rtx);
++char *aarch64_output_sve_rdvl (rtx);
+ char *aarch64_output_sve_addvl_addpl (rtx);
+ char *aarch64_output_sve_vector_inc_dec (const char *, rtx);
+ char *aarch64_output_scalar_simd_mov_immediate (rtx, scalar_int_mode);
+diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
+index acb659f53..4194dfc70 100644
+--- a/gcc/config/aarch64/aarch64.cc
++++ b/gcc/config/aarch64/aarch64.cc
+@@ -5520,6 +5520,18 @@ aarch64_fold_sve_cnt_pat (aarch64_svpattern pattern, unsigned int nelts_per_vq)
+   return -1;
+ }
+ 
++/* Return true if a single CNT[BHWD] instruction can multiply FACTOR
++   by the number of 128-bit quadwords in an SVE vector.  */
++
++static bool
++aarch64_sve_cnt_factor_p (HOST_WIDE_INT factor)
++{
++  /* The coefficient must be [1, 16] * {2, 4, 8, 16}.  */
++  return (IN_RANGE (factor, 2, 16 * 16)
++	  && (factor & 1) == 0
++	  && factor <= 16 * (factor & -factor));
++}
++
+ /* Return true if we can move VALUE into a register using a single
+    CNT[BHWD] instruction.  */
+ 
+@@ -5527,11 +5539,7 @@ static bool
+ aarch64_sve_cnt_immediate_p (poly_int64 value)
+ {
+   HOST_WIDE_INT factor = value.coeffs[0];
+-  /* The coefficient must be [1, 16] * {2, 4, 8, 16}.  */
+-  return (value.coeffs[1] == factor
+-	  && IN_RANGE (factor, 2, 16 * 16)
+-	  && (factor & 1) == 0
+-	  && factor <= 16 * (factor & -factor));
++  return value.coeffs[1] == factor && aarch64_sve_cnt_factor_p (factor);
+ }
+ 
+ /* Likewise for rtx X.  */
+@@ -5647,6 +5655,50 @@ aarch64_output_sve_scalar_inc_dec (rtx offset)
+ 					     -offset_value.coeffs[1], 0);
+ }
+ 
++/* Return true if a single RDVL instruction can multiply FACTOR by the
++   number of 128-bit quadwords in an SVE vector.  */
++
++static bool
++aarch64_sve_rdvl_factor_p (HOST_WIDE_INT factor)
++{
++  return (multiple_p (factor, 16)
++	  && IN_RANGE (factor, -32 * 16, 31 * 16));
++}
++
++/* Return true if we can move VALUE into a register using a single
++   RDVL instruction.  */
++
++static bool
++aarch64_sve_rdvl_immediate_p (poly_int64 value)
++{
++  HOST_WIDE_INT factor = value.coeffs[0];
++  return value.coeffs[1] == factor && aarch64_sve_rdvl_factor_p (factor);
++}
++
++/* Likewise for rtx X.  */
++
++bool
++aarch64_sve_rdvl_immediate_p (rtx x)
++{
++  poly_int64 value;
++  return poly_int_rtx_p (x, &value) && aarch64_sve_rdvl_immediate_p (value);
++}
++
++/* Return the asm string for moving RDVL immediate OFFSET into register
++   operand 0.  */
++
++char *
++aarch64_output_sve_rdvl (rtx offset)
++{
++  static char buffer[sizeof ("rdvl\t%x0, #-") + 3 * sizeof (int)];
++  poly_int64 offset_value = rtx_to_poly_int64 (offset);
++  gcc_assert (aarch64_sve_rdvl_immediate_p (offset_value));
++
++  int factor = offset_value.coeffs[1];
++  snprintf (buffer, sizeof (buffer), "rdvl\t%%x0, #%d", factor / 16);
++  return buffer;
++}
++
+ /* Return true if we can add VALUE to a register using a single ADDVL
+    or ADDPL instruction.  */
+ 
+@@ -6227,13 +6279,13 @@ aarch64_offset_temporaries (bool add_p, poly_int64 offset)
+     count += 1;
+   else if (factor != 0)
+     {
+-      factor = abs (factor);
+-      if (factor > 16 * (factor & -factor))
+-	/* Need one register for the CNT result and one for the multiplication
+-	   factor.  If necessary, the second temporary can be reused for the
+-	   constant part of the offset.  */
++      factor /= (HOST_WIDE_INT) least_bit_hwi (factor);
++      if (!IN_RANGE (factor, -32, 31))
++	/* Need one register for the CNT or RDVL result and one for the
++	   multiplication factor.  If necessary, the second temporary
++	   can be reused for the constant part of the offset.  */
+ 	return 2;
+-      /* Need one register for the CNT result (which might then
++      /* Need one register for the CNT or RDVL result (which might then
+ 	 be shifted).  */
+       count += 1;
+     }
+@@ -6322,85 +6374,100 @@ aarch64_add_offset (scalar_int_mode mode, rtx dest, rtx src,
+   /* Otherwise use a CNT-based sequence.  */
+   else if (factor != 0)
+     {
+-      /* Use a subtraction if we have a negative factor.  */
+-      rtx_code code = PLUS;
+-      if (factor < 0)
+-	{
+-	  factor = -factor;
+-	  code = MINUS;
+-	}
++      /* Calculate CNTB * FACTOR / 16 as CNTB * REL_FACTOR * 2**SHIFT,
++	 with negative shifts indicating a shift right.  */
++      HOST_WIDE_INT low_bit = least_bit_hwi (factor);
++      HOST_WIDE_INT rel_factor = factor / low_bit;
++      int shift = exact_log2 (low_bit) - 4;
++      gcc_assert (shift >= -4 && (rel_factor & 1) != 0);
++
++      /* Set CODE, VAL and SHIFT so that [+-] VAL * 2**SHIFT is
++	 equal to CNTB * FACTOR / 16, with CODE being the [+-].
+ 
+-      /* Calculate CNTD * FACTOR / 2.  First try to fold the division
+-	 into the multiplication.  */
++	 We can avoid a multiplication if REL_FACTOR is in the range
++	 of RDVL, although there are then various optimizations that
++	 we can try on top.  */
++      rtx_code code = PLUS;
+       rtx val;
+-      int shift = 0;
+-      if (factor & 1)
+-	/* Use a right shift by 1.  */
+-	shift = -1;
+-      else
+-	factor /= 2;
+-      HOST_WIDE_INT low_bit = factor & -factor;
+-      if (factor <= 16 * low_bit)
++      if (IN_RANGE (rel_factor, -32, 31))
+ 	{
+-	  if (factor > 16 * 8)
++	  /* Try to use an unshifted CNT[BHWD] or RDVL.  */
++	  if (aarch64_sve_cnt_factor_p (factor)
++	      || aarch64_sve_rdvl_factor_p (factor))
++	    {
++	      val = gen_int_mode (poly_int64 (factor, factor), mode);
++	      shift = 0;
++	    }
++	  /* Try to subtract an unshifted CNT[BHWD].  */
++	  else if (aarch64_sve_cnt_factor_p (-factor))
+ 	    {
+-	      /* "CNTB Xn, ALL, MUL #FACTOR" is out of range, so calculate
+-		 the value with the minimum multiplier and shift it into
+-		 position.  */
+-	      int extra_shift = exact_log2 (low_bit);
+-	      shift += extra_shift;
+-	      factor >>= extra_shift;
++	      code = MINUS;
++	      val = gen_int_mode (poly_int64 (-factor, -factor), mode);
++	      shift = 0;
+ 	    }
+-	  val = gen_int_mode (poly_int64 (factor * 2, factor * 2), mode);
++	  /* If subtraction is free, prefer to load a positive constant.
++	     In the best case this will fit a shifted CNTB.  */
++	  else if (src != const0_rtx && rel_factor < 0)
++	    {
++	      code = MINUS;
++	      val = gen_int_mode (-rel_factor * BYTES_PER_SVE_VECTOR, mode);
++	    }
++	  /* Otherwise use a shifted RDVL or CNT[BHWD].  */
++	  else
++	    val = gen_int_mode (rel_factor * BYTES_PER_SVE_VECTOR, mode);
+ 	}
+       else
+ 	{
+-	  /* Base the factor on LOW_BIT if we can calculate LOW_BIT
+-	     directly, since that should increase the chances of being
+-	     able to use a shift and add sequence.  If LOW_BIT itself
+-	     is out of range, just use CNTD.  */
+-	  if (low_bit <= 16 * 8)
+-	    factor /= low_bit;
++	  /* If we can calculate CNTB << SHIFT directly, prefer to do that,
++	     since it should increase the chances of being able to use
++	     a shift and add sequence for the multiplication.
++	     If CNTB << SHIFT is out of range, stick with the current
++	     shift factor.  */
++	  if (IN_RANGE (low_bit, 2, 16 * 16))
++	    {
++	      val = gen_int_mode (poly_int64 (low_bit, low_bit), mode);
++	      shift = 0;
++	    }
+ 	  else
+-	    low_bit = 1;
++	    val = gen_int_mode (BYTES_PER_SVE_VECTOR, mode);
+ 
+-	  val = gen_int_mode (poly_int64 (low_bit * 2, low_bit * 2), mode);
+ 	  val = aarch64_force_temporary (mode, temp1, val);
+ 
++	  /* Prefer to multiply by a positive factor and subtract rather
++	     than multiply by a negative factor and add, since positive
++	     values are usually easier to move.  */
++	  if (rel_factor < 0 && src != const0_rtx)
++	    {
++	      rel_factor = -rel_factor;
++	      code = MINUS;
++	    }
++
+ 	  if (can_create_pseudo_p ())
+ 	    {
+-	      rtx coeff1 = gen_int_mode (factor, mode);
++	      rtx coeff1 = gen_int_mode (rel_factor, mode);
+ 	      val = expand_mult (mode, val, coeff1, NULL_RTX, true, true);
+ 	    }
+ 	  else
+ 	    {
+-	      /* Go back to using a negative multiplication factor if we have
+-		 no register from which to subtract.  */
+-	      if (code == MINUS && src == const0_rtx)
+-		{
+-		  factor = -factor;
+-		  code = PLUS;
+-		}
+-	      rtx coeff1 = gen_int_mode (factor, mode);
++	      rtx coeff1 = gen_int_mode (rel_factor, mode);
+ 	      coeff1 = aarch64_force_temporary (mode, temp2, coeff1);
+ 	      val = gen_rtx_MULT (mode, val, coeff1);
+ 	    }
+ 	}
+ 
++      /* Multiply by 2 ** SHIFT.  */
+       if (shift > 0)
+ 	{
+-	  /* Multiply by 1 << SHIFT.  */
+ 	  val = aarch64_force_temporary (mode, temp1, val);
+ 	  val = gen_rtx_ASHIFT (mode, val, GEN_INT (shift));
+ 	}
+-      else if (shift == -1)
++      else if (shift < 0)
+ 	{
+-	  /* Divide by 2.  */
+ 	  val = aarch64_force_temporary (mode, temp1, val);
+-	  val = gen_rtx_ASHIFTRT (mode, val, const1_rtx);
++	  val = gen_rtx_ASHIFTRT (mode, val, GEN_INT (-shift));
+ 	}
+ 
+-      /* Calculate SRC +/- CNTD * FACTOR / 2.  */
++      /* Add the result to SRC or subtract the result from SRC.  */
+       if (src != const0_rtx)
+ 	{
+ 	  val = aarch64_force_temporary (mode, temp1, val);
+@@ -7045,7 +7112,9 @@ aarch64_expand_mov_immediate (rtx dest, rtx imm)
+ 	      aarch64_report_sve_required ();
+ 	      return;
+ 	    }
+-	  if (base == const0_rtx && aarch64_sve_cnt_immediate_p (offset))
++	  if (base == const0_rtx
++	      && (aarch64_sve_cnt_immediate_p (offset)
++		  || aarch64_sve_rdvl_immediate_p (offset)))
+ 	    emit_insn (gen_rtx_SET (dest, imm));
+ 	  else
+ 	    {
+@@ -21751,7 +21820,9 @@ aarch64_mov_operand_p (rtx x, machine_mode mode)
+   if (SYMBOL_REF_P (x) && mode == DImode && CONSTANT_ADDRESS_P (x))
+     return true;
+ 
+-  if (TARGET_SVE && aarch64_sve_cnt_immediate_p (x))
++  if (TARGET_SVE
++      && (aarch64_sve_cnt_immediate_p (x)
++	  || aarch64_sve_rdvl_immediate_p (x)))
+     return true;
+ 
+   return aarch64_classify_symbolic_expression (x)
+diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
+index 5d02da42f..c0977a3da 100644
+--- a/gcc/config/aarch64/aarch64.md
++++ b/gcc/config/aarch64/aarch64.md
+@@ -1207,6 +1207,7 @@
+      [w, D<hq>; neon_move      , simd  ] << aarch64_output_scalar_simd_mov_immediate (operands[1], <MODE>mode);
+      /* The "mov_imm" type for CNT is just a placeholder.  */
+      [r, Usv  ; mov_imm        , sve   ] << aarch64_output_sve_cnt_immediate ("cnt", "%x0", operands[1]);
++     [r, Usr  ; mov_imm        , sve   ] << aarch64_output_sve_rdvl (operands[1]);
+      [r, m    ; load_4         , *     ] ldr<size>\t%w0, %1
+      [w, m    ; load_4         , *     ] ldr\t%<size>0, %1
+      [m, r Z  ; store_4        , *     ] str<size>\\t%w1, %0
+@@ -1265,6 +1266,7 @@
+      [r  , n  ; mov_imm  , *   ,16] #
+      /* The "mov_imm" type for CNT is just a placeholder.  */
+      [r  , Usv; mov_imm  , sve , 4] << aarch64_output_sve_cnt_immediate ("cnt", "%x0", operands[1]);
++     [r  , Usr; mov_imm  , sve,  4] << aarch64_output_sve_rdvl (operands[1]);
+      [r  , m  ; load_4   , *   , 4] ldr\t%w0, %1
+      [w  , m  ; load_4   , fp  , 4] ldr\t%s0, %1
+      [m  , r Z; store_4  , *   , 4] str\t%w1, %0
+@@ -1299,6 +1301,7 @@
+      [r, n  ; mov_imm  , *   ,16] #
+      /* The "mov_imm" type for CNT is just a placeholder.  */
+      [r, Usv; mov_imm  , sve , 4] << aarch64_output_sve_cnt_immediate ("cnt", "%x0", operands[1]);
++     [r, Usr; mov_imm  , sve,  4] << aarch64_output_sve_rdvl (operands[1]);
+      [r, m  ; load_8   , *   , 4] ldr\t%x0, %1
+      [w, m  ; load_8   , fp  , 4] ldr\t%d0, %1
+      [m, r Z; store_8  , *   , 4] str\t%x1, %0
+diff --git a/gcc/config/aarch64/constraints.md b/gcc/config/aarch64/constraints.md
+index 750a42fb1..212a73416 100644
+--- a/gcc/config/aarch64/constraints.md
++++ b/gcc/config/aarch64/constraints.md
+@@ -214,6 +214,12 @@
+  (and (match_code "const_int")
+       (match_test "aarch64_high_bits_all_ones_p (ival)")))
+ 
++(define_constraint "Usr"
++  "@internal
++   A constraint that matches a value produced by RDVL."
++ (and (match_code "const_poly_int")
++      (match_test "aarch64_sve_rdvl_immediate_p (op)")))
++
+ (define_constraint "Usv"
+   "@internal
+    A constraint that matches a VG-based constant that can be loaded by
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cntb.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cntb.c
+index 8b8fe8e4f..a22d8a28d 100644
+--- a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cntb.c
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cntb.c
+@@ -51,19 +51,24 @@ PROTO (cntb_15, uint64_t, ()) { return svcntb () * 15; }
+ */
+ PROTO (cntb_16, uint64_t, ()) { return svcntb () * 16; }
+ 
+-/* Other sequences would be OK.  */
+ /*
+ ** cntb_17:
+-**	cntb	x0, all, mul #16
+-**	incb	x0
++**	rdvl	x0, #17
+ **	ret
+ */
+ PROTO (cntb_17, uint64_t, ()) { return svcntb () * 17; }
+ 
++/*
++** cntb_31:
++**	rdvl	x0, #31
++**	ret
++*/
++PROTO (cntb_31, uint64_t, ()) { return svcntb () * 31; }
++
+ /*
+ ** cntb_32:
+-**	cntd	(x[0-9]+)
+-**	lsl	x0, \1, 8
++**	cntb	(x[0-9]+)
++**	lsl	x0, \1, 5
+ **	ret
+ */
+ PROTO (cntb_32, uint64_t, ()) { return svcntb () * 32; }
+@@ -80,16 +85,16 @@ PROTO (cntb_33, uint64_t, ()) { return svcntb () * 33; }
+ 
+ /*
+ ** cntb_64:
+-**	cntd	(x[0-9]+)
+-**	lsl	x0, \1, 9
++**	cntb	(x[0-9]+)
++**	lsl	x0, \1, 6
+ **	ret
+ */
+ PROTO (cntb_64, uint64_t, ()) { return svcntb () * 64; }
+ 
+ /*
+ ** cntb_128:
+-**	cntd	(x[0-9]+)
+-**	lsl	x0, \1, 10
++**	cntb	(x[0-9]+)
++**	lsl	x0, \1, 7
+ **	ret
+ */
+ PROTO (cntb_128, uint64_t, ()) { return svcntb () * 128; }
+@@ -106,46 +111,70 @@ PROTO (cntb_129, uint64_t, ()) { return svcntb () * 129; }
+ 
+ /*
+ ** cntb_m1:
+-**	cntb	(x[0-9]+)
+-**	neg	x0, \1
++**	rdvl	x0, #-1
+ **	ret
+ */
+ PROTO (cntb_m1, uint64_t, ()) { return -svcntb (); }
+ 
+ /*
+ ** cntb_m13:
+-**	cntb	(x[0-9]+), all, mul #13
+-**	neg	x0, \1
++**	rdvl	x0, #-13
+ **	ret
+ */
+ PROTO (cntb_m13, uint64_t, ()) { return -svcntb () * 13; }
+ 
+ /*
+ ** cntb_m15:
+-**	cntb	(x[0-9]+), all, mul #15
+-**	neg	x0, \1
++**	rdvl	x0, #-15
+ **	ret
+ */
+ PROTO (cntb_m15, uint64_t, ()) { return -svcntb () * 15; }
+ 
+ /*
+ ** cntb_m16:
+-**	cntb	(x[0-9]+), all, mul #16
+-**	neg	x0, \1
++**	rdvl	x0, #-16
+ **	ret
+ */
+ PROTO (cntb_m16, uint64_t, ()) { return -svcntb () * 16; }
+ 
+-/* Other sequences would be OK.  */
+ /*
+ ** cntb_m17:
+-**	cntb	x0, all, mul #16
+-**	incb	x0
+-**	neg	x0, x0
++**	rdvl	x0, #-17
+ **	ret
+ */
+ PROTO (cntb_m17, uint64_t, ()) { return -svcntb () * 17; }
+ 
++/*
++** cntb_m32:
++**	rdvl	x0, #-32
++**	ret
++*/
++PROTO (cntb_m32, uint64_t, ()) { return -svcntb () * 32; }
++
++/*
++** cntb_m33:
++**	rdvl	x0, #-32
++**	decb	x0
++**	ret
++*/
++PROTO (cntb_m33, uint64_t, ()) { return -svcntb () * 33; }
++
++/*
++** cntb_m34:
++**	rdvl	(x[0-9]+), #-17
++**	lsl	x0, \1, #?1
++**	ret
++*/
++PROTO (cntb_m34, uint64_t, ()) { return -svcntb () * 34; }
++
++/*
++** cntb_m64:
++**	rdvl	(x[0-9]+), #-1
++**	lsl	x0, \1, #?6
++**	ret
++*/
++PROTO (cntb_m64, uint64_t, ()) { return -svcntb () * 64; }
++
+ /*
+ ** incb_1:
+ **	incb	x0
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cntd.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cntd.c
+index 0d0ed4849..090a643b4 100644
+--- a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cntd.c
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cntd.c
+@@ -54,8 +54,8 @@ PROTO (cntd_16, uint64_t, ()) { return svcntd () * 16; }
+ /* Other sequences would be OK.  */
+ /*
+ ** cntd_17:
+-**	cntb	x0, all, mul #2
+-**	incd	x0
++**	rdvl	(x[0-9]+), #17
++**	asr	x0, \1, 3
+ **	ret
+ */
+ PROTO (cntd_17, uint64_t, ()) { return svcntd () * 17; }
+@@ -107,8 +107,7 @@ PROTO (cntd_m15, uint64_t, ()) { return -svcntd () * 15; }
+ 
+ /*
+ ** cntd_m16:
+-**	cntb	(x[0-9]+), all, mul #2
+-**	neg	x0, \1
++**	rdvl	x0, #-2
+ **	ret
+ */
+ PROTO (cntd_m16, uint64_t, ()) { return -svcntd () * 16; }
+@@ -116,9 +115,8 @@ PROTO (cntd_m16, uint64_t, ()) { return -svcntd () * 16; }
+ /* Other sequences would be OK.  */
+ /*
+ ** cntd_m17:
+-**	cntb	x0, all, mul #2
+-**	incd	x0
+-**	neg	x0, x0
++**	rdvl	(x[0-9]+), #-17
++**	asr	x0, \1, 3
+ **	ret
+ */
+ PROTO (cntd_m17, uint64_t, ()) { return -svcntd () * 17; }
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cnth.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cnth.c
+index c29930f15..1a4e7dc0e 100644
+--- a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cnth.c
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cnth.c
+@@ -54,8 +54,8 @@ PROTO (cnth_16, uint64_t, ()) { return svcnth () * 16; }
+ /* Other sequences would be OK.  */
+ /*
+ ** cnth_17:
+-**	cntb	x0, all, mul #8
+-**	inch	x0
++**	rdvl	(x[0-9]+), #17
++**	asr	x0, \1, 1
+ **	ret
+ */
+ PROTO (cnth_17, uint64_t, ()) { return svcnth () * 17; }
+@@ -69,16 +69,16 @@ PROTO (cnth_32, uint64_t, ()) { return svcnth () * 32; }
+ 
+ /*
+ ** cnth_64:
+-**	cntd	(x[0-9]+)
+-**	lsl	x0, \1, 8
++**	cntb	(x[0-9]+)
++**	lsl	x0, \1, 5
+ **	ret
+ */
+ PROTO (cnth_64, uint64_t, ()) { return svcnth () * 64; }
+ 
+ /*
+ ** cnth_128:
+-**	cntd	(x[0-9]+)
+-**	lsl	x0, \1, 9
++**	cntb	(x[0-9]+)
++**	lsl	x0, \1, 6
+ **	ret
+ */
+ PROTO (cnth_128, uint64_t, ()) { return svcnth () * 128; }
+@@ -109,8 +109,7 @@ PROTO (cnth_m15, uint64_t, ()) { return -svcnth () * 15; }
+ 
+ /*
+ ** cnth_m16:
+-**	cntb	(x[0-9]+), all, mul #8
+-**	neg	x0, \1
++**	rdvl	x0, #-8
+ **	ret
+ */
+ PROTO (cnth_m16, uint64_t, ()) { return -svcnth () * 16; }
+@@ -118,9 +117,8 @@ PROTO (cnth_m16, uint64_t, ()) { return -svcnth () * 16; }
+ /* Other sequences would be OK.  */
+ /*
+ ** cnth_m17:
+-**	cntb	x0, all, mul #8
+-**	inch	x0
+-**	neg	x0, x0
++**	rdvl	(x[0-9]+), #-17
++**	asr	x0, \1, 1
+ **	ret
+ */
+ PROTO (cnth_m17, uint64_t, ()) { return -svcnth () * 17; }
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cntw.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cntw.c
+index e26cc67a4..9d1697690 100644
+--- a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cntw.c
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cntw.c
+@@ -54,8 +54,8 @@ PROTO (cntw_16, uint64_t, ()) { return svcntw () * 16; }
+ /* Other sequences would be OK.  */
+ /*
+ ** cntw_17:
+-**	cntb	x0, all, mul #4
+-**	incw	x0
++**	rdvl	(x[0-9]+), #17
++**	asr	x0, \1, 2
+ **	ret
+ */
+ PROTO (cntw_17, uint64_t, ()) { return svcntw () * 17; }
+@@ -76,8 +76,8 @@ PROTO (cntw_64, uint64_t, ()) { return svcntw () * 64; }
+ 
+ /*
+ ** cntw_128:
+-**	cntd	(x[0-9]+)
+-**	lsl	x0, \1, 8
++**	cntb	(x[0-9]+)
++**	lsl	x0, \1, 5
+ **	ret
+ */
+ PROTO (cntw_128, uint64_t, ()) { return svcntw () * 128; }
+@@ -108,8 +108,7 @@ PROTO (cntw_m15, uint64_t, ()) { return -svcntw () * 15; }
+ 
+ /*
+ ** cntw_m16:
+-**	cntb	(x[0-9]+), all, mul #4
+-**	neg	x0, \1
++**	rdvl	(x[0-9]+), #-4
+ **	ret
+ */
+ PROTO (cntw_m16, uint64_t, ()) { return -svcntw () * 16; }
+@@ -117,9 +116,8 @@ PROTO (cntw_m16, uint64_t, ()) { return -svcntw () * 16; }
+ /* Other sequences would be OK.  */
+ /*
+ ** cntw_m17:
+-**	cntb	x0, all, mul #4
+-**	incw	x0
+-**	neg	x0, x0
++**	rdvl	(x[0-9]+), #-17
++**	asr	x0, \1, 2
+ **	ret
+ */
+ PROTO (cntw_m17, uint64_t, ()) { return -svcntw () * 17; }
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/prfb.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/prfb.c
+index c90730a03..94cd3a066 100644
+--- a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/prfb.c
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/prfb.c
+@@ -218,8 +218,8 @@ TEST_PREFETCH (prfb_vnum_31, uint16_t,
+ 
+ /*
+ ** prfb_vnum_32:
+-**	cntd	(x[0-9]+)
+-**	lsl	(x[0-9]+), \1, #?8
++**	cntb	(x[0-9]+)
++**	lsl	(x[0-9]+), \1, #?5
+ **	add	(x[0-9]+), (\2, x0|x0, \2)
+ **	prfb	pldl1keep, p0, \[\3\]
+ **	ret
+@@ -240,7 +240,7 @@ TEST_PREFETCH (prfb_vnum_m32, uint16_t,
+ /*
+ ** prfb_vnum_m33:
+ **	...
+-**	prfb	pldl1keep, p0, \[x[0-9]+\]
++**	prfb	pldl1keep, p0, \[x[0-9]+(, x[0-9]+)?\]
+ **	ret
+ */
+ TEST_PREFETCH (prfb_vnum_m33, uint16_t,
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/prfd.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/prfd.c
+index 869ef3d3e..b7a116cf0 100644
+--- a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/prfd.c
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/prfd.c
+@@ -218,8 +218,8 @@ TEST_PREFETCH (prfd_vnum_31, uint16_t,
+ 
+ /*
+ ** prfd_vnum_32:
+-**	cntd	(x[0-9]+)
+-**	lsl	(x[0-9]+), \1, #?8
++**	cntb	(x[0-9]+)
++**	lsl	(x[0-9]+), \1, #?5
+ **	add	(x[0-9]+), (\2, x0|x0, \2)
+ **	prfd	pldl1keep, p0, \[\3\]
+ **	ret
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/prfh.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/prfh.c
+index 45a735eae..9d3df6bd3 100644
+--- a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/prfh.c
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/prfh.c
+@@ -218,8 +218,8 @@ TEST_PREFETCH (prfh_vnum_31, uint16_t,
+ 
+ /*
+ ** prfh_vnum_32:
+-**	cntd	(x[0-9]+)
+-**	lsl	(x[0-9]+), \1, #?8
++**	cntb	(x[0-9]+)
++**	lsl	(x[0-9]+), \1, #?5
+ **	add	(x[0-9]+), (\2, x0|x0, \2)
+ **	prfh	pldl1keep, p0, \[\3\]
+ **	ret
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/prfw.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/prfw.c
+index 444187f45..6962abab6 100644
+--- a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/prfw.c
++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/prfw.c
+@@ -218,8 +218,8 @@ TEST_PREFETCH (prfw_vnum_31, uint16_t,
+ 
+ /*
+ ** prfw_vnum_32:
+-**	cntd	(x[0-9]+)
+-**	lsl	(x[0-9]+), \1, #?8
++**	cntb	(x[0-9]+)
++**	lsl	(x[0-9]+), \1, #?5
+ **	add	(x[0-9]+), (\2, x0|x0, \2)
+ **	prfw	pldl1keep, p0, \[\3\]
+ **	ret
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/loop_add_4.c b/gcc/testsuite/gcc.target/aarch64/sve/loop_add_4.c
+index 9ead9c21b..7f02497e8 100644
+--- a/gcc/testsuite/gcc.target/aarch64/sve/loop_add_4.c
++++ b/gcc/testsuite/gcc.target/aarch64/sve/loop_add_4.c
+@@ -68,8 +68,7 @@ TEST_ALL (LOOP)
+ /* { dg-final { scan-assembler-times {\tindex\tz[0-9]+\.s, w[0-9]+, w[0-9]+\n} 3 } } */
+ /* { dg-final { scan-assembler-times {\tld1w\tz[0-9]+\.s, p[0-7]+/z, \[x[0-9]+, x[0-9]+, lsl 2\]} 8 } } */
+ /* { dg-final { scan-assembler-times {\tst1w\tz[0-9]+\.s, p[0-7]+, \[x[0-9]+, x[0-9]+, lsl 2\]} 8 } } */
+-/* 2 for the calculations of -17 and 17.  */
+-/* { dg-final { scan-assembler-times {\tincw\tx[0-9]+\n} 10 } } */
++/* { dg-final { scan-assembler-times {\tincw\tx[0-9]+\n} 8 } } */
+ 
+ /* { dg-final { scan-assembler-times {\tdecw\tz[0-9]+\.s, all, mul #16\n} 1 } } */
+ /* { dg-final { scan-assembler-times {\tdecw\tz[0-9]+\.s, all, mul #15\n} 1 } } */
+@@ -86,8 +85,7 @@ TEST_ALL (LOOP)
+ /* { dg-final { scan-assembler-times {\tindex\tz[0-9]+\.d, x[0-9]+, x[0-9]+\n} 3 } } */
+ /* { dg-final { scan-assembler-times {\tld1d\tz[0-9]+\.d, p[0-7]+/z, \[x[0-9]+, x[0-9]+, lsl 3\]} 8 } } */
+ /* { dg-final { scan-assembler-times {\tst1d\tz[0-9]+\.d, p[0-7]+, \[x[0-9]+, x[0-9]+, lsl 3\]} 8 } } */
+-/* 2 for the calculations of -17 and 17.  */
+-/* { dg-final { scan-assembler-times {\tincd\tx[0-9]+\n} 10 } } */
++/* { dg-final { scan-assembler-times {\tincd\tx[0-9]+\n} 8 } } */
+ 
+ /* { dg-final { scan-assembler-times {\tdecd\tz[0-9]+\.d, all, mul #16\n} 1 } } */
+ /* { dg-final { scan-assembler-times {\tdecd\tz[0-9]+\.d, all, mul #15\n} 1 } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/stack_clash_1.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/stack_clash_1.c
+index 110947a6c..5de34fc61 100644
+--- a/gcc/testsuite/gcc.target/aarch64/sve/pcs/stack_clash_1.c
++++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/stack_clash_1.c
+@@ -6,8 +6,7 @@
+ 
+ /*
+ ** test_1:
+-**	cntd	x12, all, mul #9
+-**	lsl	x12, x12, #?4
++**	rdvl	x12, #18
+ **	mov	x11, sp
+ **	...
+ **	sub	sp, sp, x12
+-- 
+2.33.0
+