0162-Backport-SME-aarch64-Use-SVE-s-RDVL-instruction.patch


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792

From 46310765c05cde8732e07bfb0df9f0ec25a34018 Mon Sep 17 00:00:00 2001
From: Richard Sandiford <richard.sandiford@arm.com>
Date: Tue, 5 Dec 2023 10:11:18 +0000
Subject: [PATCH 063/157] [Backport][SME] aarch64: Use SVE's RDVL instruction

Reference: https://gcc.gnu.org/git/?p=gcc.git;a=commit;h=80f47d7bbe38234e1530d27fe5c2f130223ca7a0

We didn't previously use SVE's RDVL instruction, since the CNT*
forms are preferred and provide most of the range.  However,
there are some cases that RDVL can handle and CNT* can't,
and using RDVL-like instructions becomes important for SME.

gcc/
	* config/aarch64/aarch64-protos.h (aarch64_sve_rdvl_immediate_p)
	(aarch64_output_sve_rdvl): Declare.
	* config/aarch64/aarch64.cc (aarch64_sve_cnt_factor_p): New
	function, split out from...
	(aarch64_sve_cnt_immediate_p): ...here.
	(aarch64_sve_rdvl_factor_p): New function.
	(aarch64_sve_rdvl_immediate_p): Likewise.
	(aarch64_output_sve_rdvl): Likewise.
	(aarch64_offset_temporaries): Rewrite the SVE handling to use RDVL
	for some cases.
	(aarch64_expand_mov_immediate): Handle RDVL immediates.
	(aarch64_mov_operand_p): Likewise.
	* config/aarch64/constraints.md (Usr): New constraint.
	* config/aarch64/aarch64.md (*mov<SHORT:mode>_aarch64): Add an RDVL
	alternative.
	(*movsi_aarch64, *movdi_aarch64): Likewise.

gcc/testsuite/
	* gcc.target/aarch64/sve/acle/asm/cntb.c: Tweak expected output.
	* gcc.target/aarch64/sve/acle/asm/cnth.c: Likewise.
	* gcc.target/aarch64/sve/acle/asm/cntw.c: Likewise.
	* gcc.target/aarch64/sve/acle/asm/cntd.c: Likewise.
	* gcc.target/aarch64/sve/acle/asm/prfb.c: Likewise.
	* gcc.target/aarch64/sve/acle/asm/prfh.c: Likewise.
	* gcc.target/aarch64/sve/acle/asm/prfw.c: Likewise.
	* gcc.target/aarch64/sve/acle/asm/prfd.c: Likewise.
	* gcc.target/aarch64/sve/loop_add_4.c: Expect RDVL to be used
	to calculate the -17 and 17 factors.
	* gcc.target/aarch64/sve/pcs/stack_clash_1.c: Likewise the 18 factor.
---
 gcc/config/aarch64/aarch64-protos.h           |   2 +
 gcc/config/aarch64/aarch64.cc                 | 191 ++++++++++++------
 gcc/config/aarch64/aarch64.md                 |   3 +
 gcc/config/aarch64/constraints.md             |   6 +
 .../gcc.target/aarch64/sve/acle/asm/cntb.c    |  71 +++++--
 .../gcc.target/aarch64/sve/acle/asm/cntd.c    |  12 +-
 .../gcc.target/aarch64/sve/acle/asm/cnth.c    |  20 +-
 .../gcc.target/aarch64/sve/acle/asm/cntw.c    |  16 +-
 .../gcc.target/aarch64/sve/acle/asm/prfb.c    |   6 +-
 .../gcc.target/aarch64/sve/acle/asm/prfd.c    |   4 +-
 .../gcc.target/aarch64/sve/acle/asm/prfh.c    |   4 +-
 .../gcc.target/aarch64/sve/acle/asm/prfw.c    |   4 +-
 .../gcc.target/aarch64/sve/loop_add_4.c       |   6 +-
 .../aarch64/sve/pcs/stack_clash_1.c           |   3 +-
 14 files changed, 225 insertions(+), 123 deletions(-)

diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h
index 3ff1a0163..14a568140 100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -802,6 +802,7 @@ bool aarch64_sve_mode_p (machine_mode);
 HOST_WIDE_INT aarch64_fold_sve_cnt_pat (aarch64_svpattern, unsigned int);
 bool aarch64_sve_cnt_immediate_p (rtx);
 bool aarch64_sve_scalar_inc_dec_immediate_p (rtx);
+bool aarch64_sve_rdvl_immediate_p (rtx);
 bool aarch64_sve_addvl_addpl_immediate_p (rtx);
 bool aarch64_sve_vector_inc_dec_immediate_p (rtx);
 int aarch64_add_offset_temporaries (rtx);
@@ -814,6 +815,7 @@ char *aarch64_output_sve_prefetch (const char *, rtx, const char *);
 char *aarch64_output_sve_cnt_immediate (const char *, const char *, rtx);
 char *aarch64_output_sve_cnt_pat_immediate (const char *, const char *, rtx *);
 char *aarch64_output_sve_scalar_inc_dec (rtx);
+char *aarch64_output_sve_rdvl (rtx);
 char *aarch64_output_sve_addvl_addpl (rtx);
 char *aarch64_output_sve_vector_inc_dec (const char *, rtx);
 char *aarch64_output_scalar_simd_mov_immediate (rtx, scalar_int_mode);
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index acb659f53..4194dfc70 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -5520,6 +5520,18 @@ aarch64_fold_sve_cnt_pat (aarch64_svpattern pattern, unsigned int nelts_per_vq)
   return -1;
 }
 
+/* Return true if a single CNT[BHWD] instruction can multiply FACTOR
+   by the number of 128-bit quadwords in an SVE vector.  */
+
+static bool
+aarch64_sve_cnt_factor_p (HOST_WIDE_INT factor)
+{
+  /* The coefficient must be [1, 16] * {2, 4, 8, 16}.  */
+  return (IN_RANGE (factor, 2, 16 * 16)
+	  && (factor & 1) == 0
+	  && factor <= 16 * (factor & -factor));
+}
+
 /* Return true if we can move VALUE into a register using a single
    CNT[BHWD] instruction.  */
 
@@ -5527,11 +5539,7 @@ static bool
 aarch64_sve_cnt_immediate_p (poly_int64 value)
 {
   HOST_WIDE_INT factor = value.coeffs[0];
-  /* The coefficient must be [1, 16] * {2, 4, 8, 16}.  */
-  return (value.coeffs[1] == factor
-	  && IN_RANGE (factor, 2, 16 * 16)
-	  && (factor & 1) == 0
-	  && factor <= 16 * (factor & -factor));
+  return value.coeffs[1] == factor && aarch64_sve_cnt_factor_p (factor);
 }
 
 /* Likewise for rtx X.  */
@@ -5647,6 +5655,50 @@ aarch64_output_sve_scalar_inc_dec (rtx offset)
 					     -offset_value.coeffs[1], 0);
 }
 
+/* Return true if a single RDVL instruction can multiply FACTOR by the
+   number of 128-bit quadwords in an SVE vector.  */
+
+static bool
+aarch64_sve_rdvl_factor_p (HOST_WIDE_INT factor)
+{
+  return (multiple_p (factor, 16)
+	  && IN_RANGE (factor, -32 * 16, 31 * 16));
+}
+
+/* Return true if we can move VALUE into a register using a single
+   RDVL instruction.  */
+
+static bool
+aarch64_sve_rdvl_immediate_p (poly_int64 value)
+{
+  HOST_WIDE_INT factor = value.coeffs[0];
+  return value.coeffs[1] == factor && aarch64_sve_rdvl_factor_p (factor);
+}
+
+/* Likewise for rtx X.  */
+
+bool
+aarch64_sve_rdvl_immediate_p (rtx x)
+{
+  poly_int64 value;
+  return poly_int_rtx_p (x, &value) && aarch64_sve_rdvl_immediate_p (value);
+}
+
+/* Return the asm string for moving RDVL immediate OFFSET into register
+   operand 0.  */
+
+char *
+aarch64_output_sve_rdvl (rtx offset)
+{
+  static char buffer[sizeof ("rdvl\t%x0, #-") + 3 * sizeof (int)];
+  poly_int64 offset_value = rtx_to_poly_int64 (offset);
+  gcc_assert (aarch64_sve_rdvl_immediate_p (offset_value));
+
+  int factor = offset_value.coeffs[1];
+  snprintf (buffer, sizeof (buffer), "rdvl\t%%x0, #%d", factor / 16);
+  return buffer;
+}
+
 /* Return true if we can add VALUE to a register using a single ADDVL
    or ADDPL instruction.  */
 
@@ -6227,13 +6279,13 @@ aarch64_offset_temporaries (bool add_p, poly_int64 offset)
     count += 1;
   else if (factor != 0)
     {
-      factor = abs (factor);
-      if (factor > 16 * (factor & -factor))
-	/* Need one register for the CNT result and one for the multiplication
-	   factor.  If necessary, the second temporary can be reused for the
-	   constant part of the offset.  */
+      factor /= (HOST_WIDE_INT) least_bit_hwi (factor);
+      if (!IN_RANGE (factor, -32, 31))
+	/* Need one register for the CNT or RDVL result and one for the
+	   multiplication factor.  If necessary, the second temporary
+	   can be reused for the constant part of the offset.  */
 	return 2;
-      /* Need one register for the CNT result (which might then
+      /* Need one register for the CNT or RDVL result (which might then
 	 be shifted).  */
       count += 1;
     }
@@ -6322,85 +6374,100 @@ aarch64_add_offset (scalar_int_mode mode, rtx dest, rtx src,
   /* Otherwise use a CNT-based sequence.  */
   else if (factor != 0)
     {
-      /* Use a subtraction if we have a negative factor.  */
-      rtx_code code = PLUS;
-      if (factor < 0)
-	{
-	  factor = -factor;
-	  code = MINUS;
-	}
+      /* Calculate CNTB * FACTOR / 16 as CNTB * REL_FACTOR * 2**SHIFT,
+	 with negative shifts indicating a shift right.  */
+      HOST_WIDE_INT low_bit = least_bit_hwi (factor);
+      HOST_WIDE_INT rel_factor = factor / low_bit;
+      int shift = exact_log2 (low_bit) - 4;
+      gcc_assert (shift >= -4 && (rel_factor & 1) != 0);
+
+      /* Set CODE, VAL and SHIFT so that [+-] VAL * 2**SHIFT is
+	 equal to CNTB * FACTOR / 16, with CODE being the [+-].
 
-      /* Calculate CNTD * FACTOR / 2.  First try to fold the division
-	 into the multiplication.  */
+	 We can avoid a multiplication if REL_FACTOR is in the range
+	 of RDVL, although there are then various optimizations that
+	 we can try on top.  */
+      rtx_code code = PLUS;
       rtx val;
-      int shift = 0;
-      if (factor & 1)
-	/* Use a right shift by 1.  */
-	shift = -1;
-      else
-	factor /= 2;
-      HOST_WIDE_INT low_bit = factor & -factor;
-      if (factor <= 16 * low_bit)
+      if (IN_RANGE (rel_factor, -32, 31))
 	{
-	  if (factor > 16 * 8)
+	  /* Try to use an unshifted CNT[BHWD] or RDVL.  */
+	  if (aarch64_sve_cnt_factor_p (factor)
+	      || aarch64_sve_rdvl_factor_p (factor))
+	    {
+	      val = gen_int_mode (poly_int64 (factor, factor), mode);
+	      shift = 0;
+	    }
+	  /* Try to subtract an unshifted CNT[BHWD].  */
+	  else if (aarch64_sve_cnt_factor_p (-factor))
 	    {
-	      /* "CNTB Xn, ALL, MUL #FACTOR" is out of range, so calculate
-		 the value with the minimum multiplier and shift it into
-		 position.  */
-	      int extra_shift = exact_log2 (low_bit);
-	      shift += extra_shift;
-	      factor >>= extra_shift;
+	      code = MINUS;
+	      val = gen_int_mode (poly_int64 (-factor, -factor), mode);
+	      shift = 0;
 	    }
-	  val = gen_int_mode (poly_int64 (factor * 2, factor * 2), mode);
+	  /* If subtraction is free, prefer to load a positive constant.
+	     In the best case this will fit a shifted CNTB.  */
+	  else if (src != const0_rtx && rel_factor < 0)
+	    {
+	      code = MINUS;
+	      val = gen_int_mode (-rel_factor * BYTES_PER_SVE_VECTOR, mode);
+	    }
+	  /* Otherwise use a shifted RDVL or CNT[BHWD].  */
+	  else
+	    val = gen_int_mode (rel_factor * BYTES_PER_SVE_VECTOR, mode);
 	}
       else
 	{
-	  /* Base the factor on LOW_BIT if we can calculate LOW_BIT
-	     directly, since that should increase the chances of being
-	     able to use a shift and add sequence.  If LOW_BIT itself
-	     is out of range, just use CNTD.  */
-	  if (low_bit <= 16 * 8)
-	    factor /= low_bit;
+	  /* If we can calculate CNTB << SHIFT directly, prefer to do that,
+	     since it should increase the chances of being able to use
+	     a shift and add sequence for the multiplication.
+	     If CNTB << SHIFT is out of range, stick with the current
+	     shift factor.  */
+	  if (IN_RANGE (low_bit, 2, 16 * 16))
+	    {
+	      val = gen_int_mode (poly_int64 (low_bit, low_bit), mode);
+	      shift = 0;
+	    }
 	  else
-	    low_bit = 1;
+	    val = gen_int_mode (BYTES_PER_SVE_VECTOR, mode);
 
-	  val = gen_int_mode (poly_int64 (low_bit * 2, low_bit * 2), mode);
 	  val = aarch64_force_temporary (mode, temp1, val);
 
+	  /* Prefer to multiply by a positive factor and subtract rather
+	     than multiply by a negative factor and add, since positive
+	     values are usually easier to move.  */
+	  if (rel_factor < 0 && src != const0_rtx)
+	    {
+	      rel_factor = -rel_factor;
+	      code = MINUS;
+	    }
+
 	  if (can_create_pseudo_p ())
 	    {
-	      rtx coeff1 = gen_int_mode (factor, mode);
+	      rtx coeff1 = gen_int_mode (rel_factor, mode);
 	      val = expand_mult (mode, val, coeff1, NULL_RTX, true, true);
 	    }
 	  else
 	    {
-	      /* Go back to using a negative multiplication factor if we have
-		 no register from which to subtract.  */
-	      if (code == MINUS && src == const0_rtx)
-		{
-		  factor = -factor;
-		  code = PLUS;
-		}
-	      rtx coeff1 = gen_int_mode (factor, mode);
+	      rtx coeff1 = gen_int_mode (rel_factor, mode);
 	      coeff1 = aarch64_force_temporary (mode, temp2, coeff1);
 	      val = gen_rtx_MULT (mode, val, coeff1);
 	    }
 	}
 
+      /* Multiply by 2 ** SHIFT.  */
       if (shift > 0)
 	{
-	  /* Multiply by 1 << SHIFT.  */
 	  val = aarch64_force_temporary (mode, temp1, val);
 	  val = gen_rtx_ASHIFT (mode, val, GEN_INT (shift));
 	}
-      else if (shift == -1)
+      else if (shift < 0)
 	{
-	  /* Divide by 2.  */
 	  val = aarch64_force_temporary (mode, temp1, val);
-	  val = gen_rtx_ASHIFTRT (mode, val, const1_rtx);
+	  val = gen_rtx_ASHIFTRT (mode, val, GEN_INT (-shift));
 	}
 
-      /* Calculate SRC +/- CNTD * FACTOR / 2.  */
+      /* Add the result to SRC or subtract the result from SRC.  */
       if (src != const0_rtx)
 	{
 	  val = aarch64_force_temporary (mode, temp1, val);
@@ -7045,7 +7112,9 @@ aarch64_expand_mov_immediate (rtx dest, rtx imm)
 	      aarch64_report_sve_required ();
 	      return;
 	    }
-	  if (base == const0_rtx && aarch64_sve_cnt_immediate_p (offset))
+	  if (base == const0_rtx
+	      && (aarch64_sve_cnt_immediate_p (offset)
+		  || aarch64_sve_rdvl_immediate_p (offset)))
 	    emit_insn (gen_rtx_SET (dest, imm));
 	  else
 	    {
@@ -21751,7 +21820,9 @@ aarch64_mov_operand_p (rtx x, machine_mode mode)
   if (SYMBOL_REF_P (x) && mode == DImode && CONSTANT_ADDRESS_P (x))
     return true;
 
-  if (TARGET_SVE && aarch64_sve_cnt_immediate_p (x))
+  if (TARGET_SVE
+      && (aarch64_sve_cnt_immediate_p (x)
+	  || aarch64_sve_rdvl_immediate_p (x)))
     return true;
 
   return aarch64_classify_symbolic_expression (x)
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index 5d02da42f..c0977a3da 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -1207,6 +1207,7 @@
      [w, D<hq>; neon_move      , simd  ] << aarch64_output_scalar_simd_mov_immediate (operands[1], <MODE>mode);
      /* The "mov_imm" type for CNT is just a placeholder.  */
      [r, Usv  ; mov_imm        , sve   ] << aarch64_output_sve_cnt_immediate ("cnt", "%x0", operands[1]);
+     [r, Usr  ; mov_imm        , sve   ] << aarch64_output_sve_rdvl (operands[1]);
      [r, m    ; load_4         , *     ] ldr<size>\t%w0, %1
      [w, m    ; load_4         , *     ] ldr\t%<size>0, %1
      [m, r Z  ; store_4        , *     ] str<size>\\t%w1, %0
@@ -1265,6 +1266,7 @@
      [r  , n  ; mov_imm  , *   ,16] #
      /* The "mov_imm" type for CNT is just a placeholder.  */
      [r  , Usv; mov_imm  , sve , 4] << aarch64_output_sve_cnt_immediate ("cnt", "%x0", operands[1]);
+     [r  , Usr; mov_imm  , sve,  4] << aarch64_output_sve_rdvl (operands[1]);
      [r  , m  ; load_4   , *   , 4] ldr\t%w0, %1
      [w  , m  ; load_4   , fp  , 4] ldr\t%s0, %1
      [m  , r Z; store_4  , *   , 4] str\t%w1, %0
@@ -1299,6 +1301,7 @@
      [r, n  ; mov_imm  , *   ,16] #
      /* The "mov_imm" type for CNT is just a placeholder.  */
      [r, Usv; mov_imm  , sve , 4] << aarch64_output_sve_cnt_immediate ("cnt", "%x0", operands[1]);
+     [r, Usr; mov_imm  , sve,  4] << aarch64_output_sve_rdvl (operands[1]);
      [r, m  ; load_8   , *   , 4] ldr\t%x0, %1
      [w, m  ; load_8   , fp  , 4] ldr\t%d0, %1
      [m, r Z; store_8  , *   , 4] str\t%x1, %0
diff --git a/gcc/config/aarch64/constraints.md b/gcc/config/aarch64/constraints.md
index 750a42fb1..212a73416 100644
--- a/gcc/config/aarch64/constraints.md
+++ b/gcc/config/aarch64/constraints.md
@@ -214,6 +214,12 @@
  (and (match_code "const_int")
       (match_test "aarch64_high_bits_all_ones_p (ival)")))
 
+(define_constraint "Usr"
+  "@internal
+   A constraint that matches a value produced by RDVL."
+ (and (match_code "const_poly_int")
+      (match_test "aarch64_sve_rdvl_immediate_p (op)")))
+
 (define_constraint "Usv"
   "@internal
    A constraint that matches a VG-based constant that can be loaded by
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cntb.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cntb.c
index 8b8fe8e4f..a22d8a28d 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cntb.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cntb.c
@@ -51,19 +51,24 @@ PROTO (cntb_15, uint64_t, ()) { return svcntb () * 15; }
 */
 PROTO (cntb_16, uint64_t, ()) { return svcntb () * 16; }
 
-/* Other sequences would be OK.  */
 /*
 ** cntb_17:
-**	cntb	x0, all, mul #16
-**	incb	x0
+**	rdvl	x0, #17
 **	ret
 */
 PROTO (cntb_17, uint64_t, ()) { return svcntb () * 17; }
 
+/*
+** cntb_31:
+**	rdvl	x0, #31
+**	ret
+*/
+PROTO (cntb_31, uint64_t, ()) { return svcntb () * 31; }
+
 /*
 ** cntb_32:
-**	cntd	(x[0-9]+)
-**	lsl	x0, \1, 8
+**	cntb	(x[0-9]+)
+**	lsl	x0, \1, 5
 **	ret
 */
 PROTO (cntb_32, uint64_t, ()) { return svcntb () * 32; }
@@ -80,16 +85,16 @@ PROTO (cntb_33, uint64_t, ()) { return svcntb () * 33; }
 
 /*
 ** cntb_64:
-**	cntd	(x[0-9]+)
-**	lsl	x0, \1, 9
+**	cntb	(x[0-9]+)
+**	lsl	x0, \1, 6
 **	ret
 */
 PROTO (cntb_64, uint64_t, ()) { return svcntb () * 64; }
 
 /*
 ** cntb_128:
-**	cntd	(x[0-9]+)
-**	lsl	x0, \1, 10
+**	cntb	(x[0-9]+)
+**	lsl	x0, \1, 7
 **	ret
 */
 PROTO (cntb_128, uint64_t, ()) { return svcntb () * 128; }
@@ -106,46 +111,70 @@ PROTO (cntb_129, uint64_t, ()) { return svcntb () * 129; }
 
 /*
 ** cntb_m1:
-**	cntb	(x[0-9]+)
-**	neg	x0, \1
+**	rdvl	x0, #-1
 **	ret
 */
 PROTO (cntb_m1, uint64_t, ()) { return -svcntb (); }
 
 /*
 ** cntb_m13:
-**	cntb	(x[0-9]+), all, mul #13
-**	neg	x0, \1
+**	rdvl	x0, #-13
 **	ret
 */
 PROTO (cntb_m13, uint64_t, ()) { return -svcntb () * 13; }
 
 /*
 ** cntb_m15:
-**	cntb	(x[0-9]+), all, mul #15
-**	neg	x0, \1
+**	rdvl	x0, #-15
 **	ret
 */
 PROTO (cntb_m15, uint64_t, ()) { return -svcntb () * 15; }
 
 /*
 ** cntb_m16:
-**	cntb	(x[0-9]+), all, mul #16
-**	neg	x0, \1
+**	rdvl	x0, #-16
 **	ret
 */
 PROTO (cntb_m16, uint64_t, ()) { return -svcntb () * 16; }
 
-/* Other sequences would be OK.  */
 /*
 ** cntb_m17:
-**	cntb	x0, all, mul #16
-**	incb	x0
-**	neg	x0, x0
+**	rdvl	x0, #-17
 **	ret
 */
 PROTO (cntb_m17, uint64_t, ()) { return -svcntb () * 17; }
 
+/*
+** cntb_m32:
+**	rdvl	x0, #-32
+**	ret
+*/
+PROTO (cntb_m32, uint64_t, ()) { return -svcntb () * 32; }
+
+/*
+** cntb_m33:
+**	rdvl	x0, #-32
+**	decb	x0
+**	ret
+*/
+PROTO (cntb_m33, uint64_t, ()) { return -svcntb () * 33; }
+
+/*
+** cntb_m34:
+**	rdvl	(x[0-9]+), #-17
+**	lsl	x0, \1, #?1
+**	ret
+*/
+PROTO (cntb_m34, uint64_t, ()) { return -svcntb () * 34; }
+
+/*
+** cntb_m64:
+**	rdvl	(x[0-9]+), #-1
+**	lsl	x0, \1, #?6
+**	ret
+*/
+PROTO (cntb_m64, uint64_t, ()) { return -svcntb () * 64; }
+
 /*
 ** incb_1:
 **	incb	x0
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cntd.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cntd.c
index 0d0ed4849..090a643b4 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cntd.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cntd.c
@@ -54,8 +54,8 @@ PROTO (cntd_16, uint64_t, ()) { return svcntd () * 16; }
 /* Other sequences would be OK.  */
 /*
 ** cntd_17:
-**	cntb	x0, all, mul #2
-**	incd	x0
+**	rdvl	(x[0-9]+), #17
+**	asr	x0, \1, 3
 **	ret
 */
 PROTO (cntd_17, uint64_t, ()) { return svcntd () * 17; }
@@ -107,8 +107,7 @@ PROTO (cntd_m15, uint64_t, ()) { return -svcntd () * 15; }
 
 /*
 ** cntd_m16:
-**	cntb	(x[0-9]+), all, mul #2
-**	neg	x0, \1
+**	rdvl	x0, #-2
 **	ret
 */
 PROTO (cntd_m16, uint64_t, ()) { return -svcntd () * 16; }
@@ -116,9 +115,8 @@ PROTO (cntd_m16, uint64_t, ()) { return -svcntd () * 16; }
 /* Other sequences would be OK.  */
 /*
 ** cntd_m17:
-**	cntb	x0, all, mul #2
-**	incd	x0
-**	neg	x0, x0
+**	rdvl	(x[0-9]+), #-17
+**	asr	x0, \1, 3
 **	ret
 */
 PROTO (cntd_m17, uint64_t, ()) { return -svcntd () * 17; }
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cnth.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cnth.c
index c29930f15..1a4e7dc0e 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cnth.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cnth.c
@@ -54,8 +54,8 @@ PROTO (cnth_16, uint64_t, ()) { return svcnth () * 16; }
 /* Other sequences would be OK.  */
 /*
 ** cnth_17:
-**	cntb	x0, all, mul #8
-**	inch	x0
+**	rdvl	(x[0-9]+), #17
+**	asr	x0, \1, 1
 **	ret
 */
 PROTO (cnth_17, uint64_t, ()) { return svcnth () * 17; }
@@ -69,16 +69,16 @@ PROTO (cnth_32, uint64_t, ()) { return svcnth () * 32; }
 
 /*
 ** cnth_64:
-**	cntd	(x[0-9]+)
-**	lsl	x0, \1, 8
+**	cntb	(x[0-9]+)
+**	lsl	x0, \1, 5
 **	ret
 */
 PROTO (cnth_64, uint64_t, ()) { return svcnth () * 64; }
 
 /*
 ** cnth_128:
-**	cntd	(x[0-9]+)
-**	lsl	x0, \1, 9
+**	cntb	(x[0-9]+)
+**	lsl	x0, \1, 6
 **	ret
 */
 PROTO (cnth_128, uint64_t, ()) { return svcnth () * 128; }
@@ -109,8 +109,7 @@ PROTO (cnth_m15, uint64_t, ()) { return -svcnth () * 15; }
 
 /*
 ** cnth_m16:
-**	cntb	(x[0-9]+), all, mul #8
-**	neg	x0, \1
+**	rdvl	x0, #-8
 **	ret
 */
 PROTO (cnth_m16, uint64_t, ()) { return -svcnth () * 16; }
@@ -118,9 +117,8 @@ PROTO (cnth_m16, uint64_t, ()) { return -svcnth () * 16; }
 /* Other sequences would be OK.  */
 /*
 ** cnth_m17:
-**	cntb	x0, all, mul #8
-**	inch	x0
-**	neg	x0, x0
+**	rdvl	(x[0-9]+), #-17
+**	asr	x0, \1, 1
 **	ret
 */
 PROTO (cnth_m17, uint64_t, ()) { return -svcnth () * 17; }
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cntw.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cntw.c
index e26cc67a4..9d1697690 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cntw.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cntw.c
@@ -54,8 +54,8 @@ PROTO (cntw_16, uint64_t, ()) { return svcntw () * 16; }
 /* Other sequences would be OK.  */
 /*
 ** cntw_17:
-**	cntb	x0, all, mul #4
-**	incw	x0
+**	rdvl	(x[0-9]+), #17
+**	asr	x0, \1, 2
 **	ret
 */
 PROTO (cntw_17, uint64_t, ()) { return svcntw () * 17; }
@@ -76,8 +76,8 @@ PROTO (cntw_64, uint64_t, ()) { return svcntw () * 64; }
 
 /*
 ** cntw_128:
-**	cntd	(x[0-9]+)
-**	lsl	x0, \1, 8
+**	cntb	(x[0-9]+)
+**	lsl	x0, \1, 5
 **	ret
 */
 PROTO (cntw_128, uint64_t, ()) { return svcntw () * 128; }
@@ -108,8 +108,7 @@ PROTO (cntw_m15, uint64_t, ()) { return -svcntw () * 15; }
 
 /*
 ** cntw_m16:
-**	cntb	(x[0-9]+), all, mul #4
-**	neg	x0, \1
+**	rdvl	(x[0-9]+), #-4
 **	ret
 */
 PROTO (cntw_m16, uint64_t, ()) { return -svcntw () * 16; }
@@ -117,9 +116,8 @@ PROTO (cntw_m16, uint64_t, ()) { return -svcntw () * 16; }
 /* Other sequences would be OK.  */
 /*
 ** cntw_m17:
-**	cntb	x0, all, mul #4
-**	incw	x0
-**	neg	x0, x0
+**	rdvl	(x[0-9]+), #-17
+**	asr	x0, \1, 2
 **	ret
 */
 PROTO (cntw_m17, uint64_t, ()) { return -svcntw () * 17; }
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/prfb.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/prfb.c
index c90730a03..94cd3a066 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/prfb.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/prfb.c
@@ -218,8 +218,8 @@ TEST_PREFETCH (prfb_vnum_31, uint16_t,
 
 /*
 ** prfb_vnum_32:
-**	cntd	(x[0-9]+)
-**	lsl	(x[0-9]+), \1, #?8
+**	cntb	(x[0-9]+)
+**	lsl	(x[0-9]+), \1, #?5
 **	add	(x[0-9]+), (\2, x0|x0, \2)
 **	prfb	pldl1keep, p0, \[\3\]
 **	ret
@@ -240,7 +240,7 @@ TEST_PREFETCH (prfb_vnum_m32, uint16_t,
 /*
 ** prfb_vnum_m33:
 **	...
-**	prfb	pldl1keep, p0, \[x[0-9]+\]
+**	prfb	pldl1keep, p0, \[x[0-9]+(, x[0-9]+)?\]
 **	ret
 */
 TEST_PREFETCH (prfb_vnum_m33, uint16_t,
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/prfd.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/prfd.c
index 869ef3d3e..b7a116cf0 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/prfd.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/prfd.c
@@ -218,8 +218,8 @@ TEST_PREFETCH (prfd_vnum_31, uint16_t,
 
 /*
 ** prfd_vnum_32:
-**	cntd	(x[0-9]+)
-**	lsl	(x[0-9]+), \1, #?8
+**	cntb	(x[0-9]+)
+**	lsl	(x[0-9]+), \1, #?5
 **	add	(x[0-9]+), (\2, x0|x0, \2)
 **	prfd	pldl1keep, p0, \[\3\]
 **	ret
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/prfh.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/prfh.c
index 45a735eae..9d3df6bd3 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/prfh.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/prfh.c
@@ -218,8 +218,8 @@ TEST_PREFETCH (prfh_vnum_31, uint16_t,
 
 /*
 ** prfh_vnum_32:
-**	cntd	(x[0-9]+)
-**	lsl	(x[0-9]+), \1, #?8
+**	cntb	(x[0-9]+)
+**	lsl	(x[0-9]+), \1, #?5
 **	add	(x[0-9]+), (\2, x0|x0, \2)
 **	prfh	pldl1keep, p0, \[\3\]
 **	ret
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/prfw.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/prfw.c
index 444187f45..6962abab6 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/prfw.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/prfw.c
@@ -218,8 +218,8 @@ TEST_PREFETCH (prfw_vnum_31, uint16_t,
 
 /*
 ** prfw_vnum_32:
-**	cntd	(x[0-9]+)
-**	lsl	(x[0-9]+), \1, #?8
+**	cntb	(x[0-9]+)
+**	lsl	(x[0-9]+), \1, #?5
 **	add	(x[0-9]+), (\2, x0|x0, \2)
 **	prfw	pldl1keep, p0, \[\3\]
 **	ret
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/loop_add_4.c b/gcc/testsuite/gcc.target/aarch64/sve/loop_add_4.c
index 9ead9c21b..7f02497e8 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/loop_add_4.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/loop_add_4.c
@@ -68,8 +68,7 @@ TEST_ALL (LOOP)
 /* { dg-final { scan-assembler-times {\tindex\tz[0-9]+\.s, w[0-9]+, w[0-9]+\n} 3 } } */
 /* { dg-final { scan-assembler-times {\tld1w\tz[0-9]+\.s, p[0-7]+/z, \[x[0-9]+, x[0-9]+, lsl 2\]} 8 } } */
 /* { dg-final { scan-assembler-times {\tst1w\tz[0-9]+\.s, p[0-7]+, \[x[0-9]+, x[0-9]+, lsl 2\]} 8 } } */
-/* 2 for the calculations of -17 and 17.  */
-/* { dg-final { scan-assembler-times {\tincw\tx[0-9]+\n} 10 } } */
+/* { dg-final { scan-assembler-times {\tincw\tx[0-9]+\n} 8 } } */
 
 /* { dg-final { scan-assembler-times {\tdecw\tz[0-9]+\.s, all, mul #16\n} 1 } } */
 /* { dg-final { scan-assembler-times {\tdecw\tz[0-9]+\.s, all, mul #15\n} 1 } } */
@@ -86,8 +85,7 @@ TEST_ALL (LOOP)
 /* { dg-final { scan-assembler-times {\tindex\tz[0-9]+\.d, x[0-9]+, x[0-9]+\n} 3 } } */
 /* { dg-final { scan-assembler-times {\tld1d\tz[0-9]+\.d, p[0-7]+/z, \[x[0-9]+, x[0-9]+, lsl 3\]} 8 } } */
 /* { dg-final { scan-assembler-times {\tst1d\tz[0-9]+\.d, p[0-7]+, \[x[0-9]+, x[0-9]+, lsl 3\]} 8 } } */
-/* 2 for the calculations of -17 and 17.  */
-/* { dg-final { scan-assembler-times {\tincd\tx[0-9]+\n} 10 } } */
+/* { dg-final { scan-assembler-times {\tincd\tx[0-9]+\n} 8 } } */
 
 /* { dg-final { scan-assembler-times {\tdecd\tz[0-9]+\.d, all, mul #16\n} 1 } } */
 /* { dg-final { scan-assembler-times {\tdecd\tz[0-9]+\.d, all, mul #15\n} 1 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/stack_clash_1.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/stack_clash_1.c
index 110947a6c..5de34fc61 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/pcs/stack_clash_1.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/stack_clash_1.c
@@ -6,8 +6,7 @@
 
 /*
 ** test_1:
-**	cntd	x12, all, mul #9
-**	lsl	x12, x12, #?4
+**	rdvl	x12, #18
 **	mov	x11, sp
 **	...
 **	sub	sp, sp, x12
-- 
2.33.0