diff options
Diffstat (limited to '0162-Backport-SME-aarch64-Use-SVE-s-RDVL-instruction.patch')
-rw-r--r-- | 0162-Backport-SME-aarch64-Use-SVE-s-RDVL-instruction.patch | 792 |
1 files changed, 792 insertions, 0 deletions
diff --git a/0162-Backport-SME-aarch64-Use-SVE-s-RDVL-instruction.patch b/0162-Backport-SME-aarch64-Use-SVE-s-RDVL-instruction.patch new file mode 100644 index 0000000..f92df2d --- /dev/null +++ b/0162-Backport-SME-aarch64-Use-SVE-s-RDVL-instruction.patch @@ -0,0 +1,792 @@ +From 46310765c05cde8732e07bfb0df9f0ec25a34018 Mon Sep 17 00:00:00 2001 +From: Richard Sandiford <richard.sandiford@arm.com> +Date: Tue, 5 Dec 2023 10:11:18 +0000 +Subject: [PATCH 063/157] [Backport][SME] aarch64: Use SVE's RDVL instruction + +Reference: https://gcc.gnu.org/git/?p=gcc.git;a=commit;h=80f47d7bbe38234e1530d27fe5c2f130223ca7a0 + +We didn't previously use SVE's RDVL instruction, since the CNT* +forms are preferred and provide most of the range. However, +there are some cases that RDVL can handle and CNT* can't, +and using RDVL-like instructions becomes important for SME. + +gcc/ + * config/aarch64/aarch64-protos.h (aarch64_sve_rdvl_immediate_p) + (aarch64_output_sve_rdvl): Declare. + * config/aarch64/aarch64.cc (aarch64_sve_cnt_factor_p): New + function, split out from... + (aarch64_sve_cnt_immediate_p): ...here. + (aarch64_sve_rdvl_factor_p): New function. + (aarch64_sve_rdvl_immediate_p): Likewise. + (aarch64_output_sve_rdvl): Likewise. + (aarch64_offset_temporaries): Rewrite the SVE handling to use RDVL + for some cases. + (aarch64_expand_mov_immediate): Handle RDVL immediates. + (aarch64_mov_operand_p): Likewise. + * config/aarch64/constraints.md (Usr): New constraint. + * config/aarch64/aarch64.md (*mov<SHORT:mode>_aarch64): Add an RDVL + alternative. + (*movsi_aarch64, *movdi_aarch64): Likewise. + +gcc/testsuite/ + * gcc.target/aarch64/sve/acle/asm/cntb.c: Tweak expected output. + * gcc.target/aarch64/sve/acle/asm/cnth.c: Likewise. + * gcc.target/aarch64/sve/acle/asm/cntw.c: Likewise. + * gcc.target/aarch64/sve/acle/asm/cntd.c: Likewise. + * gcc.target/aarch64/sve/acle/asm/prfb.c: Likewise. + * gcc.target/aarch64/sve/acle/asm/prfh.c: Likewise. + * gcc.target/aarch64/sve/acle/asm/prfw.c: Likewise. + * gcc.target/aarch64/sve/acle/asm/prfd.c: Likewise. + * gcc.target/aarch64/sve/loop_add_4.c: Expect RDVL to be used + to calculate the -17 and 17 factors. + * gcc.target/aarch64/sve/pcs/stack_clash_1.c: Likewise the 18 factor. +--- + gcc/config/aarch64/aarch64-protos.h | 2 + + gcc/config/aarch64/aarch64.cc | 191 ++++++++++++------ + gcc/config/aarch64/aarch64.md | 3 + + gcc/config/aarch64/constraints.md | 6 + + .../gcc.target/aarch64/sve/acle/asm/cntb.c | 71 +++++-- + .../gcc.target/aarch64/sve/acle/asm/cntd.c | 12 +- + .../gcc.target/aarch64/sve/acle/asm/cnth.c | 20 +- + .../gcc.target/aarch64/sve/acle/asm/cntw.c | 16 +- + .../gcc.target/aarch64/sve/acle/asm/prfb.c | 6 +- + .../gcc.target/aarch64/sve/acle/asm/prfd.c | 4 +- + .../gcc.target/aarch64/sve/acle/asm/prfh.c | 4 +- + .../gcc.target/aarch64/sve/acle/asm/prfw.c | 4 +- + .../gcc.target/aarch64/sve/loop_add_4.c | 6 +- + .../aarch64/sve/pcs/stack_clash_1.c | 3 +- + 14 files changed, 225 insertions(+), 123 deletions(-) + +diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h +index 3ff1a0163..14a568140 100644 +--- a/gcc/config/aarch64/aarch64-protos.h ++++ b/gcc/config/aarch64/aarch64-protos.h +@@ -802,6 +802,7 @@ bool aarch64_sve_mode_p (machine_mode); + HOST_WIDE_INT aarch64_fold_sve_cnt_pat (aarch64_svpattern, unsigned int); + bool aarch64_sve_cnt_immediate_p (rtx); + bool aarch64_sve_scalar_inc_dec_immediate_p (rtx); ++bool aarch64_sve_rdvl_immediate_p (rtx); + bool aarch64_sve_addvl_addpl_immediate_p (rtx); + bool aarch64_sve_vector_inc_dec_immediate_p (rtx); + int aarch64_add_offset_temporaries (rtx); +@@ -814,6 +815,7 @@ char *aarch64_output_sve_prefetch (const char *, rtx, const char *); + char *aarch64_output_sve_cnt_immediate (const char *, const char *, rtx); + char *aarch64_output_sve_cnt_pat_immediate (const char *, const char *, rtx *); + char *aarch64_output_sve_scalar_inc_dec (rtx); ++char *aarch64_output_sve_rdvl (rtx); + char *aarch64_output_sve_addvl_addpl (rtx); + char *aarch64_output_sve_vector_inc_dec (const char *, rtx); + char *aarch64_output_scalar_simd_mov_immediate (rtx, scalar_int_mode); +diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc +index acb659f53..4194dfc70 100644 +--- a/gcc/config/aarch64/aarch64.cc ++++ b/gcc/config/aarch64/aarch64.cc +@@ -5520,6 +5520,18 @@ aarch64_fold_sve_cnt_pat (aarch64_svpattern pattern, unsigned int nelts_per_vq) + return -1; + } + ++/* Return true if a single CNT[BHWD] instruction can multiply FACTOR ++ by the number of 128-bit quadwords in an SVE vector. */ ++ ++static bool ++aarch64_sve_cnt_factor_p (HOST_WIDE_INT factor) ++{ ++ /* The coefficient must be [1, 16] * {2, 4, 8, 16}. */ ++ return (IN_RANGE (factor, 2, 16 * 16) ++ && (factor & 1) == 0 ++ && factor <= 16 * (factor & -factor)); ++} ++ + /* Return true if we can move VALUE into a register using a single + CNT[BHWD] instruction. */ + +@@ -5527,11 +5539,7 @@ static bool + aarch64_sve_cnt_immediate_p (poly_int64 value) + { + HOST_WIDE_INT factor = value.coeffs[0]; +- /* The coefficient must be [1, 16] * {2, 4, 8, 16}. */ +- return (value.coeffs[1] == factor +- && IN_RANGE (factor, 2, 16 * 16) +- && (factor & 1) == 0 +- && factor <= 16 * (factor & -factor)); ++ return value.coeffs[1] == factor && aarch64_sve_cnt_factor_p (factor); + } + + /* Likewise for rtx X. */ +@@ -5647,6 +5655,50 @@ aarch64_output_sve_scalar_inc_dec (rtx offset) + -offset_value.coeffs[1], 0); + } + ++/* Return true if a single RDVL instruction can multiply FACTOR by the ++ number of 128-bit quadwords in an SVE vector. */ ++ ++static bool ++aarch64_sve_rdvl_factor_p (HOST_WIDE_INT factor) ++{ ++ return (multiple_p (factor, 16) ++ && IN_RANGE (factor, -32 * 16, 31 * 16)); ++} ++ ++/* Return true if we can move VALUE into a register using a single ++ RDVL instruction. */ ++ ++static bool ++aarch64_sve_rdvl_immediate_p (poly_int64 value) ++{ ++ HOST_WIDE_INT factor = value.coeffs[0]; ++ return value.coeffs[1] == factor && aarch64_sve_rdvl_factor_p (factor); ++} ++ ++/* Likewise for rtx X. */ ++ ++bool ++aarch64_sve_rdvl_immediate_p (rtx x) ++{ ++ poly_int64 value; ++ return poly_int_rtx_p (x, &value) && aarch64_sve_rdvl_immediate_p (value); ++} ++ ++/* Return the asm string for moving RDVL immediate OFFSET into register ++ operand 0. */ ++ ++char * ++aarch64_output_sve_rdvl (rtx offset) ++{ ++ static char buffer[sizeof ("rdvl\t%x0, #-") + 3 * sizeof (int)]; ++ poly_int64 offset_value = rtx_to_poly_int64 (offset); ++ gcc_assert (aarch64_sve_rdvl_immediate_p (offset_value)); ++ ++ int factor = offset_value.coeffs[1]; ++ snprintf (buffer, sizeof (buffer), "rdvl\t%%x0, #%d", factor / 16); ++ return buffer; ++} ++ + /* Return true if we can add VALUE to a register using a single ADDVL + or ADDPL instruction. */ + +@@ -6227,13 +6279,13 @@ aarch64_offset_temporaries (bool add_p, poly_int64 offset) + count += 1; + else if (factor != 0) + { +- factor = abs (factor); +- if (factor > 16 * (factor & -factor)) +- /* Need one register for the CNT result and one for the multiplication +- factor. If necessary, the second temporary can be reused for the +- constant part of the offset. */ ++ factor /= (HOST_WIDE_INT) least_bit_hwi (factor); ++ if (!IN_RANGE (factor, -32, 31)) ++ /* Need one register for the CNT or RDVL result and one for the ++ multiplication factor. If necessary, the second temporary ++ can be reused for the constant part of the offset. */ + return 2; +- /* Need one register for the CNT result (which might then ++ /* Need one register for the CNT or RDVL result (which might then + be shifted). */ + count += 1; + } +@@ -6322,85 +6374,100 @@ aarch64_add_offset (scalar_int_mode mode, rtx dest, rtx src, + /* Otherwise use a CNT-based sequence. */ + else if (factor != 0) + { +- /* Use a subtraction if we have a negative factor. */ +- rtx_code code = PLUS; +- if (factor < 0) +- { +- factor = -factor; +- code = MINUS; +- } ++ /* Calculate CNTB * FACTOR / 16 as CNTB * REL_FACTOR * 2**SHIFT, ++ with negative shifts indicating a shift right. */ ++ HOST_WIDE_INT low_bit = least_bit_hwi (factor); ++ HOST_WIDE_INT rel_factor = factor / low_bit; ++ int shift = exact_log2 (low_bit) - 4; ++ gcc_assert (shift >= -4 && (rel_factor & 1) != 0); ++ ++ /* Set CODE, VAL and SHIFT so that [+-] VAL * 2**SHIFT is ++ equal to CNTB * FACTOR / 16, with CODE being the [+-]. + +- /* Calculate CNTD * FACTOR / 2. First try to fold the division +- into the multiplication. */ ++ We can avoid a multiplication if REL_FACTOR is in the range ++ of RDVL, although there are then various optimizations that ++ we can try on top. */ ++ rtx_code code = PLUS; + rtx val; +- int shift = 0; +- if (factor & 1) +- /* Use a right shift by 1. */ +- shift = -1; +- else +- factor /= 2; +- HOST_WIDE_INT low_bit = factor & -factor; +- if (factor <= 16 * low_bit) ++ if (IN_RANGE (rel_factor, -32, 31)) + { +- if (factor > 16 * 8) ++ /* Try to use an unshifted CNT[BHWD] or RDVL. */ ++ if (aarch64_sve_cnt_factor_p (factor) ++ || aarch64_sve_rdvl_factor_p (factor)) ++ { ++ val = gen_int_mode (poly_int64 (factor, factor), mode); ++ shift = 0; ++ } ++ /* Try to subtract an unshifted CNT[BHWD]. */ ++ else if (aarch64_sve_cnt_factor_p (-factor)) + { +- /* "CNTB Xn, ALL, MUL #FACTOR" is out of range, so calculate +- the value with the minimum multiplier and shift it into +- position. */ +- int extra_shift = exact_log2 (low_bit); +- shift += extra_shift; +- factor >>= extra_shift; ++ code = MINUS; ++ val = gen_int_mode (poly_int64 (-factor, -factor), mode); ++ shift = 0; + } +- val = gen_int_mode (poly_int64 (factor * 2, factor * 2), mode); ++ /* If subtraction is free, prefer to load a positive constant. ++ In the best case this will fit a shifted CNTB. */ ++ else if (src != const0_rtx && rel_factor < 0) ++ { ++ code = MINUS; ++ val = gen_int_mode (-rel_factor * BYTES_PER_SVE_VECTOR, mode); ++ } ++ /* Otherwise use a shifted RDVL or CNT[BHWD]. */ ++ else ++ val = gen_int_mode (rel_factor * BYTES_PER_SVE_VECTOR, mode); + } + else + { +- /* Base the factor on LOW_BIT if we can calculate LOW_BIT +- directly, since that should increase the chances of being +- able to use a shift and add sequence. If LOW_BIT itself +- is out of range, just use CNTD. */ +- if (low_bit <= 16 * 8) +- factor /= low_bit; ++ /* If we can calculate CNTB << SHIFT directly, prefer to do that, ++ since it should increase the chances of being able to use ++ a shift and add sequence for the multiplication. ++ If CNTB << SHIFT is out of range, stick with the current ++ shift factor. */ ++ if (IN_RANGE (low_bit, 2, 16 * 16)) ++ { ++ val = gen_int_mode (poly_int64 (low_bit, low_bit), mode); ++ shift = 0; ++ } + else +- low_bit = 1; ++ val = gen_int_mode (BYTES_PER_SVE_VECTOR, mode); + +- val = gen_int_mode (poly_int64 (low_bit * 2, low_bit * 2), mode); + val = aarch64_force_temporary (mode, temp1, val); + ++ /* Prefer to multiply by a positive factor and subtract rather ++ than multiply by a negative factor and add, since positive ++ values are usually easier to move. */ ++ if (rel_factor < 0 && src != const0_rtx) ++ { ++ rel_factor = -rel_factor; ++ code = MINUS; ++ } ++ + if (can_create_pseudo_p ()) + { +- rtx coeff1 = gen_int_mode (factor, mode); ++ rtx coeff1 = gen_int_mode (rel_factor, mode); + val = expand_mult (mode, val, coeff1, NULL_RTX, true, true); + } + else + { +- /* Go back to using a negative multiplication factor if we have +- no register from which to subtract. */ +- if (code == MINUS && src == const0_rtx) +- { +- factor = -factor; +- code = PLUS; +- } +- rtx coeff1 = gen_int_mode (factor, mode); ++ rtx coeff1 = gen_int_mode (rel_factor, mode); + coeff1 = aarch64_force_temporary (mode, temp2, coeff1); + val = gen_rtx_MULT (mode, val, coeff1); + } + } + ++ /* Multiply by 2 ** SHIFT. */ + if (shift > 0) + { +- /* Multiply by 1 << SHIFT. */ + val = aarch64_force_temporary (mode, temp1, val); + val = gen_rtx_ASHIFT (mode, val, GEN_INT (shift)); + } +- else if (shift == -1) ++ else if (shift < 0) + { +- /* Divide by 2. */ + val = aarch64_force_temporary (mode, temp1, val); +- val = gen_rtx_ASHIFTRT (mode, val, const1_rtx); ++ val = gen_rtx_ASHIFTRT (mode, val, GEN_INT (-shift)); + } + +- /* Calculate SRC +/- CNTD * FACTOR / 2. */ ++ /* Add the result to SRC or subtract the result from SRC. */ + if (src != const0_rtx) + { + val = aarch64_force_temporary (mode, temp1, val); +@@ -7045,7 +7112,9 @@ aarch64_expand_mov_immediate (rtx dest, rtx imm) + aarch64_report_sve_required (); + return; + } +- if (base == const0_rtx && aarch64_sve_cnt_immediate_p (offset)) ++ if (base == const0_rtx ++ && (aarch64_sve_cnt_immediate_p (offset) ++ || aarch64_sve_rdvl_immediate_p (offset))) + emit_insn (gen_rtx_SET (dest, imm)); + else + { +@@ -21751,7 +21820,9 @@ aarch64_mov_operand_p (rtx x, machine_mode mode) + if (SYMBOL_REF_P (x) && mode == DImode && CONSTANT_ADDRESS_P (x)) + return true; + +- if (TARGET_SVE && aarch64_sve_cnt_immediate_p (x)) ++ if (TARGET_SVE ++ && (aarch64_sve_cnt_immediate_p (x) ++ || aarch64_sve_rdvl_immediate_p (x))) + return true; + + return aarch64_classify_symbolic_expression (x) +diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md +index 5d02da42f..c0977a3da 100644 +--- a/gcc/config/aarch64/aarch64.md ++++ b/gcc/config/aarch64/aarch64.md +@@ -1207,6 +1207,7 @@ + [w, D<hq>; neon_move , simd ] << aarch64_output_scalar_simd_mov_immediate (operands[1], <MODE>mode); + /* The "mov_imm" type for CNT is just a placeholder. */ + [r, Usv ; mov_imm , sve ] << aarch64_output_sve_cnt_immediate ("cnt", "%x0", operands[1]); ++ [r, Usr ; mov_imm , sve ] << aarch64_output_sve_rdvl (operands[1]); + [r, m ; load_4 , * ] ldr<size>\t%w0, %1 + [w, m ; load_4 , * ] ldr\t%<size>0, %1 + [m, r Z ; store_4 , * ] str<size>\\t%w1, %0 +@@ -1265,6 +1266,7 @@ + [r , n ; mov_imm , * ,16] # + /* The "mov_imm" type for CNT is just a placeholder. */ + [r , Usv; mov_imm , sve , 4] << aarch64_output_sve_cnt_immediate ("cnt", "%x0", operands[1]); ++ [r , Usr; mov_imm , sve, 4] << aarch64_output_sve_rdvl (operands[1]); + [r , m ; load_4 , * , 4] ldr\t%w0, %1 + [w , m ; load_4 , fp , 4] ldr\t%s0, %1 + [m , r Z; store_4 , * , 4] str\t%w1, %0 +@@ -1299,6 +1301,7 @@ + [r, n ; mov_imm , * ,16] # + /* The "mov_imm" type for CNT is just a placeholder. */ + [r, Usv; mov_imm , sve , 4] << aarch64_output_sve_cnt_immediate ("cnt", "%x0", operands[1]); ++ [r, Usr; mov_imm , sve, 4] << aarch64_output_sve_rdvl (operands[1]); + [r, m ; load_8 , * , 4] ldr\t%x0, %1 + [w, m ; load_8 , fp , 4] ldr\t%d0, %1 + [m, r Z; store_8 , * , 4] str\t%x1, %0 +diff --git a/gcc/config/aarch64/constraints.md b/gcc/config/aarch64/constraints.md +index 750a42fb1..212a73416 100644 +--- a/gcc/config/aarch64/constraints.md ++++ b/gcc/config/aarch64/constraints.md +@@ -214,6 +214,12 @@ + (and (match_code "const_int") + (match_test "aarch64_high_bits_all_ones_p (ival)"))) + ++(define_constraint "Usr" ++ "@internal ++ A constraint that matches a value produced by RDVL." ++ (and (match_code "const_poly_int") ++ (match_test "aarch64_sve_rdvl_immediate_p (op)"))) ++ + (define_constraint "Usv" + "@internal + A constraint that matches a VG-based constant that can be loaded by +diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cntb.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cntb.c +index 8b8fe8e4f..a22d8a28d 100644 +--- a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cntb.c ++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cntb.c +@@ -51,19 +51,24 @@ PROTO (cntb_15, uint64_t, ()) { return svcntb () * 15; } + */ + PROTO (cntb_16, uint64_t, ()) { return svcntb () * 16; } + +-/* Other sequences would be OK. */ + /* + ** cntb_17: +-** cntb x0, all, mul #16 +-** incb x0 ++** rdvl x0, #17 + ** ret + */ + PROTO (cntb_17, uint64_t, ()) { return svcntb () * 17; } + ++/* ++** cntb_31: ++** rdvl x0, #31 ++** ret ++*/ ++PROTO (cntb_31, uint64_t, ()) { return svcntb () * 31; } ++ + /* + ** cntb_32: +-** cntd (x[0-9]+) +-** lsl x0, \1, 8 ++** cntb (x[0-9]+) ++** lsl x0, \1, 5 + ** ret + */ + PROTO (cntb_32, uint64_t, ()) { return svcntb () * 32; } +@@ -80,16 +85,16 @@ PROTO (cntb_33, uint64_t, ()) { return svcntb () * 33; } + + /* + ** cntb_64: +-** cntd (x[0-9]+) +-** lsl x0, \1, 9 ++** cntb (x[0-9]+) ++** lsl x0, \1, 6 + ** ret + */ + PROTO (cntb_64, uint64_t, ()) { return svcntb () * 64; } + + /* + ** cntb_128: +-** cntd (x[0-9]+) +-** lsl x0, \1, 10 ++** cntb (x[0-9]+) ++** lsl x0, \1, 7 + ** ret + */ + PROTO (cntb_128, uint64_t, ()) { return svcntb () * 128; } +@@ -106,46 +111,70 @@ PROTO (cntb_129, uint64_t, ()) { return svcntb () * 129; } + + /* + ** cntb_m1: +-** cntb (x[0-9]+) +-** neg x0, \1 ++** rdvl x0, #-1 + ** ret + */ + PROTO (cntb_m1, uint64_t, ()) { return -svcntb (); } + + /* + ** cntb_m13: +-** cntb (x[0-9]+), all, mul #13 +-** neg x0, \1 ++** rdvl x0, #-13 + ** ret + */ + PROTO (cntb_m13, uint64_t, ()) { return -svcntb () * 13; } + + /* + ** cntb_m15: +-** cntb (x[0-9]+), all, mul #15 +-** neg x0, \1 ++** rdvl x0, #-15 + ** ret + */ + PROTO (cntb_m15, uint64_t, ()) { return -svcntb () * 15; } + + /* + ** cntb_m16: +-** cntb (x[0-9]+), all, mul #16 +-** neg x0, \1 ++** rdvl x0, #-16 + ** ret + */ + PROTO (cntb_m16, uint64_t, ()) { return -svcntb () * 16; } + +-/* Other sequences would be OK. */ + /* + ** cntb_m17: +-** cntb x0, all, mul #16 +-** incb x0 +-** neg x0, x0 ++** rdvl x0, #-17 + ** ret + */ + PROTO (cntb_m17, uint64_t, ()) { return -svcntb () * 17; } + ++/* ++** cntb_m32: ++** rdvl x0, #-32 ++** ret ++*/ ++PROTO (cntb_m32, uint64_t, ()) { return -svcntb () * 32; } ++ ++/* ++** cntb_m33: ++** rdvl x0, #-32 ++** decb x0 ++** ret ++*/ ++PROTO (cntb_m33, uint64_t, ()) { return -svcntb () * 33; } ++ ++/* ++** cntb_m34: ++** rdvl (x[0-9]+), #-17 ++** lsl x0, \1, #?1 ++** ret ++*/ ++PROTO (cntb_m34, uint64_t, ()) { return -svcntb () * 34; } ++ ++/* ++** cntb_m64: ++** rdvl (x[0-9]+), #-1 ++** lsl x0, \1, #?6 ++** ret ++*/ ++PROTO (cntb_m64, uint64_t, ()) { return -svcntb () * 64; } ++ + /* + ** incb_1: + ** incb x0 +diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cntd.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cntd.c +index 0d0ed4849..090a643b4 100644 +--- a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cntd.c ++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cntd.c +@@ -54,8 +54,8 @@ PROTO (cntd_16, uint64_t, ()) { return svcntd () * 16; } + /* Other sequences would be OK. */ + /* + ** cntd_17: +-** cntb x0, all, mul #2 +-** incd x0 ++** rdvl (x[0-9]+), #17 ++** asr x0, \1, 3 + ** ret + */ + PROTO (cntd_17, uint64_t, ()) { return svcntd () * 17; } +@@ -107,8 +107,7 @@ PROTO (cntd_m15, uint64_t, ()) { return -svcntd () * 15; } + + /* + ** cntd_m16: +-** cntb (x[0-9]+), all, mul #2 +-** neg x0, \1 ++** rdvl x0, #-2 + ** ret + */ + PROTO (cntd_m16, uint64_t, ()) { return -svcntd () * 16; } +@@ -116,9 +115,8 @@ PROTO (cntd_m16, uint64_t, ()) { return -svcntd () * 16; } + /* Other sequences would be OK. */ + /* + ** cntd_m17: +-** cntb x0, all, mul #2 +-** incd x0 +-** neg x0, x0 ++** rdvl (x[0-9]+), #-17 ++** asr x0, \1, 3 + ** ret + */ + PROTO (cntd_m17, uint64_t, ()) { return -svcntd () * 17; } +diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cnth.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cnth.c +index c29930f15..1a4e7dc0e 100644 +--- a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cnth.c ++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cnth.c +@@ -54,8 +54,8 @@ PROTO (cnth_16, uint64_t, ()) { return svcnth () * 16; } + /* Other sequences would be OK. */ + /* + ** cnth_17: +-** cntb x0, all, mul #8 +-** inch x0 ++** rdvl (x[0-9]+), #17 ++** asr x0, \1, 1 + ** ret + */ + PROTO (cnth_17, uint64_t, ()) { return svcnth () * 17; } +@@ -69,16 +69,16 @@ PROTO (cnth_32, uint64_t, ()) { return svcnth () * 32; } + + /* + ** cnth_64: +-** cntd (x[0-9]+) +-** lsl x0, \1, 8 ++** cntb (x[0-9]+) ++** lsl x0, \1, 5 + ** ret + */ + PROTO (cnth_64, uint64_t, ()) { return svcnth () * 64; } + + /* + ** cnth_128: +-** cntd (x[0-9]+) +-** lsl x0, \1, 9 ++** cntb (x[0-9]+) ++** lsl x0, \1, 6 + ** ret + */ + PROTO (cnth_128, uint64_t, ()) { return svcnth () * 128; } +@@ -109,8 +109,7 @@ PROTO (cnth_m15, uint64_t, ()) { return -svcnth () * 15; } + + /* + ** cnth_m16: +-** cntb (x[0-9]+), all, mul #8 +-** neg x0, \1 ++** rdvl x0, #-8 + ** ret + */ + PROTO (cnth_m16, uint64_t, ()) { return -svcnth () * 16; } +@@ -118,9 +117,8 @@ PROTO (cnth_m16, uint64_t, ()) { return -svcnth () * 16; } + /* Other sequences would be OK. */ + /* + ** cnth_m17: +-** cntb x0, all, mul #8 +-** inch x0 +-** neg x0, x0 ++** rdvl (x[0-9]+), #-17 ++** asr x0, \1, 1 + ** ret + */ + PROTO (cnth_m17, uint64_t, ()) { return -svcnth () * 17; } +diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cntw.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cntw.c +index e26cc67a4..9d1697690 100644 +--- a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cntw.c ++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/cntw.c +@@ -54,8 +54,8 @@ PROTO (cntw_16, uint64_t, ()) { return svcntw () * 16; } + /* Other sequences would be OK. */ + /* + ** cntw_17: +-** cntb x0, all, mul #4 +-** incw x0 ++** rdvl (x[0-9]+), #17 ++** asr x0, \1, 2 + ** ret + */ + PROTO (cntw_17, uint64_t, ()) { return svcntw () * 17; } +@@ -76,8 +76,8 @@ PROTO (cntw_64, uint64_t, ()) { return svcntw () * 64; } + + /* + ** cntw_128: +-** cntd (x[0-9]+) +-** lsl x0, \1, 8 ++** cntb (x[0-9]+) ++** lsl x0, \1, 5 + ** ret + */ + PROTO (cntw_128, uint64_t, ()) { return svcntw () * 128; } +@@ -108,8 +108,7 @@ PROTO (cntw_m15, uint64_t, ()) { return -svcntw () * 15; } + + /* + ** cntw_m16: +-** cntb (x[0-9]+), all, mul #4 +-** neg x0, \1 ++** rdvl (x[0-9]+), #-4 + ** ret + */ + PROTO (cntw_m16, uint64_t, ()) { return -svcntw () * 16; } +@@ -117,9 +116,8 @@ PROTO (cntw_m16, uint64_t, ()) { return -svcntw () * 16; } + /* Other sequences would be OK. */ + /* + ** cntw_m17: +-** cntb x0, all, mul #4 +-** incw x0 +-** neg x0, x0 ++** rdvl (x[0-9]+), #-17 ++** asr x0, \1, 2 + ** ret + */ + PROTO (cntw_m17, uint64_t, ()) { return -svcntw () * 17; } +diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/prfb.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/prfb.c +index c90730a03..94cd3a066 100644 +--- a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/prfb.c ++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/prfb.c +@@ -218,8 +218,8 @@ TEST_PREFETCH (prfb_vnum_31, uint16_t, + + /* + ** prfb_vnum_32: +-** cntd (x[0-9]+) +-** lsl (x[0-9]+), \1, #?8 ++** cntb (x[0-9]+) ++** lsl (x[0-9]+), \1, #?5 + ** add (x[0-9]+), (\2, x0|x0, \2) + ** prfb pldl1keep, p0, \[\3\] + ** ret +@@ -240,7 +240,7 @@ TEST_PREFETCH (prfb_vnum_m32, uint16_t, + /* + ** prfb_vnum_m33: + ** ... +-** prfb pldl1keep, p0, \[x[0-9]+\] ++** prfb pldl1keep, p0, \[x[0-9]+(, x[0-9]+)?\] + ** ret + */ + TEST_PREFETCH (prfb_vnum_m33, uint16_t, +diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/prfd.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/prfd.c +index 869ef3d3e..b7a116cf0 100644 +--- a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/prfd.c ++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/prfd.c +@@ -218,8 +218,8 @@ TEST_PREFETCH (prfd_vnum_31, uint16_t, + + /* + ** prfd_vnum_32: +-** cntd (x[0-9]+) +-** lsl (x[0-9]+), \1, #?8 ++** cntb (x[0-9]+) ++** lsl (x[0-9]+), \1, #?5 + ** add (x[0-9]+), (\2, x0|x0, \2) + ** prfd pldl1keep, p0, \[\3\] + ** ret +diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/prfh.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/prfh.c +index 45a735eae..9d3df6bd3 100644 +--- a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/prfh.c ++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/prfh.c +@@ -218,8 +218,8 @@ TEST_PREFETCH (prfh_vnum_31, uint16_t, + + /* + ** prfh_vnum_32: +-** cntd (x[0-9]+) +-** lsl (x[0-9]+), \1, #?8 ++** cntb (x[0-9]+) ++** lsl (x[0-9]+), \1, #?5 + ** add (x[0-9]+), (\2, x0|x0, \2) + ** prfh pldl1keep, p0, \[\3\] + ** ret +diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/prfw.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/prfw.c +index 444187f45..6962abab6 100644 +--- a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/prfw.c ++++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/prfw.c +@@ -218,8 +218,8 @@ TEST_PREFETCH (prfw_vnum_31, uint16_t, + + /* + ** prfw_vnum_32: +-** cntd (x[0-9]+) +-** lsl (x[0-9]+), \1, #?8 ++** cntb (x[0-9]+) ++** lsl (x[0-9]+), \1, #?5 + ** add (x[0-9]+), (\2, x0|x0, \2) + ** prfw pldl1keep, p0, \[\3\] + ** ret +diff --git a/gcc/testsuite/gcc.target/aarch64/sve/loop_add_4.c b/gcc/testsuite/gcc.target/aarch64/sve/loop_add_4.c +index 9ead9c21b..7f02497e8 100644 +--- a/gcc/testsuite/gcc.target/aarch64/sve/loop_add_4.c ++++ b/gcc/testsuite/gcc.target/aarch64/sve/loop_add_4.c +@@ -68,8 +68,7 @@ TEST_ALL (LOOP) + /* { dg-final { scan-assembler-times {\tindex\tz[0-9]+\.s, w[0-9]+, w[0-9]+\n} 3 } } */ + /* { dg-final { scan-assembler-times {\tld1w\tz[0-9]+\.s, p[0-7]+/z, \[x[0-9]+, x[0-9]+, lsl 2\]} 8 } } */ + /* { dg-final { scan-assembler-times {\tst1w\tz[0-9]+\.s, p[0-7]+, \[x[0-9]+, x[0-9]+, lsl 2\]} 8 } } */ +-/* 2 for the calculations of -17 and 17. */ +-/* { dg-final { scan-assembler-times {\tincw\tx[0-9]+\n} 10 } } */ ++/* { dg-final { scan-assembler-times {\tincw\tx[0-9]+\n} 8 } } */ + + /* { dg-final { scan-assembler-times {\tdecw\tz[0-9]+\.s, all, mul #16\n} 1 } } */ + /* { dg-final { scan-assembler-times {\tdecw\tz[0-9]+\.s, all, mul #15\n} 1 } } */ +@@ -86,8 +85,7 @@ TEST_ALL (LOOP) + /* { dg-final { scan-assembler-times {\tindex\tz[0-9]+\.d, x[0-9]+, x[0-9]+\n} 3 } } */ + /* { dg-final { scan-assembler-times {\tld1d\tz[0-9]+\.d, p[0-7]+/z, \[x[0-9]+, x[0-9]+, lsl 3\]} 8 } } */ + /* { dg-final { scan-assembler-times {\tst1d\tz[0-9]+\.d, p[0-7]+, \[x[0-9]+, x[0-9]+, lsl 3\]} 8 } } */ +-/* 2 for the calculations of -17 and 17. */ +-/* { dg-final { scan-assembler-times {\tincd\tx[0-9]+\n} 10 } } */ ++/* { dg-final { scan-assembler-times {\tincd\tx[0-9]+\n} 8 } } */ + + /* { dg-final { scan-assembler-times {\tdecd\tz[0-9]+\.d, all, mul #16\n} 1 } } */ + /* { dg-final { scan-assembler-times {\tdecd\tz[0-9]+\.d, all, mul #15\n} 1 } } */ +diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/stack_clash_1.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/stack_clash_1.c +index 110947a6c..5de34fc61 100644 +--- a/gcc/testsuite/gcc.target/aarch64/sve/pcs/stack_clash_1.c ++++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/stack_clash_1.c +@@ -6,8 +6,7 @@ + + /* + ** test_1: +-** cntd x12, all, mul #9 +-** lsl x12, x12, #?4 ++** rdvl x12, #18 + ** mov x11, sp + ** ... + ** sub sp, sp, x12 +-- +2.33.0 + |