summaryrefslogtreecommitdiff
path: root/0203-Backport-SME-aarch64-Use-vecs-to-store-register-save.patch
diff options
context:
space:
mode:
Diffstat (limited to '0203-Backport-SME-aarch64-Use-vecs-to-store-register-save.patch')
-rw-r--r--0203-Backport-SME-aarch64-Use-vecs-to-store-register-save.patch709
1 files changed, 709 insertions, 0 deletions
diff --git a/0203-Backport-SME-aarch64-Use-vecs-to-store-register-save.patch b/0203-Backport-SME-aarch64-Use-vecs-to-store-register-save.patch
new file mode 100644
index 0000000..b9e9c93
--- /dev/null
+++ b/0203-Backport-SME-aarch64-Use-vecs-to-store-register-save.patch
@@ -0,0 +1,709 @@
+From 554c83414c10909c39e0ad30026ffa4821dd9698 Mon Sep 17 00:00:00 2001
+From: Richard Sandiford <richard.sandiford@arm.com>
+Date: Tue, 17 Oct 2023 23:46:33 +0100
+Subject: [PATCH 104/157] [Backport][SME] aarch64: Use vecs to store register
+ save order
+
+Reference: https://gcc.gnu.org/git/?p=gcc.git;a=commit;h=575858508090b18dcbc176db285c9f55227ca4c0
+
+aarch64_save/restore_callee_saves looped over registers in register
+number order. This in turn meant that we could only use LDP and STP
+for registers that were consecutive both number-wise and
+offset-wise (after unsaved registers are excluded).
+
+This patch instead builds lists of the registers that we've decided to
+save, in offset order. We can then form LDP/STP pairs regardless of
+register number order, which in turn means that we can put the LR save
+slot first without losing LDP/STP opportunities.
+
+gcc/
+ * config/aarch64/aarch64.h (aarch64_frame): Add vectors that
+ store the list saved GPRs, FPRs and predicate registers.
+ * config/aarch64/aarch64.cc (aarch64_layout_frame): Initialize
+ the lists of saved registers. Use them to choose push candidates.
+ Invalidate pop candidates if we're not going to do a pop.
+ (aarch64_next_callee_save): Delete.
+ (aarch64_save_callee_saves): Take a list of registers,
+ rather than a range. Make !skip_wb select only write-back
+ candidates.
+ (aarch64_expand_prologue): Update calls accordingly.
+ (aarch64_restore_callee_saves): Take a list of registers,
+ rather than a range. Always skip pop candidates. Also skip
+ LR if shadow call stacks are enabled.
+ (aarch64_expand_epilogue): Update calls accordingly.
+
+gcc/testsuite/
+ * gcc.target/aarch64/sve/pcs/stack_clash_2.c: Expect restores
+ to happen in offset order.
+ * gcc.target/aarch64/sve/pcs/stack_clash_2_128.c: Likewise.
+ * gcc.target/aarch64/sve/pcs/stack_clash_2_256.c: Likewise.
+ * gcc.target/aarch64/sve/pcs/stack_clash_2_512.c: Likewise.
+ * gcc.target/aarch64/sve/pcs/stack_clash_2_1024.c: Likewise.
+ * gcc.target/aarch64/sve/pcs/stack_clash_2_2048.c: Likewise.
+---
+ gcc/config/aarch64/aarch64.cc | 203 +++++++++---------
+ gcc/config/aarch64/aarch64.h | 9 +-
+ .../aarch64/sve/pcs/stack_clash_2.c | 6 +-
+ .../aarch64/sve/pcs/stack_clash_2_1024.c | 6 +-
+ .../aarch64/sve/pcs/stack_clash_2_128.c | 6 +-
+ .../aarch64/sve/pcs/stack_clash_2_2048.c | 6 +-
+ .../aarch64/sve/pcs/stack_clash_2_256.c | 6 +-
+ .../aarch64/sve/pcs/stack_clash_2_512.c | 6 +-
+ 8 files changed, 128 insertions(+), 120 deletions(-)
+
+diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
+index 8d4dd2891..e10c9d763 100644
+--- a/gcc/config/aarch64/aarch64.cc
++++ b/gcc/config/aarch64/aarch64.cc
+@@ -8753,13 +8753,17 @@ aarch64_save_regs_above_locals_p ()
+ static void
+ aarch64_layout_frame (void)
+ {
+- int regno, last_fp_reg = INVALID_REGNUM;
++ unsigned regno, last_fp_reg = INVALID_REGNUM;
+ machine_mode vector_save_mode = aarch64_reg_save_mode (V8_REGNUM);
+ poly_int64 vector_save_size = GET_MODE_SIZE (vector_save_mode);
+ bool frame_related_fp_reg_p = false;
+ aarch64_frame &frame = cfun->machine->frame;
+ poly_int64 top_of_locals = -1;
+
++ vec_safe_truncate (frame.saved_gprs, 0);
++ vec_safe_truncate (frame.saved_fprs, 0);
++ vec_safe_truncate (frame.saved_prs, 0);
++
+ frame.emit_frame_chain = aarch64_needs_frame_chain ();
+
+ /* Adjust the outgoing arguments size if required. Keep it in sync with what
+@@ -8844,6 +8848,7 @@ aarch64_layout_frame (void)
+ for (regno = P0_REGNUM; regno <= P15_REGNUM; regno++)
+ if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED))
+ {
++ vec_safe_push (frame.saved_prs, regno);
+ if (frame.sve_save_and_probe == INVALID_REGNUM)
+ frame.sve_save_and_probe = regno;
+ frame.reg_offset[regno] = offset;
+@@ -8865,7 +8870,7 @@ aarch64_layout_frame (void)
+ If we don't have any vector registers to save, and we know how
+ big the predicate save area is, we can just round it up to the
+ next 16-byte boundary. */
+- if (last_fp_reg == (int) INVALID_REGNUM && offset.is_constant ())
++ if (last_fp_reg == INVALID_REGNUM && offset.is_constant ())
+ offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT);
+ else
+ {
+@@ -8879,10 +8884,11 @@ aarch64_layout_frame (void)
+ }
+
+ /* If we need to save any SVE vector registers, add them next. */
+- if (last_fp_reg != (int) INVALID_REGNUM && crtl->abi->id () == ARM_PCS_SVE)
++ if (last_fp_reg != INVALID_REGNUM && crtl->abi->id () == ARM_PCS_SVE)
+ for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
+ if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED))
+ {
++ vec_safe_push (frame.saved_fprs, regno);
+ if (frame.sve_save_and_probe == INVALID_REGNUM)
+ frame.sve_save_and_probe = regno;
+ frame.reg_offset[regno] = offset;
+@@ -8903,13 +8909,8 @@ aarch64_layout_frame (void)
+
+ auto allocate_gpr_slot = [&](unsigned int regno)
+ {
+- if (frame.hard_fp_save_and_probe == INVALID_REGNUM)
+- frame.hard_fp_save_and_probe = regno;
++ vec_safe_push (frame.saved_gprs, regno);
+ frame.reg_offset[regno] = offset;
+- if (frame.wb_push_candidate1 == INVALID_REGNUM)
+- frame.wb_push_candidate1 = regno;
+- else if (frame.wb_push_candidate2 == INVALID_REGNUM)
+- frame.wb_push_candidate2 = regno;
+ offset += UNITS_PER_WORD;
+ };
+
+@@ -8938,8 +8939,7 @@ aarch64_layout_frame (void)
+ for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
+ if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED))
+ {
+- if (frame.hard_fp_save_and_probe == INVALID_REGNUM)
+- frame.hard_fp_save_and_probe = regno;
++ vec_safe_push (frame.saved_fprs, regno);
+ /* If there is an alignment gap between integer and fp callee-saves,
+ allocate the last fp register to it if possible. */
+ if (regno == last_fp_reg
+@@ -8952,21 +8952,25 @@ aarch64_layout_frame (void)
+ }
+
+ frame.reg_offset[regno] = offset;
+- if (frame.wb_push_candidate1 == INVALID_REGNUM)
+- frame.wb_push_candidate1 = regno;
+- else if (frame.wb_push_candidate2 == INVALID_REGNUM
+- && frame.wb_push_candidate1 >= V0_REGNUM)
+- frame.wb_push_candidate2 = regno;
+ offset += vector_save_size;
+ }
+
+ offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT);
+-
+ auto saved_regs_size = offset - frame.bytes_below_saved_regs;
+- gcc_assert (known_eq (saved_regs_size, below_hard_fp_saved_regs_size)
+- || (frame.hard_fp_save_and_probe != INVALID_REGNUM
+- && known_eq (frame.reg_offset[frame.hard_fp_save_and_probe],
+- frame.bytes_below_hard_fp)));
++
++ array_slice<unsigned int> push_regs = (!vec_safe_is_empty (frame.saved_gprs)
++ ? frame.saved_gprs
++ : frame.saved_fprs);
++ if (!push_regs.empty ()
++ && known_eq (frame.reg_offset[push_regs[0]], frame.bytes_below_hard_fp))
++ {
++ frame.hard_fp_save_and_probe = push_regs[0];
++ frame.wb_push_candidate1 = push_regs[0];
++ if (push_regs.size () > 1)
++ frame.wb_push_candidate2 = push_regs[1];
++ }
++ else
++ gcc_assert (known_eq (saved_regs_size, below_hard_fp_saved_regs_size));
+
+ /* With stack-clash, a register must be saved in non-leaf functions.
+ The saving of the bottommost register counts as an implicit probe,
+@@ -9130,12 +9134,14 @@ aarch64_layout_frame (void)
+ + frame.sve_callee_adjust
+ + frame.final_adjust, frame.frame_size));
+
+- if (!frame.emit_frame_chain && frame.callee_adjust == 0)
++ if (frame.callee_adjust == 0)
+ {
+- /* We've decided not to associate any register saves with the initial
+- stack allocation. */
+- frame.wb_pop_candidate1 = frame.wb_push_candidate1 = INVALID_REGNUM;
+- frame.wb_pop_candidate2 = frame.wb_push_candidate2 = INVALID_REGNUM;
++ /* We've decided not to do a "real" push and pop. However,
++ setting up the frame chain is treated as being essentially
++ a multi-instruction push. */
++ frame.wb_pop_candidate1 = frame.wb_pop_candidate2 = INVALID_REGNUM;
++ if (!frame.emit_frame_chain)
++ frame.wb_push_candidate1 = frame.wb_push_candidate2 = INVALID_REGNUM;
+ }
+
+ frame.laid_out = true;
+@@ -9150,17 +9156,6 @@ aarch64_register_saved_on_entry (int regno)
+ return known_ge (cfun->machine->frame.reg_offset[regno], 0);
+ }
+
+-/* Return the next register up from REGNO up to LIMIT for the callee
+- to save. */
+-
+-static unsigned
+-aarch64_next_callee_save (unsigned regno, unsigned limit)
+-{
+- while (regno <= limit && !aarch64_register_saved_on_entry (regno))
+- regno ++;
+- return regno;
+-}
+-
+ /* Push the register number REGNO of mode MODE to the stack with write-back
+ adjusting the stack by ADJUSTMENT. */
+
+@@ -9424,41 +9419,46 @@ aarch64_add_cfa_expression (rtx_insn *insn, rtx reg,
+ add_reg_note (insn, REG_CFA_EXPRESSION, gen_rtx_SET (mem, reg));
+ }
+
+-/* Emit code to save the callee-saved registers from register number START
+- to LIMIT to the stack. The stack pointer is currently BYTES_BELOW_SP
+- bytes above the bottom of the static frame. Skip any write-back
+- candidates if SKIP_WB is true. HARD_FP_VALID_P is true if the hard
+- frame pointer has been set up. */
++/* Emit code to save the callee-saved registers in REGS. Skip any
++ write-back candidates if SKIP_WB is true, otherwise consider only
++ write-back candidates.
++
++ The stack pointer is currently BYTES_BELOW_SP bytes above the bottom
++ of the static frame. HARD_FP_VALID_P is true if the hard frame pointer
++ has been set up. */
+
+ static void
+ aarch64_save_callee_saves (poly_int64 bytes_below_sp,
+- unsigned start, unsigned limit, bool skip_wb,
++ array_slice<unsigned int> regs, bool skip_wb,
+ bool hard_fp_valid_p)
+ {
+ aarch64_frame &frame = cfun->machine->frame;
+ rtx_insn *insn;
+- unsigned regno;
+- unsigned regno2;
+ rtx anchor_reg = NULL_RTX, ptrue = NULL_RTX;
+
+- for (regno = aarch64_next_callee_save (start, limit);
+- regno <= limit;
+- regno = aarch64_next_callee_save (regno + 1, limit))
++ auto skip_save_p = [&](unsigned int regno)
++ {
++ if (cfun->machine->reg_is_wrapped_separately[regno])
++ return true;
++
++ if (skip_wb == (regno == frame.wb_push_candidate1
++ || regno == frame.wb_push_candidate2))
++ return true;
++
++ return false;
++ };
++
++ for (unsigned int i = 0; i < regs.size (); ++i)
+ {
+- rtx reg, mem;
++ unsigned int regno = regs[i];
+ poly_int64 offset;
+ bool frame_related_p = aarch64_emit_cfi_for_reg_p (regno);
+
+- if (skip_wb
+- && (regno == frame.wb_push_candidate1
+- || regno == frame.wb_push_candidate2))
+- continue;
+-
+- if (cfun->machine->reg_is_wrapped_separately[regno])
++ if (skip_save_p (regno))
+ continue;
+
+ machine_mode mode = aarch64_reg_save_mode (regno);
+- reg = gen_rtx_REG (mode, regno);
++ rtx reg = gen_rtx_REG (mode, regno);
+ offset = frame.reg_offset[regno] - bytes_below_sp;
+ rtx base_rtx = stack_pointer_rtx;
+ poly_int64 sp_offset = offset;
+@@ -9485,12 +9485,13 @@ aarch64_save_callee_saves (poly_int64 bytes_below_sp,
+ }
+ offset -= fp_offset;
+ }
+- mem = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
++ rtx mem = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
+ bool need_cfa_note_p = (base_rtx != stack_pointer_rtx);
+
++ unsigned int regno2;
+ if (!aarch64_sve_mode_p (mode)
+- && (regno2 = aarch64_next_callee_save (regno + 1, limit)) <= limit
+- && !cfun->machine->reg_is_wrapped_separately[regno2]
++ && i + 1 < regs.size ()
++ && (regno2 = regs[i + 1], !skip_save_p (regno2))
+ && known_eq (GET_MODE_SIZE (mode),
+ frame.reg_offset[regno2] - frame.reg_offset[regno]))
+ {
+@@ -9516,6 +9517,7 @@ aarch64_save_callee_saves (poly_int64 bytes_below_sp,
+ }
+
+ regno = regno2;
++ ++i;
+ }
+ else if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
+ {
+@@ -9533,49 +9535,57 @@ aarch64_save_callee_saves (poly_int64 bytes_below_sp,
+ }
+ }
+
+-/* Emit code to restore the callee registers from register number START
+- up to and including LIMIT. The stack pointer is currently BYTES_BELOW_SP
+- bytes above the bottom of the static frame. Skip any write-back
+- candidates if SKIP_WB is true. Write the appropriate REG_CFA_RESTORE
+- notes into CFI_OPS. */
++/* Emit code to restore the callee registers in REGS, ignoring pop candidates
++ and any other registers that are handled separately. Write the appropriate
++ REG_CFA_RESTORE notes into CFI_OPS.
++
++ The stack pointer is currently BYTES_BELOW_SP bytes above the bottom
++ of the static frame. */
+
+ static void
+-aarch64_restore_callee_saves (poly_int64 bytes_below_sp, unsigned start,
+- unsigned limit, bool skip_wb, rtx *cfi_ops)
++aarch64_restore_callee_saves (poly_int64 bytes_below_sp,
++ array_slice<unsigned int> regs, rtx *cfi_ops)
+ {
+ aarch64_frame &frame = cfun->machine->frame;
+- unsigned regno;
+- unsigned regno2;
+ poly_int64 offset;
+ rtx anchor_reg = NULL_RTX, ptrue = NULL_RTX;
+
+- for (regno = aarch64_next_callee_save (start, limit);
+- regno <= limit;
+- regno = aarch64_next_callee_save (regno + 1, limit))
++ auto skip_restore_p = [&](unsigned int regno)
+ {
+- bool frame_related_p = aarch64_emit_cfi_for_reg_p (regno);
+ if (cfun->machine->reg_is_wrapped_separately[regno])
+- continue;
++ return true;
++
++ if (regno == frame.wb_pop_candidate1
++ || regno == frame.wb_pop_candidate2)
++ return true;
+
+- rtx reg, mem;
++ /* The shadow call stack code restores LR separately. */
++ if (frame.is_scs_enabled && regno == LR_REGNUM)
++ return true;
+
+- if (skip_wb
+- && (regno == frame.wb_pop_candidate1
+- || regno == frame.wb_pop_candidate2))
++ return false;
++ };
++
++ for (unsigned int i = 0; i < regs.size (); ++i)
++ {
++ unsigned int regno = regs[i];
++ bool frame_related_p = aarch64_emit_cfi_for_reg_p (regno);
++ if (skip_restore_p (regno))
+ continue;
+
+ machine_mode mode = aarch64_reg_save_mode (regno);
+- reg = gen_rtx_REG (mode, regno);
++ rtx reg = gen_rtx_REG (mode, regno);
+ offset = frame.reg_offset[regno] - bytes_below_sp;
+ rtx base_rtx = stack_pointer_rtx;
+ if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
+ aarch64_adjust_sve_callee_save_base (mode, base_rtx, anchor_reg,
+ offset, ptrue);
+- mem = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
++ rtx mem = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
+
++ unsigned int regno2;
+ if (!aarch64_sve_mode_p (mode)
+- && (regno2 = aarch64_next_callee_save (regno + 1, limit)) <= limit
+- && !cfun->machine->reg_is_wrapped_separately[regno2]
++ && i + 1 < regs.size ()
++ && (regno2 = regs[i + 1], !skip_restore_p (regno2))
+ && known_eq (GET_MODE_SIZE (mode),
+ frame.reg_offset[regno2] - frame.reg_offset[regno]))
+ {
+@@ -9588,6 +9598,7 @@ aarch64_restore_callee_saves (poly_int64 bytes_below_sp, unsigned start,
+
+ *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
+ regno = regno2;
++ ++i;
+ }
+ else if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
+ emit_insn (gen_aarch64_pred_mov (mode, reg, ptrue, mem));
+@@ -10409,13 +10420,10 @@ aarch64_expand_prologue (void)
+ - frame.bytes_above_hard_fp);
+ gcc_assert (known_ge (chain_offset, 0));
+
++ gcc_assert (reg1 == R29_REGNUM && reg2 == R30_REGNUM);
+ if (callee_adjust == 0)
+- {
+- reg1 = R29_REGNUM;
+- reg2 = R30_REGNUM;
+- aarch64_save_callee_saves (bytes_below_sp, reg1, reg2,
+- false, false);
+- }
++ aarch64_save_callee_saves (bytes_below_sp, frame.saved_gprs,
++ false, false);
+ else
+ gcc_assert (known_eq (chain_offset, 0));
+ aarch64_add_offset (Pmode, hard_frame_pointer_rtx,
+@@ -10453,8 +10461,7 @@ aarch64_expand_prologue (void)
+ aarch64_emit_stack_tie (hard_frame_pointer_rtx);
+ }
+
+- aarch64_save_callee_saves (bytes_below_sp, R0_REGNUM, R30_REGNUM,
+- callee_adjust != 0 || emit_frame_chain,
++ aarch64_save_callee_saves (bytes_below_sp, frame.saved_gprs, true,
+ emit_frame_chain);
+ if (maybe_ne (sve_callee_adjust, 0))
+ {
+@@ -10465,10 +10472,9 @@ aarch64_expand_prologue (void)
+ !frame_pointer_needed, false);
+ bytes_below_sp -= sve_callee_adjust;
+ }
+- aarch64_save_callee_saves (bytes_below_sp, P0_REGNUM, P15_REGNUM,
+- false, emit_frame_chain);
+- aarch64_save_callee_saves (bytes_below_sp, V0_REGNUM, V31_REGNUM,
+- callee_adjust != 0 || emit_frame_chain,
++ aarch64_save_callee_saves (bytes_below_sp, frame.saved_prs, true,
++ emit_frame_chain);
++ aarch64_save_callee_saves (bytes_below_sp, frame.saved_fprs, true,
+ emit_frame_chain);
+
+ /* We may need to probe the final adjustment if it is larger than the guard
+@@ -10514,8 +10520,6 @@ aarch64_expand_epilogue (rtx_call_insn *sibcall)
+ poly_int64 bytes_below_hard_fp = frame.bytes_below_hard_fp;
+ unsigned reg1 = frame.wb_pop_candidate1;
+ unsigned reg2 = frame.wb_pop_candidate2;
+- unsigned int last_gpr = (frame.is_scs_enabled
+- ? R29_REGNUM : R30_REGNUM);
+ rtx cfi_ops = NULL;
+ rtx_insn *insn;
+ /* A stack clash protection prologue may not have left EP0_REGNUM or
+@@ -10579,10 +10583,8 @@ aarch64_expand_epilogue (rtx_call_insn *sibcall)
+
+ /* Restore the vector registers before the predicate registers,
+ so that we can use P4 as a temporary for big-endian SVE frames. */
+- aarch64_restore_callee_saves (final_adjust, V0_REGNUM, V31_REGNUM,
+- callee_adjust != 0, &cfi_ops);
+- aarch64_restore_callee_saves (final_adjust, P0_REGNUM, P15_REGNUM,
+- false, &cfi_ops);
++ aarch64_restore_callee_saves (final_adjust, frame.saved_fprs, &cfi_ops);
++ aarch64_restore_callee_saves (final_adjust, frame.saved_prs, &cfi_ops);
+ if (maybe_ne (sve_callee_adjust, 0))
+ aarch64_add_sp (NULL_RTX, NULL_RTX, sve_callee_adjust, true);
+
+@@ -10590,8 +10592,7 @@ aarch64_expand_epilogue (rtx_call_insn *sibcall)
+ restore x30, we don't need to restore x30 again in the traditional
+ way. */
+ aarch64_restore_callee_saves (final_adjust + sve_callee_adjust,
+- R0_REGNUM, last_gpr,
+- callee_adjust != 0, &cfi_ops);
++ frame.saved_gprs, &cfi_ops);
+
+ if (need_barrier_p)
+ aarch64_emit_stack_tie (stack_pointer_rtx);
+diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h
+index 292ef2eec..1591cde8b 100644
+--- a/gcc/config/aarch64/aarch64.h
++++ b/gcc/config/aarch64/aarch64.h
+@@ -787,7 +787,7 @@ extern enum aarch64_processor aarch64_tune;
+
+ #define DEFAULT_PCC_STRUCT_RETURN 0
+
+-#ifdef HAVE_POLY_INT_H
++#if defined(HAVE_POLY_INT_H) && defined(GCC_VEC_H)
+ struct GTY (()) aarch64_frame
+ {
+ /* The offset from the bottom of the static frame (the bottom of the
+@@ -795,6 +795,13 @@ struct GTY (()) aarch64_frame
+ needed. */
+ poly_int64 reg_offset[LAST_SAVED_REGNUM + 1];
+
++ /* The list of GPRs, FPRs and predicate registers that have nonnegative
++ entries in reg_offset. The registers are listed in order of
++ increasing offset (rather than increasing register number). */
++ vec<unsigned, va_gc_atomic> *saved_gprs;
++ vec<unsigned, va_gc_atomic> *saved_fprs;
++ vec<unsigned, va_gc_atomic> *saved_prs;
++
+ /* The number of extra stack bytes taken up by register varargs.
+ This area is allocated by the callee at the very top of the
+ frame. This value is rounded up to a multiple of
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/stack_clash_2.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/stack_clash_2.c
+index 4622a1eed..bbb45d266 100644
+--- a/gcc/testsuite/gcc.target/aarch64/sve/pcs/stack_clash_2.c
++++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/stack_clash_2.c
+@@ -215,9 +215,9 @@ test_7 (void)
+ ** add sp, sp, #?16
+ ** ldr p4, \[sp\]
+ ** addvl sp, sp, #1
++** ldp x29, x30, \[sp\]
+ ** ldp x24, x25, \[sp, 16\]
+ ** ldr x26, \[sp, 32\]
+-** ldp x29, x30, \[sp\]
+ ** mov x12, #?4144
+ ** add sp, sp, x12
+ ** ret
+@@ -283,9 +283,9 @@ test_9 (int n)
+ ** addvl sp, x29, #-1
+ ** ldr p4, \[sp\]
+ ** addvl sp, sp, #1
++** ldp x29, x30, \[sp\]
+ ** ldp x24, x25, \[sp, 16\]
+ ** ldr x26, \[sp, 32\]
+-** ldp x29, x30, \[sp\]
+ ** mov x12, #?4144
+ ** add sp, sp, x12
+ ** ret
+@@ -319,9 +319,9 @@ test_10 (int n)
+ ** addvl sp, x29, #-1
+ ** ldr p4, \[sp\]
+ ** addvl sp, sp, #1
++** ldp x29, x30, \[sp\]
+ ** ldp x24, x25, \[sp, 16\]
+ ** ldr x26, \[sp, 32\]
+-** ldp x29, x30, \[sp\]
+ ** add sp, sp, #?3008
+ ** add sp, sp, #?126976
+ ** ret
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/stack_clash_2_1024.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/stack_clash_2_1024.c
+index e31200fc2..9437c7a85 100644
+--- a/gcc/testsuite/gcc.target/aarch64/sve/pcs/stack_clash_2_1024.c
++++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/stack_clash_2_1024.c
+@@ -176,9 +176,9 @@ test_7 (void)
+ ** add sp, sp, #?16
+ ** ldr z16, \[sp\]
+ ** add sp, sp, #?128
++** ldp x29, x30, \[sp\]
+ ** ldp x24, x25, \[sp, 16\]
+ ** ldr x26, \[sp, 32\]
+-** ldp x29, x30, \[sp\]
+ ** mov x12, #?4144
+ ** add sp, sp, x12
+ ** ret
+@@ -234,9 +234,9 @@ test_9 (int n)
+ ** sub sp, x29, #128
+ ** ldr z16, \[sp\]
+ ** add sp, sp, #?128
++** ldp x29, x30, \[sp\]
+ ** ldp x24, x25, \[sp, 16\]
+ ** ldr x26, \[sp, 32\]
+-** ldp x29, x30, \[sp\]
+ ** mov x12, #?4144
+ ** add sp, sp, x12
+ ** ret
+@@ -268,9 +268,9 @@ test_10 (int n)
+ ** sub sp, x29, #128
+ ** ldr z16, \[sp\]
+ ** add sp, sp, #?128
++** ldp x29, x30, \[sp\]
+ ** ldp x24, x25, \[sp, 16\]
+ ** ldr x26, \[sp, 32\]
+-** ldp x29, x30, \[sp\]
+ ** add sp, sp, #?3008
+ ** add sp, sp, #?126976
+ ** ret
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/stack_clash_2_128.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/stack_clash_2_128.c
+index 41193b411..b4e1627fa 100644
+--- a/gcc/testsuite/gcc.target/aarch64/sve/pcs/stack_clash_2_128.c
++++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/stack_clash_2_128.c
+@@ -176,9 +176,9 @@ test_7 (void)
+ ** add sp, sp, #?16
+ ** ldr p4, \[sp\]
+ ** add sp, sp, #?16
++** ldp x29, x30, \[sp\]
+ ** ldp x24, x25, \[sp, 16\]
+ ** ldr x26, \[sp, 32\]
+-** ldp x29, x30, \[sp\]
+ ** mov x12, #?4144
+ ** add sp, sp, x12
+ ** ret
+@@ -234,9 +234,9 @@ test_9 (int n)
+ ** sub sp, x29, #16
+ ** ldr p4, \[sp\]
+ ** add sp, sp, #?16
++** ldp x29, x30, \[sp\]
+ ** ldp x24, x25, \[sp, 16\]
+ ** ldr x26, \[sp, 32\]
+-** ldp x29, x30, \[sp\]
+ ** mov x12, #?4144
+ ** add sp, sp, x12
+ ** ret
+@@ -267,9 +267,9 @@ test_10 (int n)
+ ** sub sp, x29, #16
+ ** ldr p4, \[sp\]
+ ** add sp, sp, #?16
++** ldp x29, x30, \[sp\]
+ ** ldp x24, x25, \[sp, 16\]
+ ** ldr x26, \[sp, 32\]
+-** ldp x29, x30, \[sp\]
+ ** add sp, sp, #?3008
+ ** add sp, sp, #?126976
+ ** ret
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/stack_clash_2_2048.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/stack_clash_2_2048.c
+index f63751678..921209379 100644
+--- a/gcc/testsuite/gcc.target/aarch64/sve/pcs/stack_clash_2_2048.c
++++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/stack_clash_2_2048.c
+@@ -176,9 +176,9 @@ test_7 (void)
+ ** add sp, sp, #?16
+ ** ldr z16, \[sp\]
+ ** add sp, sp, #?256
++** ldp x29, x30, \[sp\]
+ ** ldp x24, x25, \[sp, 16\]
+ ** ldr x26, \[sp, 32\]
+-** ldp x29, x30, \[sp\]
+ ** mov x12, #?4144
+ ** add sp, sp, x12
+ ** ret
+@@ -234,9 +234,9 @@ test_9 (int n)
+ ** sub sp, x29, #256
+ ** ldr z16, \[sp\]
+ ** add sp, sp, #?256
++** ldp x29, x30, \[sp\]
+ ** ldp x24, x25, \[sp, 16\]
+ ** ldr x26, \[sp, 32\]
+-** ldp x29, x30, \[sp\]
+ ** mov x12, #?4144
+ ** add sp, sp, x12
+ ** ret
+@@ -268,9 +268,9 @@ test_10 (int n)
+ ** sub sp, x29, #256
+ ** ldr z16, \[sp\]
+ ** add sp, sp, #?256
++** ldp x29, x30, \[sp\]
+ ** ldp x24, x25, \[sp, 16\]
+ ** ldr x26, \[sp, 32\]
+-** ldp x29, x30, \[sp\]
+ ** add sp, sp, #?3008
+ ** add sp, sp, #?126976
+ ** ret
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/stack_clash_2_256.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/stack_clash_2_256.c
+index 6bcbb5772..bd8bef0f0 100644
+--- a/gcc/testsuite/gcc.target/aarch64/sve/pcs/stack_clash_2_256.c
++++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/stack_clash_2_256.c
+@@ -176,9 +176,9 @@ test_7 (void)
+ ** add sp, sp, #?16
+ ** ldr z16, \[sp\]
+ ** add sp, sp, #?32
++** ldp x29, x30, \[sp\]
+ ** ldp x24, x25, \[sp, 16\]
+ ** ldr x26, \[sp, 32\]
+-** ldp x29, x30, \[sp\]
+ ** mov x12, #?4144
+ ** add sp, sp, x12
+ ** ret
+@@ -234,9 +234,9 @@ test_9 (int n)
+ ** sub sp, x29, #32
+ ** ldr z16, \[sp\]
+ ** add sp, sp, #?32
++** ldp x29, x30, \[sp\]
+ ** ldp x24, x25, \[sp, 16\]
+ ** ldr x26, \[sp, 32\]
+-** ldp x29, x30, \[sp\]
+ ** mov x12, #?4144
+ ** add sp, sp, x12
+ ** ret
+@@ -267,9 +267,9 @@ test_10 (int n)
+ ** sub sp, x29, #32
+ ** ldr z16, \[sp\]
+ ** add sp, sp, #?32
++** ldp x29, x30, \[sp\]
+ ** ldp x24, x25, \[sp, 16\]
+ ** ldr x26, \[sp, 32\]
+-** ldp x29, x30, \[sp\]
+ ** add sp, sp, #?3008
+ ** add sp, sp, #?126976
+ ** ret
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/stack_clash_2_512.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/stack_clash_2_512.c
+index dc7df8e6b..2c76ccecd 100644
+--- a/gcc/testsuite/gcc.target/aarch64/sve/pcs/stack_clash_2_512.c
++++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/stack_clash_2_512.c
+@@ -176,9 +176,9 @@ test_7 (void)
+ ** add sp, sp, #?16
+ ** ldr z16, \[sp\]
+ ** add sp, sp, #?64
++** ldp x29, x30, \[sp\]
+ ** ldp x24, x25, \[sp, 16\]
+ ** ldr x26, \[sp, 32\]
+-** ldp x29, x30, \[sp\]
+ ** mov x12, #?4144
+ ** add sp, sp, x12
+ ** ret
+@@ -234,9 +234,9 @@ test_9 (int n)
+ ** sub sp, x29, #64
+ ** ldr z16, \[sp\]
+ ** add sp, sp, #?64
++** ldp x29, x30, \[sp\]
+ ** ldp x24, x25, \[sp, 16\]
+ ** ldr x26, \[sp, 32\]
+-** ldp x29, x30, \[sp\]
+ ** mov x12, #?4144
+ ** add sp, sp, x12
+ ** ret
+@@ -268,9 +268,9 @@ test_10 (int n)
+ ** sub sp, x29, #64
+ ** ldr z16, \[sp\]
+ ** add sp, sp, #?64
++** ldp x29, x30, \[sp\]
+ ** ldp x24, x25, \[sp, 16\]
+ ** ldr x26, \[sp, 32\]
+-** ldp x29, x30, \[sp\]
+ ** add sp, sp, #?3008
+ ** add sp, sp, #?126976
+ ** ret
+--
+2.33.0
+