summaryrefslogtreecommitdiff
path: root/0199-Backport-SME-aarch64-Explicitly-record-probe-registe.patch
diff options
context:
space:
mode:
authorCoprDistGit <infra@openeuler.org>2025-02-28 10:03:49 +0000
committerCoprDistGit <infra@openeuler.org>2025-02-28 10:03:49 +0000
commit73127104a245052cd5cf29cdaaca3e5c32c70348 (patch)
tree8e28b63e478c43c252f18b49836dff7313affe54 /0199-Backport-SME-aarch64-Explicitly-record-probe-registe.patch
parent49d3feaf4665cdb07576fc1a2382a4d82a612d35 (diff)
automatic import of gccopeneuler24.03_LTS_SP1
Diffstat (limited to '0199-Backport-SME-aarch64-Explicitly-record-probe-registe.patch')
-rw-r--r--0199-Backport-SME-aarch64-Explicitly-record-probe-registe.patch280
1 files changed, 280 insertions, 0 deletions
diff --git a/0199-Backport-SME-aarch64-Explicitly-record-probe-registe.patch b/0199-Backport-SME-aarch64-Explicitly-record-probe-registe.patch
new file mode 100644
index 0000000..6fe7afa
--- /dev/null
+++ b/0199-Backport-SME-aarch64-Explicitly-record-probe-registe.patch
@@ -0,0 +1,280 @@
+From 1bf3e9a04411b483c89d2e2f9096ab66800c3b3f Mon Sep 17 00:00:00 2001
+From: Richard Sandiford <richard.sandiford@arm.com>
+Date: Tue, 12 Sep 2023 16:05:12 +0100
+Subject: [PATCH 100/157] [Backport][SME] aarch64: Explicitly record probe
+ registers in frame info
+
+Reference: https://gcc.gnu.org/git/?p=gcc.git;a=commit;h=5ce957484eea15f09503fcffa4dfdfb70ad82f8f
+
+The stack frame is currently divided into three areas:
+
+A: the area above the hard frame pointer
+B: the SVE saves below the hard frame pointer
+C: the outgoing arguments
+
+If the stack frame is allocated in one chunk, the allocation needs a
+probe if the frame size is >= guard_size - 1KiB. In addition, if the
+function is not a leaf function, it must probe an address no more than
+1KiB above the outgoing SP. We ensured the second condition by
+
+(1) using single-chunk allocations for non-leaf functions only if
+ the link register save slot is within 512 bytes of the bottom
+ of the frame; and
+
+(2) using the link register save as a probe (meaning, for instance,
+ that it can't be individually shrink wrapped)
+
+If instead the stack is allocated in multiple chunks, then:
+
+* an allocation involving only the outgoing arguments (C above) requires
+ a probe if the allocation size is > 1KiB
+
+* any other allocation requires a probe if the allocation size
+ is >= guard_size - 1KiB
+
+* second and subsequent allocations require the previous allocation
+ to probe at the bottom of the allocated area, regardless of the size
+ of that previous allocation
+
+The final point means that, unlike for single allocations,
+it can be necessary to have both a non-SVE register probe and
+an SVE register probe. For example:
+
+* allocate A, probe using a non-SVE register save
+* allocate B, probe using an SVE register save
+* allocate C
+
+The non-SVE register used in this case was again the link register.
+It was previously used even if the link register save slot was some
+bytes above the bottom of the non-SVE register saves, but an earlier
+patch avoided that by putting the link register save slot first.
+
+As a belt-and-braces fix, this patch explicitly records which
+probe registers we're using and allows the non-SVE probe to be
+whichever register comes first (as for SVE).
+
+The patch also avoids unnecessary probes in sve/pcs/stack_clash_3.c.
+
+gcc/
+ * config/aarch64/aarch64.h (aarch64_frame::sve_save_and_probe)
+ (aarch64_frame::hard_fp_save_and_probe): New fields.
+ * config/aarch64/aarch64.cc (aarch64_layout_frame): Initialize them.
+ Rather than asserting that a leaf function saves LR, instead assert
+ that a leaf function saves something.
+ (aarch64_get_separate_components): Prevent the chosen probe
+ registers from being individually shrink-wrapped.
+ (aarch64_allocate_and_probe_stack_space): Remove workaround for
+ probe registers that aren't at the bottom of the previous allocation.
+
+gcc/testsuite/
+ * gcc.target/aarch64/sve/pcs/stack_clash_3.c: Avoid redundant probes.
+---
+ gcc/config/aarch64/aarch64.cc | 68 +++++++++++++++----
+ gcc/config/aarch64/aarch64.h | 8 +++
+ .../aarch64/sve/pcs/stack_clash_3.c | 6 +-
+ 3 files changed, 64 insertions(+), 18 deletions(-)
+
+diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
+index 6d835dc8f..dd80ceba8 100644
+--- a/gcc/config/aarch64/aarch64.cc
++++ b/gcc/config/aarch64/aarch64.cc
+@@ -8810,15 +8810,11 @@ aarch64_layout_frame (void)
+ && !crtl->abi->clobbers_full_reg_p (regno))
+ frame.reg_offset[regno] = SLOT_REQUIRED;
+
+- /* With stack-clash, LR must be saved in non-leaf functions. The saving of
+- LR counts as an implicit probe which allows us to maintain the invariant
+- described in the comment at expand_prologue. */
+- gcc_assert (crtl->is_leaf
+- || maybe_ne (frame.reg_offset[R30_REGNUM], SLOT_NOT_REQUIRED));
+
+ poly_int64 offset = crtl->outgoing_args_size;
+ gcc_assert (multiple_p (offset, STACK_BOUNDARY / BITS_PER_UNIT));
+ frame.bytes_below_saved_regs = offset;
++ frame.sve_save_and_probe = INVALID_REGNUM;
+
+ /* Now assign stack slots for the registers. Start with the predicate
+ registers, since predicate LDR and STR have a relatively small
+@@ -8826,6 +8822,8 @@ aarch64_layout_frame (void)
+ for (regno = P0_REGNUM; regno <= P15_REGNUM; regno++)
+ if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED))
+ {
++ if (frame.sve_save_and_probe == INVALID_REGNUM)
++ frame.sve_save_and_probe = regno;
+ frame.reg_offset[regno] = offset;
+ offset += BYTES_PER_SVE_PRED;
+ }
+@@ -8863,6 +8861,8 @@ aarch64_layout_frame (void)
+ for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
+ if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED))
+ {
++ if (frame.sve_save_and_probe == INVALID_REGNUM)
++ frame.sve_save_and_probe = regno;
+ frame.reg_offset[regno] = offset;
+ offset += vector_save_size;
+ }
+@@ -8872,10 +8872,18 @@ aarch64_layout_frame (void)
+ frame.below_hard_fp_saved_regs_size = offset - frame.bytes_below_saved_regs;
+ bool saves_below_hard_fp_p
+ = maybe_ne (frame.below_hard_fp_saved_regs_size, 0);
++ gcc_assert (!saves_below_hard_fp_p
++ || (frame.sve_save_and_probe != INVALID_REGNUM
++ && known_eq (frame.reg_offset[frame.sve_save_and_probe],
++ frame.bytes_below_saved_regs)));
++
+ frame.bytes_below_hard_fp = offset;
++ frame.hard_fp_save_and_probe = INVALID_REGNUM;
+
+ auto allocate_gpr_slot = [&](unsigned int regno)
+ {
++ if (frame.hard_fp_save_and_probe == INVALID_REGNUM)
++ frame.hard_fp_save_and_probe = regno;
+ frame.reg_offset[regno] = offset;
+ if (frame.wb_push_candidate1 == INVALID_REGNUM)
+ frame.wb_push_candidate1 = regno;
+@@ -8909,6 +8917,8 @@ aarch64_layout_frame (void)
+ for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
+ if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED))
+ {
++ if (frame.hard_fp_save_and_probe == INVALID_REGNUM)
++ frame.hard_fp_save_and_probe = regno;
+ /* If there is an alignment gap between integer and fp callee-saves,
+ allocate the last fp register to it if possible. */
+ if (regno == last_fp_reg
+@@ -8932,6 +8942,17 @@ aarch64_layout_frame (void)
+ offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT);
+
+ frame.saved_regs_size = offset - frame.bytes_below_saved_regs;
++ gcc_assert (known_eq (frame.saved_regs_size,
++ frame.below_hard_fp_saved_regs_size)
++ || (frame.hard_fp_save_and_probe != INVALID_REGNUM
++ && known_eq (frame.reg_offset[frame.hard_fp_save_and_probe],
++ frame.bytes_below_hard_fp)));
++
++ /* With stack-clash, a register must be saved in non-leaf functions.
++ The saving of the bottommost register counts as an implicit probe,
++ which allows us to maintain the invariant described in the comment
++ at expand_prologue. */
++ gcc_assert (crtl->is_leaf || maybe_ne (frame.saved_regs_size, 0));
+
+ offset += get_frame_size ();
+ offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT);
+@@ -9062,6 +9083,25 @@ aarch64_layout_frame (void)
+ frame.final_adjust = frame.bytes_below_saved_regs;
+ }
+
++ /* The frame is allocated in pieces, with each non-final piece
++ including a register save at offset 0 that acts as a probe for
++ the following piece. In addition, the save of the bottommost register
++ acts as a probe for callees and allocas. Roll back any probes that
++ aren't needed.
++
++ A probe isn't needed if it is associated with the final allocation
++ (including callees and allocas) that happens before the epilogue is
++ executed. */
++ if (crtl->is_leaf
++ && !cfun->calls_alloca
++ && known_eq (frame.final_adjust, 0))
++ {
++ if (maybe_ne (frame.sve_callee_adjust, 0))
++ frame.sve_save_and_probe = INVALID_REGNUM;
++ else
++ frame.hard_fp_save_and_probe = INVALID_REGNUM;
++ }
++
+ /* Make sure the individual adjustments add up to the full frame size. */
+ gcc_assert (known_eq (frame.initial_adjust
+ + frame.callee_adjust
+@@ -9639,13 +9679,6 @@ aarch64_get_separate_components (void)
+
+ poly_int64 offset = frame.reg_offset[regno];
+
+- /* If the register is saved in the first SVE save slot, we use
+- it as a stack probe for -fstack-clash-protection. */
+- if (flag_stack_clash_protection
+- && maybe_ne (frame.below_hard_fp_saved_regs_size, 0)
+- && known_eq (offset, frame.bytes_below_saved_regs))
+- continue;
+-
+ /* Get the offset relative to the register we'll use. */
+ if (frame_pointer_needed)
+ offset -= frame.bytes_below_hard_fp;
+@@ -9680,6 +9713,13 @@ aarch64_get_separate_components (void)
+
+ bitmap_clear_bit (components, LR_REGNUM);
+ bitmap_clear_bit (components, SP_REGNUM);
++ if (flag_stack_clash_protection)
++ {
++ if (frame.sve_save_and_probe != INVALID_REGNUM)
++ bitmap_clear_bit (components, frame.sve_save_and_probe);
++ if (frame.hard_fp_save_and_probe != INVALID_REGNUM)
++ bitmap_clear_bit (components, frame.hard_fp_save_and_probe);
++ }
+
+ return components;
+ }
+@@ -10226,8 +10266,8 @@ aarch64_epilogue_uses (int regno)
+ When probing is needed, we emit a probe at the start of the prologue
+ and every PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE bytes thereafter.
+
+- We have to track how much space has been allocated and the only stores
+- to the stack we track as implicit probes are the FP/LR stores.
++ We can also use register saves as probes. These are stored in
++ sve_save_and_probe and hard_fp_save_and_probe.
+
+ For outgoing arguments we probe if the size is larger than 1KB, such that
+ the ABI specified buffer is maintained for the next callee.
+diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h
+index f340237d0..af480d9e8 100644
+--- a/gcc/config/aarch64/aarch64.h
++++ b/gcc/config/aarch64/aarch64.h
+@@ -887,6 +887,14 @@ struct GTY (()) aarch64_frame
+ This is the register they should use. */
+ unsigned spare_pred_reg;
+
++ /* An SVE register that is saved below the hard frame pointer and that acts
++ as a probe for later allocations, or INVALID_REGNUM if none. */
++ unsigned sve_save_and_probe;
++
++ /* A register that is saved at the hard frame pointer and that acts
++ as a probe for later allocations, or INVALID_REGNUM if none. */
++ unsigned hard_fp_save_and_probe;
++
+ bool laid_out;
+
+ /* True if shadow call stack should be enabled for the current function. */
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/stack_clash_3.c b/gcc/testsuite/gcc.target/aarch64/sve/pcs/stack_clash_3.c
+index 3e01ec36c..3530a0d50 100644
+--- a/gcc/testsuite/gcc.target/aarch64/sve/pcs/stack_clash_3.c
++++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/stack_clash_3.c
+@@ -11,11 +11,10 @@
+ ** mov x11, sp
+ ** ...
+ ** sub sp, sp, x13
+-** str p4, \[sp\]
+ ** cbz w0, [^\n]*
++** str p4, \[sp\]
+ ** ...
+ ** ptrue p0\.b, all
+-** ldr p4, \[sp\]
+ ** addvl sp, sp, #1
+ ** ldr x24, \[sp\], 32
+ ** ret
+@@ -39,13 +38,12 @@ test_1 (int n)
+ ** mov x11, sp
+ ** ...
+ ** sub sp, sp, x13
+-** str p4, \[sp\]
+ ** cbz w0, [^\n]*
++** str p4, \[sp\]
+ ** str p5, \[sp, #1, mul vl\]
+ ** str p6, \[sp, #2, mul vl\]
+ ** ...
+ ** ptrue p0\.b, all
+-** ldr p4, \[sp\]
+ ** addvl sp, sp, #1
+ ** ldr x24, \[sp\], 32
+ ** ret
+--
+2.33.0
+