summaryrefslogtreecommitdiff
path: root/0212-Backport-SME-aarch64-Add-support-for-__arm_locally_s.patch
diff options
context:
space:
mode:
Diffstat (limited to '0212-Backport-SME-aarch64-Add-support-for-__arm_locally_s.patch')
-rw-r--r--0212-Backport-SME-aarch64-Add-support-for-__arm_locally_s.patch1748
1 files changed, 1748 insertions, 0 deletions
diff --git a/0212-Backport-SME-aarch64-Add-support-for-__arm_locally_s.patch b/0212-Backport-SME-aarch64-Add-support-for-__arm_locally_s.patch
new file mode 100644
index 0000000..72576e3
--- /dev/null
+++ b/0212-Backport-SME-aarch64-Add-support-for-__arm_locally_s.patch
@@ -0,0 +1,1748 @@
+From 0ad41f11bea5c303ff39c54cae8e46afdfae6070 Mon Sep 17 00:00:00 2001
+From: Richard Sandiford <richard.sandiford@arm.com>
+Date: Tue, 5 Dec 2023 10:11:29 +0000
+Subject: [PATCH 113/157] [Backport][SME] aarch64: Add support for
+ __arm_locally_streaming
+
+Reference: https://gcc.gnu.org/git/?p=gcc.git;a=commit;h=3f6e5991fab507aa79121dc44d1afcd622c78744
+
+This patch adds support for the __arm_locally_streaming attribute,
+which allows a function to use SME internally without changing
+the function's ABI. The attribute is valid but redundant for
+__arm_streaming functions.
+
+gcc/
+ * config/aarch64/aarch64.cc (aarch64_arm_attribute_table): Add
+ arm::locally_streaming.
+ (aarch64_fndecl_is_locally_streaming): New function.
+ (aarch64_fndecl_sm_state): Handle locally-streaming functions.
+ (aarch64_cfun_enables_pstate_sm): New function.
+ (aarch64_add_offset): Add an argument that specifies whether
+ the streaming vector length should be used instead of the
+ prevailing one.
+ (aarch64_split_add_offset, aarch64_add_sp, aarch64_sub_sp): Likewise.
+ (aarch64_allocate_and_probe_stack_space): Likewise.
+ (aarch64_expand_mov_immediate): Update calls accordingly.
+ (aarch64_need_old_pstate_sm): Return true for locally-streaming
+ streaming-compatible functions.
+ (aarch64_layout_frame): Force all call-preserved Z and P registers
+ to be saved and restored if the function switches PSTATE.SM in the
+ prologue.
+ (aarch64_get_separate_components): Disable shrink-wrapping of
+ such Z and P saves and restores.
+ (aarch64_use_late_prologue_epilogue): New function.
+ (aarch64_expand_prologue): Measure SVE lengths in the streaming
+ vector length for locally-streaming functions, then emit code
+ to enable streaming mode.
+ (aarch64_expand_epilogue): Likewise in reverse.
+ (TARGET_USE_LATE_PROLOGUE_EPILOGUE): Define.
+ * config/aarch64/aarch64-c.cc (aarch64_define_unconditional_macros):
+ Define __arm_locally_streaming.
+
+gcc/testsuite/
+ * gcc.target/aarch64/sme/locally_streaming_1.c: New test.
+ * gcc.target/aarch64/sme/locally_streaming_2.c: Likewise.
+ * gcc.target/aarch64/sme/locally_streaming_3.c: Likewise.
+ * gcc.target/aarch64/sme/locally_streaming_4.c: Likewise.
+ * gcc.target/aarch64/sme/keyword_macros_1.c: Add
+ __arm_locally_streaming.
+ * g++.target/aarch64/sme/keyword_macros_1.C: Likewise.
+---
+ gcc/config/aarch64/aarch64-c.cc | 1 +
+ gcc/config/aarch64/aarch64.cc | 233 +++++++--
+ .../g++.target/aarch64/sme/keyword_macros_1.C | 1 +
+ .../gcc.target/aarch64/sme/keyword_macros_1.c | 1 +
+ .../aarch64/sme/locally_streaming_1.c | 466 ++++++++++++++++++
+ .../aarch64/sme/locally_streaming_2.c | 177 +++++++
+ .../aarch64/sme/locally_streaming_3.c | 273 ++++++++++
+ .../aarch64/sme/locally_streaming_4.c | 145 ++++++
+ 8 files changed, 1259 insertions(+), 38 deletions(-)
+ create mode 100644 gcc/testsuite/gcc.target/aarch64/sme/locally_streaming_1.c
+ create mode 100644 gcc/testsuite/gcc.target/aarch64/sme/locally_streaming_2.c
+ create mode 100644 gcc/testsuite/gcc.target/aarch64/sme/locally_streaming_3.c
+ create mode 100644 gcc/testsuite/gcc.target/aarch64/sme/locally_streaming_4.c
+
+diff --git a/gcc/config/aarch64/aarch64-c.cc b/gcc/config/aarch64/aarch64-c.cc
+index cb8a6c2fc..745719d8b 100644
+--- a/gcc/config/aarch64/aarch64-c.cc
++++ b/gcc/config/aarch64/aarch64-c.cc
+@@ -86,6 +86,7 @@ aarch64_define_unconditional_macros (cpp_reader *pfile)
+
+ DEFINE_ARM_KEYWORD_MACRO ("streaming");
+ DEFINE_ARM_KEYWORD_MACRO ("streaming_compatible");
++ DEFINE_ARM_KEYWORD_MACRO ("locally_streaming");
+
+ #undef DEFINE_ARM_KEYWORD_MACRO
+
+diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
+index 113784e31..4cb43c2e2 100644
+--- a/gcc/config/aarch64/aarch64.cc
++++ b/gcc/config/aarch64/aarch64.cc
+@@ -3283,6 +3283,7 @@ static const attribute_spec aarch64_arm_attributes[] =
+ NULL, attr_streaming_exclusions },
+ { "streaming_compatible", 0, 0, false, true, true, true,
+ NULL, attr_streaming_exclusions },
++ { "locally_streaming", 0, 0, true, false, false, false, NULL, NULL },
+ { "new", 1, -1, true, false, false, false,
+ handle_arm_new, NULL },
+ { "preserves", 1, -1, false, true, true, true,
+@@ -4657,6 +4658,16 @@ aarch64_fntype_isa_mode (const_tree fntype)
+ | aarch64_fntype_pstate_za (fntype));
+ }
+
++/* Return true if FNDECL uses streaming mode internally, as an
++ implementation choice. */
++
++static bool
++aarch64_fndecl_is_locally_streaming (const_tree fndecl)
++{
++ return lookup_attribute ("arm", "locally_streaming",
++ DECL_ATTRIBUTES (fndecl));
++}
++
+ /* Return the state of PSTATE.SM when compiling the body of
+ function FNDECL. This might be different from the state of
+ PSTATE.SM on entry. */
+@@ -4664,6 +4675,9 @@ aarch64_fntype_isa_mode (const_tree fntype)
+ static aarch64_feature_flags
+ aarch64_fndecl_pstate_sm (const_tree fndecl)
+ {
++ if (aarch64_fndecl_is_locally_streaming (fndecl))
++ return AARCH64_FL_SM_ON;
++
+ return aarch64_fntype_pstate_sm (TREE_TYPE (fndecl));
+ }
+
+@@ -4739,6 +4753,16 @@ aarch64_cfun_has_new_state (const char *state_name)
+ return aarch64_fndecl_has_new_state (cfun->decl, state_name);
+ }
+
++/* Return true if PSTATE.SM is 1 in the body of the current function,
++ but is not guaranteed to be 1 on entry. */
++
++static bool
++aarch64_cfun_enables_pstate_sm ()
++{
++ return (aarch64_fndecl_is_locally_streaming (cfun->decl)
++ && aarch64_cfun_incoming_pstate_sm () != AARCH64_FL_SM_ON);
++}
++
+ /* Return true if the current function has state STATE_NAME, either by
+ creating new state itself or by sharing state with callers. */
+
+@@ -6931,6 +6955,10 @@ aarch64_add_offset_temporaries (rtx x)
+ TEMP2, if nonnull, is a second temporary register that doesn't
+ overlap either DEST or REG.
+
++ FORCE_ISA_MODE is AARCH64_FL_SM_ON if any variable component of OFFSET
++ is measured relative to the SME vector length instead of the current
++ prevailing vector length. It is 0 otherwise.
++
+ Since this function may be used to adjust the stack pointer, we must
+ ensure that it cannot cause transient stack deallocation (for example
+ by first incrementing SP and then decrementing when adjusting by a
+@@ -6939,6 +6967,7 @@ aarch64_add_offset_temporaries (rtx x)
+ static void
+ aarch64_add_offset (scalar_int_mode mode, rtx dest, rtx src,
+ poly_int64 offset, rtx temp1, rtx temp2,
++ aarch64_feature_flags force_isa_mode,
+ bool frame_related_p, bool emit_move_imm = true)
+ {
+ gcc_assert (emit_move_imm || temp1 != NULL_RTX);
+@@ -6951,9 +6980,18 @@ aarch64_add_offset (scalar_int_mode mode, rtx dest, rtx src,
+ /* Try using ADDVL or ADDPL to add the whole value. */
+ if (src != const0_rtx && aarch64_sve_addvl_addpl_immediate_p (offset))
+ {
+- rtx offset_rtx = gen_int_mode (offset, mode);
++ gcc_assert (offset.coeffs[0] == offset.coeffs[1]);
++ rtx offset_rtx;
++ if (force_isa_mode == 0)
++ offset_rtx = gen_int_mode (offset, mode);
++ else
++ offset_rtx = aarch64_sme_vq_immediate (mode, offset.coeffs[0], 0);
+ rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
+ RTX_FRAME_RELATED_P (insn) = frame_related_p;
++ if (frame_related_p && (force_isa_mode & AARCH64_FL_SM_ON))
++ add_reg_note (insn, REG_CFA_ADJUST_CFA,
++ gen_rtx_SET (dest, plus_constant (Pmode, src,
++ offset)));
+ return;
+ }
+
+@@ -6969,11 +7007,19 @@ aarch64_add_offset (scalar_int_mode mode, rtx dest, rtx src,
+ if (src != const0_rtx
+ && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
+ {
+- rtx offset_rtx = gen_int_mode (poly_offset, mode);
++ rtx offset_rtx;
++ if (force_isa_mode == 0)
++ offset_rtx = gen_int_mode (poly_offset, mode);
++ else
++ offset_rtx = aarch64_sme_vq_immediate (mode, factor, 0);
+ if (frame_related_p)
+ {
+ rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
+ RTX_FRAME_RELATED_P (insn) = true;
++ if (force_isa_mode & AARCH64_FL_SM_ON)
++ add_reg_note (insn, REG_CFA_ADJUST_CFA,
++ gen_rtx_SET (dest, plus_constant (Pmode, src,
++ poly_offset)));
+ src = dest;
+ }
+ else
+@@ -7004,9 +7050,19 @@ aarch64_add_offset (scalar_int_mode mode, rtx dest, rtx src,
+ rtx val;
+ if (IN_RANGE (rel_factor, -32, 31))
+ {
++ if (force_isa_mode & AARCH64_FL_SM_ON)
++ {
++ /* Try to use an unshifted RDSVL, otherwise fall back on
++ a shifted RDSVL #1. */
++ if (aarch64_sve_rdvl_addvl_factor_p (factor))
++ shift = 0;
++ else
++ factor = rel_factor * 16;
++ val = aarch64_sme_vq_immediate (mode, factor, 0);
++ }
+ /* Try to use an unshifted CNT[BHWD] or RDVL. */
+- if (aarch64_sve_cnt_factor_p (factor)
+- || aarch64_sve_rdvl_addvl_factor_p (factor))
++ else if (aarch64_sve_cnt_factor_p (factor)
++ || aarch64_sve_rdvl_addvl_factor_p (factor))
+ {
+ val = gen_int_mode (poly_int64 (factor, factor), mode);
+ shift = 0;
+@@ -7036,11 +7092,18 @@ aarch64_add_offset (scalar_int_mode mode, rtx dest, rtx src,
+ a shift and add sequence for the multiplication.
+ If CNTB << SHIFT is out of range, stick with the current
+ shift factor. */
+- if (IN_RANGE (low_bit, 2, 16 * 16))
++ if (force_isa_mode == 0
++ && IN_RANGE (low_bit, 2, 16 * 16))
+ {
+ val = gen_int_mode (poly_int64 (low_bit, low_bit), mode);
+ shift = 0;
+ }
++ else if ((force_isa_mode & AARCH64_FL_SM_ON)
++ && aarch64_sve_rdvl_addvl_factor_p (low_bit))
++ {
++ val = aarch64_sme_vq_immediate (mode, low_bit, 0);
++ shift = 0;
++ }
+ else
+ val = gen_int_mode (BYTES_PER_SVE_VECTOR, mode);
+
+@@ -7128,30 +7191,34 @@ aarch64_split_add_offset (scalar_int_mode mode, rtx dest, rtx src,
+ rtx offset_rtx, rtx temp1, rtx temp2)
+ {
+ aarch64_add_offset (mode, dest, src, rtx_to_poly_int64 (offset_rtx),
+- temp1, temp2, false);
++ temp1, temp2, 0, false);
+ }
+
+ /* Add DELTA to the stack pointer, marking the instructions frame-related.
+- TEMP1 is available as a temporary if nonnull. EMIT_MOVE_IMM is false
+- if TEMP1 already contains abs (DELTA). */
++ TEMP1 is available as a temporary if nonnull. FORCE_ISA_MODE is as
++ for aarch64_add_offset. EMIT_MOVE_IMM is false if TEMP1 already
++ contains abs (DELTA). */
+
+ static inline void
+-aarch64_add_sp (rtx temp1, rtx temp2, poly_int64 delta, bool emit_move_imm)
++aarch64_add_sp (rtx temp1, rtx temp2, poly_int64 delta,
++ aarch64_feature_flags force_isa_mode, bool emit_move_imm)
+ {
+ aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, delta,
+- temp1, temp2, true, emit_move_imm);
++ temp1, temp2, force_isa_mode, true, emit_move_imm);
+ }
+
+ /* Subtract DELTA from the stack pointer, marking the instructions
+- frame-related if FRAME_RELATED_P. TEMP1 is available as a temporary
+- if nonnull. */
++ frame-related if FRAME_RELATED_P. FORCE_ISA_MODE is as for
++ aarch64_add_offset. TEMP1 is available as a temporary if nonnull. */
+
+ static inline void
+-aarch64_sub_sp (rtx temp1, rtx temp2, poly_int64 delta, bool frame_related_p,
+- bool emit_move_imm = true)
++aarch64_sub_sp (rtx temp1, rtx temp2, poly_int64 delta,
++ aarch64_feature_flags force_isa_mode,
++ bool frame_related_p, bool emit_move_imm = true)
+ {
+ aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, -delta,
+- temp1, temp2, frame_related_p, emit_move_imm);
++ temp1, temp2, force_isa_mode, frame_related_p,
++ emit_move_imm);
+ }
+
+ /* A streaming-compatible function needs to switch temporarily to the known
+@@ -8176,11 +8243,11 @@ aarch64_expand_mov_immediate (rtx dest, rtx imm)
+ {
+ base = aarch64_force_temporary (int_mode, dest, base);
+ aarch64_add_offset (int_mode, dest, base, offset,
+- NULL_RTX, NULL_RTX, false);
++ NULL_RTX, NULL_RTX, 0, false);
+ }
+ else
+ aarch64_add_offset (int_mode, dest, base, offset,
+- dest, NULL_RTX, false);
++ dest, NULL_RTX, 0, false);
+ }
+ return;
+ }
+@@ -8207,7 +8274,7 @@ aarch64_expand_mov_immediate (rtx dest, rtx imm)
+ gcc_assert (can_create_pseudo_p ());
+ base = aarch64_force_temporary (int_mode, dest, base);
+ aarch64_add_offset (int_mode, dest, base, const_offset,
+- NULL_RTX, NULL_RTX, false);
++ NULL_RTX, NULL_RTX, 0, false);
+ return;
+ }
+
+@@ -8247,7 +8314,7 @@ aarch64_expand_mov_immediate (rtx dest, rtx imm)
+ gcc_assert(can_create_pseudo_p ());
+ base = aarch64_force_temporary (int_mode, dest, base);
+ aarch64_add_offset (int_mode, dest, base, const_offset,
+- NULL_RTX, NULL_RTX, false);
++ NULL_RTX, NULL_RTX, 0, false);
+ return;
+ }
+ /* FALLTHRU */
+@@ -9755,6 +9822,9 @@ aarch64_need_old_pstate_sm ()
+ if (aarch64_cfun_incoming_pstate_sm () != 0)
+ return false;
+
++ if (aarch64_cfun_enables_pstate_sm ())
++ return true;
++
+ if (cfun->machine->call_switches_pstate_sm)
+ for (auto insn = get_insns (); insn; insn = NEXT_INSN (insn))
+ if (auto *call = dyn_cast<rtx_call_insn *> (insn))
+@@ -9781,6 +9851,7 @@ aarch64_layout_frame (void)
+ bool frame_related_fp_reg_p = false;
+ aarch64_frame &frame = cfun->machine->frame;
+ poly_int64 top_of_locals = -1;
++ bool enables_pstate_sm = aarch64_cfun_enables_pstate_sm ();
+
+ vec_safe_truncate (frame.saved_gprs, 0);
+ vec_safe_truncate (frame.saved_fprs, 0);
+@@ -9818,7 +9889,7 @@ aarch64_layout_frame (void)
+ frame.reg_offset[regno] = SLOT_REQUIRED;
+
+ for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
+- if (df_regs_ever_live_p (regno)
++ if ((enables_pstate_sm || df_regs_ever_live_p (regno))
+ && !fixed_regs[regno]
+ && !crtl->abi->clobbers_full_reg_p (regno))
+ {
+@@ -9847,7 +9918,7 @@ aarch64_layout_frame (void)
+ }
+
+ for (regno = P0_REGNUM; regno <= P15_REGNUM; regno++)
+- if (df_regs_ever_live_p (regno)
++ if ((enables_pstate_sm || df_regs_ever_live_p (regno))
+ && !fixed_regs[regno]
+ && !crtl->abi->clobbers_full_reg_p (regno))
+ frame.reg_offset[regno] = SLOT_REQUIRED;
+@@ -9964,7 +10035,8 @@ aarch64_layout_frame (void)
+ /* If the current function changes the SVE vector length, ensure that the
+ old value of the DWARF VG register is saved and available in the CFI,
+ so that outer frames with VL-sized offsets can be processed correctly. */
+- if (cfun->machine->call_switches_pstate_sm)
++ if (cfun->machine->call_switches_pstate_sm
++ || aarch64_cfun_enables_pstate_sm ())
+ {
+ frame.reg_offset[VG_REGNUM] = offset;
+ offset += UNITS_PER_WORD;
+@@ -10749,9 +10821,16 @@ aarch64_get_separate_components (void)
+ bitmap_clear (components);
+
+ /* The registers we need saved to the frame. */
++ bool enables_pstate_sm = aarch64_cfun_enables_pstate_sm ();
+ for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
+ if (aarch64_register_saved_on_entry (regno))
+ {
++ /* Disallow shrink wrapping for registers that will be clobbered
++ by an SMSTART SM in the prologue. */
++ if (enables_pstate_sm
++ && (FP_REGNUM_P (regno) || PR_REGNUM_P (regno)))
++ continue;
++
+ /* Punt on saves and restores that use ST1D and LD1D. We could
+ try to be smarter, but it would involve making sure that the
+ spare predicate register itself is safe to use at the save
+@@ -11070,11 +11149,16 @@ aarch64_emit_stack_tie (rtx reg)
+ events, e.g. if we were to allow the stack to be dropped by more than a page
+ and then have multiple probes up and we take a signal somewhere in between
+ then the signal handler doesn't know the state of the stack and can make no
+- assumptions about which pages have been probed. */
++ assumptions about which pages have been probed.
++
++ FORCE_ISA_MODE is AARCH64_FL_SM_ON if any variable component of POLY_SIZE
++ is measured relative to the SME vector length instead of the current
++ prevailing vector length. It is 0 otherwise. */
+
+ static void
+ aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2,
+ poly_int64 poly_size,
++ aarch64_feature_flags force_isa_mode,
+ bool frame_related_p,
+ bool final_adjustment_p)
+ {
+@@ -11116,7 +11200,8 @@ aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2,
+ if (known_lt (poly_size, min_probe_threshold)
+ || !flag_stack_clash_protection)
+ {
+- aarch64_sub_sp (temp1, temp2, poly_size, frame_related_p);
++ aarch64_sub_sp (temp1, temp2, poly_size, force_isa_mode,
++ frame_related_p);
+ return;
+ }
+
+@@ -11133,7 +11218,8 @@ aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2,
+
+ /* First calculate the amount of bytes we're actually spilling. */
+ aarch64_add_offset (Pmode, temp1, CONST0_RTX (Pmode),
+- poly_size, temp1, temp2, false, true);
++ poly_size, temp1, temp2, force_isa_mode,
++ false, true);
+
+ rtx_insn *insn = get_last_insn ();
+
+@@ -11191,7 +11277,7 @@ aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2,
+ {
+ for (HOST_WIDE_INT i = 0; i < rounded_size; i += guard_size)
+ {
+- aarch64_sub_sp (NULL, temp2, guard_size, true);
++ aarch64_sub_sp (NULL, temp2, guard_size, force_isa_mode, true);
+ emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
+ guard_used_by_caller));
+ emit_insn (gen_blockage ());
+@@ -11202,7 +11288,7 @@ aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2,
+ {
+ /* Compute the ending address. */
+ aarch64_add_offset (Pmode, temp1, stack_pointer_rtx, -rounded_size,
+- temp1, NULL, false, true);
++ temp1, NULL, force_isa_mode, false, true);
+ rtx_insn *insn = get_last_insn ();
+
+ /* For the initial allocation, we don't have a frame pointer
+@@ -11268,7 +11354,7 @@ aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2,
+ if (final_adjustment_p && rounded_size != 0)
+ min_probe_threshold = 0;
+
+- aarch64_sub_sp (temp1, temp2, residual, frame_related_p);
++ aarch64_sub_sp (temp1, temp2, residual, force_isa_mode, frame_related_p);
+ if (residual >= min_probe_threshold)
+ {
+ if (dump_file)
+@@ -11333,6 +11419,14 @@ aarch64_epilogue_uses (int regno)
+ return 0;
+ }
+
++/* Implement TARGET_USE_LATE_PROLOGUE_EPILOGUE. */
++
++static bool
++aarch64_use_late_prologue_epilogue ()
++{
++ return aarch64_cfun_enables_pstate_sm ();
++}
++
+ /* The current function's frame has a save slot for the incoming state
+ of SVCR. Return a legitimate memory for the slot, based on the hard
+ frame pointer. */
+@@ -11469,6 +11563,9 @@ aarch64_expand_prologue (void)
+ unsigned reg2 = frame.wb_push_candidate2;
+ bool emit_frame_chain = frame.emit_frame_chain;
+ rtx_insn *insn;
++ aarch64_feature_flags force_isa_mode = 0;
++ if (aarch64_cfun_enables_pstate_sm ())
++ force_isa_mode = AARCH64_FL_SM_ON;
+
+ if (flag_stack_clash_protection && known_eq (callee_adjust, 0))
+ {
+@@ -11530,7 +11627,7 @@ aarch64_expand_prologue (void)
+ less the amount of the guard reserved for use by the caller's
+ outgoing args. */
+ aarch64_allocate_and_probe_stack_space (tmp0_rtx, tmp1_rtx, initial_adjust,
+- true, false);
++ force_isa_mode, true, false);
+
+ if (callee_adjust != 0)
+ aarch64_push_regs (reg1, reg2, callee_adjust);
+@@ -11553,7 +11650,8 @@ aarch64_expand_prologue (void)
+ gcc_assert (known_eq (chain_offset, 0));
+ aarch64_add_offset (Pmode, hard_frame_pointer_rtx,
+ stack_pointer_rtx, chain_offset,
+- tmp1_rtx, tmp0_rtx, frame_pointer_needed);
++ tmp1_rtx, tmp0_rtx, force_isa_mode,
++ frame_pointer_needed);
+ if (frame_pointer_needed && !frame_size.is_constant ())
+ {
+ /* Variable-sized frames need to describe the save slot
+@@ -11600,6 +11698,7 @@ aarch64_expand_prologue (void)
+ || known_eq (initial_adjust, 0));
+ aarch64_allocate_and_probe_stack_space (tmp1_rtx, tmp0_rtx,
+ sve_callee_adjust,
++ force_isa_mode,
+ !frame_pointer_needed, false);
+ bytes_below_sp -= sve_callee_adjust;
+ }
+@@ -11612,12 +11711,15 @@ aarch64_expand_prologue (void)
+ that is assumed by the called. */
+ gcc_assert (known_eq (bytes_below_sp, final_adjust));
+ aarch64_allocate_and_probe_stack_space (tmp1_rtx, tmp0_rtx, final_adjust,
++ force_isa_mode,
+ !frame_pointer_needed, true);
+ if (emit_frame_chain && maybe_ne (final_adjust, 0))
+ aarch64_emit_stack_tie (hard_frame_pointer_rtx);
+
+- /* Save the incoming value of PSTATE.SM, if required. */
+- if (known_ge (frame.old_svcr_offset, 0))
++ /* Save the incoming value of PSTATE.SM, if required. Code further
++ down does this for locally-streaming functions. */
++ if (known_ge (frame.old_svcr_offset, 0)
++ && !aarch64_cfun_enables_pstate_sm ())
+ {
+ rtx mem = aarch64_old_svcr_mem ();
+ MEM_VOLATILE_P (mem) = 1;
+@@ -11649,6 +11751,34 @@ aarch64_expand_prologue (void)
+ emit_move_insn (gen_rtx_REG (DImode, R1_REGNUM), old_r1);
+ }
+ }
++
++ /* Enable PSTATE.SM, if required. */
++ if (aarch64_cfun_enables_pstate_sm ())
++ {
++ rtx_insn *guard_label = nullptr;
++ if (known_ge (cfun->machine->frame.old_svcr_offset, 0))
++ {
++ /* The current function is streaming-compatible. Save the
++ original state of PSTATE.SM. */
++ rtx svcr = gen_rtx_REG (DImode, IP0_REGNUM);
++ emit_insn (gen_aarch64_read_svcr (svcr));
++ emit_move_insn (aarch64_old_svcr_mem (), svcr);
++ guard_label = aarch64_guard_switch_pstate_sm (svcr,
++ aarch64_isa_flags);
++ }
++ aarch64_sme_mode_switch_regs args_switch;
++ auto &args = crtl->args.info;
++ for (unsigned int i = 0; i < args.num_sme_mode_switch_args; ++i)
++ {
++ rtx x = args.sme_mode_switch_args[i];
++ args_switch.add_reg (GET_MODE (x), REGNO (x));
++ }
++ args_switch.emit_prologue ();
++ emit_insn (gen_aarch64_smstart_sm ());
++ args_switch.emit_epilogue ();
++ if (guard_label)
++ emit_label (guard_label);
++ }
+ }
+
+ /* Return TRUE if we can use a simple_return insn.
+@@ -11695,6 +11825,9 @@ aarch64_expand_epilogue (rtx_call_insn *sibcall)
+ HOST_WIDE_INT guard_size
+ = 1 << param_stack_clash_protection_guard_size;
+ HOST_WIDE_INT guard_used_by_caller = STACK_CLASH_CALLER_GUARD;
++ aarch64_feature_flags force_isa_mode = 0;
++ if (aarch64_cfun_enables_pstate_sm ())
++ force_isa_mode = AARCH64_FL_SM_ON;
+
+ /* We can re-use the registers when:
+
+@@ -11719,6 +11852,24 @@ aarch64_expand_epilogue (rtx_call_insn *sibcall)
+ = maybe_ne (get_frame_size ()
+ + frame.saved_varargs_size, 0);
+
++ /* Reset PSTATE.SM, if required. */
++ if (aarch64_cfun_enables_pstate_sm ())
++ {
++ rtx_insn *guard_label = nullptr;
++ if (known_ge (cfun->machine->frame.old_svcr_offset, 0))
++ guard_label = aarch64_guard_switch_pstate_sm (IP0_REGNUM,
++ aarch64_isa_flags);
++ aarch64_sme_mode_switch_regs return_switch;
++ if (crtl->return_rtx && REG_P (crtl->return_rtx))
++ return_switch.add_reg (GET_MODE (crtl->return_rtx),
++ REGNO (crtl->return_rtx));
++ return_switch.emit_prologue ();
++ emit_insn (gen_aarch64_smstop_sm ());
++ return_switch.emit_epilogue ();
++ if (guard_label)
++ emit_label (guard_label);
++ }
++
+ /* Emit a barrier to prevent loads from a deallocated stack. */
+ if (maybe_gt (final_adjust, crtl->outgoing_args_size)
+ || cfun->calls_alloca
+@@ -11739,19 +11890,21 @@ aarch64_expand_epilogue (rtx_call_insn *sibcall)
+ aarch64_add_offset (Pmode, stack_pointer_rtx,
+ hard_frame_pointer_rtx,
+ -bytes_below_hard_fp + final_adjust,
+- tmp1_rtx, tmp0_rtx, callee_adjust == 0);
++ tmp1_rtx, tmp0_rtx, force_isa_mode,
++ callee_adjust == 0);
+ else
+ /* The case where we need to re-use the register here is very rare, so
+ avoid the complicated condition and just always emit a move if the
+ immediate doesn't fit. */
+- aarch64_add_sp (tmp1_rtx, tmp0_rtx, final_adjust, true);
++ aarch64_add_sp (tmp1_rtx, tmp0_rtx, final_adjust, force_isa_mode, true);
+
+ /* Restore the vector registers before the predicate registers,
+ so that we can use P4 as a temporary for big-endian SVE frames. */
+ aarch64_restore_callee_saves (final_adjust, frame.saved_fprs, &cfi_ops);
+ aarch64_restore_callee_saves (final_adjust, frame.saved_prs, &cfi_ops);
+ if (maybe_ne (sve_callee_adjust, 0))
+- aarch64_add_sp (NULL_RTX, NULL_RTX, sve_callee_adjust, true);
++ aarch64_add_sp (NULL_RTX, NULL_RTX, sve_callee_adjust,
++ force_isa_mode, true);
+
+ /* When shadow call stack is enabled, the scs_pop in the epilogue will
+ restore x30, we don't need to restore x30 again in the traditional
+@@ -11781,7 +11934,7 @@ aarch64_expand_epilogue (rtx_call_insn *sibcall)
+
+ /* Liveness of EP0_REGNUM can not be trusted across function calls either, so
+ add restriction on emit_move optimization to leaf functions. */
+- aarch64_add_sp (tmp0_rtx, tmp1_rtx, initial_adjust,
++ aarch64_add_sp (tmp0_rtx, tmp1_rtx, initial_adjust, force_isa_mode,
+ (!can_inherit_p || !crtl->is_leaf
+ || df_regs_ever_live_p (EP0_REGNUM)));
+
+@@ -11914,7 +12067,8 @@ aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
+ temp1 = gen_rtx_REG (Pmode, EP1_REGNUM);
+
+ if (vcall_offset == 0)
+- aarch64_add_offset (Pmode, this_rtx, this_rtx, delta, temp1, temp0, false);
++ aarch64_add_offset (Pmode, this_rtx, this_rtx, delta, temp1, temp0,
++ 0, false);
+ else
+ {
+ gcc_assert ((vcall_offset & (POINTER_BYTES - 1)) == 0);
+@@ -11927,7 +12081,7 @@ aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
+ plus_constant (Pmode, this_rtx, delta));
+ else
+ aarch64_add_offset (Pmode, this_rtx, this_rtx, delta,
+- temp1, temp0, false);
++ temp1, temp0, 0, false);
+ }
+
+ if (Pmode == ptr_mode)
+@@ -30962,6 +31116,9 @@ aarch64_libgcc_floating_mode_supported_p
+ #undef TARGET_EXTRA_LIVE_ON_ENTRY
+ #define TARGET_EXTRA_LIVE_ON_ENTRY aarch64_extra_live_on_entry
+
++#undef TARGET_USE_LATE_PROLOGUE_EPILOGUE
++#define TARGET_USE_LATE_PROLOGUE_EPILOGUE aarch64_use_late_prologue_epilogue
++
+ #undef TARGET_EMIT_EPILOGUE_FOR_SIBCALL
+ #define TARGET_EMIT_EPILOGUE_FOR_SIBCALL aarch64_expand_epilogue
+
+diff --git a/gcc/testsuite/g++.target/aarch64/sme/keyword_macros_1.C b/gcc/testsuite/g++.target/aarch64/sme/keyword_macros_1.C
+index 8b0755014..dc5c097bd 100644
+--- a/gcc/testsuite/g++.target/aarch64/sme/keyword_macros_1.C
++++ b/gcc/testsuite/g++.target/aarch64/sme/keyword_macros_1.C
+@@ -7,3 +7,4 @@ void f4 () __arm_out("za");
+ void f5 () __arm_inout("za");
+ void f6 () __arm_preserves("za");
+ __arm_new("za") void f7 () {}
++__arm_locally_streaming void f8 () {}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sme/keyword_macros_1.c b/gcc/testsuite/gcc.target/aarch64/sme/keyword_macros_1.c
+index fcabe3edc..22f5facfd 100644
+--- a/gcc/testsuite/gcc.target/aarch64/sme/keyword_macros_1.c
++++ b/gcc/testsuite/gcc.target/aarch64/sme/keyword_macros_1.c
+@@ -7,3 +7,4 @@ void f4 () __arm_out("za");
+ void f5 () __arm_inout("za");
+ void f6 () __arm_preserves("za");
+ __arm_new("za") void f7 () {}
++__arm_locally_streaming void f8 () {}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sme/locally_streaming_1.c b/gcc/testsuite/gcc.target/aarch64/sme/locally_streaming_1.c
+new file mode 100644
+index 000000000..20ff4b87d
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sme/locally_streaming_1.c
+@@ -0,0 +1,466 @@
++// { dg-options "-O -fomit-frame-pointer" }
++// { dg-final { check-function-bodies "**" "" } }
++
++void consume_za () [[arm::streaming, arm::inout("za")]];
++
++/*
++** n_ls:
++** sub sp, sp, #?80
++** cntd x16
++** str x16, \[sp\]
++** stp d8, d9, \[sp, #?16\]
++** stp d10, d11, \[sp, #?32\]
++** stp d12, d13, \[sp, #?48\]
++** stp d14, d15, \[sp, #?64\]
++** smstart sm
++** smstop sm
++** ldp d8, d9, \[sp, #?16\]
++** ldp d10, d11, \[sp, #?32\]
++** ldp d12, d13, \[sp, #?48\]
++** ldp d14, d15, \[sp, #?64\]
++** add sp, sp, #?80
++** ret
++*/
++[[arm::locally_streaming]] void
++n_ls ()
++{
++ asm ("");
++}
++
++/*
++** s_ls:
++** ret
++*/
++[[arm::locally_streaming]] void
++s_ls () [[arm::streaming]]
++{
++ asm ("");
++}
++
++/*
++** sc_ls:
++** stp x29, x30, \[sp, #?-96\]!
++** mov x29, sp
++** cntd x16
++** str x16, \[sp, #?24\]
++** stp d8, d9, \[sp, #?32\]
++** stp d10, d11, \[sp, #?48\]
++** stp d12, d13, \[sp, #?64\]
++** stp d14, d15, \[sp, #?80\]
++** mrs x16, svcr
++** str x16, \[x29, #?16\]
++** tbnz x16, 0, [^\n]+
++** smstart sm
++** ldr x16, \[x29, #?16\]
++** tbnz x16, 0, [^\n]+
++** smstop sm
++** ldp d8, d9, \[sp, #?32\]
++** ldp d10, d11, \[sp, #?48\]
++** ldp d12, d13, \[sp, #?64\]
++** ldp d14, d15, \[sp, #?80\]
++** ldp x29, x30, \[sp\], #?96
++** ret
++*/
++[[arm::locally_streaming]] void
++sc_ls () [[arm::streaming_compatible]]
++{
++ asm ("");
++}
++
++/*
++** n_ls_new_za:
++** str x30, \[sp, #?-80\]!
++** cntd x16
++** str x16, \[sp, #?8\]
++** stp d8, d9, \[sp, #?16\]
++** stp d10, d11, \[sp, #?32\]
++** stp d12, d13, \[sp, #?48\]
++** stp d14, d15, \[sp, #?64\]
++** smstart sm
++** mrs (x[0-9]+), tpidr2_el0
++** cbz \1, [^\n]+
++** bl __arm_tpidr2_save
++** msr tpidr2_el0, xzr
++** zero { za }
++** smstart za
++** bl consume_za
++** smstop za
++** smstop sm
++** ldp d8, d9, \[sp, #?16\]
++** ldp d10, d11, \[sp, #?32\]
++** ldp d12, d13, \[sp, #?48\]
++** ldp d14, d15, \[sp, #?64\]
++** ldr x30, \[sp\], #?80
++** ret
++*/
++[[arm::locally_streaming, arm::new("za")]] void
++n_ls_new_za ()
++{
++ consume_za ();
++ asm ("");
++}
++
++/*
++** s_ls_new_za:
++** str x30, \[sp, #?-16\]!
++** mrs (x[0-9]+), tpidr2_el0
++** cbz \1, [^\n]+
++** bl __arm_tpidr2_save
++** msr tpidr2_el0, xzr
++** zero { za }
++** smstart za
++** bl consume_za
++** smstop za
++** ldr x30, \[sp\], #?16
++** ret
++*/
++[[arm::locally_streaming, arm::new("za")]] void
++s_ls_new_za () [[arm::streaming]]
++{
++ consume_za ();
++ asm ("");
++}
++
++/*
++** sc_ls_new_za:
++** stp x29, x30, \[sp, #?-96\]!
++** mov x29, sp
++** cntd x16
++** str x16, \[sp, #?24\]
++** stp d8, d9, \[sp, #?32\]
++** stp d10, d11, \[sp, #?48\]
++** stp d12, d13, \[sp, #?64\]
++** stp d14, d15, \[sp, #?80\]
++** mrs x16, svcr
++** str x16, \[x29, #?16\]
++** tbnz x16, 0, [^\n]+
++** smstart sm
++** mrs (x[0-9]+), tpidr2_el0
++** cbz \1, [^\n]+
++** bl __arm_tpidr2_save
++** msr tpidr2_el0, xzr
++** zero { za }
++** smstart za
++** bl consume_za
++** smstop za
++** ldr x16, \[x29, #?16\]
++** tbnz x16, 0, [^\n]+
++** smstop sm
++** ldp d8, d9, \[sp, #?32\]
++** ldp d10, d11, \[sp, #?48\]
++** ldp d12, d13, \[sp, #?64\]
++** ldp d14, d15, \[sp, #?80\]
++** ldp x29, x30, \[sp\], #?96
++** ret
++*/
++[[arm::locally_streaming, arm::new("za")]] void
++sc_ls_new_za () [[arm::streaming_compatible]]
++{
++ consume_za ();
++ asm ("");
++}
++
++/*
++** n_ls_shared_za:
++** str x30, \[sp, #?-80\]!
++** cntd x16
++** str x16, \[sp, #?8\]
++** stp d8, d9, \[sp, #?16\]
++** stp d10, d11, \[sp, #?32\]
++** stp d12, d13, \[sp, #?48\]
++** stp d14, d15, \[sp, #?64\]
++** smstart sm
++** bl consume_za
++** smstop sm
++** ldp d8, d9, \[sp, #?16\]
++** ldp d10, d11, \[sp, #?32\]
++** ldp d12, d13, \[sp, #?48\]
++** ldp d14, d15, \[sp, #?64\]
++** ldr x30, \[sp\], #?80
++** ret
++*/
++[[arm::locally_streaming]] void
++n_ls_shared_za () [[arm::inout("za")]]
++{
++ consume_za ();
++ asm ("");
++}
++
++/*
++** s_ls_shared_za:
++** str x30, \[sp, #?-16\]!
++** bl consume_za
++** ldr x30, \[sp\], #?16
++** ret
++*/
++[[arm::locally_streaming]] void
++s_ls_shared_za () [[arm::streaming, arm::inout("za")]]
++{
++ consume_za ();
++ asm ("");
++}
++
++/*
++** sc_ls_shared_za:
++** stp x29, x30, \[sp, #?-96\]!
++** mov x29, sp
++** cntd x16
++** str x16, \[sp, #?24\]
++** stp d8, d9, \[sp, #?32\]
++** stp d10, d11, \[sp, #?48\]
++** stp d12, d13, \[sp, #?64\]
++** stp d14, d15, \[sp, #?80\]
++** mrs x16, svcr
++** str x16, \[x29, #?16\]
++** tbnz x16, 0, [^\n]+
++** smstart sm
++** bl consume_za
++** ldr x16, \[x29, #?16\]
++** tbnz x16, 0, [^\n]+
++** smstop sm
++** ldp d8, d9, \[sp, #?32\]
++** ldp d10, d11, \[sp, #?48\]
++** ldp d12, d13, \[sp, #?64\]
++** ldp d14, d15, \[sp, #?80\]
++** ldp x29, x30, \[sp\], #?96
++** ret
++*/
++[[arm::locally_streaming]] void
++sc_ls_shared_za () [[arm::streaming_compatible, arm::inout("za")]]
++{
++ consume_za ();
++ asm ("");
++}
++
++/*
++** n_ls_vector_pcs:
++** sub sp, sp, #?272
++** cntd x16
++** str x16, \[sp\]
++** stp q8, q9, \[sp, #?16\]
++** stp q10, q11, \[sp, #?48\]
++** stp q12, q13, \[sp, #?80\]
++** stp q14, q15, \[sp, #?112\]
++** stp q16, q17, \[sp, #?144\]
++** stp q18, q19, \[sp, #?176\]
++** stp q20, q21, \[sp, #?208\]
++** stp q22, q23, \[sp, #?240\]
++** smstart sm
++** smstop sm
++** ldp q8, q9, \[sp, #?16\]
++** ldp q10, q11, \[sp, #?48\]
++** ldp q12, q13, \[sp, #?80\]
++** ldp q14, q15, \[sp, #?112\]
++** ldp q16, q17, \[sp, #?144\]
++** ldp q18, q19, \[sp, #?176\]
++** ldp q20, q21, \[sp, #?208\]
++** ldp q22, q23, \[sp, #?240\]
++** add sp, sp, #?272
++** ret
++*/
++[[arm::locally_streaming]] void __attribute__((aarch64_vector_pcs))
++n_ls_vector_pcs ()
++{
++ asm ("");
++}
++
++/*
++** n_ls_sve_pcs:
++** sub sp, sp, #?16
++** cntd x16
++** str x16, \[sp\]
++** addsvl sp, sp, #-18
++** str p4, \[sp\]
++** str p5, \[sp, #1, mul vl\]
++** str p6, \[sp, #2, mul vl\]
++** str p7, \[sp, #3, mul vl\]
++** str p8, \[sp, #4, mul vl\]
++** str p9, \[sp, #5, mul vl\]
++** str p10, \[sp, #6, mul vl\]
++** str p11, \[sp, #7, mul vl\]
++** str p12, \[sp, #8, mul vl\]
++** str p13, \[sp, #9, mul vl\]
++** str p14, \[sp, #10, mul vl\]
++** str p15, \[sp, #11, mul vl\]
++** str z8, \[sp, #2, mul vl\]
++** str z9, \[sp, #3, mul vl\]
++** str z10, \[sp, #4, mul vl\]
++** str z11, \[sp, #5, mul vl\]
++** str z12, \[sp, #6, mul vl\]
++** str z13, \[sp, #7, mul vl\]
++** str z14, \[sp, #8, mul vl\]
++** str z15, \[sp, #9, mul vl\]
++** str z16, \[sp, #10, mul vl\]
++** str z17, \[sp, #11, mul vl\]
++** str z18, \[sp, #12, mul vl\]
++** str z19, \[sp, #13, mul vl\]
++** str z20, \[sp, #14, mul vl\]
++** str z21, \[sp, #15, mul vl\]
++** str z22, \[sp, #16, mul vl\]
++** str z23, \[sp, #17, mul vl\]
++** addvl sp, sp, #-1
++** str p0, \[sp\]
++** smstart sm
++** ldr p0, \[sp\]
++** addvl sp, sp, #1
++** smstop sm
++** ldr z8, \[sp, #2, mul vl\]
++** ldr z9, \[sp, #3, mul vl\]
++** ldr z10, \[sp, #4, mul vl\]
++** ldr z11, \[sp, #5, mul vl\]
++** ldr z12, \[sp, #6, mul vl\]
++** ldr z13, \[sp, #7, mul vl\]
++** ldr z14, \[sp, #8, mul vl\]
++** ldr z15, \[sp, #9, mul vl\]
++** ldr z16, \[sp, #10, mul vl\]
++** ldr z17, \[sp, #11, mul vl\]
++** ldr z18, \[sp, #12, mul vl\]
++** ldr z19, \[sp, #13, mul vl\]
++** ldr z20, \[sp, #14, mul vl\]
++** ldr z21, \[sp, #15, mul vl\]
++** ldr z22, \[sp, #16, mul vl\]
++** ldr z23, \[sp, #17, mul vl\]
++** ldr p4, \[sp\]
++** ldr p5, \[sp, #1, mul vl\]
++** ldr p6, \[sp, #2, mul vl\]
++** ldr p7, \[sp, #3, mul vl\]
++** ldr p8, \[sp, #4, mul vl\]
++** ldr p9, \[sp, #5, mul vl\]
++** ldr p10, \[sp, #6, mul vl\]
++** ldr p11, \[sp, #7, mul vl\]
++** ldr p12, \[sp, #8, mul vl\]
++** ldr p13, \[sp, #9, mul vl\]
++** ldr p14, \[sp, #10, mul vl\]
++** ldr p15, \[sp, #11, mul vl\]
++** addsvl sp, sp, #18
++** add sp, sp, #?16
++** ret
++*/
++[[arm::locally_streaming]] void
++n_ls_sve_pcs (__SVBool_t x)
++{
++ asm ("");
++}
++
++/*
++** n_ls_v0:
++** addsvl sp, sp, #-1
++** ...
++** smstart sm
++** add x[0-9]+, [^\n]+
++** smstop sm
++** ...
++** addsvl sp, sp, #1
++** ...
++*/
++#define TEST(VN) __SVInt32_t VN; asm ("" :: "r" (&VN));
++[[arm::locally_streaming]] void
++n_ls_v0 ()
++{
++ TEST (v0);
++}
++
++/*
++** n_ls_v32:
++** addsvl sp, sp, #-32
++** ...
++** smstart sm
++** ...
++** smstop sm
++** ...
++** rdsvl (x[0-9]+), #1
++** lsl (x[0-9]+), \1, #?5
++** add sp, sp, \2
++** ...
++*/
++[[arm::locally_streaming]] void
++n_ls_v32 ()
++{
++ TEST (v0);
++ TEST (v1);
++ TEST (v2);
++ TEST (v3);
++ TEST (v4);
++ TEST (v5);
++ TEST (v6);
++ TEST (v7);
++ TEST (v8);
++ TEST (v9);
++ TEST (v10);
++ TEST (v11);
++ TEST (v12);
++ TEST (v13);
++ TEST (v14);
++ TEST (v15);
++ TEST (v16);
++ TEST (v17);
++ TEST (v18);
++ TEST (v19);
++ TEST (v20);
++ TEST (v21);
++ TEST (v22);
++ TEST (v23);
++ TEST (v24);
++ TEST (v25);
++ TEST (v26);
++ TEST (v27);
++ TEST (v28);
++ TEST (v29);
++ TEST (v30);
++ TEST (v31);
++}
++
++/*
++** n_ls_v33:
++** rdsvl (x[0-9]+), #1
++** mov (x[0-9]+), #?33
++** mul (x[0-9]+), (?:\1, \2|\2, \1)
++** sub sp, sp, \3
++** ...
++** smstart sm
++** ...
++** smstop sm
++** ...
++** rdsvl (x[0-9]+), #1
++** mov (x[0-9]+), #?33
++** mul (x[0-9]+), (?:\4, \5|\5, \4)
++** add sp, sp, \6
++** ...
++*/
++[[arm::locally_streaming]] void
++n_ls_v33 ()
++{
++ TEST (v0);
++ TEST (v1);
++ TEST (v2);
++ TEST (v3);
++ TEST (v4);
++ TEST (v5);
++ TEST (v6);
++ TEST (v7);
++ TEST (v8);
++ TEST (v9);
++ TEST (v10);
++ TEST (v11);
++ TEST (v12);
++ TEST (v13);
++ TEST (v14);
++ TEST (v15);
++ TEST (v16);
++ TEST (v17);
++ TEST (v18);
++ TEST (v19);
++ TEST (v20);
++ TEST (v21);
++ TEST (v22);
++ TEST (v23);
++ TEST (v24);
++ TEST (v25);
++ TEST (v26);
++ TEST (v27);
++ TEST (v28);
++ TEST (v29);
++ TEST (v30);
++ TEST (v31);
++ TEST (v32);
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sme/locally_streaming_2.c b/gcc/testsuite/gcc.target/aarch64/sme/locally_streaming_2.c
+new file mode 100644
+index 000000000..0eba99385
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sme/locally_streaming_2.c
+@@ -0,0 +1,177 @@
++// { dg-options "-O -fomit-frame-pointer" }
++// { dg-final { check-function-bodies "**" "" } }
++
++#include <arm_neon.h>
++#include <arm_sve.h>
++
++/*
++** test_d0:
++** ...
++** smstart sm
++** ...
++** fmov x10, d0
++** smstop sm
++** fmov d0, x10
++** ...
++*/
++[[arm::locally_streaming]] double
++test_d0 ()
++{
++ asm ("");
++ return 1.0f;
++}
++
++/*
++** test_d0_vec:
++** ...
++** smstart sm
++** ...
++** (
++** fmov x10, d0
++** |
++** umov x10, v0.d\[0\]
++** )
++** smstop sm
++** fmov d0, x10
++** ...
++*/
++[[arm::locally_streaming]] int8x8_t
++test_d0_vec ()
++{
++ asm ("");
++ return (int8x8_t) {};
++}
++
++/*
++** test_q0:
++** ...
++** smstart sm
++** ...
++** str q0, \[sp, #?-16\]!
++** smstop sm
++** ldr q0, \[sp\], #?16
++** ...
++*/
++[[arm::locally_streaming]] int8x16_t
++test_q0 ()
++{
++ asm ("");
++ return (int8x16_t) {};
++}
++
++/*
++** test_q1:
++** ...
++** smstart sm
++** ...
++** stp q0, q1, \[sp, #?-32\]!
++** smstop sm
++** ldp q0, q1, \[sp\], #?32
++** ...
++*/
++[[arm::locally_streaming]] int8x16x2_t
++test_q1 ()
++{
++ asm ("");
++ return (int8x16x2_t) {};
++}
++
++/*
++** test_q2:
++** ...
++** smstart sm
++** ...
++** stp q0, q1, \[sp, #?-48\]!
++** str q2, \[sp, #?32\]
++** smstop sm
++** ldr q2, \[sp, #?32\]
++** ldp q0, q1, \[sp\], #?48
++** ...
++*/
++[[arm::locally_streaming]] int8x16x3_t
++test_q2 ()
++{
++ asm ("");
++ return (int8x16x3_t) {};
++}
++
++/*
++** test_q3:
++** ...
++** smstart sm
++** ...
++** stp q0, q1, \[sp, #?-64\]!
++** stp q2, q3, \[sp, #?32\]
++** smstop sm
++** ldp q2, q3, \[sp, #?32\]
++** ldp q0, q1, \[sp\], #?64
++** ...
++*/
++[[arm::locally_streaming]] int8x16x4_t
++test_q3 ()
++{
++ asm ("");
++ return (int8x16x4_t) {};
++}
++
++/*
++** test_z0:
++** ...
++** smstart sm
++** mov z0\.b, #0
++** addvl sp, sp, #-1
++** str z0, \[sp\]
++** smstop sm
++** ldr z0, \[sp\]
++** addvl sp, sp, #1
++** ...
++*/
++[[arm::locally_streaming]] svint8_t
++test_z0 ()
++{
++ asm ("");
++ return (svint8_t) {};
++}
++
++/*
++** test_z3:
++** ...
++** smstart sm
++** ...
++** addvl sp, sp, #-4
++** str z0, \[sp\]
++** str z1, \[sp, #1, mul vl\]
++** str z2, \[sp, #2, mul vl\]
++** str z3, \[sp, #3, mul vl\]
++** smstop sm
++** ldr z0, \[sp\]
++** ldr z1, \[sp, #1, mul vl\]
++** ldr z2, \[sp, #2, mul vl\]
++** ldr z3, \[sp, #3, mul vl\]
++** ...
++*/
++[[arm::locally_streaming]] svint8x4_t
++test_z3 ()
++{
++ asm ("");
++ return (svint8x4_t) {};
++}
++
++/*
++** test_p0:
++** ...
++** smstart sm
++** pfalse p0\.b
++** addvl sp, sp, #-1
++** str p0, \[sp\]
++** smstop sm
++** ldr p0, \[sp\]
++** addvl sp, sp, #1
++** ...
++*/
++[[arm::locally_streaming]] svbool_t
++test_p0 ()
++{
++ asm ("");
++ return (svbool_t) {};
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sme/locally_streaming_3.c b/gcc/testsuite/gcc.target/aarch64/sme/locally_streaming_3.c
+new file mode 100644
+index 000000000..2bdea6ac6
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sme/locally_streaming_3.c
+@@ -0,0 +1,273 @@
++// { dg-options "-O -fomit-frame-pointer" }
++// { dg-final { check-function-bodies "**" "" } }
++
++#include <arm_neon.h>
++#include <arm_sve.h>
++
++/*
++** test_d0:
++** ...
++** fmov x10, d0
++** smstart sm
++** fmov d0, x10
++** smstop sm
++** ...
++*/
++[[arm::locally_streaming]] void
++test_d0 (double d0)
++{
++ asm ("");
++}
++
++/*
++** test_d7:
++** ...
++** fmov x10, d0
++** fmov x11, d1
++** fmov x12, d2
++** fmov x13, d3
++** fmov x14, d4
++** fmov x15, d5
++** fmov x16, d6
++** fmov x17, d7
++** smstart sm
++** fmov d0, x10
++** fmov d1, x11
++** fmov d2, x12
++** fmov d3, x13
++** fmov d4, x14
++** fmov d5, x15
++** fmov d6, x16
++** fmov d7, x17
++** smstop sm
++** ...
++*/
++[[arm::locally_streaming]] void
++test_d7 (double d0, double d1, double d2, double d3,
++ double d4, double d5, double d6, double d7)
++{
++ asm ("");
++}
++
++/*
++** test_d0_vec:
++** ...
++** (
++** fmov x10, d0
++** |
++** umov x10, v0.d\[0\]
++** )
++** smstart sm
++** fmov d0, x10
++** smstop sm
++** ...
++*/
++[[arm::locally_streaming]] void
++test_d0_vec (int8x8_t d0)
++{
++ asm ("");
++}
++
++/*
++** test_d7_vec:
++** ...
++** (
++** fmov x10, d0
++** fmov x11, d1
++** fmov x12, d2
++** fmov x13, d3
++** fmov x14, d4
++** fmov x15, d5
++** fmov x16, d6
++** fmov x17, d7
++** |
++** umov x10, v0.d\[0\]
++** umov x11, v1.d\[0\]
++** umov x12, v2.d\[0\]
++** umov x13, v3.d\[0\]
++** umov x14, v4.d\[0\]
++** umov x15, v5.d\[0\]
++** umov x16, v6.d\[0\]
++** umov x17, v7.d\[0\]
++** )
++** smstart sm
++** fmov d0, x10
++** fmov d1, x11
++** fmov d2, x12
++** fmov d3, x13
++** fmov d4, x14
++** fmov d5, x15
++** fmov d6, x16
++** fmov d7, x17
++** smstop sm
++** ...
++*/
++[[arm::locally_streaming]] void
++test_d7_vec (int8x8_t d0, int8x8_t d1, int8x8_t d2, int8x8_t d3,
++ int8x8_t d4, int8x8_t d5, int8x8_t d6, int8x8_t d7)
++{
++ asm ("");
++}
++
++/*
++** test_q0:
++** ...
++** str q0, \[sp, #?-16\]!
++** smstart sm
++** ldr q0, \[sp\], #?16
++** smstop sm
++** ...
++*/
++[[arm::locally_streaming]] void
++test_q0 (int8x16_t q0)
++{
++ asm ("");
++}
++
++/*
++** test_q7:
++** ...
++** stp q0, q1, \[sp, #?-128\]!
++** stp q2, q3, \[sp, #?32\]
++** stp q4, q5, \[sp, #?64\]
++** stp q6, q7, \[sp, #?96\]
++** smstart sm
++** ldp q2, q3, \[sp, #?32\]
++** ldp q4, q5, \[sp, #?64\]
++** ldp q6, q7, \[sp, #?96\]
++** ldp q0, q1, \[sp\], #?128
++** smstop sm
++** ...
++*/
++[[arm::locally_streaming]] void
++test_q7 (int8x16x4_t q0, int8x16x4_t q4)
++{
++ asm ("");
++}
++
++/*
++** test_z0:
++** ...
++** addvl sp, sp, #-1
++** str z0, \[sp\]
++** smstart sm
++** ldr z0, \[sp\]
++** addvl sp, sp, #1
++** smstop sm
++** ...
++*/
++[[arm::locally_streaming]] void
++test_z0 (svint8_t z0)
++{
++ asm ("");
++}
++
++/*
++** test_z7:
++** ...
++** addvl sp, sp, #-8
++** str z0, \[sp\]
++** str z1, \[sp, #1, mul vl\]
++** str z2, \[sp, #2, mul vl\]
++** str z3, \[sp, #3, mul vl\]
++** str z4, \[sp, #4, mul vl\]
++** str z5, \[sp, #5, mul vl\]
++** str z6, \[sp, #6, mul vl\]
++** str z7, \[sp, #7, mul vl\]
++** smstart sm
++** ldr z0, \[sp\]
++** ldr z1, \[sp, #1, mul vl\]
++** ldr z2, \[sp, #2, mul vl\]
++** ldr z3, \[sp, #3, mul vl\]
++** ldr z4, \[sp, #4, mul vl\]
++** ldr z5, \[sp, #5, mul vl\]
++** ldr z6, \[sp, #6, mul vl\]
++** ldr z7, \[sp, #7, mul vl\]
++** addvl sp, sp, #8
++** smstop sm
++** ...
++*/
++[[arm::locally_streaming]] void
++test_z7 (svint8x4_t z0, svint8x4_t z4)
++{
++ asm ("");
++}
++
++/*
++** test_p0:
++** ...
++** addvl sp, sp, #-1
++** str p0, \[sp\]
++** smstart sm
++** ldr p0, \[sp\]
++** addvl sp, sp, #1
++** smstop sm
++** ...
++*/
++[[arm::locally_streaming]] void
++test_p0 (svbool_t p0)
++{
++ asm ("");
++}
++
++/*
++** test_p3:
++** ...
++** addvl sp, sp, #-1
++** str p0, \[sp\]
++** str p1, \[sp, #1, mul vl\]
++** str p2, \[sp, #2, mul vl\]
++** str p3, \[sp, #3, mul vl\]
++** smstart sm
++** ldr p0, \[sp\]
++** ldr p1, \[sp, #1, mul vl\]
++** ldr p2, \[sp, #2, mul vl\]
++** ldr p3, \[sp, #3, mul vl\]
++** addvl sp, sp, #1
++** smstop sm
++** ...
++*/
++[[arm::locally_streaming]] void
++test_p3 (svbool_t p0, svbool_t p1, svbool_t p2, svbool_t p3)
++{
++ asm ("");
++}
++
++/*
++** test_mixed:
++** ...
++** addvl sp, sp, #-3
++** str p0, \[sp\]
++** str p1, \[sp, #1, mul vl\]
++** str p2, \[sp, #2, mul vl\]
++** str p3, \[sp, #3, mul vl\]
++** str z3, \[sp, #1, mul vl\]
++** str z7, \[sp, #2, mul vl\]
++** stp q2, q6, \[sp, #?-32\]!
++** fmov w10, s0
++** fmov x11, d1
++** fmov w12, s4
++** fmov x13, d5
++** smstart sm
++** fmov s0, w10
++** fmov d1, x11
++** fmov s4, w12
++** fmov d5, x13
++** ldp q2, q6, \[sp\], #?32
++** ldr p0, \[sp\]
++** ldr p1, \[sp, #1, mul vl\]
++** ldr p2, \[sp, #2, mul vl\]
++** ldr p3, \[sp, #3, mul vl\]
++** ldr z3, \[sp, #1, mul vl\]
++** ldr z7, \[sp, #2, mul vl\]
++** addvl sp, sp, #3
++** smstop sm
++** ...
++*/
++[[arm::locally_streaming]] void
++test_mixed (float s0, double d1, float32x4_t q2, svfloat32_t z3,
++ float s4, double d5, float64x2_t q6, svfloat64_t z7,
++ svbool_t p0, svbool_t p1, svbool_t p2, svbool_t p3)
++{
++ asm ("");
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sme/locally_streaming_4.c b/gcc/testsuite/gcc.target/aarch64/sme/locally_streaming_4.c
+new file mode 100644
+index 000000000..42adeb152
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sme/locally_streaming_4.c
+@@ -0,0 +1,145 @@
++// { dg-options "-O -fomit-frame-pointer" }
++/* { dg-final { check-function-bodies "**" "" } } */
++
++#include <arm_neon.h>
++#include <arm_sve.h>
++
++/*
++** test_d0:
++** ...
++** smstart sm
++** ...
++** fmov x10, d0
++** smstop sm
++** fmov d0, x10
++** ...
++** smstart sm
++** ...
++** smstop sm
++** ...
++*/
++void consume_d0 (double d0);
++
++__arm_locally_streaming void
++test_d0 ()
++{
++ asm ("");
++ consume_d0 (1.0);
++ asm ("");
++}
++
++/*
++** test_d7:
++** ...
++** fmov x10, d0
++** fmov x11, d1
++** fmov x12, d2
++** fmov x13, d3
++** fmov x14, d4
++** fmov x15, d5
++** fmov x16, d6
++** fmov x17, d7
++** smstop sm
++** fmov d0, x10
++** fmov d1, x11
++** fmov d2, x12
++** fmov d3, x13
++** fmov d4, x14
++** fmov d5, x15
++** fmov d6, x16
++** fmov d7, x17
++** ...
++*/
++void consume_d7 (double d0, double d1, double d2, double d3,
++ double d4, double d5, double d6, double d7);
++__arm_locally_streaming void
++test_d7 ()
++{
++ asm ("");
++ consume_d7 (1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
++ asm ("");
++}
++
++/*
++** test_q7:
++** ...
++** stp q0, q1, \[sp, #?-128\]!
++** stp q2, q3, \[sp, #?32\]
++** stp q4, q5, \[sp, #?64\]
++** stp q6, q7, \[sp, #?96\]
++** smstop sm
++** ldp q2, q3, \[sp, #?32\]
++** ldp q4, q5, \[sp, #?64\]
++** ldp q6, q7, \[sp, #?96\]
++** ldp q0, q1, \[sp\], #?128
++** ...
++*/
++void consume_q7 (int8x16x4_t q0, int8x16x4_t q4);
++
++__arm_locally_streaming void
++test_q7 (int8x16x4_t *ptr)
++{
++ asm ("");
++ consume_q7 (ptr[0], ptr[1]);
++ asm ("");
++}
++
++/*
++** test_z7:
++** ...
++** addvl sp, sp, #-8
++** str z0, \[sp\]
++** str z1, \[sp, #1, mul vl\]
++** str z2, \[sp, #2, mul vl\]
++** str z3, \[sp, #3, mul vl\]
++** str z4, \[sp, #4, mul vl\]
++** str z5, \[sp, #5, mul vl\]
++** str z6, \[sp, #6, mul vl\]
++** str z7, \[sp, #7, mul vl\]
++** smstop sm
++** ldr z0, \[sp\]
++** ldr z1, \[sp, #1, mul vl\]
++** ldr z2, \[sp, #2, mul vl\]
++** ldr z3, \[sp, #3, mul vl\]
++** ldr z4, \[sp, #4, mul vl\]
++** ldr z5, \[sp, #5, mul vl\]
++** ldr z6, \[sp, #6, mul vl\]
++** ldr z7, \[sp, #7, mul vl\]
++** addvl sp, sp, #8
++** ...
++*/
++void consume_z7 (svint8x4_t z0, svint8x4_t z4);
++
++__arm_locally_streaming void
++test_z7 (svint8x4_t *ptr1, svint8x4_t *ptr2)
++{
++ asm ("");
++ consume_z7 (*ptr1, *ptr2);
++ asm ("");
++}
++
++/*
++** test_p3:
++** ...
++** addvl sp, sp, #-1
++** str p0, \[sp\]
++** str p1, \[sp, #1, mul vl\]
++** str p2, \[sp, #2, mul vl\]
++** str p3, \[sp, #3, mul vl\]
++** smstop sm
++** ldr p0, \[sp\]
++** ldr p1, \[sp, #1, mul vl\]
++** ldr p2, \[sp, #2, mul vl\]
++** ldr p3, \[sp, #3, mul vl\]
++** addvl sp, sp, #1
++** ...
++*/
++void consume_p3 (svbool_t p0, svbool_t p1, svbool_t p2, svbool_t p3);
++
++__arm_locally_streaming void
++test_p3 (svbool_t *ptr1, svbool_t *ptr2, svbool_t *ptr3, svbool_t *ptr4)
++{
++ asm ("");
++ consume_p3 (*ptr1, *ptr2, *ptr3, *ptr4);
++ asm ("");
++}
+--
+2.33.0
+