automatic import of gccopeneuler24.03_LTS_SP1

author: CoprDistGit <infra@openeuler.org> 2025-02-28 10:03:49 +0000
committer: CoprDistGit <infra@openeuler.org> 2025-02-28 10:03:49 +0000
commit: 73127104a245052cd5cf29cdaaca3e5c32c70348 (patch)
tree: 8e28b63e478c43c252f18b49836dff7313affe54 /0212-Backport-SME-aarch64-Add-support-for-__arm_locally_s.patch
parent: 49d3feaf4665cdb07576fc1a2382a4d82a612d35 (diff)
1 files changed, 1748 insertions, 0 deletions
diff --git a/0212-Backport-SME-aarch64-Add-support-for-__arm_locally_s.patch b/0212-Backport-SME-aarch64-Add-support-for-__arm_locally_s.patch
new file mode 100644
index 0000000..72576e3
--- /dev/null
+++ b/0212-Backport-SME-aarch64-Add-support-for-__arm_locally_s.patch
@@ -0,0 +1,1748 @@
+From 0ad41f11bea5c303ff39c54cae8e46afdfae6070 Mon Sep 17 00:00:00 2001
+From: Richard Sandiford <richard.sandiford@arm.com>
+Date: Tue, 5 Dec 2023 10:11:29 +0000
+Subject: [PATCH 113/157] [Backport][SME] aarch64: Add support for
+ __arm_locally_streaming
+
+Reference: https://gcc.gnu.org/git/?p=gcc.git;a=commit;h=3f6e5991fab507aa79121dc44d1afcd622c78744
+
+This patch adds support for the __arm_locally_streaming attribute,
+which allows a function to use SME internally without changing
+the function's ABI.  The attribute is valid but redundant for
+__arm_streaming functions.
+
+gcc/
+	* config/aarch64/aarch64.cc (aarch64_arm_attribute_table): Add
+	arm::locally_streaming.
+	(aarch64_fndecl_is_locally_streaming): New function.
+	(aarch64_fndecl_sm_state): Handle locally-streaming functions.
+	(aarch64_cfun_enables_pstate_sm): New function.
+	(aarch64_add_offset): Add an argument that specifies whether
+	the streaming vector length should be used instead of the
+	prevailing one.
+	(aarch64_split_add_offset, aarch64_add_sp, aarch64_sub_sp): Likewise.
+	(aarch64_allocate_and_probe_stack_space): Likewise.
+	(aarch64_expand_mov_immediate): Update calls accordingly.
+	(aarch64_need_old_pstate_sm): Return true for locally-streaming
+	streaming-compatible functions.
+	(aarch64_layout_frame): Force all call-preserved Z and P registers
+	to be saved and restored if the function switches PSTATE.SM in the
+	prologue.
+	(aarch64_get_separate_components): Disable shrink-wrapping of
+	such Z and P saves and restores.
+	(aarch64_use_late_prologue_epilogue): New function.
+	(aarch64_expand_prologue): Measure SVE lengths in the streaming
+	vector length for locally-streaming functions, then emit code
+	to enable streaming mode.
+	(aarch64_expand_epilogue): Likewise in reverse.
+	(TARGET_USE_LATE_PROLOGUE_EPILOGUE): Define.
+	* config/aarch64/aarch64-c.cc (aarch64_define_unconditional_macros):
+	Define __arm_locally_streaming.
+
+gcc/testsuite/
+	* gcc.target/aarch64/sme/locally_streaming_1.c: New test.
+	* gcc.target/aarch64/sme/locally_streaming_2.c: Likewise.
+	* gcc.target/aarch64/sme/locally_streaming_3.c: Likewise.
+	* gcc.target/aarch64/sme/locally_streaming_4.c: Likewise.
+	* gcc.target/aarch64/sme/keyword_macros_1.c: Add
+	__arm_locally_streaming.
+	* g++.target/aarch64/sme/keyword_macros_1.C: Likewise.
+---
+ gcc/config/aarch64/aarch64-c.cc               |   1 +
+ gcc/config/aarch64/aarch64.cc                 | 233 +++++++--
+ .../g++.target/aarch64/sme/keyword_macros_1.C |   1 +
+ .../gcc.target/aarch64/sme/keyword_macros_1.c |   1 +
+ .../aarch64/sme/locally_streaming_1.c         | 466 ++++++++++++++++++
+ .../aarch64/sme/locally_streaming_2.c         | 177 +++++++
+ .../aarch64/sme/locally_streaming_3.c         | 273 ++++++++++
+ .../aarch64/sme/locally_streaming_4.c         | 145 ++++++
+ 8 files changed, 1259 insertions(+), 38 deletions(-)
+ create mode 100644 gcc/testsuite/gcc.target/aarch64/sme/locally_streaming_1.c
+ create mode 100644 gcc/testsuite/gcc.target/aarch64/sme/locally_streaming_2.c
+ create mode 100644 gcc/testsuite/gcc.target/aarch64/sme/locally_streaming_3.c
+ create mode 100644 gcc/testsuite/gcc.target/aarch64/sme/locally_streaming_4.c
+
+diff --git a/gcc/config/aarch64/aarch64-c.cc b/gcc/config/aarch64/aarch64-c.cc
+index cb8a6c2fc..745719d8b 100644
+--- a/gcc/config/aarch64/aarch64-c.cc
++++ b/gcc/config/aarch64/aarch64-c.cc
+@@ -86,6 +86,7 @@ aarch64_define_unconditional_macros (cpp_reader *pfile)
+ 
+   DEFINE_ARM_KEYWORD_MACRO ("streaming");
+   DEFINE_ARM_KEYWORD_MACRO ("streaming_compatible");
++  DEFINE_ARM_KEYWORD_MACRO ("locally_streaming");
+ 
+ #undef DEFINE_ARM_KEYWORD_MACRO
+ 
+diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
+index 113784e31..4cb43c2e2 100644
+--- a/gcc/config/aarch64/aarch64.cc
++++ b/gcc/config/aarch64/aarch64.cc
+@@ -3283,6 +3283,7 @@ static const attribute_spec aarch64_arm_attributes[] =
+ 			  NULL, attr_streaming_exclusions },
+   { "streaming_compatible", 0, 0, false, true,  true,  true,
+ 			  NULL, attr_streaming_exclusions },
++  { "locally_streaming",  0, 0, true, false, false, false, NULL, NULL },
+   { "new",		  1, -1, true, false, false, false,
+ 			  handle_arm_new, NULL },
+   { "preserves",	  1, -1, false, true,  true,  true,
+@@ -4657,6 +4658,16 @@ aarch64_fntype_isa_mode (const_tree fntype)
+ 	  | aarch64_fntype_pstate_za (fntype));
+ }
+ 
++/* Return true if FNDECL uses streaming mode internally, as an
++   implementation choice.  */
++
++static bool
++aarch64_fndecl_is_locally_streaming (const_tree fndecl)
++{
++  return lookup_attribute ("arm", "locally_streaming",
++			   DECL_ATTRIBUTES (fndecl));
++}
++
+ /* Return the state of PSTATE.SM when compiling the body of
+    function FNDECL.  This might be different from the state of
+    PSTATE.SM on entry.  */
+@@ -4664,6 +4675,9 @@ aarch64_fntype_isa_mode (const_tree fntype)
+ static aarch64_feature_flags
+ aarch64_fndecl_pstate_sm (const_tree fndecl)
+ {
++  if (aarch64_fndecl_is_locally_streaming (fndecl))
++    return AARCH64_FL_SM_ON;
++
+   return aarch64_fntype_pstate_sm (TREE_TYPE (fndecl));
+ }
+ 
+@@ -4739,6 +4753,16 @@ aarch64_cfun_has_new_state (const char *state_name)
+   return aarch64_fndecl_has_new_state (cfun->decl, state_name);
+ }
+ 
++/* Return true if PSTATE.SM is 1 in the body of the current function,
++   but is not guaranteed to be 1 on entry.  */
++
++static bool
++aarch64_cfun_enables_pstate_sm ()
++{
++  return (aarch64_fndecl_is_locally_streaming (cfun->decl)
++	  && aarch64_cfun_incoming_pstate_sm () != AARCH64_FL_SM_ON);
++}
++
+ /* Return true if the current function has state STATE_NAME, either by
+    creating new state itself or by sharing state with callers.  */
+ 
+@@ -6931,6 +6955,10 @@ aarch64_add_offset_temporaries (rtx x)
+    TEMP2, if nonnull, is a second temporary register that doesn't
+    overlap either DEST or REG.
+ 
++   FORCE_ISA_MODE is AARCH64_FL_SM_ON if any variable component of OFFSET
++   is measured relative to the SME vector length instead of the current
++   prevailing vector length.  It is 0 otherwise.
++
+    Since this function may be used to adjust the stack pointer, we must
+    ensure that it cannot cause transient stack deallocation (for example
+    by first incrementing SP and then decrementing when adjusting by a
+@@ -6939,6 +6967,7 @@ aarch64_add_offset_temporaries (rtx x)
+ static void
+ aarch64_add_offset (scalar_int_mode mode, rtx dest, rtx src,
+ 		    poly_int64 offset, rtx temp1, rtx temp2,
++		    aarch64_feature_flags force_isa_mode,
+ 		    bool frame_related_p, bool emit_move_imm = true)
+ {
+   gcc_assert (emit_move_imm || temp1 != NULL_RTX);
+@@ -6951,9 +6980,18 @@ aarch64_add_offset (scalar_int_mode mode, rtx dest, rtx src,
+   /* Try using ADDVL or ADDPL to add the whole value.  */
+   if (src != const0_rtx && aarch64_sve_addvl_addpl_immediate_p (offset))
+     {
+-      rtx offset_rtx = gen_int_mode (offset, mode);
++      gcc_assert (offset.coeffs[0] == offset.coeffs[1]);
++      rtx offset_rtx;
++      if (force_isa_mode == 0)
++	offset_rtx = gen_int_mode (offset, mode);
++      else
++	offset_rtx = aarch64_sme_vq_immediate (mode, offset.coeffs[0], 0);
+       rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
+       RTX_FRAME_RELATED_P (insn) = frame_related_p;
++      if (frame_related_p && (force_isa_mode & AARCH64_FL_SM_ON))
++	add_reg_note (insn, REG_CFA_ADJUST_CFA,
++		      gen_rtx_SET (dest, plus_constant (Pmode, src,
++							offset)));
+       return;
+     }
+ 
+@@ -6969,11 +7007,19 @@ aarch64_add_offset (scalar_int_mode mode, rtx dest, rtx src,
+   if (src != const0_rtx
+       && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
+     {
+-      rtx offset_rtx = gen_int_mode (poly_offset, mode);
++      rtx offset_rtx;
++      if (force_isa_mode == 0)
++	offset_rtx = gen_int_mode (poly_offset, mode);
++      else
++	offset_rtx = aarch64_sme_vq_immediate (mode, factor, 0);
+       if (frame_related_p)
+ 	{
+ 	  rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
+ 	  RTX_FRAME_RELATED_P (insn) = true;
++	  if (force_isa_mode & AARCH64_FL_SM_ON)
++	    add_reg_note (insn, REG_CFA_ADJUST_CFA,
++			  gen_rtx_SET (dest, plus_constant (Pmode, src,
++							    poly_offset)));
+ 	  src = dest;
+ 	}
+       else
+@@ -7004,9 +7050,19 @@ aarch64_add_offset (scalar_int_mode mode, rtx dest, rtx src,
+       rtx val;
+       if (IN_RANGE (rel_factor, -32, 31))
+ 	{
++	  if (force_isa_mode & AARCH64_FL_SM_ON)
++	    {
++	      /* Try to use an unshifted RDSVL, otherwise fall back on
++		 a shifted RDSVL #1.  */
++	      if (aarch64_sve_rdvl_addvl_factor_p (factor))
++		shift = 0;
++	      else
++		factor = rel_factor * 16;
++	      val = aarch64_sme_vq_immediate (mode, factor, 0);
++	    }
+ 	  /* Try to use an unshifted CNT[BHWD] or RDVL.  */
+-	  if (aarch64_sve_cnt_factor_p (factor)
+-	      || aarch64_sve_rdvl_addvl_factor_p (factor))
++	  else if (aarch64_sve_cnt_factor_p (factor)
++		   || aarch64_sve_rdvl_addvl_factor_p (factor))
+ 	    {
+ 	      val = gen_int_mode (poly_int64 (factor, factor), mode);
+ 	      shift = 0;
+@@ -7036,11 +7092,18 @@ aarch64_add_offset (scalar_int_mode mode, rtx dest, rtx src,
+ 	     a shift and add sequence for the multiplication.
+ 	     If CNTB << SHIFT is out of range, stick with the current
+ 	     shift factor.  */
+-	  if (IN_RANGE (low_bit, 2, 16 * 16))
++	  if (force_isa_mode == 0
++	      && IN_RANGE (low_bit, 2, 16 * 16))
+ 	    {
+ 	      val = gen_int_mode (poly_int64 (low_bit, low_bit), mode);
+ 	      shift = 0;
+ 	    }
++	  else if ((force_isa_mode & AARCH64_FL_SM_ON)
++		   && aarch64_sve_rdvl_addvl_factor_p (low_bit))
++	    {
++	      val = aarch64_sme_vq_immediate (mode, low_bit, 0);
++	      shift = 0;
++	    }
+ 	  else
+ 	    val = gen_int_mode (BYTES_PER_SVE_VECTOR, mode);
+ 
+@@ -7128,30 +7191,34 @@ aarch64_split_add_offset (scalar_int_mode mode, rtx dest, rtx src,
+ 			  rtx offset_rtx, rtx temp1, rtx temp2)
+ {
+   aarch64_add_offset (mode, dest, src, rtx_to_poly_int64 (offset_rtx),
+-		      temp1, temp2, false);
++		      temp1, temp2, 0, false);
+ }
+ 
+ /* Add DELTA to the stack pointer, marking the instructions frame-related.
+-   TEMP1 is available as a temporary if nonnull.  EMIT_MOVE_IMM is false
+-   if TEMP1 already contains abs (DELTA).  */
++   TEMP1 is available as a temporary if nonnull.  FORCE_ISA_MODE is as
++   for aarch64_add_offset.  EMIT_MOVE_IMM is false if TEMP1 already
++   contains abs (DELTA).  */
+ 
+ static inline void
+-aarch64_add_sp (rtx temp1, rtx temp2, poly_int64 delta, bool emit_move_imm)
++aarch64_add_sp (rtx temp1, rtx temp2, poly_int64 delta,
++		aarch64_feature_flags force_isa_mode, bool emit_move_imm)
+ {
+   aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, delta,
+-		      temp1, temp2, true, emit_move_imm);
++		      temp1, temp2, force_isa_mode, true, emit_move_imm);
+ }
+ 
+ /* Subtract DELTA from the stack pointer, marking the instructions
+-   frame-related if FRAME_RELATED_P.  TEMP1 is available as a temporary
+-   if nonnull.  */
++   frame-related if FRAME_RELATED_P.  FORCE_ISA_MODE is as for
++   aarch64_add_offset.  TEMP1 is available as a temporary if nonnull.  */
+ 
+ static inline void
+-aarch64_sub_sp (rtx temp1, rtx temp2, poly_int64 delta, bool frame_related_p,
+-		bool emit_move_imm = true)
++aarch64_sub_sp (rtx temp1, rtx temp2, poly_int64 delta,
++		aarch64_feature_flags force_isa_mode,
++		bool frame_related_p, bool emit_move_imm = true)
+ {
+   aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, -delta,
+-		      temp1, temp2, frame_related_p, emit_move_imm);
++		      temp1, temp2, force_isa_mode, frame_related_p,
++		      emit_move_imm);
+ }
+ 
+ /* A streaming-compatible function needs to switch temporarily to the known
+@@ -8176,11 +8243,11 @@ aarch64_expand_mov_immediate (rtx dest, rtx imm)
+ 		{
+ 		  base = aarch64_force_temporary (int_mode, dest, base);
+ 		  aarch64_add_offset (int_mode, dest, base, offset,
+-				      NULL_RTX, NULL_RTX, false);
++				      NULL_RTX, NULL_RTX, 0, false);
+ 		}
+ 	      else
+ 		aarch64_add_offset (int_mode, dest, base, offset,
+-				    dest, NULL_RTX, false);
++				    dest, NULL_RTX, 0, false);
+ 	    }
+ 	  return;
+ 	}
+@@ -8207,7 +8274,7 @@ aarch64_expand_mov_immediate (rtx dest, rtx imm)
+ 	      gcc_assert (can_create_pseudo_p ());
+ 	      base = aarch64_force_temporary (int_mode, dest, base);
+ 	      aarch64_add_offset (int_mode, dest, base, const_offset,
+-				  NULL_RTX, NULL_RTX, false);
++				  NULL_RTX, NULL_RTX, 0, false);
+ 	      return;
+ 	    }
+ 
+@@ -8247,7 +8314,7 @@ aarch64_expand_mov_immediate (rtx dest, rtx imm)
+ 	      gcc_assert(can_create_pseudo_p ());
+ 	      base = aarch64_force_temporary (int_mode, dest, base);
+ 	      aarch64_add_offset (int_mode, dest, base, const_offset,
+-				  NULL_RTX, NULL_RTX, false);
++				  NULL_RTX, NULL_RTX, 0, false);
+ 	      return;
+ 	    }
+ 	  /* FALLTHRU */
+@@ -9755,6 +9822,9 @@ aarch64_need_old_pstate_sm ()
+   if (aarch64_cfun_incoming_pstate_sm () != 0)
+     return false;
+ 
++  if (aarch64_cfun_enables_pstate_sm ())
++    return true;
++
+   if (cfun->machine->call_switches_pstate_sm)
+     for (auto insn = get_insns (); insn; insn = NEXT_INSN (insn))
+       if (auto *call = dyn_cast<rtx_call_insn *> (insn))
+@@ -9781,6 +9851,7 @@ aarch64_layout_frame (void)
+   bool frame_related_fp_reg_p = false;
+   aarch64_frame &frame = cfun->machine->frame;
+   poly_int64 top_of_locals = -1;
++  bool enables_pstate_sm = aarch64_cfun_enables_pstate_sm ();
+ 
+   vec_safe_truncate (frame.saved_gprs, 0);
+   vec_safe_truncate (frame.saved_fprs, 0);
+@@ -9818,7 +9889,7 @@ aarch64_layout_frame (void)
+       frame.reg_offset[regno] = SLOT_REQUIRED;
+ 
+   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
+-    if (df_regs_ever_live_p (regno)
++    if ((enables_pstate_sm || df_regs_ever_live_p (regno))
+ 	&& !fixed_regs[regno]
+ 	&& !crtl->abi->clobbers_full_reg_p (regno))
+       {
+@@ -9847,7 +9918,7 @@ aarch64_layout_frame (void)
+     }
+ 
+   for (regno = P0_REGNUM; regno <= P15_REGNUM; regno++)
+-    if (df_regs_ever_live_p (regno)
++    if ((enables_pstate_sm || df_regs_ever_live_p (regno))
+ 	&& !fixed_regs[regno]
+ 	&& !crtl->abi->clobbers_full_reg_p (regno))
+       frame.reg_offset[regno] = SLOT_REQUIRED;
+@@ -9964,7 +10035,8 @@ aarch64_layout_frame (void)
+   /* If the current function changes the SVE vector length, ensure that the
+      old value of the DWARF VG register is saved and available in the CFI,
+      so that outer frames with VL-sized offsets can be processed correctly.  */
+-  if (cfun->machine->call_switches_pstate_sm)
++  if (cfun->machine->call_switches_pstate_sm
++      || aarch64_cfun_enables_pstate_sm ())
+     {
+       frame.reg_offset[VG_REGNUM] = offset;
+       offset += UNITS_PER_WORD;
+@@ -10749,9 +10821,16 @@ aarch64_get_separate_components (void)
+   bitmap_clear (components);
+ 
+   /* The registers we need saved to the frame.  */
++  bool enables_pstate_sm = aarch64_cfun_enables_pstate_sm ();
+   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
+     if (aarch64_register_saved_on_entry (regno))
+       {
++	/* Disallow shrink wrapping for registers that will be clobbered
++	   by an SMSTART SM in the prologue.  */
++	if (enables_pstate_sm
++	    && (FP_REGNUM_P (regno) || PR_REGNUM_P (regno)))
++	  continue;
++
+ 	/* Punt on saves and restores that use ST1D and LD1D.  We could
+ 	   try to be smarter, but it would involve making sure that the
+ 	   spare predicate register itself is safe to use at the save
+@@ -11070,11 +11149,16 @@ aarch64_emit_stack_tie (rtx reg)
+    events, e.g. if we were to allow the stack to be dropped by more than a page
+    and then have multiple probes up and we take a signal somewhere in between
+    then the signal handler doesn't know the state of the stack and can make no
+-   assumptions about which pages have been probed.  */
++   assumptions about which pages have been probed.
++
++   FORCE_ISA_MODE is AARCH64_FL_SM_ON if any variable component of POLY_SIZE
++   is measured relative to the SME vector length instead of the current
++   prevailing vector length.  It is 0 otherwise.  */
+ 
+ static void
+ aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2,
+ 					poly_int64 poly_size,
++					aarch64_feature_flags force_isa_mode,
+ 					bool frame_related_p,
+ 					bool final_adjustment_p)
+ {
+@@ -11116,7 +11200,8 @@ aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2,
+   if (known_lt (poly_size, min_probe_threshold)
+       || !flag_stack_clash_protection)
+     {
+-      aarch64_sub_sp (temp1, temp2, poly_size, frame_related_p);
++      aarch64_sub_sp (temp1, temp2, poly_size, force_isa_mode,
++		      frame_related_p);
+       return;
+     }
+ 
+@@ -11133,7 +11218,8 @@ aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2,
+ 
+       /* First calculate the amount of bytes we're actually spilling.  */
+       aarch64_add_offset (Pmode, temp1, CONST0_RTX (Pmode),
+-			  poly_size, temp1, temp2, false, true);
++			  poly_size, temp1, temp2, force_isa_mode,
++			  false, true);
+ 
+       rtx_insn *insn = get_last_insn ();
+ 
+@@ -11191,7 +11277,7 @@ aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2,
+     {
+       for (HOST_WIDE_INT i = 0; i < rounded_size; i += guard_size)
+ 	{
+-	  aarch64_sub_sp (NULL, temp2, guard_size, true);
++	  aarch64_sub_sp (NULL, temp2, guard_size, force_isa_mode, true);
+ 	  emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
+ 					   guard_used_by_caller));
+ 	  emit_insn (gen_blockage ());
+@@ -11202,7 +11288,7 @@ aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2,
+     {
+       /* Compute the ending address.  */
+       aarch64_add_offset (Pmode, temp1, stack_pointer_rtx, -rounded_size,
+-			  temp1, NULL, false, true);
++			  temp1, NULL, force_isa_mode, false, true);
+       rtx_insn *insn = get_last_insn ();
+ 
+       /* For the initial allocation, we don't have a frame pointer
+@@ -11268,7 +11354,7 @@ aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2,
+       if (final_adjustment_p && rounded_size != 0)
+ 	min_probe_threshold = 0;
+ 
+-      aarch64_sub_sp (temp1, temp2, residual, frame_related_p);
++      aarch64_sub_sp (temp1, temp2, residual, force_isa_mode, frame_related_p);
+       if (residual >= min_probe_threshold)
+ 	{
+ 	  if (dump_file)
+@@ -11333,6 +11419,14 @@ aarch64_epilogue_uses (int regno)
+   return 0;
+ }
+ 
++/* Implement TARGET_USE_LATE_PROLOGUE_EPILOGUE.  */
++
++static bool
++aarch64_use_late_prologue_epilogue ()
++{
++  return aarch64_cfun_enables_pstate_sm ();
++}
++
+ /* The current function's frame has a save slot for the incoming state
+    of SVCR.  Return a legitimate memory for the slot, based on the hard
+    frame pointer.  */
+@@ -11469,6 +11563,9 @@ aarch64_expand_prologue (void)
+   unsigned reg2 = frame.wb_push_candidate2;
+   bool emit_frame_chain = frame.emit_frame_chain;
+   rtx_insn *insn;
++  aarch64_feature_flags force_isa_mode = 0;
++  if (aarch64_cfun_enables_pstate_sm ())
++    force_isa_mode = AARCH64_FL_SM_ON;
+ 
+   if (flag_stack_clash_protection && known_eq (callee_adjust, 0))
+     {
+@@ -11530,7 +11627,7 @@ aarch64_expand_prologue (void)
+      less the amount of the guard reserved for use by the caller's
+      outgoing args.  */
+   aarch64_allocate_and_probe_stack_space (tmp0_rtx, tmp1_rtx, initial_adjust,
+-					  true, false);
++					  force_isa_mode, true, false);
+ 
+   if (callee_adjust != 0)
+     aarch64_push_regs (reg1, reg2, callee_adjust);
+@@ -11553,7 +11650,8 @@ aarch64_expand_prologue (void)
+ 	gcc_assert (known_eq (chain_offset, 0));
+       aarch64_add_offset (Pmode, hard_frame_pointer_rtx,
+ 			  stack_pointer_rtx, chain_offset,
+-			  tmp1_rtx, tmp0_rtx, frame_pointer_needed);
++			  tmp1_rtx, tmp0_rtx, force_isa_mode,
++			  frame_pointer_needed);
+       if (frame_pointer_needed && !frame_size.is_constant ())
+ 	{
+ 	  /* Variable-sized frames need to describe the save slot
+@@ -11600,6 +11698,7 @@ aarch64_expand_prologue (void)
+ 		  || known_eq (initial_adjust, 0));
+       aarch64_allocate_and_probe_stack_space (tmp1_rtx, tmp0_rtx,
+ 					      sve_callee_adjust,
++					      force_isa_mode,
+ 					      !frame_pointer_needed, false);
+       bytes_below_sp -= sve_callee_adjust;
+     }
+@@ -11612,12 +11711,15 @@ aarch64_expand_prologue (void)
+      that is assumed by the called.  */
+   gcc_assert (known_eq (bytes_below_sp, final_adjust));
+   aarch64_allocate_and_probe_stack_space (tmp1_rtx, tmp0_rtx, final_adjust,
++					  force_isa_mode,
+ 					  !frame_pointer_needed, true);
+   if (emit_frame_chain && maybe_ne (final_adjust, 0))
+     aarch64_emit_stack_tie (hard_frame_pointer_rtx);
+ 
+-  /* Save the incoming value of PSTATE.SM, if required.  */
+-  if (known_ge (frame.old_svcr_offset, 0))
++  /* Save the incoming value of PSTATE.SM, if required.  Code further
++     down does this for locally-streaming functions.  */
++  if (known_ge (frame.old_svcr_offset, 0)
++      && !aarch64_cfun_enables_pstate_sm ())
+     {
+       rtx mem = aarch64_old_svcr_mem ();
+       MEM_VOLATILE_P (mem) = 1;
+@@ -11649,6 +11751,34 @@ aarch64_expand_prologue (void)
+ 	    emit_move_insn (gen_rtx_REG (DImode, R1_REGNUM), old_r1);
+ 	}
+     }
++
++  /* Enable PSTATE.SM, if required.  */
++  if (aarch64_cfun_enables_pstate_sm ())
++    {
++      rtx_insn *guard_label = nullptr;
++      if (known_ge (cfun->machine->frame.old_svcr_offset, 0))
++	{
++	  /* The current function is streaming-compatible.  Save the
++	     original state of PSTATE.SM.  */
++	  rtx svcr = gen_rtx_REG (DImode, IP0_REGNUM);
++	  emit_insn (gen_aarch64_read_svcr (svcr));
++	  emit_move_insn (aarch64_old_svcr_mem (), svcr);
++	  guard_label = aarch64_guard_switch_pstate_sm (svcr,
++							aarch64_isa_flags);
++	}
++      aarch64_sme_mode_switch_regs args_switch;
++      auto &args = crtl->args.info;
++      for (unsigned int i = 0; i < args.num_sme_mode_switch_args; ++i)
++	{
++	  rtx x = args.sme_mode_switch_args[i];
++	  args_switch.add_reg (GET_MODE (x), REGNO (x));
++	}
++      args_switch.emit_prologue ();
++      emit_insn (gen_aarch64_smstart_sm ());
++      args_switch.emit_epilogue ();
++      if (guard_label)
++	emit_label (guard_label);
++    }
+ }
+ 
+ /* Return TRUE if we can use a simple_return insn.
+@@ -11695,6 +11825,9 @@ aarch64_expand_epilogue (rtx_call_insn *sibcall)
+   HOST_WIDE_INT guard_size
+     = 1 << param_stack_clash_protection_guard_size;
+   HOST_WIDE_INT guard_used_by_caller = STACK_CLASH_CALLER_GUARD;
++  aarch64_feature_flags force_isa_mode = 0;
++  if (aarch64_cfun_enables_pstate_sm ())
++    force_isa_mode = AARCH64_FL_SM_ON;
+ 
+   /* We can re-use the registers when:
+ 
+@@ -11719,6 +11852,24 @@ aarch64_expand_epilogue (rtx_call_insn *sibcall)
+     = maybe_ne (get_frame_size ()
+ 		+ frame.saved_varargs_size, 0);
+ 
++  /* Reset PSTATE.SM, if required.  */
++  if (aarch64_cfun_enables_pstate_sm ())
++    {
++      rtx_insn *guard_label = nullptr;
++      if (known_ge (cfun->machine->frame.old_svcr_offset, 0))
++	guard_label = aarch64_guard_switch_pstate_sm (IP0_REGNUM,
++						      aarch64_isa_flags);
++      aarch64_sme_mode_switch_regs return_switch;
++      if (crtl->return_rtx && REG_P (crtl->return_rtx))
++	return_switch.add_reg (GET_MODE (crtl->return_rtx),
++			       REGNO (crtl->return_rtx));
++      return_switch.emit_prologue ();
++      emit_insn (gen_aarch64_smstop_sm ());
++      return_switch.emit_epilogue ();
++      if (guard_label)
++	emit_label (guard_label);
++    }
++
+   /* Emit a barrier to prevent loads from a deallocated stack.  */
+   if (maybe_gt (final_adjust, crtl->outgoing_args_size)
+       || cfun->calls_alloca
+@@ -11739,19 +11890,21 @@ aarch64_expand_epilogue (rtx_call_insn *sibcall)
+     aarch64_add_offset (Pmode, stack_pointer_rtx,
+ 			hard_frame_pointer_rtx,
+ 			-bytes_below_hard_fp + final_adjust,
+-			tmp1_rtx, tmp0_rtx, callee_adjust == 0);
++			tmp1_rtx, tmp0_rtx, force_isa_mode,
++			callee_adjust == 0);
+   else
+      /* The case where we need to re-use the register here is very rare, so
+ 	avoid the complicated condition and just always emit a move if the
+ 	immediate doesn't fit.  */
+-     aarch64_add_sp (tmp1_rtx, tmp0_rtx, final_adjust, true);
++     aarch64_add_sp (tmp1_rtx, tmp0_rtx, final_adjust, force_isa_mode, true);
+ 
+   /* Restore the vector registers before the predicate registers,
+      so that we can use P4 as a temporary for big-endian SVE frames.  */
+   aarch64_restore_callee_saves (final_adjust, frame.saved_fprs, &cfi_ops);
+   aarch64_restore_callee_saves (final_adjust, frame.saved_prs, &cfi_ops);
+   if (maybe_ne (sve_callee_adjust, 0))
+-    aarch64_add_sp (NULL_RTX, NULL_RTX, sve_callee_adjust, true);
++    aarch64_add_sp (NULL_RTX, NULL_RTX, sve_callee_adjust,
++		    force_isa_mode, true);
+ 
+   /* When shadow call stack is enabled, the scs_pop in the epilogue will
+      restore x30, we don't need to restore x30 again in the traditional
+@@ -11781,7 +11934,7 @@ aarch64_expand_epilogue (rtx_call_insn *sibcall)
+ 
+   /* Liveness of EP0_REGNUM can not be trusted across function calls either, so
+      add restriction on emit_move optimization to leaf functions.  */
+-  aarch64_add_sp (tmp0_rtx, tmp1_rtx, initial_adjust,
++  aarch64_add_sp (tmp0_rtx, tmp1_rtx, initial_adjust, force_isa_mode,
+ 		  (!can_inherit_p || !crtl->is_leaf
+ 		   || df_regs_ever_live_p (EP0_REGNUM)));
+ 
+@@ -11914,7 +12067,8 @@ aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
+   temp1 = gen_rtx_REG (Pmode, EP1_REGNUM);
+ 
+   if (vcall_offset == 0)
+-    aarch64_add_offset (Pmode, this_rtx, this_rtx, delta, temp1, temp0, false);
++    aarch64_add_offset (Pmode, this_rtx, this_rtx, delta, temp1, temp0,
++			0, false);
+   else
+     {
+       gcc_assert ((vcall_offset & (POINTER_BYTES - 1)) == 0);
+@@ -11927,7 +12081,7 @@ aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
+ 				       plus_constant (Pmode, this_rtx, delta));
+ 	  else
+ 	    aarch64_add_offset (Pmode, this_rtx, this_rtx, delta,
+-				temp1, temp0, false);
++				temp1, temp0, 0, false);
+ 	}
+ 
+       if (Pmode == ptr_mode)
+@@ -30962,6 +31116,9 @@ aarch64_libgcc_floating_mode_supported_p
+ #undef TARGET_EXTRA_LIVE_ON_ENTRY
+ #define TARGET_EXTRA_LIVE_ON_ENTRY aarch64_extra_live_on_entry
+ 
++#undef TARGET_USE_LATE_PROLOGUE_EPILOGUE
++#define TARGET_USE_LATE_PROLOGUE_EPILOGUE aarch64_use_late_prologue_epilogue
++
+ #undef TARGET_EMIT_EPILOGUE_FOR_SIBCALL
+ #define TARGET_EMIT_EPILOGUE_FOR_SIBCALL aarch64_expand_epilogue
+ 
+diff --git a/gcc/testsuite/g++.target/aarch64/sme/keyword_macros_1.C b/gcc/testsuite/g++.target/aarch64/sme/keyword_macros_1.C
+index 8b0755014..dc5c097bd 100644
+--- a/gcc/testsuite/g++.target/aarch64/sme/keyword_macros_1.C
++++ b/gcc/testsuite/g++.target/aarch64/sme/keyword_macros_1.C
+@@ -7,3 +7,4 @@ void f4 () __arm_out("za");
+ void f5 () __arm_inout("za");
+ void f6 () __arm_preserves("za");
+ __arm_new("za") void f7 () {}
++__arm_locally_streaming void f8 () {}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sme/keyword_macros_1.c b/gcc/testsuite/gcc.target/aarch64/sme/keyword_macros_1.c
+index fcabe3edc..22f5facfd 100644
+--- a/gcc/testsuite/gcc.target/aarch64/sme/keyword_macros_1.c
++++ b/gcc/testsuite/gcc.target/aarch64/sme/keyword_macros_1.c
+@@ -7,3 +7,4 @@ void f4 () __arm_out("za");
+ void f5 () __arm_inout("za");
+ void f6 () __arm_preserves("za");
+ __arm_new("za") void f7 () {}
++__arm_locally_streaming void f8 () {}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sme/locally_streaming_1.c b/gcc/testsuite/gcc.target/aarch64/sme/locally_streaming_1.c
+new file mode 100644
+index 000000000..20ff4b87d
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sme/locally_streaming_1.c
+@@ -0,0 +1,466 @@
++// { dg-options "-O -fomit-frame-pointer" }
++// { dg-final { check-function-bodies "**" "" } }
++
++void consume_za () [[arm::streaming, arm::inout("za")]];
++
++/*
++** n_ls:
++**	sub	sp, sp, #?80
++**	cntd	x16
++**	str	x16, \[sp\]
++**	stp	d8, d9, \[sp, #?16\]
++**	stp	d10, d11, \[sp, #?32\]
++**	stp	d12, d13, \[sp, #?48\]
++**	stp	d14, d15, \[sp, #?64\]
++**	smstart	sm
++**	smstop	sm
++**	ldp	d8, d9, \[sp, #?16\]
++**	ldp	d10, d11, \[sp, #?32\]
++**	ldp	d12, d13, \[sp, #?48\]
++**	ldp	d14, d15, \[sp, #?64\]
++**	add	sp, sp, #?80
++**	ret
++*/
++[[arm::locally_streaming]] void
++n_ls ()
++{
++  asm ("");
++}
++
++/*
++** s_ls:
++**	ret
++*/
++[[arm::locally_streaming]] void
++s_ls () [[arm::streaming]]
++{
++  asm ("");
++}
++
++/*
++** sc_ls:
++**	stp	x29, x30, \[sp, #?-96\]!
++**	mov	x29, sp
++**	cntd	x16
++**	str	x16, \[sp, #?24\]
++**	stp	d8, d9, \[sp, #?32\]
++**	stp	d10, d11, \[sp, #?48\]
++**	stp	d12, d13, \[sp, #?64\]
++**	stp	d14, d15, \[sp, #?80\]
++**	mrs	x16, svcr
++**	str	x16, \[x29, #?16\]
++**	tbnz	x16, 0, [^\n]+
++**	smstart	sm
++**	ldr	x16, \[x29, #?16\]
++**	tbnz	x16, 0, [^\n]+
++**	smstop	sm
++**	ldp	d8, d9, \[sp, #?32\]
++**	ldp	d10, d11, \[sp, #?48\]
++**	ldp	d12, d13, \[sp, #?64\]
++**	ldp	d14, d15, \[sp, #?80\]
++**	ldp	x29, x30, \[sp\], #?96
++**	ret
++*/
++[[arm::locally_streaming]] void
++sc_ls () [[arm::streaming_compatible]]
++{
++  asm ("");
++}
++
++/*
++** n_ls_new_za:
++**	str	x30, \[sp, #?-80\]!
++**	cntd	x16
++**	str	x16, \[sp, #?8\]
++**	stp	d8, d9, \[sp, #?16\]
++**	stp	d10, d11, \[sp, #?32\]
++**	stp	d12, d13, \[sp, #?48\]
++**	stp	d14, d15, \[sp, #?64\]
++**	smstart	sm
++**	mrs	(x[0-9]+), tpidr2_el0
++**	cbz	\1, [^\n]+
++**	bl	__arm_tpidr2_save
++**	msr	tpidr2_el0, xzr
++**	zero	{ za }
++**	smstart	za
++**	bl	consume_za
++**	smstop	za
++**	smstop	sm
++**	ldp	d8, d9, \[sp, #?16\]
++**	ldp	d10, d11, \[sp, #?32\]
++**	ldp	d12, d13, \[sp, #?48\]
++**	ldp	d14, d15, \[sp, #?64\]
++**	ldr	x30, \[sp\], #?80
++**	ret
++*/
++[[arm::locally_streaming, arm::new("za")]] void
++n_ls_new_za ()
++{
++  consume_za ();
++  asm ("");
++}
++
++/*
++** s_ls_new_za:
++**	str	x30, \[sp, #?-16\]!
++**	mrs	(x[0-9]+), tpidr2_el0
++**	cbz	\1, [^\n]+
++**	bl	__arm_tpidr2_save
++**	msr	tpidr2_el0, xzr
++**	zero	{ za }
++**	smstart	za
++**	bl	consume_za
++**	smstop	za
++**	ldr	x30, \[sp\], #?16
++**	ret
++*/
++[[arm::locally_streaming, arm::new("za")]] void
++s_ls_new_za () [[arm::streaming]]
++{
++  consume_za ();
++  asm ("");
++}
++
++/*
++** sc_ls_new_za:
++**	stp	x29, x30, \[sp, #?-96\]!
++**	mov	x29, sp
++**	cntd	x16
++**	str	x16, \[sp, #?24\]
++**	stp	d8, d9, \[sp, #?32\]
++**	stp	d10, d11, \[sp, #?48\]
++**	stp	d12, d13, \[sp, #?64\]
++**	stp	d14, d15, \[sp, #?80\]
++**	mrs	x16, svcr
++**	str	x16, \[x29, #?16\]
++**	tbnz	x16, 0, [^\n]+
++**	smstart	sm
++**	mrs	(x[0-9]+), tpidr2_el0
++**	cbz	\1, [^\n]+
++**	bl	__arm_tpidr2_save
++**	msr	tpidr2_el0, xzr
++**	zero	{ za }
++**	smstart	za
++**	bl	consume_za
++**	smstop	za
++**	ldr	x16, \[x29, #?16\]
++**	tbnz	x16, 0, [^\n]+
++**	smstop	sm
++**	ldp	d8, d9, \[sp, #?32\]
++**	ldp	d10, d11, \[sp, #?48\]
++**	ldp	d12, d13, \[sp, #?64\]
++**	ldp	d14, d15, \[sp, #?80\]
++**	ldp	x29, x30, \[sp\], #?96
++**	ret
++*/
++[[arm::locally_streaming, arm::new("za")]] void
++sc_ls_new_za () [[arm::streaming_compatible]]
++{
++  consume_za ();
++  asm ("");
++}
++
++/*
++** n_ls_shared_za:
++**	str	x30, \[sp, #?-80\]!
++**	cntd	x16
++**	str	x16, \[sp, #?8\]
++**	stp	d8, d9, \[sp, #?16\]
++**	stp	d10, d11, \[sp, #?32\]
++**	stp	d12, d13, \[sp, #?48\]
++**	stp	d14, d15, \[sp, #?64\]
++**	smstart	sm
++**	bl	consume_za
++**	smstop	sm
++**	ldp	d8, d9, \[sp, #?16\]
++**	ldp	d10, d11, \[sp, #?32\]
++**	ldp	d12, d13, \[sp, #?48\]
++**	ldp	d14, d15, \[sp, #?64\]
++**	ldr	x30, \[sp\], #?80
++**	ret
++*/
++[[arm::locally_streaming]] void
++n_ls_shared_za () [[arm::inout("za")]]
++{
++  consume_za ();
++  asm ("");
++}
++
++/*
++** s_ls_shared_za:
++**	str	x30, \[sp, #?-16\]!
++**	bl	consume_za
++**	ldr	x30, \[sp\], #?16
++**	ret
++*/
++[[arm::locally_streaming]] void
++s_ls_shared_za () [[arm::streaming, arm::inout("za")]]
++{
++  consume_za ();
++  asm ("");
++}
++
++/*
++** sc_ls_shared_za:
++**	stp	x29, x30, \[sp, #?-96\]!
++**	mov	x29, sp
++**	cntd	x16
++**	str	x16, \[sp, #?24\]
++**	stp	d8, d9, \[sp, #?32\]
++**	stp	d10, d11, \[sp, #?48\]
++**	stp	d12, d13, \[sp, #?64\]
++**	stp	d14, d15, \[sp, #?80\]
++**	mrs	x16, svcr
++**	str	x16, \[x29, #?16\]
++**	tbnz	x16, 0, [^\n]+
++**	smstart	sm
++**	bl	consume_za
++**	ldr	x16, \[x29, #?16\]
++**	tbnz	x16, 0, [^\n]+
++**	smstop	sm
++**	ldp	d8, d9, \[sp, #?32\]
++**	ldp	d10, d11, \[sp, #?48\]
++**	ldp	d12, d13, \[sp, #?64\]
++**	ldp	d14, d15, \[sp, #?80\]
++**	ldp	x29, x30, \[sp\], #?96
++**	ret
++*/
++[[arm::locally_streaming]] void
++sc_ls_shared_za () [[arm::streaming_compatible, arm::inout("za")]]
++{
++  consume_za ();
++  asm ("");
++}
++
++/*
++** n_ls_vector_pcs:
++**	sub	sp, sp, #?272
++**	cntd	x16
++**	str	x16, \[sp\]
++**	stp	q8, q9, \[sp, #?16\]
++**	stp	q10, q11, \[sp, #?48\]
++**	stp	q12, q13, \[sp, #?80\]
++**	stp	q14, q15, \[sp, #?112\]
++**	stp	q16, q17, \[sp, #?144\]
++**	stp	q18, q19, \[sp, #?176\]
++**	stp	q20, q21, \[sp, #?208\]
++**	stp	q22, q23, \[sp, #?240\]
++**	smstart	sm
++**	smstop	sm
++**	ldp	q8, q9, \[sp, #?16\]
++**	ldp	q10, q11, \[sp, #?48\]
++**	ldp	q12, q13, \[sp, #?80\]
++**	ldp	q14, q15, \[sp, #?112\]
++**	ldp	q16, q17, \[sp, #?144\]
++**	ldp	q18, q19, \[sp, #?176\]
++**	ldp	q20, q21, \[sp, #?208\]
++**	ldp	q22, q23, \[sp, #?240\]
++**	add	sp, sp, #?272
++**	ret
++*/
++[[arm::locally_streaming]] void __attribute__((aarch64_vector_pcs))
++n_ls_vector_pcs ()
++{
++  asm ("");
++}
++
++/*
++** n_ls_sve_pcs:
++**	sub	sp, sp, #?16
++**	cntd	x16
++**	str	x16, \[sp\]
++**	addsvl	sp, sp, #-18
++**	str	p4, \[sp\]
++**	str	p5, \[sp, #1, mul vl\]
++**	str	p6, \[sp, #2, mul vl\]
++**	str	p7, \[sp, #3, mul vl\]
++**	str	p8, \[sp, #4, mul vl\]
++**	str	p9, \[sp, #5, mul vl\]
++**	str	p10, \[sp, #6, mul vl\]
++**	str	p11, \[sp, #7, mul vl\]
++**	str	p12, \[sp, #8, mul vl\]
++**	str	p13, \[sp, #9, mul vl\]
++**	str	p14, \[sp, #10, mul vl\]
++**	str	p15, \[sp, #11, mul vl\]
++**	str	z8, \[sp, #2, mul vl\]
++**	str	z9, \[sp, #3, mul vl\]
++**	str	z10, \[sp, #4, mul vl\]
++**	str	z11, \[sp, #5, mul vl\]
++**	str	z12, \[sp, #6, mul vl\]
++**	str	z13, \[sp, #7, mul vl\]
++**	str	z14, \[sp, #8, mul vl\]
++**	str	z15, \[sp, #9, mul vl\]
++**	str	z16, \[sp, #10, mul vl\]
++**	str	z17, \[sp, #11, mul vl\]
++**	str	z18, \[sp, #12, mul vl\]
++**	str	z19, \[sp, #13, mul vl\]
++**	str	z20, \[sp, #14, mul vl\]
++**	str	z21, \[sp, #15, mul vl\]
++**	str	z22, \[sp, #16, mul vl\]
++**	str	z23, \[sp, #17, mul vl\]
++**	addvl	sp, sp, #-1
++**	str	p0, \[sp\]
++**	smstart	sm
++**	ldr	p0, \[sp\]
++**	addvl	sp, sp, #1
++**	smstop	sm
++**	ldr	z8, \[sp, #2, mul vl\]
++**	ldr	z9, \[sp, #3, mul vl\]
++**	ldr	z10, \[sp, #4, mul vl\]
++**	ldr	z11, \[sp, #5, mul vl\]
++**	ldr	z12, \[sp, #6, mul vl\]
++**	ldr	z13, \[sp, #7, mul vl\]
++**	ldr	z14, \[sp, #8, mul vl\]
++**	ldr	z15, \[sp, #9, mul vl\]
++**	ldr	z16, \[sp, #10, mul vl\]
++**	ldr	z17, \[sp, #11, mul vl\]
++**	ldr	z18, \[sp, #12, mul vl\]
++**	ldr	z19, \[sp, #13, mul vl\]
++**	ldr	z20, \[sp, #14, mul vl\]
++**	ldr	z21, \[sp, #15, mul vl\]
++**	ldr	z22, \[sp, #16, mul vl\]
++**	ldr	z23, \[sp, #17, mul vl\]
++**	ldr	p4, \[sp\]
++**	ldr	p5, \[sp, #1, mul vl\]
++**	ldr	p6, \[sp, #2, mul vl\]
++**	ldr	p7, \[sp, #3, mul vl\]
++**	ldr	p8, \[sp, #4, mul vl\]
++**	ldr	p9, \[sp, #5, mul vl\]
++**	ldr	p10, \[sp, #6, mul vl\]
++**	ldr	p11, \[sp, #7, mul vl\]
++**	ldr	p12, \[sp, #8, mul vl\]
++**	ldr	p13, \[sp, #9, mul vl\]
++**	ldr	p14, \[sp, #10, mul vl\]
++**	ldr	p15, \[sp, #11, mul vl\]
++**	addsvl	sp, sp, #18
++**	add	sp, sp, #?16
++**	ret
++*/
++[[arm::locally_streaming]] void
++n_ls_sve_pcs (__SVBool_t x)
++{
++  asm ("");
++}
++
++/*
++** n_ls_v0:
++**	addsvl	sp, sp, #-1
++**	...
++**	smstart	sm
++**	add	x[0-9]+, [^\n]+
++**	smstop	sm
++**	...
++**	addsvl	sp, sp, #1
++**	...
++*/
++#define TEST(VN) __SVInt32_t VN; asm ("" :: "r" (&VN));
++[[arm::locally_streaming]] void
++n_ls_v0 ()
++{
++  TEST (v0);
++}
++
++/*
++** n_ls_v32:
++**	addsvl	sp, sp, #-32
++**	...
++**	smstart	sm
++**	...
++**	smstop	sm
++**	...
++**	rdsvl	(x[0-9]+), #1
++**	lsl	(x[0-9]+), \1, #?5
++**	add	sp, sp, \2
++**	...
++*/
++[[arm::locally_streaming]] void
++n_ls_v32 ()
++{
++  TEST (v0);
++  TEST (v1);
++  TEST (v2);
++  TEST (v3);
++  TEST (v4);
++  TEST (v5);
++  TEST (v6);
++  TEST (v7);
++  TEST (v8);
++  TEST (v9);
++  TEST (v10);
++  TEST (v11);
++  TEST (v12);
++  TEST (v13);
++  TEST (v14);
++  TEST (v15);
++  TEST (v16);
++  TEST (v17);
++  TEST (v18);
++  TEST (v19);
++  TEST (v20);
++  TEST (v21);
++  TEST (v22);
++  TEST (v23);
++  TEST (v24);
++  TEST (v25);
++  TEST (v26);
++  TEST (v27);
++  TEST (v28);
++  TEST (v29);
++  TEST (v30);
++  TEST (v31);
++}
++
++/*
++** n_ls_v33:
++**	rdsvl	(x[0-9]+), #1
++**	mov	(x[0-9]+), #?33
++**	mul	(x[0-9]+), (?:\1, \2|\2, \1)
++**	sub	sp, sp, \3
++**	...
++**	smstart	sm
++**	...
++**	smstop	sm
++**	...
++**	rdsvl	(x[0-9]+), #1
++**	mov	(x[0-9]+), #?33
++**	mul	(x[0-9]+), (?:\4, \5|\5, \4)
++**	add	sp, sp, \6
++**	...
++*/
++[[arm::locally_streaming]] void
++n_ls_v33 ()
++{
++  TEST (v0);
++  TEST (v1);
++  TEST (v2);
++  TEST (v3);
++  TEST (v4);
++  TEST (v5);
++  TEST (v6);
++  TEST (v7);
++  TEST (v8);
++  TEST (v9);
++  TEST (v10);
++  TEST (v11);
++  TEST (v12);
++  TEST (v13);
++  TEST (v14);
++  TEST (v15);
++  TEST (v16);
++  TEST (v17);
++  TEST (v18);
++  TEST (v19);
++  TEST (v20);
++  TEST (v21);
++  TEST (v22);
++  TEST (v23);
++  TEST (v24);
++  TEST (v25);
++  TEST (v26);
++  TEST (v27);
++  TEST (v28);
++  TEST (v29);
++  TEST (v30);
++  TEST (v31);
++  TEST (v32);
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sme/locally_streaming_2.c b/gcc/testsuite/gcc.target/aarch64/sme/locally_streaming_2.c
+new file mode 100644
+index 000000000..0eba99385
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sme/locally_streaming_2.c
+@@ -0,0 +1,177 @@
++// { dg-options "-O -fomit-frame-pointer" }
++// { dg-final { check-function-bodies "**" "" } }
++
++#include <arm_neon.h>
++#include <arm_sve.h>
++
++/*
++** test_d0:
++**	...
++**	smstart	sm
++**	...
++**	fmov	x10, d0
++**	smstop	sm
++**	fmov	d0, x10
++**	...
++*/
++[[arm::locally_streaming]] double
++test_d0 ()
++{
++  asm ("");
++  return 1.0f;
++}
++
++/*
++** test_d0_vec:
++**	...
++**	smstart	sm
++**	...
++** (
++**	fmov	x10, d0
++** |
++**	umov	x10, v0.d\[0\]
++** )
++**	smstop	sm
++**	fmov	d0, x10
++**	...
++*/
++[[arm::locally_streaming]] int8x8_t
++test_d0_vec ()
++{
++  asm ("");
++  return (int8x8_t) {};
++}
++
++/*
++** test_q0:
++**	...
++**	smstart	sm
++**	...
++**	str	q0, \[sp, #?-16\]!
++**	smstop	sm
++**	ldr	q0, \[sp\], #?16
++**	...
++*/
++[[arm::locally_streaming]] int8x16_t
++test_q0 ()
++{
++  asm ("");
++  return (int8x16_t) {};
++}
++
++/*
++** test_q1:
++**	...
++**	smstart	sm
++**	...
++**	stp	q0, q1, \[sp, #?-32\]!
++**	smstop	sm
++**	ldp	q0, q1, \[sp\], #?32
++**	...
++*/
++[[arm::locally_streaming]] int8x16x2_t
++test_q1 ()
++{
++  asm ("");
++  return (int8x16x2_t) {};
++}
++
++/*
++** test_q2:
++**	...
++**	smstart	sm
++**	...
++**	stp	q0, q1, \[sp, #?-48\]!
++**	str	q2, \[sp, #?32\]
++**	smstop	sm
++**	ldr	q2, \[sp, #?32\]
++**	ldp	q0, q1, \[sp\], #?48
++**	...
++*/
++[[arm::locally_streaming]] int8x16x3_t
++test_q2 ()
++{
++  asm ("");
++  return (int8x16x3_t) {};
++}
++
++/*
++** test_q3:
++**	...
++**	smstart	sm
++**	...
++**	stp	q0, q1, \[sp, #?-64\]!
++**	stp	q2, q3, \[sp, #?32\]
++**	smstop	sm
++**	ldp	q2, q3, \[sp, #?32\]
++**	ldp	q0, q1, \[sp\], #?64
++**	...
++*/
++[[arm::locally_streaming]] int8x16x4_t
++test_q3 ()
++{
++  asm ("");
++  return (int8x16x4_t) {};
++}
++
++/*
++** test_z0:
++**	...
++**	smstart	sm
++**	mov	z0\.b, #0
++**	addvl	sp, sp, #-1
++**	str	z0, \[sp\]
++**	smstop	sm
++**	ldr	z0, \[sp\]
++**	addvl	sp, sp, #1
++**	...
++*/
++[[arm::locally_streaming]] svint8_t
++test_z0 ()
++{
++  asm ("");
++  return (svint8_t) {};
++}
++
++/*
++** test_z3:
++**	...
++**	smstart	sm
++**	...
++**	addvl	sp, sp, #-4
++**	str	z0, \[sp\]
++**	str	z1, \[sp, #1, mul vl\]
++**	str	z2, \[sp, #2, mul vl\]
++**	str	z3, \[sp, #3, mul vl\]
++**	smstop	sm
++**	ldr	z0, \[sp\]
++**	ldr	z1, \[sp, #1, mul vl\]
++**	ldr	z2, \[sp, #2, mul vl\]
++**	ldr	z3, \[sp, #3, mul vl\]
++**	...
++*/
++[[arm::locally_streaming]] svint8x4_t
++test_z3 ()
++{
++  asm ("");
++  return (svint8x4_t) {};
++}
++
++/*
++** test_p0:
++**	...
++**	smstart	sm
++**	pfalse	p0\.b
++**	addvl	sp, sp, #-1
++**	str	p0, \[sp\]
++**	smstop	sm
++**	ldr	p0, \[sp\]
++**	addvl	sp, sp, #1
++**	...
++*/
++[[arm::locally_streaming]] svbool_t
++test_p0 ()
++{
++  asm ("");
++  return (svbool_t) {};
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sme/locally_streaming_3.c b/gcc/testsuite/gcc.target/aarch64/sme/locally_streaming_3.c
+new file mode 100644
+index 000000000..2bdea6ac6
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sme/locally_streaming_3.c
+@@ -0,0 +1,273 @@
++// { dg-options "-O -fomit-frame-pointer" }
++// { dg-final { check-function-bodies "**" "" } }
++
++#include <arm_neon.h>
++#include <arm_sve.h>
++
++/*
++** test_d0:
++**	...
++**	fmov	x10, d0
++**	smstart	sm
++**	fmov	d0, x10
++**	smstop	sm
++**	...
++*/
++[[arm::locally_streaming]] void
++test_d0 (double d0)
++{
++  asm ("");
++}
++
++/*
++** test_d7:
++**	...
++**	fmov	x10, d0
++**	fmov	x11, d1
++**	fmov	x12, d2
++**	fmov	x13, d3
++**	fmov	x14, d4
++**	fmov	x15, d5
++**	fmov	x16, d6
++**	fmov	x17, d7
++**	smstart	sm
++**	fmov	d0, x10
++**	fmov	d1, x11
++**	fmov	d2, x12
++**	fmov	d3, x13
++**	fmov	d4, x14
++**	fmov	d5, x15
++**	fmov	d6, x16
++**	fmov	d7, x17
++**	smstop	sm
++**	...
++*/
++[[arm::locally_streaming]] void
++test_d7 (double d0, double d1, double d2, double d3,
++	 double d4, double d5, double d6, double d7)
++{
++  asm ("");
++}
++
++/*
++** test_d0_vec:
++**	...
++** (
++**	fmov	x10, d0
++** |
++**	umov	x10, v0.d\[0\]
++** )
++**	smstart	sm
++**	fmov	d0, x10
++**	smstop	sm
++**	...
++*/
++[[arm::locally_streaming]] void
++test_d0_vec (int8x8_t d0)
++{
++  asm ("");
++}
++
++/*
++** test_d7_vec:
++**	...
++** (
++**	fmov	x10, d0
++**	fmov	x11, d1
++**	fmov	x12, d2
++**	fmov	x13, d3
++**	fmov	x14, d4
++**	fmov	x15, d5
++**	fmov	x16, d6
++**	fmov	x17, d7
++** |
++**	umov	x10, v0.d\[0\]
++**	umov	x11, v1.d\[0\]
++**	umov	x12, v2.d\[0\]
++**	umov	x13, v3.d\[0\]
++**	umov	x14, v4.d\[0\]
++**	umov	x15, v5.d\[0\]
++**	umov	x16, v6.d\[0\]
++**	umov	x17, v7.d\[0\]
++** )
++**	smstart	sm
++**	fmov	d0, x10
++**	fmov	d1, x11
++**	fmov	d2, x12
++**	fmov	d3, x13
++**	fmov	d4, x14
++**	fmov	d5, x15
++**	fmov	d6, x16
++**	fmov	d7, x17
++**	smstop	sm
++**	...
++*/
++[[arm::locally_streaming]] void
++test_d7_vec (int8x8_t d0, int8x8_t d1, int8x8_t d2, int8x8_t d3,
++	     int8x8_t d4, int8x8_t d5, int8x8_t d6, int8x8_t d7)
++{
++  asm ("");
++}
++
++/*
++** test_q0:
++**	...
++**	str	q0, \[sp, #?-16\]!
++**	smstart	sm
++**	ldr	q0, \[sp\], #?16
++**	smstop	sm
++**	...
++*/
++[[arm::locally_streaming]] void
++test_q0 (int8x16_t q0)
++{
++  asm ("");
++}
++
++/*
++** test_q7:
++**	...
++**	stp	q0, q1, \[sp, #?-128\]!
++**	stp	q2, q3, \[sp, #?32\]
++**	stp	q4, q5, \[sp, #?64\]
++**	stp	q6, q7, \[sp, #?96\]
++**	smstart	sm
++**	ldp	q2, q3, \[sp, #?32\]
++**	ldp	q4, q5, \[sp, #?64\]
++**	ldp	q6, q7, \[sp, #?96\]
++**	ldp	q0, q1, \[sp\], #?128
++**	smstop	sm
++**	...
++*/
++[[arm::locally_streaming]] void
++test_q7 (int8x16x4_t q0, int8x16x4_t q4)
++{
++  asm ("");
++}
++
++/*
++** test_z0:
++**	...
++**	addvl	sp, sp, #-1
++**	str	z0, \[sp\]
++**	smstart	sm
++**	ldr	z0, \[sp\]
++**	addvl	sp, sp, #1
++**	smstop	sm
++**	...
++*/
++[[arm::locally_streaming]] void
++test_z0 (svint8_t z0)
++{
++  asm ("");
++}
++
++/*
++** test_z7:
++**	...
++**	addvl	sp, sp, #-8
++**	str	z0, \[sp\]
++**	str	z1, \[sp, #1, mul vl\]
++**	str	z2, \[sp, #2, mul vl\]
++**	str	z3, \[sp, #3, mul vl\]
++**	str	z4, \[sp, #4, mul vl\]
++**	str	z5, \[sp, #5, mul vl\]
++**	str	z6, \[sp, #6, mul vl\]
++**	str	z7, \[sp, #7, mul vl\]
++**	smstart	sm
++**	ldr	z0, \[sp\]
++**	ldr	z1, \[sp, #1, mul vl\]
++**	ldr	z2, \[sp, #2, mul vl\]
++**	ldr	z3, \[sp, #3, mul vl\]
++**	ldr	z4, \[sp, #4, mul vl\]
++**	ldr	z5, \[sp, #5, mul vl\]
++**	ldr	z6, \[sp, #6, mul vl\]
++**	ldr	z7, \[sp, #7, mul vl\]
++**	addvl	sp, sp, #8
++**	smstop	sm
++**	...
++*/
++[[arm::locally_streaming]] void
++test_z7 (svint8x4_t z0, svint8x4_t z4)
++{
++  asm ("");
++}
++
++/*
++** test_p0:
++**	...
++**	addvl	sp, sp, #-1
++**	str	p0, \[sp\]
++**	smstart	sm
++**	ldr	p0, \[sp\]
++**	addvl	sp, sp, #1
++**	smstop	sm
++**	...
++*/
++[[arm::locally_streaming]] void
++test_p0 (svbool_t p0)
++{
++  asm ("");
++}
++
++/*
++** test_p3:
++**	...
++**	addvl	sp, sp, #-1
++**	str	p0, \[sp\]
++**	str	p1, \[sp, #1, mul vl\]
++**	str	p2, \[sp, #2, mul vl\]
++**	str	p3, \[sp, #3, mul vl\]
++**	smstart	sm
++**	ldr	p0, \[sp\]
++**	ldr	p1, \[sp, #1, mul vl\]
++**	ldr	p2, \[sp, #2, mul vl\]
++**	ldr	p3, \[sp, #3, mul vl\]
++**	addvl	sp, sp, #1
++**	smstop	sm
++**	...
++*/
++[[arm::locally_streaming]] void
++test_p3 (svbool_t p0, svbool_t p1, svbool_t p2, svbool_t p3)
++{
++  asm ("");
++}
++
++/*
++** test_mixed:
++**	...
++**	addvl	sp, sp, #-3
++**	str	p0, \[sp\]
++**	str	p1, \[sp, #1, mul vl\]
++**	str	p2, \[sp, #2, mul vl\]
++**	str	p3, \[sp, #3, mul vl\]
++**	str	z3, \[sp, #1, mul vl\]
++**	str	z7, \[sp, #2, mul vl\]
++**	stp	q2, q6, \[sp, #?-32\]!
++**	fmov	w10, s0
++**	fmov	x11, d1
++**	fmov	w12, s4
++**	fmov	x13, d5
++**	smstart	sm
++**	fmov	s0, w10
++**	fmov	d1, x11
++**	fmov	s4, w12
++**	fmov	d5, x13
++**	ldp	q2, q6, \[sp\], #?32
++**	ldr	p0, \[sp\]
++**	ldr	p1, \[sp, #1, mul vl\]
++**	ldr	p2, \[sp, #2, mul vl\]
++**	ldr	p3, \[sp, #3, mul vl\]
++**	ldr	z3, \[sp, #1, mul vl\]
++**	ldr	z7, \[sp, #2, mul vl\]
++**	addvl	sp, sp, #3
++**	smstop	sm
++**	...
++*/
++[[arm::locally_streaming]] void
++test_mixed (float s0, double d1, float32x4_t q2, svfloat32_t z3,
++	    float s4, double d5, float64x2_t q6, svfloat64_t z7,
++	    svbool_t p0, svbool_t p1, svbool_t p2, svbool_t p3)
++{
++  asm ("");
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sme/locally_streaming_4.c b/gcc/testsuite/gcc.target/aarch64/sme/locally_streaming_4.c
+new file mode 100644
+index 000000000..42adeb152
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sme/locally_streaming_4.c
+@@ -0,0 +1,145 @@
++// { dg-options "-O -fomit-frame-pointer" }
++/* { dg-final { check-function-bodies "**" "" } } */
++
++#include <arm_neon.h>
++#include <arm_sve.h>
++
++/*
++** test_d0:
++**	...
++**	smstart	sm
++**	...
++**	fmov	x10, d0
++**	smstop	sm
++**	fmov	d0, x10
++**	...
++**	smstart	sm
++**	...
++**	smstop	sm
++**	...
++*/
++void consume_d0 (double d0);
++
++__arm_locally_streaming void
++test_d0 ()
++{
++  asm ("");
++  consume_d0 (1.0);
++  asm ("");
++}
++
++/*
++** test_d7:
++**	...
++**	fmov	x10, d0
++**	fmov	x11, d1
++**	fmov	x12, d2
++**	fmov	x13, d3
++**	fmov	x14, d4
++**	fmov	x15, d5
++**	fmov	x16, d6
++**	fmov	x17, d7
++**	smstop	sm
++**	fmov	d0, x10
++**	fmov	d1, x11
++**	fmov	d2, x12
++**	fmov	d3, x13
++**	fmov	d4, x14
++**	fmov	d5, x15
++**	fmov	d6, x16
++**	fmov	d7, x17
++**	...
++*/
++void consume_d7 (double d0, double d1, double d2, double d3,
++		 double d4, double d5, double d6, double d7);
++__arm_locally_streaming void
++test_d7 ()
++{
++  asm ("");
++  consume_d7 (1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
++  asm ("");
++}
++
++/*
++** test_q7:
++**	...
++**	stp	q0, q1, \[sp, #?-128\]!
++**	stp	q2, q3, \[sp, #?32\]
++**	stp	q4, q5, \[sp, #?64\]
++**	stp	q6, q7, \[sp, #?96\]
++**	smstop	sm
++**	ldp	q2, q3, \[sp, #?32\]
++**	ldp	q4, q5, \[sp, #?64\]
++**	ldp	q6, q7, \[sp, #?96\]
++**	ldp	q0, q1, \[sp\], #?128
++**	...
++*/
++void consume_q7 (int8x16x4_t q0, int8x16x4_t q4);
++
++__arm_locally_streaming void
++test_q7 (int8x16x4_t *ptr)
++{
++  asm ("");
++  consume_q7 (ptr[0], ptr[1]);
++  asm ("");
++}
++
++/*
++** test_z7:
++**	...
++**	addvl	sp, sp, #-8
++**	str	z0, \[sp\]
++**	str	z1, \[sp, #1, mul vl\]
++**	str	z2, \[sp, #2, mul vl\]
++**	str	z3, \[sp, #3, mul vl\]
++**	str	z4, \[sp, #4, mul vl\]
++**	str	z5, \[sp, #5, mul vl\]
++**	str	z6, \[sp, #6, mul vl\]
++**	str	z7, \[sp, #7, mul vl\]
++**	smstop	sm
++**	ldr	z0, \[sp\]
++**	ldr	z1, \[sp, #1, mul vl\]
++**	ldr	z2, \[sp, #2, mul vl\]
++**	ldr	z3, \[sp, #3, mul vl\]
++**	ldr	z4, \[sp, #4, mul vl\]
++**	ldr	z5, \[sp, #5, mul vl\]
++**	ldr	z6, \[sp, #6, mul vl\]
++**	ldr	z7, \[sp, #7, mul vl\]
++**	addvl	sp, sp, #8
++**	...
++*/
++void consume_z7 (svint8x4_t z0, svint8x4_t z4);
++
++__arm_locally_streaming void
++test_z7 (svint8x4_t *ptr1, svint8x4_t *ptr2)
++{
++  asm ("");
++  consume_z7 (*ptr1, *ptr2);
++  asm ("");
++}
++
++/*
++** test_p3:
++**	...
++**	addvl	sp, sp, #-1
++**	str	p0, \[sp\]
++**	str	p1, \[sp, #1, mul vl\]
++**	str	p2, \[sp, #2, mul vl\]
++**	str	p3, \[sp, #3, mul vl\]
++**	smstop	sm
++**	ldr	p0, \[sp\]
++**	ldr	p1, \[sp, #1, mul vl\]
++**	ldr	p2, \[sp, #2, mul vl\]
++**	ldr	p3, \[sp, #3, mul vl\]
++**	addvl	sp, sp, #1
++**	...
++*/
++void consume_p3 (svbool_t p0, svbool_t p1, svbool_t p2, svbool_t p3);
++
++__arm_locally_streaming void
++test_p3 (svbool_t *ptr1, svbool_t *ptr2, svbool_t *ptr3, svbool_t *ptr4)
++{
++  asm ("");
++  consume_p3 (*ptr1, *ptr2, *ptr3, *ptr4);
++  asm ("");
++}
+-- 
+2.33.0
+
author	CoprDistGit <infra@openeuler.org>	2025-02-28 10:03:49 +0000
committer	CoprDistGit <infra@openeuler.org>	2025-02-28 10:03:49 +0000
commit	73127104a245052cd5cf29cdaaca3e5c32c70348 (patch)
tree	8e28b63e478c43c252f18b49836dff7313affe54 /0212-Backport-SME-aarch64-Add-support-for-__arm_locally_s.patch
parent	49d3feaf4665cdb07576fc1a2382a4d82a612d35 (diff)