diff options
Diffstat (limited to '0205-Backport-SME-aarch64-Switch-PSTATE.SM-around-calls.patch')
-rw-r--r-- | 0205-Backport-SME-aarch64-Switch-PSTATE.SM-around-calls.patch | 3270 |
1 files changed, 3270 insertions, 0 deletions
diff --git a/0205-Backport-SME-aarch64-Switch-PSTATE.SM-around-calls.patch b/0205-Backport-SME-aarch64-Switch-PSTATE.SM-around-calls.patch new file mode 100644 index 0000000..d0a23d2 --- /dev/null +++ b/0205-Backport-SME-aarch64-Switch-PSTATE.SM-around-calls.patch @@ -0,0 +1,3270 @@ +From 88a41bc24eb793eee27aa9f4ef6b763b3c3e76e6 Mon Sep 17 00:00:00 2001 +From: Richard Sandiford <richard.sandiford@arm.com> +Date: Tue, 5 Dec 2023 10:11:25 +0000 +Subject: [PATCH 106/157] [Backport][SME] aarch64: Switch PSTATE.SM around + calls + +Reference: https://gcc.gnu.org/git/?p=gcc.git;a=commit;h=dd8090f40079fa41ee58d9f76b2e50ed4f95c6bf + +This patch adds support for switching to the appropriate SME mode +for each call. Switching to streaming mode requires an SMSTART SM +instruction and switching to non-streaming mode requires an SMSTOP SM +instruction. If the call is being made from streaming-compatible code, +these switches are conditional on the current mode being the opposite +of the one that the call needs. + +Since changing PSTATE.SM changes the vector length and effectively +changes the ISA, the code to do the switching has to be emitted late. +The patch does this using a new pass that runs next to late prologue/ +epilogue insertion. (It doesn't use md_reorg because later additions +need the CFG.) + +If a streaming-compatible function needs to switch mode for a call, +it must restore the original mode afterwards. The old mode must +therefore be available immediately after the call. The easiest +way of ensuring this is to force the use of a hard frame pointer +and ensure that the old state is saved at an in-range offset +from there. + +Changing modes clobbers the Z and P registers, so we need to +save and restore live Z and P state around each mode switch. +However, mode switches are not expected to be performance +critical, so it seemed better to err on the side of being +correct rather than trying to optimise the save and restore +with surrounding code. + +gcc/ + * config/aarch64/aarch64-passes.def + (pass_late_thread_prologue_and_epilogue): New pass. + * config/aarch64/aarch64-sme.md: New file. + * config/aarch64/aarch64.md: Include it. + (*tb<optab><mode>1): Rename to... + (@aarch64_tb<optab><mode>): ...this. + (call, call_value, sibcall, sibcall_value): Don't require operand 2 + to be a CONST_INT. + * config/aarch64/aarch64-protos.h (aarch64_emit_call_insn): Return + the insn. + (make_pass_switch_sm_state): Declare. + * config/aarch64/aarch64.h (TARGET_STREAMING_COMPATIBLE): New macro. + (CALL_USED_REGISTER): Mark VG as call-preserved. + (aarch64_frame::old_svcr_offset): New member variable. + (machine_function::call_switches_sm_state): Likewise. + (CUMULATIVE_ARGS::num_sme_mode_switch_args): Likewise. + (CUMULATIVE_ARGS::sme_mode_switch_args): Likewise. + * config/aarch64/aarch64.cc: Include tree-pass.h and cfgbuild.h. + (aarch64_cfun_incoming_pstate_sm): New function. + (aarch64_call_switches_pstate_sm): Likewise. + (aarch64_reg_save_mode): Return DImode for VG_REGNUM. + (aarch64_callee_isa_mode): New function. + (aarch64_insn_callee_isa_mode): Likewise. + (aarch64_guard_switch_pstate_sm): Likewise. + (aarch64_switch_pstate_sm): Likewise. + (aarch64_sme_mode_switch_regs): New class. + (aarch64_record_sme_mode_switch_args): New function. + (aarch64_finish_sme_mode_switch_args): Likewise. + (aarch64_function_arg): Handle the end marker by returning a + PARALLEL that contains the ABI cookie that we used previously + alongside the result of aarch64_finish_sme_mode_switch_args. + (aarch64_init_cumulative_args): Initialize num_sme_mode_switch_args. + (aarch64_function_arg_advance): If a call would switch SM state, + record all argument registers that would need to be saved around + the mode switch. + (aarch64_need_old_pstate_sm): New function. + (aarch64_layout_frame): Decide whether the frame needs to store the + incoming value of PSTATE.SM and allocate a save slot for it if so. + If a function switches SME state, arrange to save the old value + of the DWARF VG register. Handle the case where this is the only + register save slot above the FP. + (aarch64_save_callee_saves): Handles saves of the DWARF VG register. + (aarch64_get_separate_components): Prevent such saves from being + shrink-wrapped. + (aarch64_old_svcr_mem): New function. + (aarch64_read_old_svcr): Likewise. + (aarch64_guard_switch_pstate_sm): Likewise. + (aarch64_expand_prologue): Handle saves of the DWARF VG register. + Initialize any SVCR save slot. + (aarch64_expand_call): Allow the cookie to be PARALLEL that contains + both the UNSPEC_CALLEE_ABI value and a list of registers that need + to be preserved across a change to PSTATE.SM. If the call does + involve such a change to PSTATE.SM, record the registers that + would be clobbered by this process. Also emit an instruction + to mark the temporary change in VG. Update call_switches_pstate_sm. + (aarch64_emit_call_insn): Return the emitted instruction. + (aarch64_frame_pointer_required): New function. + (aarch64_conditional_register_usage): Prevent VG_REGNUM from being + treated as a register operand. + (aarch64_switch_pstate_sm_for_call): New function. + (pass_data_switch_pstate_sm): New pass variable. + (pass_switch_pstate_sm): New pass class. + (make_pass_switch_pstate_sm): New function. + (TARGET_FRAME_POINTER_REQUIRED): Define. + * config/aarch64/t-aarch64 (s-check-sve-md): Add aarch64-sme.md. + +gcc/testsuite/ + * gcc.target/aarch64/sme/call_sm_switch_1.c: New test. + * gcc.target/aarch64/sme/call_sm_switch_2.c: Likewise. + * gcc.target/aarch64/sme/call_sm_switch_3.c: Likewise. + * gcc.target/aarch64/sme/call_sm_switch_4.c: Likewise. + * gcc.target/aarch64/sme/call_sm_switch_5.c: Likewise. + * gcc.target/aarch64/sme/call_sm_switch_6.c: Likewise. + * gcc.target/aarch64/sme/call_sm_switch_7.c: Likewise. + * gcc.target/aarch64/sme/call_sm_switch_8.c: Likewise. + * gcc.target/aarch64/sme/call_sm_switch_9.c: Likewise. + * gcc.target/aarch64/sme/call_sm_switch_10.c: Likewise. +--- + gcc/config/aarch64/aarch64-passes.def | 1 + + gcc/config/aarch64/aarch64-protos.h | 3 +- + gcc/config/aarch64/aarch64-sme.md | 171 ++++ + gcc/config/aarch64/aarch64.cc | 883 +++++++++++++++++- + gcc/config/aarch64/aarch64.h | 25 +- + gcc/config/aarch64/aarch64.md | 13 +- + gcc/config/aarch64/t-aarch64 | 5 +- + .../gcc.target/aarch64/sme/call_sm_switch_1.c | 233 +++++ + .../aarch64/sme/call_sm_switch_10.c | 37 + + .../gcc.target/aarch64/sme/call_sm_switch_2.c | 43 + + .../gcc.target/aarch64/sme/call_sm_switch_3.c | 166 ++++ + .../gcc.target/aarch64/sme/call_sm_switch_4.c | 43 + + .../gcc.target/aarch64/sme/call_sm_switch_5.c | 318 +++++++ + .../gcc.target/aarch64/sme/call_sm_switch_6.c | 45 + + .../gcc.target/aarch64/sme/call_sm_switch_7.c | 516 ++++++++++ + .../gcc.target/aarch64/sme/call_sm_switch_8.c | 87 ++ + .../gcc.target/aarch64/sme/call_sm_switch_9.c | 103 ++ + 17 files changed, 2668 insertions(+), 24 deletions(-) + create mode 100644 gcc/config/aarch64/aarch64-sme.md + create mode 100644 gcc/testsuite/gcc.target/aarch64/sme/call_sm_switch_1.c + create mode 100644 gcc/testsuite/gcc.target/aarch64/sme/call_sm_switch_10.c + create mode 100644 gcc/testsuite/gcc.target/aarch64/sme/call_sm_switch_2.c + create mode 100644 gcc/testsuite/gcc.target/aarch64/sme/call_sm_switch_3.c + create mode 100644 gcc/testsuite/gcc.target/aarch64/sme/call_sm_switch_4.c + create mode 100644 gcc/testsuite/gcc.target/aarch64/sme/call_sm_switch_5.c + create mode 100644 gcc/testsuite/gcc.target/aarch64/sme/call_sm_switch_6.c + create mode 100644 gcc/testsuite/gcc.target/aarch64/sme/call_sm_switch_7.c + create mode 100644 gcc/testsuite/gcc.target/aarch64/sme/call_sm_switch_8.c + create mode 100644 gcc/testsuite/gcc.target/aarch64/sme/call_sm_switch_9.c + +diff --git a/gcc/config/aarch64/aarch64-passes.def b/gcc/config/aarch64/aarch64-passes.def +index a2babc112..c6cbbf2ef 100644 +--- a/gcc/config/aarch64/aarch64-passes.def ++++ b/gcc/config/aarch64/aarch64-passes.def +@@ -20,6 +20,7 @@ + + INSERT_PASS_AFTER (pass_regrename, 1, pass_fma_steering); + INSERT_PASS_BEFORE (pass_reorder_blocks, 1, pass_track_speculation); ++INSERT_PASS_BEFORE (pass_late_thread_prologue_and_epilogue, 1, pass_switch_pstate_sm); + INSERT_PASS_AFTER (pass_machine_reorg, 1, pass_tag_collision_avoidance); + INSERT_PASS_BEFORE (pass_shorten_branches, 1, pass_insert_bti); + INSERT_PASS_AFTER (pass_if_after_combine, 1, pass_cc_fusion); +diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h +index 9b03410dc..737f47026 100644 +--- a/gcc/config/aarch64/aarch64-protos.h ++++ b/gcc/config/aarch64/aarch64-protos.h +@@ -913,7 +913,7 @@ void aarch64_init_cumulative_args (CUMULATIVE_ARGS *, const_tree, rtx, + const_tree, unsigned, bool = false); + void aarch64_init_expanders (void); + void aarch64_init_simd_builtins (void); +-void aarch64_emit_call_insn (rtx); ++rtx_call_insn *aarch64_emit_call_insn (rtx); + void aarch64_register_pragmas (void); + void aarch64_relayout_simd_types (void); + void aarch64_reset_previous_fndecl (void); +@@ -1055,6 +1055,7 @@ rtl_opt_pass *make_pass_track_speculation (gcc::context *); + rtl_opt_pass *make_pass_tag_collision_avoidance (gcc::context *); + rtl_opt_pass *make_pass_insert_bti (gcc::context *ctxt); + rtl_opt_pass *make_pass_cc_fusion (gcc::context *ctxt); ++rtl_opt_pass *make_pass_switch_pstate_sm (gcc::context *ctxt); + + poly_uint64 aarch64_regmode_natural_size (machine_mode); + +diff --git a/gcc/config/aarch64/aarch64-sme.md b/gcc/config/aarch64/aarch64-sme.md +new file mode 100644 +index 000000000..52427b4f1 +--- /dev/null ++++ b/gcc/config/aarch64/aarch64-sme.md +@@ -0,0 +1,171 @@ ++;; Machine description for AArch64 SME. ++;; Copyright (C) 2023 Free Software Foundation, Inc. ++;; ++;; This file is part of GCC. ++;; ++;; GCC is free software; you can redistribute it and/or modify it ++;; under the terms of the GNU General Public License as published by ++;; the Free Software Foundation; either version 3, or (at your option) ++;; any later version. ++;; ++;; GCC is distributed in the hope that it will be useful, but ++;; WITHOUT ANY WARRANTY; without even the implied warranty of ++;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++;; General Public License for more details. ++;; ++;; You should have received a copy of the GNU General Public License ++;; along with GCC; see the file COPYING3. If not see ++;; <http://www.gnu.org/licenses/>. ++ ++;; The file is organised into the following sections (search for the full ++;; line): ++;; ++;; == State management ++;; ---- Test current state ++;; ---- PSTATE.SM management ++ ++;; ========================================================================= ++;; == State management ++;; ========================================================================= ++;; ++;; Many of the instructions in this section are only valid when SME is ++;; present. However, they don't have a TARGET_SME condition since ++;; (a) they are only emitted under direct control of aarch64 code and ++;; (b) they are sometimes used conditionally, particularly in streaming- ++;; compatible code. ++;; ++;; ========================================================================= ++ ++;; ------------------------------------------------------------------------- ++;; ---- Test current state ++;; ------------------------------------------------------------------------- ++ ++(define_c_enum "unspec" [ ++ UNSPEC_OLD_VG_SAVED ++ UNSPEC_UPDATE_VG ++ UNSPEC_GET_SME_STATE ++ UNSPEC_READ_SVCR ++]) ++ ++;; A marker instruction to say that the old value of the DWARF VG register ++;; has been saved to the stack, for CFI purposes. Operand 0 is the old ++;; value of the register and operand 1 is the save slot. ++(define_insn "aarch64_old_vg_saved" ++ [(set (reg:DI VG_REGNUM) ++ (unspec:DI [(match_operand 0) ++ (match_operand 1)] UNSPEC_OLD_VG_SAVED))] ++ "" ++ "" ++ [(set_attr "type" "no_insn")] ++) ++ ++;; A marker to indicate places where a call temporarily changes VG. ++(define_insn "aarch64_update_vg" ++ [(set (reg:DI VG_REGNUM) ++ (unspec:DI [(reg:DI VG_REGNUM)] UNSPEC_UPDATE_VG))] ++ "" ++ "" ++ [(set_attr "type" "no_insn")] ++) ++ ++(define_insn "aarch64_get_sme_state" ++ [(set (reg:TI R0_REGNUM) ++ (unspec_volatile:TI [(const_int 0)] UNSPEC_GET_SME_STATE)) ++ (clobber (reg:DI R16_REGNUM)) ++ (clobber (reg:DI R17_REGNUM)) ++ (clobber (reg:DI R18_REGNUM)) ++ (clobber (reg:DI R30_REGNUM)) ++ (clobber (reg:CC CC_REGNUM))] ++ "" ++ "bl\t__arm_sme_state" ++) ++ ++(define_insn "aarch64_read_svcr" ++ [(set (match_operand:DI 0 "register_operand" "=r") ++ (unspec_volatile:DI [(const_int 0)] UNSPEC_READ_SVCR))] ++ "" ++ "mrs\t%0, svcr" ++) ++ ++;; ------------------------------------------------------------------------- ++;; ---- PSTATE.SM management ++;; ------------------------------------------------------------------------- ++;; Includes: ++;; - SMSTART SM ++;; - SMSTOP SM ++;; ------------------------------------------------------------------------- ++ ++(define_c_enum "unspec" [ ++ UNSPEC_SMSTART_SM ++ UNSPEC_SMSTOP_SM ++]) ++ ++;; Turn on streaming mode. This clobbers all SVE state. ++;; ++;; Depend on VG_REGNUM to ensure that the VG save slot has already been ++;; initialized. ++(define_insn "aarch64_smstart_sm" ++ [(unspec_volatile [(const_int 0)] UNSPEC_SMSTART_SM) ++ (use (reg:DI VG_REGNUM)) ++ (clobber (reg:V4x16QI V0_REGNUM)) ++ (clobber (reg:V4x16QI V4_REGNUM)) ++ (clobber (reg:V4x16QI V8_REGNUM)) ++ (clobber (reg:V4x16QI V12_REGNUM)) ++ (clobber (reg:V4x16QI V16_REGNUM)) ++ (clobber (reg:V4x16QI V20_REGNUM)) ++ (clobber (reg:V4x16QI V24_REGNUM)) ++ (clobber (reg:V4x16QI V28_REGNUM)) ++ (clobber (reg:VNx16BI P0_REGNUM)) ++ (clobber (reg:VNx16BI P1_REGNUM)) ++ (clobber (reg:VNx16BI P2_REGNUM)) ++ (clobber (reg:VNx16BI P3_REGNUM)) ++ (clobber (reg:VNx16BI P4_REGNUM)) ++ (clobber (reg:VNx16BI P5_REGNUM)) ++ (clobber (reg:VNx16BI P6_REGNUM)) ++ (clobber (reg:VNx16BI P7_REGNUM)) ++ (clobber (reg:VNx16BI P8_REGNUM)) ++ (clobber (reg:VNx16BI P9_REGNUM)) ++ (clobber (reg:VNx16BI P10_REGNUM)) ++ (clobber (reg:VNx16BI P11_REGNUM)) ++ (clobber (reg:VNx16BI P12_REGNUM)) ++ (clobber (reg:VNx16BI P13_REGNUM)) ++ (clobber (reg:VNx16BI P14_REGNUM)) ++ (clobber (reg:VNx16BI P15_REGNUM))] ++ "" ++ "smstart\tsm" ++) ++ ++;; Turn off streaming mode. This clobbers all SVE state. ++;; ++;; Depend on VG_REGNUM to ensure that the VG save slot has already been ++;; initialized. ++(define_insn "aarch64_smstop_sm" ++ [(unspec_volatile [(const_int 0)] UNSPEC_SMSTOP_SM) ++ (use (reg:DI VG_REGNUM)) ++ (clobber (reg:V4x16QI V0_REGNUM)) ++ (clobber (reg:V4x16QI V4_REGNUM)) ++ (clobber (reg:V4x16QI V8_REGNUM)) ++ (clobber (reg:V4x16QI V12_REGNUM)) ++ (clobber (reg:V4x16QI V16_REGNUM)) ++ (clobber (reg:V4x16QI V20_REGNUM)) ++ (clobber (reg:V4x16QI V24_REGNUM)) ++ (clobber (reg:V4x16QI V28_REGNUM)) ++ (clobber (reg:VNx16BI P0_REGNUM)) ++ (clobber (reg:VNx16BI P1_REGNUM)) ++ (clobber (reg:VNx16BI P2_REGNUM)) ++ (clobber (reg:VNx16BI P3_REGNUM)) ++ (clobber (reg:VNx16BI P4_REGNUM)) ++ (clobber (reg:VNx16BI P5_REGNUM)) ++ (clobber (reg:VNx16BI P6_REGNUM)) ++ (clobber (reg:VNx16BI P7_REGNUM)) ++ (clobber (reg:VNx16BI P8_REGNUM)) ++ (clobber (reg:VNx16BI P9_REGNUM)) ++ (clobber (reg:VNx16BI P10_REGNUM)) ++ (clobber (reg:VNx16BI P11_REGNUM)) ++ (clobber (reg:VNx16BI P12_REGNUM)) ++ (clobber (reg:VNx16BI P13_REGNUM)) ++ (clobber (reg:VNx16BI P14_REGNUM)) ++ (clobber (reg:VNx16BI P15_REGNUM))] ++ "" ++ "smstop\tsm" ++) +diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc +index 1c127192d..82f8e574e 100644 +--- a/gcc/config/aarch64/aarch64.cc ++++ b/gcc/config/aarch64/aarch64.cc +@@ -82,6 +82,8 @@ + #include "tree-dfa.h" + #include "asan.h" + #include "aarch64-feature-deps.h" ++#include "tree-pass.h" ++#include "cfgbuild.h" + + /* This file should be included last. */ + #include "target-def.h" +@@ -4377,6 +4379,26 @@ aarch64_fndecl_isa_mode (const_tree fndecl) + return aarch64_fndecl_pstate_sm (fndecl); + } + ++/* Return the state of PSTATE.SM on entry to the current function. ++ This might be different from the state of PSTATE.SM in the function ++ body. */ ++ ++static aarch64_feature_flags ++aarch64_cfun_incoming_pstate_sm () ++{ ++ return aarch64_fntype_pstate_sm (TREE_TYPE (cfun->decl)); ++} ++ ++/* Return true if a call from the current function to a function with ++ ISA mode CALLEE_MODE would involve a change to PSTATE.SM around ++ the BL instruction. */ ++ ++static bool ++aarch64_call_switches_pstate_sm (aarch64_feature_flags callee_mode) ++{ ++ return (callee_mode & ~AARCH64_ISA_MODE & AARCH64_FL_SM_STATE) != 0; ++} ++ + /* Implement TARGET_COMPATIBLE_VECTOR_TYPES_P. */ + + static bool +@@ -4400,7 +4422,7 @@ aarch64_emit_cfi_for_reg_p (unsigned int regno) + static machine_mode + aarch64_reg_save_mode (unsigned int regno) + { +- if (GP_REGNUM_P (regno)) ++ if (GP_REGNUM_P (regno) || regno == VG_REGNUM) + return DImode; + + if (FP_REGNUM_P (regno)) +@@ -4459,6 +4481,16 @@ aarch64_callee_abi (rtx cookie) + return function_abis[UINTVAL (cookie) >> AARCH64_NUM_ISA_MODES]; + } + ++/* COOKIE is a CONST_INT from an UNSPEC_CALLEE_ABI rtx. Return the ++ required ISA mode on entry to the callee, which is also the ISA ++ mode on return from the callee. */ ++ ++static aarch64_feature_flags ++aarch64_callee_isa_mode (rtx cookie) ++{ ++ return UINTVAL (cookie) & AARCH64_FL_ISA_MODES; ++} ++ + /* INSN is a call instruction. Return the CONST_INT stored in its + UNSPEC_CALLEE_ABI rtx. */ + +@@ -4481,6 +4513,15 @@ aarch64_insn_callee_abi (const rtx_insn *insn) + return aarch64_callee_abi (aarch64_insn_callee_cookie (insn)); + } + ++/* INSN is a call instruction. Return the required ISA mode on entry to ++ the callee, which is also the ISA mode on return from the callee. */ ++ ++static aarch64_feature_flags ++aarch64_insn_callee_isa_mode (const rtx_insn *insn) ++{ ++ return aarch64_callee_isa_mode (aarch64_insn_callee_cookie (insn)); ++} ++ + /* Implement TARGET_HARD_REGNO_CALL_PART_CLOBBERED. The callee only saves + the lower 64 bits of a 128-bit register. Tell the compiler the callee + clobbers the top 64 bits when restoring the bottom 64 bits. */ +@@ -6645,6 +6686,437 @@ aarch64_sub_sp (rtx temp1, rtx temp2, poly_int64 delta, bool frame_related_p, + temp1, temp2, frame_related_p, emit_move_imm); + } + ++/* A streaming-compatible function needs to switch temporarily to the known ++ PSTATE.SM mode described by LOCAL_MODE. The low bit of OLD_SVCR contains ++ the runtime state of PSTATE.SM in the streaming-compatible code, before ++ the start of the switch to LOCAL_MODE. ++ ++ Emit instructions to branch around the mode switch if PSTATE.SM already ++ matches LOCAL_MODE. Return the label that the branch jumps to. */ ++ ++static rtx_insn * ++aarch64_guard_switch_pstate_sm (rtx old_svcr, aarch64_feature_flags local_mode) ++{ ++ local_mode &= AARCH64_FL_SM_STATE; ++ gcc_assert (local_mode != 0); ++ auto already_ok_cond = (local_mode & AARCH64_FL_SM_ON ? NE : EQ); ++ auto *label = gen_label_rtx (); ++ auto *jump = emit_jump_insn (gen_aarch64_tb (already_ok_cond, DImode, DImode, ++ old_svcr, const0_rtx, label)); ++ JUMP_LABEL (jump) = label; ++ return label; ++} ++ ++/* Emit code to switch from the PSTATE.SM state in OLD_MODE to the PSTATE.SM ++ state in NEW_MODE. This is known to involve either an SMSTART SM or ++ an SMSTOP SM. */ ++ ++static void ++aarch64_switch_pstate_sm (aarch64_feature_flags old_mode, ++ aarch64_feature_flags new_mode) ++{ ++ old_mode &= AARCH64_FL_SM_STATE; ++ new_mode &= AARCH64_FL_SM_STATE; ++ gcc_assert (old_mode != new_mode); ++ ++ if ((new_mode & AARCH64_FL_SM_ON) ++ || (new_mode == 0 && (old_mode & AARCH64_FL_SM_OFF))) ++ emit_insn (gen_aarch64_smstart_sm ()); ++ else ++ emit_insn (gen_aarch64_smstop_sm ()); ++} ++ ++/* As a side-effect, SMSTART SM and SMSTOP SM clobber the contents of all ++ FP and predicate registers. This class emits code to preserve any ++ necessary registers around the mode switch. ++ ++ The class uses four approaches to saving and restoring contents, enumerated ++ by group_type: ++ ++ - GPR: save and restore the contents of FP registers using GPRs. ++ This is used if the FP register contains no more than 64 significant ++ bits. The registers used are FIRST_GPR onwards. ++ ++ - MEM_128: save and restore 128-bit SIMD registers using memory. ++ ++ - MEM_SVE_PRED: save and restore full SVE predicate registers using memory. ++ ++ - MEM_SVE_DATA: save and restore full SVE vector registers using memory. ++ ++ The save slots within each memory group are consecutive, with the ++ MEM_SVE_PRED slots occupying a region below the MEM_SVE_DATA slots. ++ ++ There will only be two mode switches for each use of SME, so they should ++ not be particularly performance-sensitive. It's also rare for SIMD, SVE ++ or predicate registers to be live across mode switches. We therefore ++ don't preallocate the save slots but instead allocate them locally on ++ demand. This makes the code emitted by the class self-contained. */ ++ ++class aarch64_sme_mode_switch_regs ++{ ++public: ++ static const unsigned int FIRST_GPR = R10_REGNUM; ++ ++ void add_reg (machine_mode, unsigned int); ++ void add_call_args (rtx_call_insn *); ++ void add_call_result (rtx_call_insn *); ++ ++ void emit_prologue (); ++ void emit_epilogue (); ++ ++ /* The number of GPRs needed to save FP registers, starting from ++ FIRST_GPR. */ ++ unsigned int num_gprs () { return m_group_count[GPR]; } ++ ++private: ++ enum sequence { PROLOGUE, EPILOGUE }; ++ enum group_type { GPR, MEM_128, MEM_SVE_PRED, MEM_SVE_DATA, NUM_GROUPS }; ++ ++ /* Information about the save location for one FP, SIMD, SVE data, or ++ SVE predicate register. */ ++ struct save_location { ++ /* The register to be saved. */ ++ rtx reg; ++ ++ /* Which group the save location belongs to. */ ++ group_type group; ++ ++ /* A zero-based index of the register within the group. */ ++ unsigned int index; ++ }; ++ ++ unsigned int sve_data_headroom (); ++ rtx get_slot_mem (machine_mode, poly_int64); ++ void emit_stack_adjust (sequence, poly_int64); ++ void emit_mem_move (sequence, const save_location &, poly_int64); ++ ++ void emit_gpr_moves (sequence); ++ void emit_mem_128_moves (sequence); ++ void emit_sve_sp_adjust (sequence); ++ void emit_sve_pred_moves (sequence); ++ void emit_sve_data_moves (sequence); ++ ++ /* All save locations, in no particular order. */ ++ auto_vec<save_location, 12> m_save_locations; ++ ++ /* The number of registers in each group. */ ++ unsigned int m_group_count[NUM_GROUPS] = {}; ++}; ++ ++/* Record that (reg:MODE REGNO) needs to be preserved around the mode ++ switch. */ ++ ++void ++aarch64_sme_mode_switch_regs::add_reg (machine_mode mode, unsigned int regno) ++{ ++ if (!FP_REGNUM_P (regno) && !PR_REGNUM_P (regno)) ++ return; ++ ++ unsigned int end_regno = end_hard_regno (mode, regno); ++ unsigned int vec_flags = aarch64_classify_vector_mode (mode); ++ gcc_assert ((vec_flags & VEC_STRUCT) || end_regno == regno + 1); ++ for (; regno < end_regno; regno++) ++ { ++ machine_mode submode = mode; ++ if (vec_flags & VEC_STRUCT) ++ { ++ if (vec_flags & VEC_SVE_DATA) ++ submode = SVE_BYTE_MODE; ++ else if (vec_flags & VEC_PARTIAL) ++ submode = V8QImode; ++ else ++ submode = V16QImode; ++ } ++ save_location loc; ++ loc.reg = gen_rtx_REG (submode, regno); ++ if (vec_flags == VEC_SVE_PRED) ++ { ++ gcc_assert (PR_REGNUM_P (regno)); ++ loc.group = MEM_SVE_PRED; ++ } ++ else ++ { ++ gcc_assert (FP_REGNUM_P (regno)); ++ if (known_le (GET_MODE_SIZE (submode), 8)) ++ loc.group = GPR; ++ else if (known_eq (GET_MODE_SIZE (submode), 16)) ++ loc.group = MEM_128; ++ else ++ loc.group = MEM_SVE_DATA; ++ } ++ loc.index = m_group_count[loc.group]++; ++ m_save_locations.quick_push (loc); ++ } ++} ++ ++/* Record that the arguments to CALL_INSN need to be preserved around ++ the mode switch. */ ++ ++void ++aarch64_sme_mode_switch_regs::add_call_args (rtx_call_insn *call_insn) ++{ ++ for (rtx node = CALL_INSN_FUNCTION_USAGE (call_insn); ++ node; node = XEXP (node, 1)) ++ { ++ rtx item = XEXP (node, 0); ++ if (GET_CODE (item) != USE) ++ continue; ++ item = XEXP (item, 0); ++ if (!REG_P (item)) ++ continue; ++ add_reg (GET_MODE (item), REGNO (item)); ++ } ++} ++ ++/* Record that the return value from CALL_INSN (if any) needs to be ++ preserved around the mode switch. */ ++ ++void ++aarch64_sme_mode_switch_regs::add_call_result (rtx_call_insn *call_insn) ++{ ++ rtx pat = PATTERN (call_insn); ++ gcc_assert (GET_CODE (pat) == PARALLEL); ++ pat = XVECEXP (pat, 0, 0); ++ if (GET_CODE (pat) == CALL) ++ return; ++ rtx dest = SET_DEST (pat); ++ if (GET_CODE (dest) == PARALLEL) ++ for (int i = 0; i < XVECLEN (dest, 0); ++i) ++ { ++ rtx x = XVECEXP (dest, 0, i); ++ gcc_assert (GET_CODE (x) == EXPR_LIST); ++ rtx reg = XEXP (x, 0); ++ add_reg (GET_MODE (reg), REGNO (reg)); ++ } ++ else ++ add_reg (GET_MODE (dest), REGNO (dest)); ++} ++ ++/* Emit code to save registers before the mode switch. */ ++ ++void ++aarch64_sme_mode_switch_regs::emit_prologue () ++{ ++ emit_sve_sp_adjust (PROLOGUE); ++ emit_sve_pred_moves (PROLOGUE); ++ emit_sve_data_moves (PROLOGUE); ++ emit_mem_128_moves (PROLOGUE); ++ emit_gpr_moves (PROLOGUE); ++} ++ ++/* Emit code to restore registers after the mode switch. */ ++ ++void ++aarch64_sme_mode_switch_regs::emit_epilogue () ++{ ++ emit_gpr_moves (EPILOGUE); ++ emit_mem_128_moves (EPILOGUE); ++ emit_sve_pred_moves (EPILOGUE); ++ emit_sve_data_moves (EPILOGUE); ++ emit_sve_sp_adjust (EPILOGUE); ++} ++ ++/* The SVE predicate registers are stored below the SVE data registers, ++ with the predicate save area being padded to a data-register-sized ++ boundary. Return the size of this padded area as a whole number ++ of data register slots. */ ++ ++unsigned int ++aarch64_sme_mode_switch_regs::sve_data_headroom () ++{ ++ return CEIL (m_group_count[MEM_SVE_PRED], 8); ++} ++ ++/* Return a memory reference of mode MODE to OFFSET bytes from the ++ stack pointer. */ ++ ++rtx ++aarch64_sme_mode_switch_regs::get_slot_mem (machine_mode mode, ++ poly_int64 offset) ++{ ++ rtx addr = plus_constant (Pmode, stack_pointer_rtx, offset); ++ return gen_rtx_MEM (mode, addr); ++} ++ ++/* Allocate or deallocate SIZE bytes of stack space: SEQ decides which. */ ++ ++void ++aarch64_sme_mode_switch_regs::emit_stack_adjust (sequence seq, ++ poly_int64 size) ++{ ++ if (seq == PROLOGUE) ++ size = -size; ++ emit_insn (gen_rtx_SET (stack_pointer_rtx, ++ plus_constant (Pmode, stack_pointer_rtx, size))); ++} ++ ++/* Save or restore the register in LOC, whose slot is OFFSET bytes from ++ the stack pointer. SEQ chooses between saving and restoring. */ ++ ++void ++aarch64_sme_mode_switch_regs::emit_mem_move (sequence seq, ++ const save_location &loc, ++ poly_int64 offset) ++{ ++ rtx mem = get_slot_mem (GET_MODE (loc.reg), offset); ++ if (seq == PROLOGUE) ++ emit_move_insn (mem, loc.reg); ++ else ++ emit_move_insn (loc.reg, mem); ++} ++ ++/* Emit instructions to save or restore the GPR group. SEQ chooses between ++ saving and restoring. */ ++ ++void ++aarch64_sme_mode_switch_regs::emit_gpr_moves (sequence seq) ++{ ++ for (auto &loc : m_save_locations) ++ if (loc.group == GPR) ++ { ++ gcc_assert (loc.index < 8); ++ rtx gpr = gen_rtx_REG (GET_MODE (loc.reg), FIRST_GPR + loc.index); ++ if (seq == PROLOGUE) ++ emit_move_insn (gpr, loc.reg); ++ else ++ emit_move_insn (loc.reg, gpr); ++ } ++} ++ ++/* Emit instructions to save or restore the MEM_128 group. SEQ chooses ++ between saving and restoring. */ ++ ++void ++aarch64_sme_mode_switch_regs::emit_mem_128_moves (sequence seq) ++{ ++ HOST_WIDE_INT count = m_group_count[MEM_128]; ++ if (count == 0) ++ return; ++ ++ auto sp = stack_pointer_rtx; ++ auto sp_adjust = (seq == PROLOGUE ? -count : count) * 16; ++ ++ /* Pick a common mode that supports LDR & STR with pre/post-modification ++ and LDP & STP with pre/post-modification. */ ++ auto mode = TFmode; ++ ++ /* An instruction pattern that should be emitted at the end. */ ++ rtx last_pat = NULL_RTX; ++ ++ /* A previous MEM_128 location that hasn't been handled yet. */ ++ save_location *prev_loc = nullptr; ++ ++ /* Look for LDP/STPs and record any leftover LDR/STR in PREV_LOC. */ ++ for (auto &loc : m_save_locations) ++ if (loc.group == MEM_128) ++ { ++ if (!prev_loc) ++ { ++ prev_loc = &loc; ++ continue; ++ } ++ gcc_assert (loc.index == prev_loc->index + 1); ++ ++ /* The offset of the base of the save area from the current ++ stack pointer. */ ++ HOST_WIDE_INT bias = 0; ++ if (prev_loc->index == 0 && seq == PROLOGUE) ++ bias = sp_adjust; ++ ++ /* Get the two sets in the LDP/STP. */ ++ rtx ops[] = { ++ gen_rtx_REG (mode, REGNO (prev_loc->reg)), ++ get_slot_mem (mode, prev_loc->index * 16 + bias), ++ gen_rtx_REG (mode, REGNO (loc.reg)), ++ get_slot_mem (mode, loc.index * 16 + bias) ++ }; ++ unsigned int lhs = (seq == PROLOGUE); ++ rtx set1 = gen_rtx_SET (ops[lhs], ops[1 - lhs]); ++ rtx set2 = gen_rtx_SET (ops[lhs + 2], ops[3 - lhs]); ++ ++ /* Combine the sets with any stack allocation/deallocation. */ ++ rtvec vec; ++ if (prev_loc->index == 0) ++ { ++ rtx plus_sp = plus_constant (Pmode, sp, sp_adjust); ++ vec = gen_rtvec (3, gen_rtx_SET (sp, plus_sp), set1, set2); ++ } ++ else ++ vec = gen_rtvec (2, set1, set2); ++ rtx pat = gen_rtx_PARALLEL (VOIDmode, vec); ++ ++ /* Queue a deallocation to the end, otherwise emit the ++ instruction now. */ ++ if (seq == EPILOGUE && prev_loc->index == 0) ++ last_pat = pat; ++ else ++ emit_insn (pat); ++ prev_loc = nullptr; ++ } ++ ++ /* Handle any leftover LDR/STR. */ ++ if (prev_loc) ++ { ++ rtx reg = gen_rtx_REG (mode, REGNO (prev_loc->reg)); ++ rtx addr; ++ if (prev_loc->index != 0) ++ addr = plus_constant (Pmode, sp, prev_loc->index * 16); ++ else if (seq == PROLOGUE) ++ { ++ rtx allocate = plus_constant (Pmode, sp, -count * 16); ++ addr = gen_rtx_PRE_MODIFY (Pmode, sp, allocate); ++ } ++ else ++ { ++ rtx deallocate = plus_constant (Pmode, sp, count * 16); ++ addr = gen_rtx_POST_MODIFY (Pmode, sp, deallocate); ++ } ++ rtx mem = gen_rtx_MEM (mode, addr); ++ if (seq == PROLOGUE) ++ emit_move_insn (mem, reg); ++ else ++ emit_move_insn (reg, mem); ++ } ++ ++ if (last_pat) ++ emit_insn (last_pat); ++} ++ ++/* Allocate or deallocate the stack space needed by the SVE groups. ++ SEQ chooses between allocating and deallocating. */ ++ ++void ++aarch64_sme_mode_switch_regs::emit_sve_sp_adjust (sequence seq) ++{ ++ if (unsigned int count = m_group_count[MEM_SVE_DATA] + sve_data_headroom ()) ++ emit_stack_adjust (seq, count * BYTES_PER_SVE_VECTOR); ++} ++ ++/* Save or restore the MEM_SVE_DATA group. SEQ chooses between saving ++ and restoring. */ ++ ++void ++aarch64_sme_mode_switch_regs::emit_sve_data_moves (sequence seq) ++{ ++ for (auto &loc : m_save_locations) ++ if (loc.group == MEM_SVE_DATA) ++ { ++ auto index = loc.index + sve_data_headroom (); ++ emit_mem_move (seq, loc, index * BYTES_PER_SVE_VECTOR); ++ } ++} ++ ++/* Save or restore the MEM_SVE_PRED group. SEQ chooses between saving ++ and restoring. */ ++ ++void ++aarch64_sme_mode_switch_regs::emit_sve_pred_moves (sequence seq) ++{ ++ for (auto &loc : m_save_locations) ++ if (loc.group == MEM_SVE_PRED) ++ emit_mem_move (seq, loc, loc.index * BYTES_PER_SVE_PRED); ++} ++ + /* Set DEST to (vec_series BASE STEP). */ + + static void +@@ -8211,6 +8683,40 @@ on_stack: + return; + } + ++/* Add the current argument register to the set of those that need ++ to be saved and restored around a change to PSTATE.SM. */ ++ ++static void ++aarch64_record_sme_mode_switch_args (CUMULATIVE_ARGS *pcum) ++{ ++ subrtx_var_iterator::array_type array; ++ FOR_EACH_SUBRTX_VAR (iter, array, pcum->aapcs_reg, NONCONST) ++ { ++ rtx x = *iter; ++ if (REG_P (x) && (FP_REGNUM_P (REGNO (x)) || PR_REGNUM_P (REGNO (x)))) ++ { ++ unsigned int i = pcum->num_sme_mode_switch_args++; ++ gcc_assert (i < ARRAY_SIZE (pcum->sme_mode_switch_args)); ++ pcum->sme_mode_switch_args[i] = x; ++ } ++ } ++} ++ ++/* Return a parallel that contains all the registers that need to be ++ saved around a change to PSTATE.SM. Return const0_rtx if there is ++ no such mode switch, or if no registers need to be saved. */ ++ ++static rtx ++aarch64_finish_sme_mode_switch_args (CUMULATIVE_ARGS *pcum) ++{ ++ if (!pcum->num_sme_mode_switch_args) ++ return const0_rtx; ++ ++ auto argvec = gen_rtvec_v (pcum->num_sme_mode_switch_args, ++ pcum->sme_mode_switch_args); ++ return gen_rtx_PARALLEL (VOIDmode, argvec); ++} ++ + /* Implement TARGET_FUNCTION_ARG. */ + + static rtx +@@ -8222,7 +8728,13 @@ aarch64_function_arg (cumulative_args_t pcum_v, const function_arg_info &arg) + || pcum->pcs_variant == ARM_PCS_SVE); + + if (arg.end_marker_p ()) +- return aarch64_gen_callee_cookie (pcum->isa_mode, pcum->pcs_variant); ++ { ++ rtx abi_cookie = aarch64_gen_callee_cookie (pcum->isa_mode, ++ pcum->pcs_variant); ++ rtx sme_mode_switch_args = aarch64_finish_sme_mode_switch_args (pcum); ++ return gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, abi_cookie, ++ sme_mode_switch_args)); ++ } + + aarch64_layout_arg (pcum_v, arg); + return pcum->aapcs_reg; +@@ -8257,6 +8769,7 @@ aarch64_init_cumulative_args (CUMULATIVE_ARGS *pcum, + pcum->aapcs_stack_words = 0; + pcum->aapcs_stack_size = 0; + pcum->silent_p = silent_p; ++ pcum->num_sme_mode_switch_args = 0; + + if (!silent_p + && !TARGET_FLOAT +@@ -8297,6 +8810,10 @@ aarch64_function_arg_advance (cumulative_args_t pcum_v, + aarch64_layout_arg (pcum_v, arg); + gcc_assert ((pcum->aapcs_reg != NULL_RTX) + != (pcum->aapcs_stack_words != 0)); ++ if (pcum->aapcs_reg ++ && aarch64_call_switches_pstate_sm (pcum->isa_mode)) ++ aarch64_record_sme_mode_switch_args (pcum); ++ + pcum->aapcs_arg_processed = false; + pcum->aapcs_ncrn = pcum->aapcs_nextncrn; + pcum->aapcs_nvrn = pcum->aapcs_nextnvrn; +@@ -8747,6 +9264,30 @@ aarch64_save_regs_above_locals_p () + return crtl->stack_protect_guard; + } + ++/* Return true if the current function needs to record the incoming ++ value of PSTATE.SM. */ ++static bool ++aarch64_need_old_pstate_sm () ++{ ++ /* Exit early if the incoming value of PSTATE.SM is known at ++ compile time. */ ++ if (aarch64_cfun_incoming_pstate_sm () != 0) ++ return false; ++ ++ if (cfun->machine->call_switches_pstate_sm) ++ for (auto insn = get_insns (); insn; insn = NEXT_INSN (insn)) ++ if (auto *call = dyn_cast<rtx_call_insn *> (insn)) ++ if (!SIBLING_CALL_P (call)) ++ { ++ /* Return true if there is a call to a non-streaming-compatible ++ function. */ ++ auto callee_isa_mode = aarch64_insn_callee_isa_mode (call); ++ if (aarch64_call_switches_pstate_sm (callee_isa_mode)) ++ return true; ++ } ++ return false; ++} ++ + /* Mark the registers that need to be saved by the callee and calculate + the size of the callee-saved registers area and frame record (both FP + and LR may be omitted). */ +@@ -8780,6 +9321,7 @@ aarch64_layout_frame (void) + /* First mark all the registers that really need to be saved... */ + for (regno = 0; regno <= LAST_SAVED_REGNUM; regno++) + frame.reg_offset[regno] = SLOT_NOT_REQUIRED; ++ frame.old_svcr_offset = SLOT_NOT_REQUIRED; + + /* ... that includes the eh data registers (if needed)... */ + if (crtl->calls_eh_return) +@@ -8932,6 +9474,21 @@ aarch64_layout_frame (void) + if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED)) + allocate_gpr_slot (regno); + ++ if (aarch64_need_old_pstate_sm ()) ++ { ++ frame.old_svcr_offset = offset; ++ offset += UNITS_PER_WORD; ++ } ++ ++ /* If the current function changes the SVE vector length, ensure that the ++ old value of the DWARF VG register is saved and available in the CFI, ++ so that outer frames with VL-sized offsets can be processed correctly. */ ++ if (cfun->machine->call_switches_pstate_sm) ++ { ++ frame.reg_offset[VG_REGNUM] = offset; ++ offset += UNITS_PER_WORD; ++ } ++ + poly_int64 max_int_offset = offset; + offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT); + bool has_align_gap = maybe_ne (offset, max_int_offset); +@@ -8969,8 +9526,6 @@ aarch64_layout_frame (void) + if (push_regs.size () > 1) + frame.wb_push_candidate2 = push_regs[1]; + } +- else +- gcc_assert (known_eq (saved_regs_size, below_hard_fp_saved_regs_size)); + + /* With stack-clash, a register must be saved in non-leaf functions. + The saving of the bottommost register counts as an implicit probe, +@@ -9078,7 +9633,8 @@ aarch64_layout_frame (void) + frame.initial_adjust = frame.frame_size - frame.bytes_below_saved_regs; + frame.final_adjust = frame.bytes_below_saved_regs; + } +- else if (frame.bytes_above_hard_fp.is_constant (&const_above_fp) ++ else if (frame.wb_push_candidate1 != INVALID_REGNUM ++ && frame.bytes_above_hard_fp.is_constant (&const_above_fp) + && const_above_fp < max_push_offset) + { + /* Frame with large area below the saved registers, or with SVE saves, +@@ -9459,7 +10015,13 @@ aarch64_save_callee_saves (poly_int64 bytes_below_sp, + + machine_mode mode = aarch64_reg_save_mode (regno); + rtx reg = gen_rtx_REG (mode, regno); ++ rtx move_src = reg; + offset = frame.reg_offset[regno] - bytes_below_sp; ++ if (regno == VG_REGNUM) ++ { ++ move_src = gen_rtx_REG (DImode, IP0_REGNUM); ++ emit_move_insn (move_src, gen_int_mode (aarch64_sve_vg, DImode)); ++ } + rtx base_rtx = stack_pointer_rtx; + poly_int64 sp_offset = offset; + +@@ -9467,7 +10029,7 @@ aarch64_save_callee_saves (poly_int64 bytes_below_sp, + if (mode == VNx2DImode && BYTES_BIG_ENDIAN) + aarch64_adjust_sve_callee_save_base (mode, base_rtx, anchor_reg, + offset, ptrue); +- else if (GP_REGNUM_P (regno) ++ else if (GP_REGNUM_P (REGNO (reg)) + && (!offset.is_constant (&const_offset) || const_offset >= 512)) + { + poly_int64 fp_offset = frame.bytes_below_hard_fp - bytes_below_sp; +@@ -9490,6 +10052,7 @@ aarch64_save_callee_saves (poly_int64 bytes_below_sp, + + unsigned int regno2; + if (!aarch64_sve_mode_p (mode) ++ && reg == move_src + && i + 1 < regs.size () + && (regno2 = regs[i + 1], !skip_save_p (regno2)) + && known_eq (GET_MODE_SIZE (mode), +@@ -9521,17 +10084,24 @@ aarch64_save_callee_saves (poly_int64 bytes_below_sp, + } + else if (mode == VNx2DImode && BYTES_BIG_ENDIAN) + { +- insn = emit_insn (gen_aarch64_pred_mov (mode, mem, ptrue, reg)); ++ insn = emit_insn (gen_aarch64_pred_mov (mode, mem, ptrue, move_src)); + need_cfa_note_p = true; + } + else if (aarch64_sve_mode_p (mode)) +- insn = emit_insn (gen_rtx_SET (mem, reg)); ++ insn = emit_insn (gen_rtx_SET (mem, move_src)); + else +- insn = emit_move_insn (mem, reg); ++ insn = emit_move_insn (mem, move_src); + + RTX_FRAME_RELATED_P (insn) = frame_related_p; + if (frame_related_p && need_cfa_note_p) + aarch64_add_cfa_expression (insn, reg, stack_pointer_rtx, sp_offset); ++ else if (frame_related_p && move_src != reg) ++ add_reg_note (insn, REG_FRAME_RELATED_EXPR, gen_rtx_SET (mem, reg)); ++ ++ /* Emit a fake instruction to indicate that the VG save slot has ++ been initialized. */ ++ if (regno == VG_REGNUM) ++ emit_insn (gen_aarch64_old_vg_saved (move_src, mem)); + } + } + +@@ -9754,6 +10324,10 @@ aarch64_get_separate_components (void) + bitmap_clear_bit (components, frame.hard_fp_save_and_probe); + } + ++ /* The VG save sequence needs a temporary GPR. Punt for now on trying ++ to find one. */ ++ bitmap_clear_bit (components, VG_REGNUM); ++ + return components; + } + +@@ -10249,6 +10823,47 @@ aarch64_epilogue_uses (int regno) + return 0; + } + ++/* The current function's frame has a save slot for the incoming state ++ of SVCR. Return a legitimate memory for the slot, based on the hard ++ frame pointer. */ ++ ++static rtx ++aarch64_old_svcr_mem () ++{ ++ gcc_assert (frame_pointer_needed ++ && known_ge (cfun->machine->frame.old_svcr_offset, 0)); ++ rtx base = hard_frame_pointer_rtx; ++ poly_int64 offset = (0 ++ /* hard fp -> bottom of frame. */ ++ - cfun->machine->frame.bytes_below_hard_fp ++ /* bottom of frame -> save slot. */ ++ + cfun->machine->frame.old_svcr_offset); ++ return gen_frame_mem (DImode, plus_constant (Pmode, base, offset)); ++} ++ ++/* The current function's frame has a save slot for the incoming state ++ of SVCR. Load the slot into register REGNO and return the register. */ ++ ++static rtx ++aarch64_read_old_svcr (unsigned int regno) ++{ ++ rtx svcr = gen_rtx_REG (DImode, regno); ++ emit_move_insn (svcr, aarch64_old_svcr_mem ()); ++ return svcr; ++} ++ ++/* Like the rtx version of aarch64_guard_switch_pstate_sm, but first ++ load the incoming value of SVCR from its save slot into temporary ++ register REGNO. */ ++ ++static rtx_insn * ++aarch64_guard_switch_pstate_sm (unsigned int regno, ++ aarch64_feature_flags local_mode) ++{ ++ rtx old_svcr = aarch64_read_old_svcr (regno); ++ return aarch64_guard_switch_pstate_sm (old_svcr, local_mode); ++} ++ + /* AArch64 stack frames generated by this compiler look like: + + +-------------------------------+ +@@ -10463,6 +11078,12 @@ aarch64_expand_prologue (void) + + aarch64_save_callee_saves (bytes_below_sp, frame.saved_gprs, true, + emit_frame_chain); ++ if (maybe_ge (frame.reg_offset[VG_REGNUM], 0)) ++ { ++ unsigned int saved_regs[] = { VG_REGNUM }; ++ aarch64_save_callee_saves (bytes_below_sp, saved_regs, true, ++ emit_frame_chain); ++ } + if (maybe_ne (sve_callee_adjust, 0)) + { + gcc_assert (!flag_stack_clash_protection +@@ -10484,6 +11105,40 @@ aarch64_expand_prologue (void) + !frame_pointer_needed, true); + if (emit_frame_chain && maybe_ne (final_adjust, 0)) + aarch64_emit_stack_tie (hard_frame_pointer_rtx); ++ ++ /* Save the incoming value of PSTATE.SM, if required. */ ++ if (known_ge (frame.old_svcr_offset, 0)) ++ { ++ rtx mem = aarch64_old_svcr_mem (); ++ MEM_VOLATILE_P (mem) = 1; ++ if (TARGET_SME) ++ { ++ rtx reg = gen_rtx_REG (DImode, IP0_REGNUM); ++ emit_insn (gen_aarch64_read_svcr (reg)); ++ emit_move_insn (mem, reg); ++ } ++ else ++ { ++ rtx old_r0 = NULL_RTX, old_r1 = NULL_RTX; ++ auto &args = crtl->args.info; ++ if (args.aapcs_ncrn > 0) ++ { ++ old_r0 = gen_rtx_REG (DImode, PROBE_STACK_FIRST_REGNUM); ++ emit_move_insn (old_r0, gen_rtx_REG (DImode, R0_REGNUM)); ++ } ++ if (args.aapcs_ncrn > 1) ++ { ++ old_r1 = gen_rtx_REG (DImode, PROBE_STACK_SECOND_REGNUM); ++ emit_move_insn (old_r1, gen_rtx_REG (DImode, R1_REGNUM)); ++ } ++ emit_insn (gen_aarch64_get_sme_state ()); ++ emit_move_insn (mem, gen_rtx_REG (DImode, R0_REGNUM)); ++ if (old_r0) ++ emit_move_insn (gen_rtx_REG (DImode, R0_REGNUM), old_r0); ++ if (old_r1) ++ emit_move_insn (gen_rtx_REG (DImode, R1_REGNUM), old_r1); ++ } ++ } + } + + /* Return TRUE if we can use a simple_return insn. +@@ -11730,17 +12385,33 @@ aarch64_start_call_args (cumulative_args_t ca_v) + RESULT is the register in which the result is returned. It's NULL for + "call" and "sibcall". + MEM is the location of the function call. +- CALLEE_ABI is a const_int that gives the arm_pcs of the callee. ++ COOKIE is either: ++ - a const_int that gives the argument to the call's UNSPEC_CALLEE_ABI. ++ - a PARALLEL that contains such a const_int as its first element. ++ The second element is a PARALLEL that lists all the argument ++ registers that need to be saved and restored around a change ++ in PSTATE.SM, or const0_rtx if no such switch is needed. + SIBCALL indicates whether this function call is normal call or sibling call. + It will generate different pattern accordingly. */ + + void +-aarch64_expand_call (rtx result, rtx mem, rtx callee_abi, bool sibcall) ++aarch64_expand_call (rtx result, rtx mem, rtx cookie, bool sibcall) + { + rtx call, callee, tmp; + rtvec vec; + machine_mode mode; + ++ rtx callee_abi = cookie; ++ rtx sme_mode_switch_args = const0_rtx; ++ if (GET_CODE (cookie) == PARALLEL) ++ { ++ callee_abi = XVECEXP (cookie, 0, 0); ++ sme_mode_switch_args = XVECEXP (cookie, 0, 1); ++ } ++ ++ gcc_assert (CONST_INT_P (callee_abi)); ++ auto callee_isa_mode = aarch64_callee_isa_mode (callee_abi); ++ + gcc_assert (MEM_P (mem)); + callee = XEXP (mem, 0); + mode = GET_MODE (callee); +@@ -11765,26 +12436,75 @@ aarch64_expand_call (rtx result, rtx mem, rtx callee_abi, bool sibcall) + else + tmp = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (Pmode, LR_REGNUM)); + +- gcc_assert (CONST_INT_P (callee_abi)); + callee_abi = gen_rtx_UNSPEC (DImode, gen_rtvec (1, callee_abi), + UNSPEC_CALLEE_ABI); + + vec = gen_rtvec (3, call, callee_abi, tmp); + call = gen_rtx_PARALLEL (VOIDmode, vec); + +- aarch64_emit_call_insn (call); ++ auto call_insn = aarch64_emit_call_insn (call); ++ ++ /* Check whether the call requires a change to PSTATE.SM. We can't ++ emit the instructions to change PSTATE.SM yet, since they involve ++ a change in vector length and a change in instruction set, which ++ cannot be represented in RTL. ++ ++ For now, just record which registers will be clobbered and used ++ by the changes to PSTATE.SM. */ ++ if (!sibcall && aarch64_call_switches_pstate_sm (callee_isa_mode)) ++ { ++ aarch64_sme_mode_switch_regs args_switch; ++ if (sme_mode_switch_args != const0_rtx) ++ { ++ unsigned int num_args = XVECLEN (sme_mode_switch_args, 0); ++ for (unsigned int i = 0; i < num_args; ++i) ++ { ++ rtx x = XVECEXP (sme_mode_switch_args, 0, i); ++ args_switch.add_reg (GET_MODE (x), REGNO (x)); ++ } ++ } ++ ++ aarch64_sme_mode_switch_regs result_switch; ++ if (result) ++ result_switch.add_call_result (call_insn); ++ ++ unsigned int num_gprs = MAX (args_switch.num_gprs (), ++ result_switch.num_gprs ()); ++ for (unsigned int i = 0; i < num_gprs; ++i) ++ clobber_reg (&CALL_INSN_FUNCTION_USAGE (call_insn), ++ gen_rtx_REG (DImode, args_switch.FIRST_GPR + i)); ++ ++ for (int regno = V0_REGNUM; regno < V0_REGNUM + 32; regno += 4) ++ clobber_reg (&CALL_INSN_FUNCTION_USAGE (call_insn), ++ gen_rtx_REG (V4x16QImode, regno)); ++ ++ for (int regno = P0_REGNUM; regno < P0_REGNUM + 16; regno += 1) ++ clobber_reg (&CALL_INSN_FUNCTION_USAGE (call_insn), ++ gen_rtx_REG (VNx16BImode, regno)); ++ ++ /* Ensure that the VG save slot has been initialized. Also emit ++ an instruction to model the effect of the temporary clobber ++ of VG, so that the prologue/epilogue pass sees the need to ++ save the old value. */ ++ use_reg (&CALL_INSN_FUNCTION_USAGE (call_insn), ++ gen_rtx_REG (DImode, VG_REGNUM)); ++ emit_insn_before (gen_aarch64_update_vg (), call_insn); ++ ++ cfun->machine->call_switches_pstate_sm = true; ++ } + } + + /* Emit call insn with PAT and do aarch64-specific handling. */ + +-void ++rtx_call_insn * + aarch64_emit_call_insn (rtx pat) + { +- rtx insn = emit_call_insn (pat); ++ auto insn = emit_call_insn (pat); + + rtx *fusage = &CALL_INSN_FUNCTION_USAGE (insn); + clobber_reg (fusage, gen_rtx_REG (word_mode, IP0_REGNUM)); + clobber_reg (fusage, gen_rtx_REG (word_mode, IP1_REGNUM)); ++ return as_a<rtx_call_insn *> (insn); + } + + machine_mode +@@ -13069,6 +13789,16 @@ aarch64_secondary_memory_needed (machine_mode mode, reg_class_t class1, + return false; + } + ++/* Implement TARGET_FRAME_POINTER_REQUIRED. */ ++ ++static bool ++aarch64_frame_pointer_required () ++{ ++ /* If the function needs to record the incoming value of PSTATE.SM, ++ make sure that the slot is accessible from the frame pointer. */ ++ return aarch64_need_old_pstate_sm (); ++} ++ + static bool + aarch64_can_eliminate (const int from ATTRIBUTE_UNUSED, const int to) + { +@@ -20607,7 +21337,8 @@ aarch64_conditional_register_usage (void) + call_used_regs[i] = 1; + } + +- /* Only allow the FFR and FFRT to be accessed via special patterns. */ ++ /* Only allow these registers to be accessed via special patterns. */ ++ CLEAR_HARD_REG_BIT (operand_reg_set, VG_REGNUM); + CLEAR_HARD_REG_BIT (operand_reg_set, FFR_REGNUM); + CLEAR_HARD_REG_BIT (operand_reg_set, FFRT_REGNUM); + +@@ -27903,6 +28634,123 @@ aarch64_indirect_call_asm (rtx addr) + return ""; + } + ++/* If CALL involves a change in PSTATE.SM, emit the instructions needed ++ to switch to the new mode and the instructions needed to restore the ++ original mode. Return true if something changed. */ ++static bool ++aarch64_switch_pstate_sm_for_call (rtx_call_insn *call) ++{ ++ /* Mode switches for sibling calls are handled via the epilogue. */ ++ if (SIBLING_CALL_P (call)) ++ return false; ++ ++ auto callee_isa_mode = aarch64_insn_callee_isa_mode (call); ++ if (!aarch64_call_switches_pstate_sm (callee_isa_mode)) ++ return false; ++ ++ /* Switch mode before the call, preserving any argument registers ++ across the switch. */ ++ start_sequence (); ++ rtx_insn *args_guard_label = nullptr; ++ if (TARGET_STREAMING_COMPATIBLE) ++ args_guard_label = aarch64_guard_switch_pstate_sm (IP0_REGNUM, ++ callee_isa_mode); ++ aarch64_sme_mode_switch_regs args_switch; ++ args_switch.add_call_args (call); ++ args_switch.emit_prologue (); ++ aarch64_switch_pstate_sm (AARCH64_ISA_MODE, callee_isa_mode); ++ args_switch.emit_epilogue (); ++ if (args_guard_label) ++ emit_label (args_guard_label); ++ auto args_seq = get_insns (); ++ end_sequence (); ++ emit_insn_before (args_seq, call); ++ ++ if (find_reg_note (call, REG_NORETURN, NULL_RTX)) ++ return true; ++ ++ /* Switch mode after the call, preserving any return registers across ++ the switch. */ ++ start_sequence (); ++ rtx_insn *return_guard_label = nullptr; ++ if (TARGET_STREAMING_COMPATIBLE) ++ return_guard_label = aarch64_guard_switch_pstate_sm (IP0_REGNUM, ++ callee_isa_mode); ++ aarch64_sme_mode_switch_regs return_switch; ++ return_switch.add_call_result (call); ++ return_switch.emit_prologue (); ++ aarch64_switch_pstate_sm (callee_isa_mode, AARCH64_ISA_MODE); ++ return_switch.emit_epilogue (); ++ if (return_guard_label) ++ emit_label (return_guard_label); ++ auto result_seq = get_insns (); ++ end_sequence (); ++ emit_insn_after (result_seq, call); ++ return true; ++} ++ ++namespace { ++ ++const pass_data pass_data_switch_pstate_sm = ++{ ++ RTL_PASS, // type ++ "smstarts", // name ++ OPTGROUP_NONE, // optinfo_flags ++ TV_NONE, // tv_id ++ 0, // properties_required ++ 0, // properties_provided ++ 0, // properties_destroyed ++ 0, // todo_flags_start ++ TODO_df_finish, // todo_flags_finish ++}; ++ ++class pass_switch_pstate_sm : public rtl_opt_pass ++{ ++public: ++ pass_switch_pstate_sm (gcc::context *ctxt) ++ : rtl_opt_pass (pass_data_switch_pstate_sm, ctxt) ++ {} ++ ++ // opt_pass methods: ++ bool gate (function *) override final; ++ unsigned int execute (function *) override final; ++}; ++ ++bool ++pass_switch_pstate_sm::gate (function *) ++{ ++ return cfun->machine->call_switches_pstate_sm; ++} ++ ++/* Emit any instructions needed to switch PSTATE.SM. */ ++unsigned int ++pass_switch_pstate_sm::execute (function *fn) ++{ ++ basic_block bb; ++ ++ auto_sbitmap blocks (last_basic_block_for_fn (cfun)); ++ bitmap_clear (blocks); ++ FOR_EACH_BB_FN (bb, fn) ++ { ++ rtx_insn *insn; ++ FOR_BB_INSNS (bb, insn) ++ if (auto *call = dyn_cast<rtx_call_insn *> (insn)) ++ if (aarch64_switch_pstate_sm_for_call (call)) ++ bitmap_set_bit (blocks, bb->index); ++ } ++ find_many_sub_basic_blocks (blocks); ++ clear_aux_for_blocks (); ++ return 0; ++} ++ ++} ++ ++rtl_opt_pass * ++make_pass_switch_pstate_sm (gcc::context *ctxt) ++{ ++ return new pass_switch_pstate_sm (ctxt); ++} ++ + /* Target-specific selftests. */ + + #if CHECKING_P +@@ -28176,6 +29024,9 @@ aarch64_get_v16qi_mode () + #undef TARGET_CALLEE_COPIES + #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_arg_info_false + ++#undef TARGET_FRAME_POINTER_REQUIRED ++#define TARGET_FRAME_POINTER_REQUIRED aarch64_frame_pointer_required ++ + #undef TARGET_CAN_ELIMINATE + #define TARGET_CAN_ELIMINATE aarch64_can_eliminate + +diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h +index 1591cde8b..6bfe55968 100644 +--- a/gcc/config/aarch64/aarch64.h ++++ b/gcc/config/aarch64/aarch64.h +@@ -255,6 +255,10 @@ constexpr auto AARCH64_FL_DEFAULT_ISA_MODE = AARCH64_FL_SM_OFF; + /* The current function is a normal non-streaming function. */ + #define TARGET_NON_STREAMING (AARCH64_ISA_SM_OFF) + ++/* The current function has a streaming-compatible body. */ ++#define TARGET_STREAMING_COMPATIBLE \ ++ ((aarch64_isa_flags & AARCH64_FL_SM_STATE) == 0) ++ + /* Crypto is an optional extension to AdvSIMD. */ + #define TARGET_CRYPTO (AARCH64_ISA_CRYPTO) + +@@ -461,7 +465,7 @@ constexpr auto AARCH64_FL_DEFAULT_ISA_MODE = AARCH64_FL_SM_OFF; + 0, 0, 0, 0, 0, 0, 0, 0, /* V8 - V15 */ \ + 1, 1, 1, 1, 1, 1, 1, 1, /* V16 - V23 */ \ + 1, 1, 1, 1, 1, 1, 1, 1, /* V24 - V31 */ \ +- 1, 1, 1, 1, /* SFP, AP, CC, VG */ \ ++ 1, 1, 1, 0, /* SFP, AP, CC, VG */ \ + 1, 1, 1, 1, 1, 1, 1, 1, /* P0 - P7 */ \ + 1, 1, 1, 1, 1, 1, 1, 1, /* P8 - P15 */ \ + 1, 1 /* FFR and FFRT */ \ +@@ -802,6 +806,13 @@ struct GTY (()) aarch64_frame + vec<unsigned, va_gc_atomic> *saved_fprs; + vec<unsigned, va_gc_atomic> *saved_prs; + ++ /* The offset from the base of the frame of a 64-bit slot whose low ++ bit contains the incoming value of PSTATE.SM. This slot must be ++ within reach of the hard frame pointer. ++ ++ The offset is -1 if such a slot isn't needed. */ ++ poly_int64 old_svcr_offset; ++ + /* The number of extra stack bytes taken up by register varargs. + This area is allocated by the callee at the very top of the + frame. This value is rounded up to a multiple of +@@ -910,6 +921,12 @@ typedef struct GTY (()) machine_function + /* One entry for each general purpose register. */ + rtx call_via[SP_REGNUM]; + bool label_is_assembled; ++ ++ /* True if we've expanded at least one call to a function that changes ++ PSTATE.SM. This should only be used for saving compile time: false ++ guarantees that no such mode switch exists. */ ++ bool call_switches_pstate_sm; ++ + /* A set of all decls that have been passed to a vld1 intrinsic in the + current function. This is used to help guide the vector cost model. */ + hash_set<tree> *vector_load_decls; +@@ -978,6 +995,12 @@ typedef struct + stack arg area so far. */ + bool silent_p; /* True if we should act silently, rather than + raise an error for invalid calls. */ ++ ++ /* A list of registers that need to be saved and restored around a ++ change to PSTATE.SM. An auto_vec would be more convenient, but those ++ can't be copied. */ ++ unsigned int num_sme_mode_switch_args; ++ rtx sme_mode_switch_args[12]; + } CUMULATIVE_ARGS; + #endif + +diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md +index 2ce123255..bb867de74 100644 +--- a/gcc/config/aarch64/aarch64.md ++++ b/gcc/config/aarch64/aarch64.md +@@ -970,7 +970,7 @@ + operands[1]); + }) + +-(define_insn "*tb<optab><ALLI:mode><GPI:mode>1" ++(define_insn "@aarch64_tb<optab><ALLI:mode><GPI:mode>" + [(set (pc) (if_then_else + (EQL (zero_extract:GPI (match_operand:ALLI 0 "register_operand" "r") + (const_int 1) +@@ -1057,7 +1057,7 @@ + [(parallel + [(call (match_operand 0 "memory_operand") + (match_operand 1 "general_operand")) +- (unspec:DI [(match_operand 2 "const_int_operand")] UNSPEC_CALLEE_ABI) ++ (unspec:DI [(match_operand 2)] UNSPEC_CALLEE_ABI) + (clobber (reg:DI LR_REGNUM))])] + "" + " +@@ -1083,7 +1083,7 @@ + [(set (match_operand 0 "") + (call (match_operand 1 "memory_operand") + (match_operand 2 "general_operand"))) +- (unspec:DI [(match_operand 3 "const_int_operand")] UNSPEC_CALLEE_ABI) ++ (unspec:DI [(match_operand 3)] UNSPEC_CALLEE_ABI) + (clobber (reg:DI LR_REGNUM))])] + "" + " +@@ -1110,7 +1110,7 @@ + [(parallel + [(call (match_operand 0 "memory_operand") + (match_operand 1 "general_operand")) +- (unspec:DI [(match_operand 2 "const_int_operand")] UNSPEC_CALLEE_ABI) ++ (unspec:DI [(match_operand 2)] UNSPEC_CALLEE_ABI) + (return)])] + "" + { +@@ -1124,7 +1124,7 @@ + [(set (match_operand 0 "") + (call (match_operand 1 "memory_operand") + (match_operand 2 "general_operand"))) +- (unspec:DI [(match_operand 3 "const_int_operand")] UNSPEC_CALLEE_ABI) ++ (unspec:DI [(match_operand 3)] UNSPEC_CALLEE_ABI) + (return)])] + "" + { +@@ -7747,3 +7747,6 @@ + + ;; SVE2. + (include "aarch64-sve2.md") ++ ++;; SME and extensions ++(include "aarch64-sme.md") +diff --git a/gcc/config/aarch64/t-aarch64 b/gcc/config/aarch64/t-aarch64 +index 10cd8f093..49731ba92 100644 +--- a/gcc/config/aarch64/t-aarch64 ++++ b/gcc/config/aarch64/t-aarch64 +@@ -186,9 +186,12 @@ MULTILIB_DIRNAMES = $(subst $(comma), ,$(TM_MULTILIB_CONFIG)) + insn-conditions.md: s-check-sve-md + s-check-sve-md: $(srcdir)/config/aarch64/check-sve-md.awk \ + $(srcdir)/config/aarch64/aarch64-sve.md \ +- $(srcdir)/config/aarch64/aarch64-sve2.md ++ $(srcdir)/config/aarch64/aarch64-sve2.md \ ++ $(srcdir)/config/aarch64/aarch64-sme.md + $(AWK) -f $(srcdir)/config/aarch64/check-sve-md.awk \ + $(srcdir)/config/aarch64/aarch64-sve.md + $(AWK) -f $(srcdir)/config/aarch64/check-sve-md.awk \ + $(srcdir)/config/aarch64/aarch64-sve2.md ++ $(AWK) -f $(srcdir)/config/aarch64/check-sve-md.awk \ ++ $(srcdir)/config/aarch64/aarch64-sme.md + $(STAMP) s-check-sve-md +diff --git a/gcc/testsuite/gcc.target/aarch64/sme/call_sm_switch_1.c b/gcc/testsuite/gcc.target/aarch64/sme/call_sm_switch_1.c +new file mode 100644 +index 000000000..a2de55773 +--- /dev/null ++++ b/gcc/testsuite/gcc.target/aarch64/sme/call_sm_switch_1.c +@@ -0,0 +1,233 @@ ++// { dg-options "-O -fomit-frame-pointer -fno-optimize-sibling-calls" } ++// { dg-final { check-function-bodies "**" "" } } ++ ++void ns_callee (); ++ void s_callee () [[arm::streaming]]; ++ void sc_callee () [[arm::streaming_compatible]]; ++ ++void ns_callee_stack (int, int, int, int, int, int, int, int, int); ++ ++struct callbacks { ++ void (*ns_ptr) (); ++ void (*s_ptr) () [[arm::streaming]]; ++ void (*sc_ptr) () [[arm::streaming_compatible]]; ++}; ++ ++/* ++** n_caller: { target lp64 } ++** stp x30, (x19|x2[0-8]), \[sp, #?-96\]! ++** cntd x16 ++** str x16, \[sp, #?16\] ++** stp d8, d9, \[sp, #?32\] ++** stp d10, d11, \[sp, #?48\] ++** stp d12, d13, \[sp, #?64\] ++** stp d14, d15, \[sp, #?80\] ++** mov \1, x0 ++** bl ns_callee ++** smstart sm ++** bl s_callee ++** smstop sm ++** bl sc_callee ++** ldr (x[0-9]+), \[\1\] ++** blr \2 ++** ldr (x[0-9]+), \[\1, #?8\] ++** smstart sm ++** blr \3 ++** smstop sm ++** ldr (x[0-9]+), \[\1, #?16\] ++** blr \4 ++** ldp d8, d9, \[sp, #?32\] ++** ldp d10, d11, \[sp, #?48\] ++** ldp d12, d13, \[sp, #?64\] ++** ldp d14, d15, \[sp, #?80\] ++** ldp x30, \1, \[sp\], #?96 ++** ret ++*/ ++void ++n_caller (struct callbacks *c) ++{ ++ ns_callee (); ++ s_callee (); ++ sc_callee (); ++ ++ c->ns_ptr (); ++ c->s_ptr (); ++ c->sc_ptr (); ++} ++ ++/* ++** s_caller: { target lp64 } ++** stp x30, (x19|x2[0-8]), \[sp, #?-96\]! ++** cntd x16 ++** str x16, \[sp, #?16\] ++** stp d8, d9, \[sp, #?32\] ++** stp d10, d11, \[sp, #?48\] ++** stp d12, d13, \[sp, #?64\] ++** stp d14, d15, \[sp, #?80\] ++** mov \1, x0 ++** smstop sm ++** bl ns_callee ++** smstart sm ++** bl s_callee ++** bl sc_callee ++** ldr (x[0-9]+), \[\1\] ++** smstop sm ++** blr \2 ++** smstart sm ++** ldr (x[0-9]+), \[\1, #?8\] ++** blr \3 ++** ldr (x[0-9]+), \[\1, #?16\] ++** blr \4 ++** ldp d8, d9, \[sp, #?32\] ++** ldp d10, d11, \[sp, #?48\] ++** ldp d12, d13, \[sp, #?64\] ++** ldp d14, d15, \[sp, #?80\] ++** ldp x30, \1, \[sp\], #?96 ++** ret ++*/ ++void ++s_caller (struct callbacks *c) [[arm::streaming]] ++{ ++ ns_callee (); ++ s_callee (); ++ sc_callee (); ++ ++ c->ns_ptr (); ++ c->s_ptr (); ++ c->sc_ptr (); ++} ++ ++/* ++** sc_caller_sme: ++** stp x29, x30, \[sp, #?-96\]! ++** mov x29, sp ++** cntd x16 ++** str x16, \[sp, #?24\] ++** stp d8, d9, \[sp, #?32\] ++** stp d10, d11, \[sp, #?48\] ++** stp d12, d13, \[sp, #?64\] ++** stp d14, d15, \[sp, #?80\] ++** mrs x16, svcr ++** str x16, \[x29, #?16\] ++** ldr x16, \[x29, #?16\] ++** tbz x16, 0, .* ++** smstop sm ++** bl ns_callee ++** ldr x16, \[x29, #?16\] ++** tbz x16, 0, .* ++** smstart sm ++** ldr x16, \[x29, #?16\] ++** tbnz x16, 0, .* ++** smstart sm ++** bl s_callee ++** ldr x16, \[x29, #?16\] ++** tbnz x16, 0, .* ++** smstop sm ++** bl sc_callee ++** ldp d8, d9, \[sp, #?32\] ++** ldp d10, d11, \[sp, #?48\] ++** ldp d12, d13, \[sp, #?64\] ++** ldp d14, d15, \[sp, #?80\] ++** ldp x29, x30, \[sp\], #?96 ++** ret ++*/ ++void ++sc_caller_sme () [[arm::streaming_compatible]] ++{ ++ ns_callee (); ++ s_callee (); ++ sc_callee (); ++} ++ ++#pragma GCC target "+nosme" ++ ++/* ++** sc_caller: ++** stp x29, x30, \[sp, #?-96\]! ++** mov x29, sp ++** cntd x16 ++** str x16, \[sp, #?24\] ++** stp d8, d9, \[sp, #?32\] ++** stp d10, d11, \[sp, #?48\] ++** stp d12, d13, \[sp, #?64\] ++** stp d14, d15, \[sp, #?80\] ++** bl __arm_sme_state ++** str x0, \[x29, #?16\] ++** ... ++** bl sc_callee ++** ldp d8, d9, \[sp, #?32\] ++** ldp d10, d11, \[sp, #?48\] ++** ldp d12, d13, \[sp, #?64\] ++** ldp d14, d15, \[sp, #?80\] ++** ldp x29, x30, \[sp\], #?96 ++** ret ++*/ ++void ++sc_caller () [[arm::streaming_compatible]] ++{ ++ ns_callee (); ++ sc_callee (); ++} ++ ++/* ++** sc_caller_x0: ++** ... ++** mov x10, x0 ++** bl __arm_sme_state ++** ... ++** str wzr, \[x10\] ++** ... ++*/ ++void ++sc_caller_x0 (int *ptr) [[arm::streaming_compatible]] ++{ ++ *ptr = 0; ++ ns_callee (); ++ sc_callee (); ++} ++ ++/* ++** sc_caller_x1: ++** ... ++** mov x10, x0 ++** mov x11, x1 ++** bl __arm_sme_state ++** ... ++** str w11, \[x10\] ++** ... ++*/ ++void ++sc_caller_x1 (int *ptr, int a) [[arm::streaming_compatible]] ++{ ++ *ptr = a; ++ ns_callee (); ++ sc_callee (); ++} ++ ++/* ++** sc_caller_stack: ++** sub sp, sp, #112 ++** stp x29, x30, \[sp, #?16\] ++** add x29, sp, #?16 ++** ... ++** stp d8, d9, \[sp, #?48\] ++** ... ++** bl __arm_sme_state ++** str x0, \[x29, #?16\] ++** ... ++** bl ns_callee_stack ++** ldr x16, \[x29, #?16\] ++** tbz x16, 0, .* ++** smstart sm ++** ... ++*/ ++void ++sc_caller_stack () [[arm::streaming_compatible]] ++{ ++ ns_callee_stack (0, 0, 0, 0, 0, 0, 0, 0, 0); ++} ++ ++/* { dg-final { scan-assembler {n_caller:(?:(?!ret).)*\.cfi_offset 46, -80\n} } } */ ++/* { dg-final { scan-assembler {s_caller:(?:(?!ret).)*\.cfi_offset 46, -80\n} } } */ ++/* { dg-final { scan-assembler {sc_caller_sme:(?:(?!ret).)*\.cfi_offset 46, -72\n} } } */ ++/* { dg-final { scan-assembler {sc_caller:(?:(?!ret).)*\.cfi_offset 46, -72\n} } } */ +diff --git a/gcc/testsuite/gcc.target/aarch64/sme/call_sm_switch_10.c b/gcc/testsuite/gcc.target/aarch64/sme/call_sm_switch_10.c +new file mode 100644 +index 000000000..49c5e4a6a +--- /dev/null ++++ b/gcc/testsuite/gcc.target/aarch64/sme/call_sm_switch_10.c +@@ -0,0 +1,37 @@ ++// { dg-options "" } ++ ++#pragma GCC target "+nosme" ++ ++void ns_callee (); ++ void s_callee () [[arm::streaming]]; ++ void sc_callee () [[arm::streaming_compatible]]; ++ ++struct callbacks { ++ void (*ns_ptr) (); ++ void (*s_ptr) () [[arm::streaming]]; ++ void (*sc_ptr) () [[arm::streaming_compatible]]; ++}; ++ ++void ++n_caller (struct callbacks *c) ++{ ++ ns_callee (); ++ s_callee (); // { dg-error "calling a streaming function requires the ISA extension 'sme'" } ++ sc_callee (); ++ ++ c->ns_ptr (); ++ c->s_ptr (); // { dg-error "calling a streaming function requires the ISA extension 'sme'" } ++ c->sc_ptr (); ++} ++ ++void ++sc_caller_sme (struct callbacks *c) [[arm::streaming_compatible]] ++{ ++ ns_callee (); ++ s_callee (); // { dg-error "calling a streaming function requires the ISA extension 'sme'" } ++ sc_callee (); ++ ++ c->ns_ptr (); ++ c->s_ptr (); // { dg-error "calling a streaming function requires the ISA extension 'sme'" } ++ c->sc_ptr (); ++} +diff --git a/gcc/testsuite/gcc.target/aarch64/sme/call_sm_switch_2.c b/gcc/testsuite/gcc.target/aarch64/sme/call_sm_switch_2.c +new file mode 100644 +index 000000000..890fcbc5b +--- /dev/null ++++ b/gcc/testsuite/gcc.target/aarch64/sme/call_sm_switch_2.c +@@ -0,0 +1,43 @@ ++// { dg-options "-O -fomit-frame-pointer -fno-optimize-sibling-calls" } ++ ++void ns_callee (); ++ void s_callee () [[arm::streaming]]; ++ void sc_callee () [[arm::streaming_compatible]]; ++ ++struct callbacks { ++ void (*ns_ptr) (); ++ void (*s_ptr) () [[arm::streaming]]; ++ void (*sc_ptr) () [[arm::streaming_compatible]]; ++}; ++ ++void ++n_caller (struct callbacks *c) ++{ ++ ns_callee (); ++ sc_callee (); ++ ++ c->ns_ptr (); ++ c->sc_ptr (); ++} ++ ++void ++s_caller (struct callbacks *c) [[arm::streaming]] ++{ ++ s_callee (); ++ sc_callee (); ++ ++ c->s_ptr (); ++ c->sc_ptr (); ++} ++ ++void ++sc_caller (struct callbacks *c) [[arm::streaming_compatible]] ++{ ++ sc_callee (); ++ ++ c->sc_ptr (); ++} ++ ++// { dg-final { scan-assembler-not {[dpqz][0-9]+,} } } ++// { dg-final { scan-assembler-not {smstart\tsm} } } ++// { dg-final { scan-assembler-not {smstop\tsm} } } +diff --git a/gcc/testsuite/gcc.target/aarch64/sme/call_sm_switch_3.c b/gcc/testsuite/gcc.target/aarch64/sme/call_sm_switch_3.c +new file mode 100644 +index 000000000..ed999d085 +--- /dev/null ++++ b/gcc/testsuite/gcc.target/aarch64/sme/call_sm_switch_3.c +@@ -0,0 +1,166 @@ ++// { dg-options "-O -fomit-frame-pointer -fno-optimize-sibling-calls" } ++// { dg-final { check-function-bodies "**" "" } } ++ ++__attribute__((aarch64_vector_pcs)) void ns_callee (); ++__attribute__((aarch64_vector_pcs)) void s_callee () [[arm::streaming]]; ++__attribute__((aarch64_vector_pcs)) void sc_callee () [[arm::streaming_compatible]]; ++ ++struct callbacks { ++ __attribute__((aarch64_vector_pcs)) void (*ns_ptr) (); ++ __attribute__((aarch64_vector_pcs)) void (*s_ptr) () [[arm::streaming]]; ++ __attribute__((aarch64_vector_pcs)) void (*sc_ptr) () [[arm::streaming_compatible]]; ++}; ++ ++/* ++** n_caller: { target lp64 } ++** stp x30, (x19|x2[0-8]), \[sp, #?-288\]! ++** cntd x16 ++** str x16, \[sp, #?16\] ++** stp q8, q9, \[sp, #?32\] ++** stp q10, q11, \[sp, #?64\] ++** stp q12, q13, \[sp, #?96\] ++** stp q14, q15, \[sp, #?128\] ++** stp q16, q17, \[sp, #?160\] ++** stp q18, q19, \[sp, #?192\] ++** stp q20, q21, \[sp, #?224\] ++** stp q22, q23, \[sp, #?256\] ++** mov \1, x0 ++** bl ns_callee ++** smstart sm ++** bl s_callee ++** smstop sm ++** bl sc_callee ++** ldr (x[0-9]+), \[\1\] ++** blr \2 ++** ldr (x[0-9]+), \[\1, #?8\] ++** smstart sm ++** blr \3 ++** smstop sm ++** ldr (x[0-9]+), \[\1, #?16\] ++** blr \4 ++** ldp q8, q9, \[sp, #?32\] ++** ldp q10, q11, \[sp, #?64\] ++** ldp q12, q13, \[sp, #?96\] ++** ldp q14, q15, \[sp, #?128\] ++** ldp q16, q17, \[sp, #?160\] ++** ldp q18, q19, \[sp, #?192\] ++** ldp q20, q21, \[sp, #?224\] ++** ldp q22, q23, \[sp, #?256\] ++** ldp x30, \1, \[sp\], #?288 ++** ret ++*/ ++void __attribute__((aarch64_vector_pcs)) ++n_caller (struct callbacks *c) ++{ ++ ns_callee (); ++ s_callee (); ++ sc_callee (); ++ ++ c->ns_ptr (); ++ c->s_ptr (); ++ c->sc_ptr (); ++} ++ ++/* ++** s_caller: { target lp64 } ++** stp x30, (x19|x2[0-8]), \[sp, #?-288\]! ++** cntd x16 ++** str x16, \[sp, #?16\] ++** stp q8, q9, \[sp, #?32\] ++** stp q10, q11, \[sp, #?64\] ++** stp q12, q13, \[sp, #?96\] ++** stp q14, q15, \[sp, #?128\] ++** stp q16, q17, \[sp, #?160\] ++** stp q18, q19, \[sp, #?192\] ++** stp q20, q21, \[sp, #?224\] ++** stp q22, q23, \[sp, #?256\] ++** mov \1, x0 ++** smstop sm ++** bl ns_callee ++** smstart sm ++** bl s_callee ++** bl sc_callee ++** ldr (x[0-9]+), \[\1\] ++** smstop sm ++** blr \2 ++** smstart sm ++** ldr (x[0-9]+), \[\1, #?8\] ++** blr \3 ++** ldr (x[0-9]+), \[\1, #?16\] ++** blr \4 ++** ldp q8, q9, \[sp, #?32\] ++** ldp q10, q11, \[sp, #?64\] ++** ldp q12, q13, \[sp, #?96\] ++** ldp q14, q15, \[sp, #?128\] ++** ldp q16, q17, \[sp, #?160\] ++** ldp q18, q19, \[sp, #?192\] ++** ldp q20, q21, \[sp, #?224\] ++** ldp q22, q23, \[sp, #?256\] ++** ldp x30, \1, \[sp\], #?288 ++** ret ++*/ ++void __attribute__((aarch64_vector_pcs)) ++s_caller (struct callbacks *c) [[arm::streaming]] ++{ ++ ns_callee (); ++ s_callee (); ++ sc_callee (); ++ ++ c->ns_ptr (); ++ c->s_ptr (); ++ c->sc_ptr (); ++} ++ ++/* ++** sc_caller: ++** stp x29, x30, \[sp, #?-288\]! ++** mov x29, sp ++** cntd x16 ++** str x16, \[sp, #?24\] ++** stp q8, q9, \[sp, #?32\] ++** stp q10, q11, \[sp, #?64\] ++** stp q12, q13, \[sp, #?96\] ++** stp q14, q15, \[sp, #?128\] ++** stp q16, q17, \[sp, #?160\] ++** stp q18, q19, \[sp, #?192\] ++** stp q20, q21, \[sp, #?224\] ++** stp q22, q23, \[sp, #?256\] ++** mrs x16, svcr ++** str x16, \[x29, #?16\] ++** ldr x16, \[x29, #?16\] ++** tbz x16, 0, .* ++** smstop sm ++** bl ns_callee ++** ldr x16, \[x29, #?16\] ++** tbz x16, 0, .* ++** smstart sm ++** ldr x16, \[x29, #?16\] ++** tbnz x16, 0, .* ++** smstart sm ++** bl s_callee ++** ldr x16, \[x29, #?16\] ++** tbnz x16, 0, .* ++** smstop sm ++** bl sc_callee ++** ldp q8, q9, \[sp, #?32\] ++** ldp q10, q11, \[sp, #?64\] ++** ldp q12, q13, \[sp, #?96\] ++** ldp q14, q15, \[sp, #?128\] ++** ldp q16, q17, \[sp, #?160\] ++** ldp q18, q19, \[sp, #?192\] ++** ldp q20, q21, \[sp, #?224\] ++** ldp q22, q23, \[sp, #?256\] ++** ldp x29, x30, \[sp\], #?288 ++** ret ++*/ ++void __attribute__((aarch64_vector_pcs)) ++sc_caller () [[arm::streaming_compatible]] ++{ ++ ns_callee (); ++ s_callee (); ++ sc_callee (); ++} ++ ++/* { dg-final { scan-assembler {n_caller:(?:(?!ret).)*\.cfi_offset 46, -272\n} } } */ ++/* { dg-final { scan-assembler {s_caller:(?:(?!ret).)*\.cfi_offset 46, -272\n} } } */ ++/* { dg-final { scan-assembler {sc_caller:(?:(?!ret).)*\.cfi_offset 46, -264\n} } } */ +diff --git a/gcc/testsuite/gcc.target/aarch64/sme/call_sm_switch_4.c b/gcc/testsuite/gcc.target/aarch64/sme/call_sm_switch_4.c +new file mode 100644 +index 000000000..f93a67f97 +--- /dev/null ++++ b/gcc/testsuite/gcc.target/aarch64/sme/call_sm_switch_4.c +@@ -0,0 +1,43 @@ ++// { dg-options "-O -fomit-frame-pointer -fno-optimize-sibling-calls" } ++ ++__attribute__((aarch64_vector_pcs)) void ns_callee (); ++__attribute__((aarch64_vector_pcs)) void s_callee () [[arm::streaming]]; ++__attribute__((aarch64_vector_pcs)) void sc_callee () [[arm::streaming_compatible]]; ++ ++struct callbacks { ++ __attribute__((aarch64_vector_pcs)) void (*ns_ptr) (); ++ __attribute__((aarch64_vector_pcs)) void (*s_ptr) () [[arm::streaming]]; ++ __attribute__((aarch64_vector_pcs)) void (*sc_ptr) () [[arm::streaming_compatible]]; ++}; ++ ++void __attribute__((aarch64_vector_pcs)) ++n_caller (struct callbacks *c) ++{ ++ ns_callee (); ++ sc_callee (); ++ ++ c->ns_ptr (); ++ c->sc_ptr (); ++} ++ ++void __attribute__((aarch64_vector_pcs)) ++s_caller (struct callbacks *c) [[arm::streaming]] ++{ ++ s_callee (); ++ sc_callee (); ++ ++ c->s_ptr (); ++ c->sc_ptr (); ++} ++ ++void __attribute__((aarch64_vector_pcs)) ++sc_caller (struct callbacks *c) [[arm::streaming_compatible]] ++{ ++ sc_callee (); ++ ++ c->sc_ptr (); ++} ++ ++// { dg-final { scan-assembler-not {[dpqz][0-9]+,} } } ++// { dg-final { scan-assembler-not {smstart\tsm} } } ++// { dg-final { scan-assembler-not {smstop\tsm} } } +diff --git a/gcc/testsuite/gcc.target/aarch64/sme/call_sm_switch_5.c b/gcc/testsuite/gcc.target/aarch64/sme/call_sm_switch_5.c +new file mode 100644 +index 000000000..be9b5cc04 +--- /dev/null ++++ b/gcc/testsuite/gcc.target/aarch64/sme/call_sm_switch_5.c +@@ -0,0 +1,318 @@ ++// { dg-options "-O -fomit-frame-pointer -fno-optimize-sibling-calls" } ++// { dg-final { check-function-bodies "**" "" } } ++ ++#include <arm_sve.h> ++ ++svbool_t ns_callee (); ++ svbool_t s_callee () [[arm::streaming]]; ++ svbool_t sc_callee () [[arm::streaming_compatible]]; ++ ++struct callbacks { ++ svbool_t (*ns_ptr) (); ++ svbool_t (*s_ptr) () [[arm::streaming]]; ++ svbool_t (*sc_ptr) () [[arm::streaming_compatible]]; ++}; ++ ++/* ++** n_caller: { target lp64 } ++** stp x30, (x19|x2[0-8]), \[sp, #?-32\]! ++** cntd x16 ++** str x16, \[sp, #?16\] ++** addvl sp, sp, #-18 ++** str p4, \[sp\] ++** str p5, \[sp, #1, mul vl\] ++** str p6, \[sp, #2, mul vl\] ++** str p7, \[sp, #3, mul vl\] ++** str p8, \[sp, #4, mul vl\] ++** str p9, \[sp, #5, mul vl\] ++** str p10, \[sp, #6, mul vl\] ++** str p11, \[sp, #7, mul vl\] ++** str p12, \[sp, #8, mul vl\] ++** str p13, \[sp, #9, mul vl\] ++** str p14, \[sp, #10, mul vl\] ++** str p15, \[sp, #11, mul vl\] ++** str z8, \[sp, #2, mul vl\] ++** str z9, \[sp, #3, mul vl\] ++** str z10, \[sp, #4, mul vl\] ++** str z11, \[sp, #5, mul vl\] ++** str z12, \[sp, #6, mul vl\] ++** str z13, \[sp, #7, mul vl\] ++** str z14, \[sp, #8, mul vl\] ++** str z15, \[sp, #9, mul vl\] ++** str z16, \[sp, #10, mul vl\] ++** str z17, \[sp, #11, mul vl\] ++** str z18, \[sp, #12, mul vl\] ++** str z19, \[sp, #13, mul vl\] ++** str z20, \[sp, #14, mul vl\] ++** str z21, \[sp, #15, mul vl\] ++** str z22, \[sp, #16, mul vl\] ++** str z23, \[sp, #17, mul vl\] ++** mov \1, x0 ++** bl ns_callee ++** smstart sm ++** bl s_callee ++** addvl sp, sp, #-1 ++** str p0, \[sp\] ++** smstop sm ++** ldr p0, \[sp\] ++** addvl sp, sp, #1 ++** bl sc_callee ++** ldr (x[0-9]+), \[\1\] ++** blr \2 ++** ldr (x[0-9]+), \[\1, #?8\] ++** smstart sm ++** blr \3 ++** addvl sp, sp, #-1 ++** str p0, \[sp\] ++** smstop sm ++** ldr p0, \[sp\] ++** addvl sp, sp, #1 ++** ldr (x[0-9]+), \[\1, #?16\] ++** blr \4 ++** ldr z8, \[sp, #2, mul vl\] ++** ldr z9, \[sp, #3, mul vl\] ++** ldr z10, \[sp, #4, mul vl\] ++** ldr z11, \[sp, #5, mul vl\] ++** ldr z12, \[sp, #6, mul vl\] ++** ldr z13, \[sp, #7, mul vl\] ++** ldr z14, \[sp, #8, mul vl\] ++** ldr z15, \[sp, #9, mul vl\] ++** ldr z16, \[sp, #10, mul vl\] ++** ldr z17, \[sp, #11, mul vl\] ++** ldr z18, \[sp, #12, mul vl\] ++** ldr z19, \[sp, #13, mul vl\] ++** ldr z20, \[sp, #14, mul vl\] ++** ldr z21, \[sp, #15, mul vl\] ++** ldr z22, \[sp, #16, mul vl\] ++** ldr z23, \[sp, #17, mul vl\] ++** ldr p4, \[sp\] ++** ldr p5, \[sp, #1, mul vl\] ++** ldr p6, \[sp, #2, mul vl\] ++** ldr p7, \[sp, #3, mul vl\] ++** ldr p8, \[sp, #4, mul vl\] ++** ldr p9, \[sp, #5, mul vl\] ++** ldr p10, \[sp, #6, mul vl\] ++** ldr p11, \[sp, #7, mul vl\] ++** ldr p12, \[sp, #8, mul vl\] ++** ldr p13, \[sp, #9, mul vl\] ++** ldr p14, \[sp, #10, mul vl\] ++** ldr p15, \[sp, #11, mul vl\] ++** addvl sp, sp, #18 ++** ldp x30, \1, \[sp\], #?32 ++** ret ++*/ ++svbool_t ++n_caller (struct callbacks *c) ++{ ++ ns_callee (); ++ s_callee (); ++ sc_callee (); ++ ++ c->ns_ptr (); ++ c->s_ptr (); ++ return c->sc_ptr (); ++} ++ ++/* ++** s_caller: { target lp64 } ++** stp x30, (x19|x2[0-8]), \[sp, #?-32\]! ++** cntd x16 ++** str x16, \[sp, #?16\] ++** addvl sp, sp, #-18 ++** str p4, \[sp\] ++** str p5, \[sp, #1, mul vl\] ++** str p6, \[sp, #2, mul vl\] ++** str p7, \[sp, #3, mul vl\] ++** str p8, \[sp, #4, mul vl\] ++** str p9, \[sp, #5, mul vl\] ++** str p10, \[sp, #6, mul vl\] ++** str p11, \[sp, #7, mul vl\] ++** str p12, \[sp, #8, mul vl\] ++** str p13, \[sp, #9, mul vl\] ++** str p14, \[sp, #10, mul vl\] ++** str p15, \[sp, #11, mul vl\] ++** str z8, \[sp, #2, mul vl\] ++** str z9, \[sp, #3, mul vl\] ++** str z10, \[sp, #4, mul vl\] ++** str z11, \[sp, #5, mul vl\] ++** str z12, \[sp, #6, mul vl\] ++** str z13, \[sp, #7, mul vl\] ++** str z14, \[sp, #8, mul vl\] ++** str z15, \[sp, #9, mul vl\] ++** str z16, \[sp, #10, mul vl\] ++** str z17, \[sp, #11, mul vl\] ++** str z18, \[sp, #12, mul vl\] ++** str z19, \[sp, #13, mul vl\] ++** str z20, \[sp, #14, mul vl\] ++** str z21, \[sp, #15, mul vl\] ++** str z22, \[sp, #16, mul vl\] ++** str z23, \[sp, #17, mul vl\] ++** mov \1, x0 ++** smstop sm ++** bl ns_callee ++** addvl sp, sp, #-1 ++** str p0, \[sp\] ++** smstart sm ++** ldr p0, \[sp\] ++** addvl sp, sp, #1 ++** bl s_callee ++** bl sc_callee ++** ldr (x[0-9]+), \[\1\] ++** smstop sm ++** blr \2 ++** addvl sp, sp, #-1 ++** str p0, \[sp\] ++** smstart sm ++** ldr p0, \[sp\] ++** addvl sp, sp, #1 ++** ldr (x[0-9]+), \[\1, #?8\] ++** blr \3 ++** ldr (x[0-9]+), \[\1, #?16\] ++** blr \4 ++** ldr z8, \[sp, #2, mul vl\] ++** ldr z9, \[sp, #3, mul vl\] ++** ldr z10, \[sp, #4, mul vl\] ++** ldr z11, \[sp, #5, mul vl\] ++** ldr z12, \[sp, #6, mul vl\] ++** ldr z13, \[sp, #7, mul vl\] ++** ldr z14, \[sp, #8, mul vl\] ++** ldr z15, \[sp, #9, mul vl\] ++** ldr z16, \[sp, #10, mul vl\] ++** ldr z17, \[sp, #11, mul vl\] ++** ldr z18, \[sp, #12, mul vl\] ++** ldr z19, \[sp, #13, mul vl\] ++** ldr z20, \[sp, #14, mul vl\] ++** ldr z21, \[sp, #15, mul vl\] ++** ldr z22, \[sp, #16, mul vl\] ++** ldr z23, \[sp, #17, mul vl\] ++** ldr p4, \[sp\] ++** ldr p5, \[sp, #1, mul vl\] ++** ldr p6, \[sp, #2, mul vl\] ++** ldr p7, \[sp, #3, mul vl\] ++** ldr p8, \[sp, #4, mul vl\] ++** ldr p9, \[sp, #5, mul vl\] ++** ldr p10, \[sp, #6, mul vl\] ++** ldr p11, \[sp, #7, mul vl\] ++** ldr p12, \[sp, #8, mul vl\] ++** ldr p13, \[sp, #9, mul vl\] ++** ldr p14, \[sp, #10, mul vl\] ++** ldr p15, \[sp, #11, mul vl\] ++** addvl sp, sp, #18 ++** ldp x30, \1, \[sp\], #?32 ++** ret ++*/ ++svbool_t ++s_caller (struct callbacks *c) [[arm::streaming]] ++{ ++ ns_callee (); ++ s_callee (); ++ sc_callee (); ++ ++ c->ns_ptr (); ++ c->s_ptr (); ++ return c->sc_ptr (); ++} ++ ++/* ++** sc_caller: ++** stp x29, x30, \[sp, #?-32\]! ++** mov x29, sp ++** cntd x16 ++** str x16, \[sp, #?24\] ++** addvl sp, sp, #-18 ++** str p4, \[sp\] ++** str p5, \[sp, #1, mul vl\] ++** str p6, \[sp, #2, mul vl\] ++** str p7, \[sp, #3, mul vl\] ++** str p8, \[sp, #4, mul vl\] ++** str p9, \[sp, #5, mul vl\] ++** str p10, \[sp, #6, mul vl\] ++** str p11, \[sp, #7, mul vl\] ++** str p12, \[sp, #8, mul vl\] ++** str p13, \[sp, #9, mul vl\] ++** str p14, \[sp, #10, mul vl\] ++** str p15, \[sp, #11, mul vl\] ++** str z8, \[sp, #2, mul vl\] ++** str z9, \[sp, #3, mul vl\] ++** str z10, \[sp, #4, mul vl\] ++** str z11, \[sp, #5, mul vl\] ++** str z12, \[sp, #6, mul vl\] ++** str z13, \[sp, #7, mul vl\] ++** str z14, \[sp, #8, mul vl\] ++** str z15, \[sp, #9, mul vl\] ++** str z16, \[sp, #10, mul vl\] ++** str z17, \[sp, #11, mul vl\] ++** str z18, \[sp, #12, mul vl\] ++** str z19, \[sp, #13, mul vl\] ++** str z20, \[sp, #14, mul vl\] ++** str z21, \[sp, #15, mul vl\] ++** str z22, \[sp, #16, mul vl\] ++** str z23, \[sp, #17, mul vl\] ++** mrs x16, svcr ++** str x16, \[x29, #?16\] ++** ldr x16, \[x29, #?16\] ++** tbz x16, 0, .* ++** smstop sm ++** bl ns_callee ++** ldr x16, \[x29, #?16\] ++** tbz x16, 0, .* ++** addvl sp, sp, #-1 ++** str p0, \[sp\] ++** smstart sm ++** ldr p0, \[sp\] ++** addvl sp, sp, #1 ++** ldr x16, \[x29, #?16\] ++** tbnz x16, 0, .* ++** smstart sm ++** bl s_callee ++** ldr x16, \[x29, #?16\] ++** tbnz x16, 0, .* ++** addvl sp, sp, #-1 ++** str p0, \[sp\] ++** smstop sm ++** ldr p0, \[sp\] ++** addvl sp, sp, #1 ++** bl sc_callee ++** ldr z8, \[sp, #2, mul vl\] ++** ldr z9, \[sp, #3, mul vl\] ++** ldr z10, \[sp, #4, mul vl\] ++** ldr z11, \[sp, #5, mul vl\] ++** ldr z12, \[sp, #6, mul vl\] ++** ldr z13, \[sp, #7, mul vl\] ++** ldr z14, \[sp, #8, mul vl\] ++** ldr z15, \[sp, #9, mul vl\] ++** ldr z16, \[sp, #10, mul vl\] ++** ldr z17, \[sp, #11, mul vl\] ++** ldr z18, \[sp, #12, mul vl\] ++** ldr z19, \[sp, #13, mul vl\] ++** ldr z20, \[sp, #14, mul vl\] ++** ldr z21, \[sp, #15, mul vl\] ++** ldr z22, \[sp, #16, mul vl\] ++** ldr z23, \[sp, #17, mul vl\] ++** ldr p4, \[sp\] ++** ldr p5, \[sp, #1, mul vl\] ++** ldr p6, \[sp, #2, mul vl\] ++** ldr p7, \[sp, #3, mul vl\] ++** ldr p8, \[sp, #4, mul vl\] ++** ldr p9, \[sp, #5, mul vl\] ++** ldr p10, \[sp, #6, mul vl\] ++** ldr p11, \[sp, #7, mul vl\] ++** ldr p12, \[sp, #8, mul vl\] ++** ldr p13, \[sp, #9, mul vl\] ++** ldr p14, \[sp, #10, mul vl\] ++** ldr p15, \[sp, #11, mul vl\] ++** addvl sp, sp, #18 ++** ldp x29, x30, \[sp\], #?32 ++** ret ++*/ ++svbool_t ++sc_caller () [[arm::streaming_compatible]] ++{ ++ ns_callee (); ++ s_callee (); ++ return sc_callee (); ++} ++ ++/* { dg-final { scan-assembler {n_caller:(?:(?!ret).)*\.cfi_offset 46, -16\n} } } */ ++/* { dg-final { scan-assembler {s_caller:(?:(?!ret).)*\.cfi_offset 46, -16\n} } } */ ++/* { dg-final { scan-assembler {sc_caller:(?:(?!ret).)*\.cfi_offset 46, -8\n} } } */ +diff --git a/gcc/testsuite/gcc.target/aarch64/sme/call_sm_switch_6.c b/gcc/testsuite/gcc.target/aarch64/sme/call_sm_switch_6.c +new file mode 100644 +index 000000000..0f6bc4f6c +--- /dev/null ++++ b/gcc/testsuite/gcc.target/aarch64/sme/call_sm_switch_6.c +@@ -0,0 +1,45 @@ ++// { dg-options "-O -fomit-frame-pointer -fno-optimize-sibling-calls" } ++ ++#include <arm_sve.h> ++ ++svbool_t ns_callee (); ++ svbool_t s_callee () [[arm::streaming]]; ++ svbool_t sc_callee () [[arm::streaming_compatible]]; ++ ++struct callbacks { ++ svbool_t (*ns_ptr) (); ++ svbool_t (*s_ptr) () [[arm::streaming]]; ++ svbool_t (*sc_ptr) () [[arm::streaming_compatible]]; ++}; ++ ++svbool_t ++n_caller (struct callbacks *c) ++{ ++ ns_callee (); ++ sc_callee (); ++ ++ c->ns_ptr (); ++ return c->sc_ptr (); ++} ++ ++svbool_t ++s_caller (struct callbacks *c) [[arm::streaming]] ++{ ++ s_callee (); ++ sc_callee (); ++ ++ c->s_ptr (); ++ return c->sc_ptr (); ++} ++ ++svbool_t ++sc_caller (struct callbacks *c) [[arm::streaming_compatible]] ++{ ++ sc_callee (); ++ ++ return c->sc_ptr (); ++} ++ ++// { dg-final { scan-assembler-not {[dpqz][0-9]+,} } } ++// { dg-final { scan-assembler-not {smstart\tsm} } } ++// { dg-final { scan-assembler-not {smstop\tsm} } } +diff --git a/gcc/testsuite/gcc.target/aarch64/sme/call_sm_switch_7.c b/gcc/testsuite/gcc.target/aarch64/sme/call_sm_switch_7.c +new file mode 100644 +index 000000000..6482a489f +--- /dev/null ++++ b/gcc/testsuite/gcc.target/aarch64/sme/call_sm_switch_7.c +@@ -0,0 +1,516 @@ ++// { dg-options "-O -fomit-frame-pointer -fno-optimize-sibling-calls" } ++// { dg-final { check-function-bodies "**" "" } } ++ ++#include <arm_neon.h> ++#include <arm_sve.h> ++ ++double produce_d0 (); ++void consume_d0 (double); ++ ++/* ++** test_d0: ++** ... ++** smstop sm ++** bl produce_d0 ++** fmov x10, d0 ++** smstart sm ++** fmov d0, x10 ++** fmov x10, d0 ++** smstop sm ++** fmov d0, x10 ++** bl consume_d0 ++** ... ++*/ ++void ++test_d0 () [[arm::streaming]] ++{ ++ double res = produce_d0 (); ++ asm volatile (""); ++ consume_d0 (res); ++} ++ ++int8x8_t produce_d0_vec (); ++void consume_d0_vec (int8x8_t); ++ ++/* ++** test_d0_vec: ++** ... ++** smstop sm ++** bl produce_d0_vec ++** ( ++** fmov x10, d0 ++** | ++** umov x10, v0.d\[0\] ++** ) ++** smstart sm ++** fmov d0, x10 ++** ( ++** fmov x10, d0 ++** | ++** umov x10, v0.d\[0\] ++** ) ++** smstop sm ++** fmov d0, x10 ++** bl consume_d0_vec ++** ... ++*/ ++void ++test_d0_vec () [[arm::streaming]] ++{ ++ int8x8_t res = produce_d0_vec (); ++ asm volatile (""); ++ consume_d0_vec (res); ++} ++ ++int8x16_t produce_q0 (); ++void consume_q0 (int8x16_t); ++ ++/* ++** test_q0: ++** ... ++** smstop sm ++** bl produce_q0 ++** str q0, \[sp, #?-16\]! ++** smstart sm ++** ldr q0, \[sp\], #?16 ++** str q0, \[sp, #?-16\]! ++** smstop sm ++** ldr q0, \[sp\], #?16 ++** bl consume_q0 ++** ... ++*/ ++void ++test_q0 () [[arm::streaming]] ++{ ++ int8x16_t res = produce_q0 (); ++ asm volatile (""); ++ consume_q0 (res); ++} ++ ++int8x16x2_t produce_q1 (); ++void consume_q1 (int8x16x2_t); ++ ++/* ++** test_q1: ++** ... ++** smstop sm ++** bl produce_q1 ++** stp q0, q1, \[sp, #?-32\]! ++** smstart sm ++** ldp q0, q1, \[sp\], #?32 ++** stp q0, q1, \[sp, #?-32\]! ++** smstop sm ++** ldp q0, q1, \[sp\], #?32 ++** bl consume_q1 ++** ... ++*/ ++void ++test_q1 () [[arm::streaming]] ++{ ++ int8x16x2_t res = produce_q1 (); ++ asm volatile (""); ++ consume_q1 (res); ++} ++ ++int8x16x3_t produce_q2 (); ++void consume_q2 (int8x16x3_t); ++ ++/* ++** test_q2: ++** ... ++** smstop sm ++** bl produce_q2 ++** stp q0, q1, \[sp, #?-48\]! ++** str q2, \[sp, #?32\] ++** smstart sm ++** ldr q2, \[sp, #?32\] ++** ldp q0, q1, \[sp\], #?48 ++** stp q0, q1, \[sp, #?-48\]! ++** str q2, \[sp, #?32\] ++** smstop sm ++** ldr q2, \[sp, #?32\] ++** ldp q0, q1, \[sp\], #?48 ++** bl consume_q2 ++** ... ++*/ ++void ++test_q2 () [[arm::streaming]] ++{ ++ int8x16x3_t res = produce_q2 (); ++ asm volatile (""); ++ consume_q2 (res); ++} ++ ++int8x16x4_t produce_q3 (); ++void consume_q3 (int8x16x4_t); ++ ++/* ++** test_q3: ++** ... ++** smstop sm ++** bl produce_q3 ++** stp q0, q1, \[sp, #?-64\]! ++** stp q2, q3, \[sp, #?32\] ++** smstart sm ++** ldp q2, q3, \[sp, #?32\] ++** ldp q0, q1, \[sp\], #?64 ++** stp q0, q1, \[sp, #?-64\]! ++** stp q2, q3, \[sp, #?32\] ++** smstop sm ++** ldp q2, q3, \[sp, #?32\] ++** ldp q0, q1, \[sp\], #?64 ++** bl consume_q3 ++** ... ++*/ ++void ++test_q3 () [[arm::streaming]] ++{ ++ int8x16x4_t res = produce_q3 (); ++ asm volatile (""); ++ consume_q3 (res); ++} ++ ++svint8_t produce_z0 (); ++void consume_z0 (svint8_t); ++ ++/* ++** test_z0: ++** ... ++** smstop sm ++** bl produce_z0 ++** addvl sp, sp, #-1 ++** str z0, \[sp\] ++** smstart sm ++** ldr z0, \[sp\] ++** addvl sp, sp, #1 ++** addvl sp, sp, #-1 ++** str z0, \[sp\] ++** smstop sm ++** ldr z0, \[sp\] ++** addvl sp, sp, #1 ++** bl consume_z0 ++** ... ++*/ ++void ++test_z0 () [[arm::streaming]] ++{ ++ svint8_t res = produce_z0 (); ++ asm volatile (""); ++ consume_z0 (res); ++} ++ ++svint8x4_t produce_z3 (); ++void consume_z3 (svint8x4_t); ++ ++/* ++** test_z3: ++** ... ++** smstop sm ++** bl produce_z3 ++** addvl sp, sp, #-4 ++** str z0, \[sp\] ++** str z1, \[sp, #1, mul vl\] ++** str z2, \[sp, #2, mul vl\] ++** str z3, \[sp, #3, mul vl\] ++** smstart sm ++** ldr z0, \[sp\] ++** ldr z1, \[sp, #1, mul vl\] ++** ldr z2, \[sp, #2, mul vl\] ++** ldr z3, \[sp, #3, mul vl\] ++** addvl sp, sp, #4 ++** addvl sp, sp, #-4 ++** str z0, \[sp\] ++** str z1, \[sp, #1, mul vl\] ++** str z2, \[sp, #2, mul vl\] ++** str z3, \[sp, #3, mul vl\] ++** smstop sm ++** ldr z0, \[sp\] ++** ldr z1, \[sp, #1, mul vl\] ++** ldr z2, \[sp, #2, mul vl\] ++** ldr z3, \[sp, #3, mul vl\] ++** addvl sp, sp, #4 ++** bl consume_z3 ++** ... ++*/ ++void ++test_z3 () [[arm::streaming]] ++{ ++ svint8x4_t res = produce_z3 (); ++ asm volatile (""); ++ consume_z3 (res); ++} ++ ++svbool_t produce_p0 (); ++void consume_p0 (svbool_t); ++ ++/* ++** test_p0: ++** ... ++** smstop sm ++** bl produce_p0 ++** addvl sp, sp, #-1 ++** str p0, \[sp\] ++** smstart sm ++** ldr p0, \[sp\] ++** addvl sp, sp, #1 ++** addvl sp, sp, #-1 ++** str p0, \[sp\] ++** smstop sm ++** ldr p0, \[sp\] ++** addvl sp, sp, #1 ++** bl consume_p0 ++** ... ++*/ ++void ++test_p0 () [[arm::streaming]] ++{ ++ svbool_t res = produce_p0 (); ++ asm volatile (""); ++ consume_p0 (res); ++} ++ ++void consume_d7 (double, double, double, double, double, double, double, ++ double); ++ ++/* ++** test_d7: ++** ... ++** fmov x10, d0 ++** fmov x11, d1 ++** fmov x12, d2 ++** fmov x13, d3 ++** fmov x14, d4 ++** fmov x15, d5 ++** fmov x16, d6 ++** fmov x17, d7 ++** smstop sm ++** fmov d0, x10 ++** fmov d1, x11 ++** fmov d2, x12 ++** fmov d3, x13 ++** fmov d4, x14 ++** fmov d5, x15 ++** fmov d6, x16 ++** fmov d7, x17 ++** bl consume_d7 ++** ... ++*/ ++void ++test_d7 () [[arm::streaming]] ++{ ++ consume_d7 (1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0); ++} ++ ++void consume_d7_vec (int8x8_t, int8x8_t, int8x8_t, int8x8_t, int8x8_t, ++ int8x8_t, int8x8_t, int8x8_t); ++ ++/* ++** test_d7_vec: ++** ... ++** ( ++** fmov x10, d0 ++** fmov x11, d1 ++** fmov x12, d2 ++** fmov x13, d3 ++** fmov x14, d4 ++** fmov x15, d5 ++** fmov x16, d6 ++** fmov x17, d7 ++** | ++** umov x10, v0.d\[0\] ++** umov x11, v1.d\[0\] ++** umov x12, v2.d\[0\] ++** umov x13, v3.d\[0\] ++** umov x14, v4.d\[0\] ++** umov x15, v5.d\[0\] ++** umov x16, v6.d\[0\] ++** umov x17, v7.d\[0\] ++** ) ++** smstop sm ++** fmov d0, x10 ++** fmov d1, x11 ++** fmov d2, x12 ++** fmov d3, x13 ++** fmov d4, x14 ++** fmov d5, x15 ++** fmov d6, x16 ++** fmov d7, x17 ++** bl consume_d7_vec ++** ... ++*/ ++void ++test_d7_vec (int8x8_t *ptr) [[arm::streaming]] ++{ ++ consume_d7_vec (*ptr, *ptr, *ptr, *ptr, *ptr, *ptr, *ptr, *ptr); ++} ++ ++void consume_q7 (int8x16_t, int8x16_t, int8x16_t, int8x16_t, int8x16_t, ++ int8x16_t, int8x16_t, int8x16_t); ++ ++/* ++** test_q7: ++** ... ++** stp q0, q1, \[sp, #?-128\]! ++** stp q2, q3, \[sp, #?32\] ++** stp q4, q5, \[sp, #?64\] ++** stp q6, q7, \[sp, #?96\] ++** smstop sm ++** ldp q2, q3, \[sp, #?32\] ++** ldp q4, q5, \[sp, #?64\] ++** ldp q6, q7, \[sp, #?96\] ++** ldp q0, q1, \[sp\], #?128 ++** bl consume_q7 ++** ... ++*/ ++void ++test_q7 (int8x16_t *ptr) [[arm::streaming]] ++{ ++ consume_q7 (*ptr, *ptr, *ptr, *ptr, *ptr, *ptr, *ptr, *ptr); ++} ++ ++void consume_z7 (svint8_t, svint8_t, svint8_t, svint8_t, svint8_t, ++ svint8_t, svint8_t, svint8_t); ++ ++/* ++** test_z7: ++** ... ++** addvl sp, sp, #-8 ++** str z0, \[sp\] ++** str z1, \[sp, #1, mul vl\] ++** str z2, \[sp, #2, mul vl\] ++** str z3, \[sp, #3, mul vl\] ++** str z4, \[sp, #4, mul vl\] ++** str z5, \[sp, #5, mul vl\] ++** str z6, \[sp, #6, mul vl\] ++** str z7, \[sp, #7, mul vl\] ++** smstop sm ++** ldr z0, \[sp\] ++** ldr z1, \[sp, #1, mul vl\] ++** ldr z2, \[sp, #2, mul vl\] ++** ldr z3, \[sp, #3, mul vl\] ++** ldr z4, \[sp, #4, mul vl\] ++** ldr z5, \[sp, #5, mul vl\] ++** ldr z6, \[sp, #6, mul vl\] ++** ldr z7, \[sp, #7, mul vl\] ++** addvl sp, sp, #8 ++** bl consume_z7 ++** ... ++*/ ++void ++test_z7 (svint8_t *ptr) [[arm::streaming]] ++{ ++ consume_z7 (*ptr, *ptr, *ptr, *ptr, *ptr, *ptr, *ptr, *ptr); ++} ++ ++void consume_p3 (svbool_t, svbool_t, svbool_t, svbool_t); ++ ++/* ++** test_p3: ++** ... ++** addvl sp, sp, #-1 ++** str p0, \[sp\] ++** str p1, \[sp, #1, mul vl\] ++** str p2, \[sp, #2, mul vl\] ++** str p3, \[sp, #3, mul vl\] ++** smstop sm ++** ldr p0, \[sp\] ++** ldr p1, \[sp, #1, mul vl\] ++** ldr p2, \[sp, #2, mul vl\] ++** ldr p3, \[sp, #3, mul vl\] ++** addvl sp, sp, #1 ++** bl consume_p3 ++** ... ++*/ ++void ++test_p3 (svbool_t *ptr) [[arm::streaming]] ++{ ++ consume_p3 (*ptr, *ptr, *ptr, *ptr); ++} ++ ++void consume_mixed (float, double, float32x4_t, svfloat32_t, ++ float, double, float64x2_t, svfloat64_t, ++ svbool_t, svbool_t, svbool_t, svbool_t); ++ ++/* ++** test_mixed: ++** ... ++** addvl sp, sp, #-3 ++** str p0, \[sp\] ++** str p1, \[sp, #1, mul vl\] ++** str p2, \[sp, #2, mul vl\] ++** str p3, \[sp, #3, mul vl\] ++** str z3, \[sp, #1, mul vl\] ++** str z7, \[sp, #2, mul vl\] ++** stp q2, q6, \[sp, #?-32\]! ++** fmov w10, s0 ++** fmov x11, d1 ++** fmov w12, s4 ++** fmov x13, d5 ++** smstop sm ++** fmov s0, w10 ++** fmov d1, x11 ++** fmov s4, w12 ++** fmov d5, x13 ++** ldp q2, q6, \[sp\], #?32 ++** ldr p0, \[sp\] ++** ldr p1, \[sp, #1, mul vl\] ++** ldr p2, \[sp, #2, mul vl\] ++** ldr p3, \[sp, #3, mul vl\] ++** ldr z3, \[sp, #1, mul vl\] ++** ldr z7, \[sp, #2, mul vl\] ++** addvl sp, sp, #3 ++** bl consume_mixed ++** ... ++*/ ++void ++test_mixed (float32x4_t *float32x4_ptr, ++ svfloat32_t *svfloat32_ptr, ++ float64x2_t *float64x2_ptr, ++ svfloat64_t *svfloat64_ptr, ++ svbool_t *svbool_ptr) [[arm::streaming]] ++{ ++ consume_mixed (1.0f, 2.0, *float32x4_ptr, *svfloat32_ptr, ++ 3.0f, 4.0, *float64x2_ptr, *svfloat64_ptr, ++ *svbool_ptr, *svbool_ptr, *svbool_ptr, *svbool_ptr); ++} ++ ++void consume_varargs (float, ...); ++ ++/* ++** test_varargs: ++** ... ++** stp q3, q7, \[sp, #?-32\]! ++** fmov w10, s0 ++** fmov x11, d1 ++** ( ++** fmov x12, d2 ++** | ++** umov x12, v2.d\[0\] ++** ) ++** fmov x13, d4 ++** fmov x14, d5 ++** ( ++** fmov x15, d6 ++** | ++** umov x15, v6.d\[0\] ++** ) ++** smstop sm ++** fmov s0, w10 ++** fmov d1, x11 ++** fmov d2, x12 ++** fmov d4, x13 ++** fmov d5, x14 ++** fmov d6, x15 ++** ldp q3, q7, \[sp\], #?32 ++** bl consume_varargs ++** ... ++*/ ++void ++test_varargs (float32x2_t *float32x2_ptr, ++ float32x4_t *float32x4_ptr, ++ float64x1_t *float64x1_ptr, ++ float64x2_t *float64x2_ptr) [[arm::streaming]] ++{ ++ consume_varargs (1.0f, 2.0, *float32x2_ptr, *float32x4_ptr, ++ 3.0f, 4.0, *float64x1_ptr, *float64x2_ptr); ++} +diff --git a/gcc/testsuite/gcc.target/aarch64/sme/call_sm_switch_8.c b/gcc/testsuite/gcc.target/aarch64/sme/call_sm_switch_8.c +new file mode 100644 +index 000000000..f44724df3 +--- /dev/null ++++ b/gcc/testsuite/gcc.target/aarch64/sme/call_sm_switch_8.c +@@ -0,0 +1,87 @@ ++// { dg-options "-O -fomit-frame-pointer -fno-optimize-sibling-calls -msve-vector-bits=128" } ++// { dg-final { check-function-bodies "**" "" } } ++ ++#include <arm_sve.h> ++ ++svint8_t produce_z0 (); ++void consume_z0 (svint8_t); ++ ++/* ++** test_z0: ++** ... ++** smstop sm ++** bl produce_z0 ++** str q0, \[sp, #?-16\]! ++** smstart sm ++** ldr q0, \[sp\], #?16 ++** str q0, \[sp, #?-16\]! ++** smstop sm ++** ldr q0, \[sp\], #?16 ++** bl consume_z0 ++** ... ++*/ ++void ++test_z0 () [[arm::streaming]] ++{ ++ svint8_t res = produce_z0 (); ++ asm volatile (""); ++ consume_z0 (res); ++} ++ ++svint8x4_t produce_z3 (); ++void consume_z3 (svint8x4_t); ++ ++/* ++** test_z3: ++** ... ++** smstop sm ++** bl produce_z3 ++** stp q0, q1, \[sp, #?-64\]! ++** stp q2, q3, \[sp, #?32\] ++** smstart sm ++** ldp q2, q3, \[sp, #?32\] ++** ldp q0, q1, \[sp\], #?64 ++** stp q0, q1, \[sp, #?-64\]! ++** stp q2, q3, \[sp, #?32\] ++** smstop sm ++** ldp q2, q3, \[sp, #?32\] ++** ldp q0, q1, \[sp\], #?64 ++** bl consume_z3 ++** ... ++*/ ++void ++test_z3 () [[arm::streaming]] ++{ ++ svint8x4_t res = produce_z3 (); ++ asm volatile (""); ++ consume_z3 (res); ++} ++ ++svbool_t produce_p0 (); ++void consume_p0 (svbool_t); ++ ++/* ++** test_p0: ++** ... ++** smstop sm ++** bl produce_p0 ++** sub sp, sp, #?16 ++** str p0, \[sp\] ++** smstart sm ++** ldr p0, \[sp\] ++** add sp, sp, #?16 ++** sub sp, sp, #?16 ++** str p0, \[sp\] ++** smstop sm ++** ldr p0, \[sp\] ++** add sp, sp, #?16 ++** bl consume_p0 ++** ... ++*/ ++void ++test_p0 () [[arm::streaming]] ++{ ++ svbool_t res = produce_p0 (); ++ asm volatile (""); ++ consume_p0 (res); ++} +diff --git a/gcc/testsuite/gcc.target/aarch64/sme/call_sm_switch_9.c b/gcc/testsuite/gcc.target/aarch64/sme/call_sm_switch_9.c +new file mode 100644 +index 000000000..83b4073ee +--- /dev/null ++++ b/gcc/testsuite/gcc.target/aarch64/sme/call_sm_switch_9.c +@@ -0,0 +1,103 @@ ++// { dg-options "-O -fomit-frame-pointer -fno-optimize-sibling-calls -msve-vector-bits=256" } ++// { dg-final { check-function-bodies "**" "" } } ++ ++#include <arm_sve.h> ++ ++svint8_t produce_z0 (); ++void consume_z0 (svint8_t); ++ ++/* ++** test_z0: ++** ... ++** smstop sm ++** bl produce_z0 ++** sub sp, sp, #?32 ++** str z0, \[sp\] ++** smstart sm ++** ldr z0, \[sp\] ++** add sp, sp, #?32 ++** sub sp, sp, #?32 ++** str z0, \[sp\] ++** smstop sm ++** ldr z0, \[sp\] ++** add sp, sp, #?32 ++** bl consume_z0 ++** ... ++*/ ++void ++test_z0 () [[arm::streaming]] ++{ ++ svint8_t res = produce_z0 (); ++ asm volatile (""); ++ consume_z0 (res); ++} ++ ++svint8x4_t produce_z3 (); ++void consume_z3 (svint8x4_t); ++ ++/* ++** test_z3: ++** ... ++** smstop sm ++** bl produce_z3 ++** sub sp, sp, #?128 ++** str z0, \[sp\] ++** str z1, \[sp, #1, mul vl\] ++** str z2, \[sp, #2, mul vl\] ++** str z3, \[sp, #3, mul vl\] ++** smstart sm ++** ldr z0, \[sp\] ++** ldr z1, \[sp, #1, mul vl\] ++** ldr z2, \[sp, #2, mul vl\] ++** ldr z3, \[sp, #3, mul vl\] ++** add sp, sp, #?128 ++** sub sp, sp, #?128 ++** str z0, \[sp\] ++** str z1, \[sp, #1, mul vl\] ++** str z2, \[sp, #2, mul vl\] ++** str z3, \[sp, #3, mul vl\] ++** smstop sm ++** ldr z0, \[sp\] ++** ldr z1, \[sp, #1, mul vl\] ++** ldr z2, \[sp, #2, mul vl\] ++** ldr z3, \[sp, #3, mul vl\] ++** add sp, sp, #?128 ++** bl consume_z3 ++** ... ++*/ ++void ++test_z3 () [[arm::streaming]] ++{ ++ svint8x4_t res = produce_z3 (); ++ asm volatile (""); ++ consume_z3 (res); ++} ++ ++svbool_t produce_p0 (); ++void consume_p0 (svbool_t); ++ ++/* ++** test_p0: ++** ... ++** smstop sm ++** bl produce_p0 ++** sub sp, sp, #?32 ++** str p0, \[sp\] ++** smstart sm ++** ldr p0, \[sp\] ++** add sp, sp, #?32 ++** sub sp, sp, #?32 ++** str p0, \[sp\] ++** smstop sm ++** ldr p0, \[sp\] ++** add sp, sp, #?32 ++** bl consume_p0 ++** ... ++*/ ++void ++test_p0 () [[arm::streaming]] ++{ ++ svbool_t res = produce_p0 (); ++ asm volatile (""); ++ consume_p0 (res); ++} +-- +2.33.0 + |