summaryrefslogtreecommitdiff
path: root/0206-Backport-SME-aarch64-Add-support-for-SME-ZA-attribut.patch
diff options
context:
space:
mode:
Diffstat (limited to '0206-Backport-SME-aarch64-Add-support-for-SME-ZA-attribut.patch')
-rw-r--r--0206-Backport-SME-aarch64-Add-support-for-SME-ZA-attribut.patch4324
1 files changed, 4324 insertions, 0 deletions
diff --git a/0206-Backport-SME-aarch64-Add-support-for-SME-ZA-attribut.patch b/0206-Backport-SME-aarch64-Add-support-for-SME-ZA-attribut.patch
new file mode 100644
index 0000000..f15e7f6
--- /dev/null
+++ b/0206-Backport-SME-aarch64-Add-support-for-SME-ZA-attribut.patch
@@ -0,0 +1,4324 @@
+From 1efd433c779f66440facc8ba5cd23bdbdd6672ba Mon Sep 17 00:00:00 2001
+From: Richard Sandiford <richard.sandiford@arm.com>
+Date: Tue, 5 Dec 2023 10:11:26 +0000
+Subject: [PATCH 107/157] [Backport][SME] aarch64: Add support for SME ZA
+ attributes
+
+Reference: https://gcc.gnu.org/git/?p=gcc.git;a=commit;h=3af9ceb631b741095d8eabd055ff7c23d4a69e6f
+
+SME has an array called ZA that can be enabled and disabled separately
+from streaming mode. A status bit called PSTATE.ZA indicates whether
+ZA is currently enabled or not.
+
+In C and C++, the state of PSTATE.ZA is controlled using function
+attributes. There are four attributes that can be attached to
+function types to indicate that the function shares ZA with its
+caller. These are:
+
+- arm::in("za")
+- arm::out("za")
+- arm::inout("za")
+- arm::preserves("za")
+
+If a function's type has one of these shared-ZA attributes,
+PSTATE.ZA is specified to be 1 on entry to the function and on return
+from the function. Otherwise, the caller and callee have separate
+ZA contexts; they do not use ZA to share data.
+
+Although normal non-shared-ZA functions have a separate ZA context
+from their callers, nested uses of ZA are expected to be rare.
+The ABI therefore defines a cooperative lazy saving scheme that
+allows saves and restore of ZA to be kept to a minimum.
+(Callers still have the option of doing a full save and restore
+if they prefer.)
+
+Functions that want to use ZA internally have an arm::new("za")
+attribute, which tells the compiler to enable PSTATE.ZA for
+the duration of the function body. It also tells the compiler
+to commit any lazy save initiated by a caller.
+
+The patch uses various abstract hard registers to track dataflow
+relating to ZA. See the comments in the patch for details.
+
+The lazy save scheme is intended to be transparent to most normal
+functions, so that they don't need to be recompiled for SME.
+This is reflected in the way that most normal functions ignore
+the new hard registers added in the patch.
+
+As with arm::streaming and arm::streaming_compatible, the attributes are
+also available as __arm_<attr>. This has two advantages: it triggers an
+error on compilers that don't understand the attributes, and it eases
+use on C, where [[...]] attributes were only added in C23.
+
+gcc/
+ * config/aarch64/aarch64-isa-modes.def (ZA_ON): New ISA mode.
+ * config/aarch64/aarch64-protos.h (aarch64_rdsvl_immediate_p)
+ (aarch64_output_rdsvl, aarch64_optimize_mode_switching)
+ (aarch64_restore_za): Declare.
+ * config/aarch64/constraints.md (UsR): New constraint.
+ * config/aarch64/aarch64.md (LOWERING_REGNUM, TPIDR_BLOCK_REGNUM)
+ (SME_STATE_REGNUM, TPIDR2_SETUP_REGNUM, ZA_FREE_REGNUM)
+ (ZA_SAVED_REGNUM, ZA_REGNUM, FIRST_FAKE_REGNUM): New constants.
+ (LAST_FAKE_REGNUM): Likewise.
+ (UNSPEC_SAVE_NZCV, UNSPEC_RESTORE_NZCV, UNSPEC_SME_VQ): New unspecs.
+ (arches): Add sme.
+ (arch_enabled): Handle it.
+ (*cb<optab><mode>1): Rename to...
+ (aarch64_cb<optab><mode>1): ...this.
+ (*movsi_aarch64): Add an alternative for RDSVL.
+ (*movdi_aarch64): Likewise.
+ (aarch64_save_nzcv, aarch64_restore_nzcv): New insns.
+ * config/aarch64/aarch64-sme.md (UNSPEC_SMSTOP_ZA)
+ (UNSPEC_INITIAL_ZERO_ZA, UNSPEC_TPIDR2_SAVE, UNSPEC_TPIDR2_RESTORE)
+ (UNSPEC_READ_TPIDR2, UNSPEC_WRITE_TPIDR2, UNSPEC_SETUP_LOCAL_TPIDR2)
+ (UNSPEC_RESTORE_ZA, UNSPEC_START_PRIVATE_ZA_CALL): New unspecs.
+ (UNSPEC_END_PRIVATE_ZA_CALL, UNSPEC_COMMIT_LAZY_SAVE): Likewise.
+ (UNSPECV_ASM_UPDATE_ZA): New unspecv.
+ (aarch64_tpidr2_save, aarch64_smstart_za, aarch64_smstop_za)
+ (aarch64_initial_zero_za, aarch64_setup_local_tpidr2)
+ (aarch64_clear_tpidr2, aarch64_write_tpidr2, aarch64_read_tpidr2)
+ (aarch64_tpidr2_restore, aarch64_restore_za, aarch64_asm_update_za)
+ (aarch64_start_private_za_call, aarch64_end_private_za_call)
+ (aarch64_commit_lazy_save): New patterns.
+ * config/aarch64/aarch64.h (AARCH64_ISA_ZA_ON, TARGET_ZA): New macros.
+ (FIXED_REGISTERS, REGISTER_NAMES): Add the new fake ZA registers.
+ (CALL_USED_REGISTERS): Replace with...
+ (CALL_REALLY_USED_REGISTERS): ...this and add the fake ZA registers.
+ (FIRST_PSEUDO_REGISTER): Bump to include the fake ZA registers.
+ (FAKE_REGS): New register class.
+ (REG_CLASS_NAMES): Update accordingly.
+ (REG_CLASS_CONTENTS): Likewise.
+ (machine_function::tpidr2_block): New member variable.
+ (machine_function::tpidr2_block_ptr): Likewise.
+ (machine_function::za_save_buffer): Likewise.
+ (machine_function::next_asm_update_za_id): Likewise.
+ (CUMULATIVE_ARGS::shared_za_flags): Likewise.
+ (aarch64_mode_entity, aarch64_local_sme_state): New enums.
+ (aarch64_tristate_mode): Likewise.
+ (OPTIMIZE_MODE_SWITCHING, NUM_MODES_FOR_MODE_SWITCHING): Define.
+ * config/aarch64/aarch64.cc (AARCH64_STATE_SHARED, AARCH64_STATE_IN)
+ (AARCH64_STATE_OUT): New constants.
+ (aarch64_attribute_shared_state_flags): New function.
+ (aarch64_lookup_shared_state_flags, aarch64_fndecl_has_new_state)
+ (aarch64_check_state_string, cmp_string_csts): Likewise.
+ (aarch64_merge_string_arguments, aarch64_check_arm_new_against_type)
+ (handle_arm_new, handle_arm_shared): Likewise.
+ (handle_arm_new_za_attribute): New
+ (aarch64_arm_attribute_table): Add new, preserves, in, out, and inout.
+ (aarch64_hard_regno_nregs): Handle FAKE_REGS.
+ (aarch64_hard_regno_mode_ok): Likewise.
+ (aarch64_fntype_shared_flags, aarch64_fntype_pstate_za): New functions.
+ (aarch64_fntype_isa_mode): Include aarch64_fntype_pstate_za.
+ (aarch64_fndecl_has_state, aarch64_fndecl_pstate_za): New functions.
+ (aarch64_fndecl_isa_mode): Include aarch64_fndecl_pstate_za.
+ (aarch64_cfun_incoming_pstate_za, aarch64_cfun_shared_flags)
+ (aarch64_cfun_has_new_state, aarch64_cfun_has_state): New functions.
+ (aarch64_sme_vq_immediate, aarch64_sme_vq_unspec_p): Likewise.
+ (aarch64_rdsvl_immediate_p, aarch64_output_rdsvl): Likewise.
+ (aarch64_expand_mov_immediate): Handle RDSVL immediates.
+ (aarch64_function_arg): Add the ZA sharing flags as a third limb
+ of the PARALLEL.
+ (aarch64_init_cumulative_args): Record the ZA sharing flags.
+ (aarch64_extra_live_on_entry): New function. Handle the new
+ ZA-related fake registers.
+ (aarch64_epilogue_uses): Handle the new ZA-related fake registers.
+ (aarch64_cannot_force_const_mem): Handle UNSPEC_SME_VQ constants.
+ (aarch64_get_tpidr2_block, aarch64_get_tpidr2_ptr): New functions.
+ (aarch64_init_tpidr2_block, aarch64_restore_za): Likewise.
+ (aarch64_layout_frame): Check whether the current function creates
+ new ZA state. Record that it clobbers LR if so.
+ (aarch64_expand_prologue): Handle functions that create new ZA state.
+ (aarch64_expand_epilogue): Likewise.
+ (aarch64_create_tpidr2_block): New function.
+ (aarch64_restore_za): Likewise.
+ (aarch64_start_call_args): Disallow calls to shared-ZA functions
+ from functions that have no ZA state. Emit a marker instruction
+ before calls to private-ZA functions from functions that have
+ SME state.
+ (aarch64_expand_call): Add return registers for state that is
+ managed via attributes. Record the use and clobber information
+ for the ZA registers.
+ (aarch64_end_call_args): New function.
+ (aarch64_regno_regclass): Handle FAKE_REGS.
+ (aarch64_class_max_nregs): Likewise.
+ (aarch64_override_options_internal): Require TARGET_SME for
+ functions that have ZA state.
+ (aarch64_conditional_register_usage): Handle FAKE_REGS.
+ (aarch64_mov_operand_p): Handle RDSVL immediates.
+ (aarch64_comp_type_attributes): Check that the ZA sharing flags
+ are equal.
+ (aarch64_merge_decl_attributes): New function.
+ (aarch64_optimize_mode_switching, aarch64_mode_emit_za_save_buffer)
+ (aarch64_mode_emit_local_sme_state, aarch64_mode_emit): Likewise.
+ (aarch64_insn_references_sme_state_p): Likewise.
+ (aarch64_mode_needed_local_sme_state): Likewise.
+ (aarch64_mode_needed_za_save_buffer, aarch64_mode_needed): Likewise.
+ (aarch64_mode_after_local_sme_state, aarch64_mode_after): Likewise.
+ (aarch64_local_sme_confluence, aarch64_mode_confluence): Likewise.
+ (aarch64_one_shot_backprop, aarch64_local_sme_backprop): Likewise.
+ (aarch64_mode_backprop, aarch64_mode_entry): Likewise.
+ (aarch64_mode_exit, aarch64_mode_eh_handler): Likewise.
+ (aarch64_mode_priority, aarch64_md_asm_adjust): Likewise.
+ (TARGET_END_CALL_ARGS, TARGET_MERGE_DECL_ATTRIBUTES): Define.
+ (TARGET_MODE_EMIT, TARGET_MODE_NEEDED, TARGET_MODE_AFTER): Likewise.
+ (TARGET_MODE_CONFLUENCE, TARGET_MODE_BACKPROP): Likewise.
+ (TARGET_MODE_ENTRY, TARGET_MODE_EXIT): Likewise.
+ (TARGET_MODE_EH_HANDLER, TARGET_MODE_PRIORITY): Likewise.
+ (TARGET_EXTRA_LIVE_ON_ENTRY): Likewise.
+ (TARGET_MD_ASM_ADJUST): Use aarch64_md_asm_adjust.
+ * config/aarch64/aarch64-c.cc (aarch64_define_unconditional_macros):
+ Define __arm_new, __arm_preserves,__arm_in, __arm_out, and __arm_inout.
+
+gcc/testsuite/
+ * gcc.target/aarch64/sme/za_state_1.c: New test.
+ * gcc.target/aarch64/sme/za_state_2.c: Likewise.
+ * gcc.target/aarch64/sme/za_state_3.c: Likewise.
+ * gcc.target/aarch64/sme/za_state_4.c: Likewise.
+ * gcc.target/aarch64/sme/za_state_5.c: Likewise.
+ * gcc.target/aarch64/sme/za_state_6.c: Likewise.
+ * g++.target/aarch64/sme/exceptions_1.C: Likewise.
+ * gcc.target/aarch64/sme/keyword_macros_1.c: Add ZA macros.
+ * g++.target/aarch64/sme/keyword_macros_1.C: Likewise.
+---
+ gcc/config/aarch64/aarch64-c.cc | 32 +
+ gcc/config/aarch64/aarch64-isa-modes.def | 5 +
+ gcc/config/aarch64/aarch64-protos.h | 5 +
+ gcc/config/aarch64/aarch64-sme.md | 287 ++++
+ gcc/config/aarch64/aarch64.cc | 1371 ++++++++++++++++-
+ gcc/config/aarch64/aarch64.h | 98 +-
+ gcc/config/aarch64/aarch64.md | 81 +-
+ gcc/config/aarch64/constraints.md | 6 +
+ .../g++.target/aarch64/sme/exceptions_1.C | 189 +++
+ .../g++.target/aarch64/sme/keyword_macros_1.C | 5 +
+ .../gcc.target/aarch64/sme/keyword_macros_1.c | 5 +
+ .../gcc.target/aarch64/sme/za_state_1.c | 154 ++
+ .../gcc.target/aarch64/sme/za_state_2.c | 73 +
+ .../gcc.target/aarch64/sme/za_state_3.c | 31 +
+ .../gcc.target/aarch64/sme/za_state_4.c | 585 +++++++
+ .../gcc.target/aarch64/sme/za_state_5.c | 595 +++++++
+ .../gcc.target/aarch64/sme/za_state_6.c | 23 +
+ 17 files changed, 3523 insertions(+), 22 deletions(-)
+ create mode 100644 gcc/testsuite/g++.target/aarch64/sme/exceptions_1.C
+ create mode 100644 gcc/testsuite/gcc.target/aarch64/sme/za_state_1.c
+ create mode 100644 gcc/testsuite/gcc.target/aarch64/sme/za_state_2.c
+ create mode 100644 gcc/testsuite/gcc.target/aarch64/sme/za_state_3.c
+ create mode 100644 gcc/testsuite/gcc.target/aarch64/sme/za_state_4.c
+ create mode 100644 gcc/testsuite/gcc.target/aarch64/sme/za_state_5.c
+ create mode 100644 gcc/testsuite/gcc.target/aarch64/sme/za_state_6.c
+
+diff --git a/gcc/config/aarch64/aarch64-c.cc b/gcc/config/aarch64/aarch64-c.cc
+index 397745fbd..76c20848f 100644
+--- a/gcc/config/aarch64/aarch64-c.cc
++++ b/gcc/config/aarch64/aarch64-c.cc
+@@ -73,6 +73,8 @@ aarch64_define_unconditional_macros (cpp_reader *pfile)
+
+ builtin_define ("__GCC_ASM_FLAG_OUTPUTS__");
+
++ builtin_define ("__ARM_STATE_ZA");
++
+ /* Define keyword attributes like __arm_streaming as macros that expand
+ to the associated [[...]] attribute. Use __extension__ in the attribute
+ for C, since the [[...]] syntax was only added in C23. */
+@@ -86,6 +88,36 @@ aarch64_define_unconditional_macros (cpp_reader *pfile)
+ DEFINE_ARM_KEYWORD_MACRO ("streaming_compatible");
+
+ #undef DEFINE_ARM_KEYWORD_MACRO
++
++ /* Same for the keyword attributes that take arguments. The snag here
++ is that some old modes warn about or reject variadic arguments. */
++ auto *cpp_opts = cpp_get_options (parse_in);
++ if (!cpp_opts->traditional)
++ {
++ auto old_warn_variadic_macros = cpp_opts->warn_variadic_macros;
++ auto old_cpp_warn_c90_c99_compat = cpp_opts->cpp_warn_c90_c99_compat;
++
++ cpp_opts->warn_variadic_macros = false;
++ cpp_opts->cpp_warn_c90_c99_compat = 0;
++
++#define DEFINE_ARM_KEYWORD_MACRO_ARGS(NAME) \
++ builtin_define_with_value ("__arm_" NAME "(...)", \
++ lang_GNU_CXX () \
++ ? "[[arm::" NAME "(__VA_ARGS__)]]" \
++ : "[[__extension__ arm::" NAME \
++ "(__VA_ARGS__)]]", 0);
++
++ DEFINE_ARM_KEYWORD_MACRO_ARGS ("new");
++ DEFINE_ARM_KEYWORD_MACRO_ARGS ("preserves");
++ DEFINE_ARM_KEYWORD_MACRO_ARGS ("in");
++ DEFINE_ARM_KEYWORD_MACRO_ARGS ("out");
++ DEFINE_ARM_KEYWORD_MACRO_ARGS ("inout");
++
++#undef DEFINE_ARM_KEYWORD_MACRO_ARGS
++
++ cpp_opts->warn_variadic_macros = old_warn_variadic_macros;
++ cpp_opts->cpp_warn_c90_c99_compat = old_cpp_warn_c90_c99_compat;
++ }
+ }
+
+ /* Undefine/redefine macros that depend on the current backend state and may
+diff --git a/gcc/config/aarch64/aarch64-isa-modes.def b/gcc/config/aarch64/aarch64-isa-modes.def
+index 5915c98a8..c0ada35bd 100644
+--- a/gcc/config/aarch64/aarch64-isa-modes.def
++++ b/gcc/config/aarch64/aarch64-isa-modes.def
+@@ -32,4 +32,9 @@
+ DEF_AARCH64_ISA_MODE(SM_ON)
+ DEF_AARCH64_ISA_MODE(SM_OFF)
+
++/* Indicates that PSTATE.ZA is known to be 1. The converse is that
++ PSTATE.ZA might be 0 or 1, depending on whether there is an uncommitted
++ lazy save. */
++DEF_AARCH64_ISA_MODE(ZA_ON)
++
+ #undef DEF_AARCH64_ISA_MODE
+diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h
+index 737f47026..0883ddd1a 100644
+--- a/gcc/config/aarch64/aarch64-protos.h
++++ b/gcc/config/aarch64/aarch64-protos.h
+@@ -808,6 +808,8 @@ bool aarch64_sve_addvl_addpl_immediate_p (rtx);
+ bool aarch64_sve_vector_inc_dec_immediate_p (rtx);
+ int aarch64_add_offset_temporaries (rtx);
+ void aarch64_split_add_offset (scalar_int_mode, rtx, rtx, rtx, rtx, rtx);
++bool aarch64_rdsvl_immediate_p (const_rtx);
++char *aarch64_output_rdsvl (const_rtx);
+ bool aarch64_mov_operand_p (rtx, machine_mode);
+ rtx aarch64_reverse_mask (machine_mode, unsigned int);
+ bool aarch64_offset_7bit_signed_scaled_p (machine_mode, poly_int64);
+@@ -1083,4 +1085,7 @@ extern bool aarch64_harden_sls_blr_p (void);
+
+ extern void aarch64_output_patchable_area (unsigned int, bool);
+
++bool aarch64_optimize_mode_switching (aarch64_mode_entity);
++void aarch64_restore_za (rtx);
++
+ #endif /* GCC_AARCH64_PROTOS_H */
+diff --git a/gcc/config/aarch64/aarch64-sme.md b/gcc/config/aarch64/aarch64-sme.md
+index 52427b4f1..d4973098e 100644
+--- a/gcc/config/aarch64/aarch64-sme.md
++++ b/gcc/config/aarch64/aarch64-sme.md
+@@ -23,6 +23,7 @@
+ ;; == State management
+ ;; ---- Test current state
+ ;; ---- PSTATE.SM management
++;; ---- PSTATE.ZA management
+
+ ;; =========================================================================
+ ;; == State management
+@@ -169,3 +170,289 @@
+ ""
+ "smstop\tsm"
+ )
++
++;; -------------------------------------------------------------------------
++;; ---- PSTATE.ZA management
++;; -------------------------------------------------------------------------
++;; Includes:
++;; - SMSTART ZA
++;; - SMSTOP ZA
++;; plus calls to support routines.
++;; -------------------------------------------------------------------------
++
++(define_c_enum "unspec" [
++ UNSPEC_SMSTOP_ZA
++ UNSPEC_INITIAL_ZERO_ZA
++ UNSPEC_TPIDR2_SAVE
++ UNSPEC_TPIDR2_RESTORE
++ UNSPEC_READ_TPIDR2
++ UNSPEC_WRITE_TPIDR2
++ UNSPEC_SETUP_LOCAL_TPIDR2
++ UNSPEC_RESTORE_ZA
++ UNSPEC_START_PRIVATE_ZA_CALL
++ UNSPEC_END_PRIVATE_ZA_CALL
++ UNSPEC_COMMIT_LAZY_SAVE
++])
++
++(define_c_enum "unspecv" [
++ UNSPECV_ASM_UPDATE_ZA
++])
++
++;; Use the ABI-defined routine to commit an uncommitted lazy save.
++;; This relies on the current PSTATE.ZA, so depends on SME_STATE_REGNUM.
++;; The fake TPIDR2_SETUP_REGNUM register initially holds the incoming
++;; value of the architected TPIDR2_EL0.
++(define_insn "aarch64_tpidr2_save"
++ [(set (reg:DI ZA_FREE_REGNUM)
++ (unspec:DI [(reg:DI SME_STATE_REGNUM)
++ (reg:DI TPIDR2_SETUP_REGNUM)] UNSPEC_TPIDR2_SAVE))
++ (clobber (reg:DI R14_REGNUM))
++ (clobber (reg:DI R15_REGNUM))
++ (clobber (reg:DI R16_REGNUM))
++ (clobber (reg:DI R17_REGNUM))
++ (clobber (reg:DI R18_REGNUM))
++ (clobber (reg:DI R30_REGNUM))
++ (clobber (reg:CC CC_REGNUM))]
++ ""
++ "bl\t__arm_tpidr2_save"
++)
++
++;; Set PSTATE.ZA to 1. If ZA was previously dormant or active,
++;; it remains in the same state afterwards, with the same contents.
++;; Otherwise, it goes from off to on with zeroed contents.
++;;
++;; Later writes of TPIDR2_EL0 to a nonzero value must not be moved
++;; up past this instruction, since that could create an invalid
++;; combination of having an active lazy save while ZA is off.
++;; Create an anti-dependence by reading the current contents
++;; of TPIDR2_SETUP_REGNUM.
++;;
++;; Making this depend on ZA_FREE_REGNUM ensures that contents belonging
++;; to the caller have already been saved. That isn't necessary for this
++;; instruction itself, since PSTATE.ZA is already 1 if it contains data.
++;; But doing this here means that other uses of ZA can just depend on
++;; SME_STATE_REGNUM, rather than both SME_STATE_REGNUM and ZA_FREE_REGNUM.
++(define_insn "aarch64_smstart_za"
++ [(set (reg:DI SME_STATE_REGNUM)
++ (const_int 1))
++ (use (reg:DI TPIDR2_SETUP_REGNUM))
++ (use (reg:DI ZA_FREE_REGNUM))]
++ ""
++ "smstart\tza"
++)
++
++;; Disable ZA and discard its current contents.
++;;
++;; The ABI says that the ZA save buffer must be null whenever PSTATE.ZA
++;; is zero, so earlier writes to TPIDR2_EL0 must not be moved down past
++;; this instruction. Depend on TPIDR2_SETUP_REGNUM to ensure this.
++;;
++;; We can only turn ZA off once we know that it is free (i.e. doesn't
++;; contain data belonging to the caller). Depend on ZA_FREE_REGNUM
++;; to ensure this.
++;;
++;; We only turn ZA off when the current function's ZA state is dead,
++;; or perhaps if we're sure that the contents are saved. Either way,
++;; we know whether ZA is saved or not.
++(define_insn "aarch64_smstop_za"
++ [(set (reg:DI SME_STATE_REGNUM)
++ (const_int 0))
++ (set (reg:DI ZA_SAVED_REGNUM)
++ (unspec:DI [(reg:DI TPIDR2_SETUP_REGNUM)
++ (reg:DI ZA_FREE_REGNUM)] UNSPEC_SMSTOP_ZA))]
++ ""
++ "smstop\tza"
++)
++
++;; Zero ZA after committing a lazy save. The sequencing is enforced
++;; by reading ZA_FREE_REGNUM.
++(define_insn "aarch64_initial_zero_za"
++ [(set (reg:DI ZA_REGNUM)
++ (unspec:DI [(reg:DI SME_STATE_REGNUM)
++ (reg:DI ZA_FREE_REGNUM)] UNSPEC_INITIAL_ZERO_ZA))]
++ ""
++ "zero\t{ za }"
++)
++
++;; Initialize the abstract TPIDR2_BLOCK_REGNUM from the contents of
++;; the current function's TPIDR2 block. Other instructions can then
++;; depend on TPIDR2_BLOCK_REGNUM rather than on the memory block.
++(define_insn "aarch64_setup_local_tpidr2"
++ [(set (reg:DI TPIDR2_BLOCK_REGNUM)
++ (unspec:DI [(match_operand:V16QI 0 "memory_operand" "m")]
++ UNSPEC_SETUP_LOCAL_TPIDR2))]
++ ""
++ ""
++ [(set_attr "type" "no_insn")]
++)
++
++;; Clear TPIDR2_EL0, cancelling any uncommitted lazy save.
++(define_insn "aarch64_clear_tpidr2"
++ [(set (reg:DI TPIDR2_SETUP_REGNUM)
++ (const_int 0))]
++ ""
++ "msr\ttpidr2_el0, xzr"
++)
++
++;; Point TPIDR2_EL0 to the current function's TPIDR2 block, whose address
++;; is given by operand 0. TPIDR2_BLOCK_REGNUM represents the contents of the
++;; pointed-to block.
++(define_insn "aarch64_write_tpidr2"
++ [(set (reg:DI TPIDR2_SETUP_REGNUM)
++ (unspec:DI [(match_operand 0 "pmode_register_operand" "r")
++ (reg:DI TPIDR2_BLOCK_REGNUM)] UNSPEC_WRITE_TPIDR2))]
++ ""
++ "msr\ttpidr2_el0, %0"
++)
++
++;; Check whether ZA has been saved. The system depends on the value that
++;; we wrote to TPIDR2_EL0 previously, so it depends on TPDIR2_SETUP_REGNUM.
++(define_insn "aarch64_read_tpidr2"
++ [(set (match_operand:DI 0 "register_operand" "=r")
++ (unspec:DI [(reg:DI TPIDR2_SETUP_REGNUM)
++ (reg:DI ZA_SAVED_REGNUM)] UNSPEC_READ_TPIDR2))]
++ ""
++ "mrs\t%0, tpidr2_el0"
++)
++
++;; Use the ABI-defined routine to restore lazy-saved ZA contents
++;; from the TPIDR2 block pointed to by X0. ZA must already be active.
++(define_insn "aarch64_tpidr2_restore"
++ [(set (reg:DI ZA_SAVED_REGNUM)
++ (unspec:DI [(reg:DI R0_REGNUM)] UNSPEC_TPIDR2_RESTORE))
++ (set (reg:DI SME_STATE_REGNUM)
++ (unspec:DI [(reg:DI SME_STATE_REGNUM)] UNSPEC_TPIDR2_RESTORE))
++ (clobber (reg:DI R14_REGNUM))
++ (clobber (reg:DI R15_REGNUM))
++ (clobber (reg:DI R16_REGNUM))
++ (clobber (reg:DI R17_REGNUM))
++ (clobber (reg:DI R18_REGNUM))
++ (clobber (reg:DI R30_REGNUM))
++ (clobber (reg:CC CC_REGNUM))]
++ ""
++ "bl\t__arm_tpidr2_restore"
++)
++
++;; Check whether a lazy save set up by aarch64_save_za was committed
++;; and restore the saved contents if so.
++;;
++;; Operand 0 is the address of the current function's TPIDR2 block.
++(define_insn_and_split "aarch64_restore_za"
++ [(set (reg:DI ZA_SAVED_REGNUM)
++ (unspec:DI [(match_operand 0 "pmode_register_operand" "r")
++ (reg:DI SME_STATE_REGNUM)
++ (reg:DI TPIDR2_SETUP_REGNUM)
++ (reg:DI ZA_SAVED_REGNUM)] UNSPEC_RESTORE_ZA))
++ (clobber (reg:DI R0_REGNUM))
++ (clobber (reg:DI R14_REGNUM))
++ (clobber (reg:DI R15_REGNUM))
++ (clobber (reg:DI R16_REGNUM))
++ (clobber (reg:DI R17_REGNUM))
++ (clobber (reg:DI R18_REGNUM))
++ (clobber (reg:DI R30_REGNUM))
++ (clobber (reg:CC CC_REGNUM))]
++ ""
++ "#"
++ "&& epilogue_completed"
++ [(const_int 0)]
++ {
++ auto label = gen_label_rtx ();
++ auto tpidr2 = gen_rtx_REG (DImode, R16_REGNUM);
++ emit_insn (gen_aarch64_read_tpidr2 (tpidr2));
++ auto jump = emit_likely_jump_insn (gen_aarch64_cbnedi1 (tpidr2, label));
++ JUMP_LABEL (jump) = label;
++
++ aarch64_restore_za (operands[0]);
++ emit_label (label);
++ DONE;
++ }
++)
++
++;; This instruction is emitted after asms that alter ZA, in order to model
++;; the effect on dataflow. The asm itself can't have ZA as an input or
++;; an output, since there is no associated data type. Instead it retains
++;; the original "za" clobber, which on its own would indicate that ZA
++;; is dead.
++;;
++;; The operand is a unique identifier.
++(define_insn "aarch64_asm_update_za"
++ [(set (reg:VNx16QI ZA_REGNUM)
++ (unspec_volatile:VNx16QI
++ [(reg:VNx16QI ZA_REGNUM)
++ (reg:DI SME_STATE_REGNUM)
++ (match_operand 0 "const_int_operand")]
++ UNSPECV_ASM_UPDATE_ZA))]
++ ""
++ ""
++ [(set_attr "type" "no_insn")]
++)
++
++;; This pseudo-instruction is emitted as part of a call to a private-ZA
++;; function from a function with ZA state. It marks a natural place to set
++;; up a lazy save, if that turns out to be necessary. The save itself
++;; is managed by the mode-switching pass.
++(define_insn "aarch64_start_private_za_call"
++ [(set (reg:DI LOWERING_REGNUM)
++ (unspec:DI [(reg:DI LOWERING_REGNUM)] UNSPEC_START_PRIVATE_ZA_CALL))]
++ ""
++ ""
++ [(set_attr "type" "no_insn")]
++)
++
++;; This pseudo-instruction is emitted as part of a call to a private-ZA
++;; function from a function with ZA state. It marks a natural place to restore
++;; the current function's ZA contents from the lazy save buffer, if that
++;; turns out to be necessary. The save itself is managed by the
++;; mode-switching pass.
++(define_insn "aarch64_end_private_za_call"
++ [(set (reg:DI LOWERING_REGNUM)
++ (unspec:DI [(reg:DI LOWERING_REGNUM)] UNSPEC_END_PRIVATE_ZA_CALL))]
++ ""
++ ""
++ [(set_attr "type" "no_insn")]
++)
++
++;; This pseudo-instruction is emitted before a private-ZA function uses
++;; PSTATE.ZA state for the first time. The instruction checks whether
++;; ZA currently contains data belonging to a caller and commits the
++;; lazy save if so.
++;;
++;; Operand 0 is the incoming value of TPIDR2_EL0. Operand 1 is nonzero
++;; if ZA is live, and should therefore be zeroed after committing a save.
++;;
++;; The instruction is generated by the mode-switching pass. It is a
++;; define_insn_and_split rather than a define_expand because of the
++;; internal control flow.
++(define_insn_and_split "aarch64_commit_lazy_save"
++ [(set (reg:DI ZA_FREE_REGNUM)
++ (unspec:DI [(match_operand 0 "pmode_register_operand" "r")
++ (match_operand 1 "const_int_operand")
++ (reg:DI SME_STATE_REGNUM)
++ (reg:DI TPIDR2_SETUP_REGNUM)
++ (reg:VNx16QI ZA_REGNUM)] UNSPEC_COMMIT_LAZY_SAVE))
++ (set (reg:DI ZA_REGNUM)
++ (unspec:DI [(reg:DI SME_STATE_REGNUM)
++ (reg:DI ZA_FREE_REGNUM)] UNSPEC_INITIAL_ZERO_ZA))
++ (clobber (reg:DI R14_REGNUM))
++ (clobber (reg:DI R15_REGNUM))
++ (clobber (reg:DI R16_REGNUM))
++ (clobber (reg:DI R17_REGNUM))
++ (clobber (reg:DI R18_REGNUM))
++ (clobber (reg:DI R30_REGNUM))
++ (clobber (reg:CC CC_REGNUM))]
++ ""
++ "#"
++ "true"
++ [(const_int 0)]
++ {
++ auto label = gen_label_rtx ();
++ auto jump = emit_jump_insn (gen_aarch64_cbeqdi1 (operands[0], label));
++ JUMP_LABEL (jump) = label;
++ emit_insn (gen_aarch64_tpidr2_save ());
++ emit_insn (gen_aarch64_clear_tpidr2 ());
++ if (INTVAL (operands[1]) != 0)
++ emit_insn (gen_aarch64_initial_zero_za ());
++ emit_label (label);
++ DONE;
++ }
++)
+diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
+index 82f8e574e..a6e996c5b 100644
+--- a/gcc/config/aarch64/aarch64.cc
++++ b/gcc/config/aarch64/aarch64.cc
+@@ -91,6 +91,26 @@
+ /* Defined for convenience. */
+ #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
+
++/* Flags that describe how a function shares certain architectural state
++ with its callers.
++
++ - AARCH64_STATE_SHARED indicates that the function does share the state
++ with callers.
++
++ - AARCH64_STATE_IN indicates that the function reads (or might read) the
++ incoming state. The converse is that the function ignores the incoming
++ state.
++
++ - AARCH64_STATE_OUT indicates that the function returns new state.
++ The converse is that the state on return is the same as it was on entry.
++
++ A function that partially modifies the state treats it as both IN
++ and OUT (because the value on return depends to some extent on the
++ value on input). */
++constexpr auto AARCH64_STATE_SHARED = 1U << 0;
++constexpr auto AARCH64_STATE_IN = 1U << 1;
++constexpr auto AARCH64_STATE_OUT = 1U << 2;
++
+ /* Information about a legitimate vector immediate operand. */
+ struct simd_immediate_info
+ {
+@@ -2959,6 +2979,151 @@ static const struct processor all_cores[] =
+ /* The current tuning set. */
+ struct tune_params aarch64_tune_params = generic_tunings;
+
++/* If NAME is the name of an arm:: attribute that describes shared state,
++ return its associated AARCH64_STATE_* flags, otherwise return 0. */
++static unsigned int
++aarch64_attribute_shared_state_flags (const char *name)
++{
++ if (strcmp (name, "in") == 0)
++ return AARCH64_STATE_SHARED | AARCH64_STATE_IN;
++ if (strcmp (name, "inout") == 0)
++ return AARCH64_STATE_SHARED | AARCH64_STATE_IN | AARCH64_STATE_OUT;
++ if (strcmp (name, "out") == 0)
++ return AARCH64_STATE_SHARED | AARCH64_STATE_OUT;
++ if (strcmp (name, "preserves") == 0)
++ return AARCH64_STATE_SHARED;
++ return 0;
++}
++
++/* See whether attribute list ATTRS has any sharing information
++ for state STATE_NAME. Return the associated state flags if so,
++ otherwise return 0. */
++static unsigned int
++aarch64_lookup_shared_state_flags (tree attrs, const char *state_name)
++{
++ for (tree attr = attrs; attr; attr = TREE_CHAIN (attr))
++ {
++ if (!cxx11_attribute_p (attr))
++ continue;
++
++ auto ns = IDENTIFIER_POINTER (TREE_PURPOSE (TREE_PURPOSE (attr)));
++ if (strcmp (ns, "arm") != 0)
++ continue;
++
++ auto attr_name = IDENTIFIER_POINTER (TREE_VALUE (TREE_PURPOSE (attr)));
++ auto flags = aarch64_attribute_shared_state_flags (attr_name);
++ if (!flags)
++ continue;
++
++ for (tree arg = TREE_VALUE (attr); arg; arg = TREE_CHAIN (arg))
++ {
++ tree value = TREE_VALUE (arg);
++ if (TREE_CODE (value) == STRING_CST
++ && strcmp (TREE_STRING_POINTER (value), state_name) == 0)
++ return flags;
++ }
++ }
++ return 0;
++}
++
++/* Return true if DECL creates a new scope for state STATE_STRING. */
++static bool
++aarch64_fndecl_has_new_state (const_tree decl, const char *state_name)
++{
++ if (tree attr = lookup_attribute ("arm", "new", DECL_ATTRIBUTES (decl)))
++ for (tree arg = TREE_VALUE (attr); arg; arg = TREE_CHAIN (arg))
++ {
++ tree value = TREE_VALUE (arg);
++ if (TREE_CODE (value) == STRING_CST
++ && strcmp (TREE_STRING_POINTER (value), state_name) == 0)
++ return true;
++ }
++ return false;
++}
++
++/* Return true if attribute argument VALUE is a recognized state string,
++ otherwise report an error. NAME is the name of the attribute to which
++ VALUE is being passed. */
++static bool
++aarch64_check_state_string (tree name, tree value)
++{
++ if (TREE_CODE (value) != STRING_CST)
++ {
++ error ("the arguments to %qE must be constant strings", name);
++ return false;
++ }
++
++ const char *state_name = TREE_STRING_POINTER (value);
++ if (strcmp (state_name, "za") != 0)
++ {
++ error ("unrecognized state string %qs", state_name);
++ return false;
++ }
++
++ return true;
++}
++
++/* qsort callback to compare two STRING_CSTs. */
++static int
++cmp_string_csts (const void *a, const void *b)
++{
++ return strcmp (TREE_STRING_POINTER (*(const_tree const *) a),
++ TREE_STRING_POINTER (*(const_tree const *) b));
++}
++
++/* Canonicalize a list of state strings. ARGS contains the arguments to
++ a new attribute while OLD_ATTR, if nonnull, contains a previous attribute
++ of the same type. If CAN_MERGE_IN_PLACE, it is safe to adjust OLD_ATTR's
++ arguments and drop the new attribute. Otherwise, the new attribute must
++ be kept and ARGS must include the information in OLD_ATTR.
++
++ In both cases, the new arguments must be a sorted list of state strings
++ with duplicates removed.
++
++ Return true if new attribute should be kept, false if it should be
++ dropped. */
++static bool
++aarch64_merge_string_arguments (tree args, tree old_attr,
++ bool can_merge_in_place)
++{
++ /* Get a sorted list of all state strings (including duplicates). */
++ auto add_args = [](vec<tree> &strings, const_tree args)
++ {
++ for (const_tree arg = args; arg; arg = TREE_CHAIN (arg))
++ if (TREE_CODE (TREE_VALUE (arg)) == STRING_CST)
++ strings.safe_push (TREE_VALUE (arg));
++ };
++ auto_vec<tree, 16> strings;
++ add_args (strings, args);
++ if (old_attr)
++ add_args (strings, TREE_VALUE (old_attr));
++ strings.qsort (cmp_string_csts);
++
++ /* The list can be empty if there was no previous attribute and if all
++ the new arguments are erroneous. Drop the attribute in that case. */
++ if (strings.is_empty ())
++ return false;
++
++ /* Destructively modify one of the argument lists, removing duplicates
++ on the fly. */
++ bool use_old_attr = old_attr && can_merge_in_place;
++ tree *end = use_old_attr ? &TREE_VALUE (old_attr) : &args;
++ tree prev = NULL_TREE;
++ for (tree arg : strings)
++ {
++ if (prev && simple_cst_equal (arg, prev))
++ continue;
++ prev = arg;
++ if (!*end)
++ *end = tree_cons (NULL_TREE, arg, NULL_TREE);
++ else
++ TREE_VALUE (*end) = arg;
++ end = &TREE_CHAIN (*end);
++ }
++ *end = NULL_TREE;
++ return !use_old_attr;
++}
++
+ /* Check whether an 'aarch64_vector_pcs' attribute is valid. */
+
+ static tree
+@@ -2987,6 +3152,101 @@ handle_aarch64_vector_pcs_attribute (tree *node, tree name, tree,
+ gcc_unreachable ();
+ }
+
++/* Return true if arm::new(ARGS) is compatible with the type of decl DECL,
++ otherwise report an error. */
++static bool
++aarch64_check_arm_new_against_type (tree args, tree decl)
++{
++ tree type_attrs = TYPE_ATTRIBUTES (TREE_TYPE (decl));
++ for (tree arg = args; arg; arg = TREE_CHAIN (arg))
++ {
++ tree value = TREE_VALUE (arg);
++ if (TREE_CODE (value) == STRING_CST)
++ {
++ const char *state_name = TREE_STRING_POINTER (value);
++ if (aarch64_lookup_shared_state_flags (type_attrs, state_name))
++ {
++ error_at (DECL_SOURCE_LOCATION (decl),
++ "cannot create a new %qs scope since %qs is shared"
++ " with callers", state_name, state_name);
++ return false;
++ }
++ }
++ }
++ return true;
++}
++
++/* Callback for arm::new attributes. */
++static tree
++handle_arm_new (tree *node, tree name, tree args, int, bool *no_add_attrs)
++{
++ tree decl = *node;
++ if (TREE_CODE (decl) != FUNCTION_DECL)
++ {
++ error ("%qE attribute applies only to function definitions", name);
++ *no_add_attrs = true;
++ return NULL_TREE;
++ }
++ if (TREE_TYPE (decl) == error_mark_node)
++ {
++ *no_add_attrs = true;
++ return NULL_TREE;
++ }
++
++ for (tree arg = args; arg; arg = TREE_CHAIN (arg))
++ aarch64_check_state_string (name, TREE_VALUE (arg));
++
++ if (!aarch64_check_arm_new_against_type (args, decl))
++ {
++ *no_add_attrs = true;
++ return NULL_TREE;
++ }
++
++ /* If there is an old attribute, we should try to update it in-place,
++ so that there is only one (definitive) arm::new attribute on the decl. */
++ tree old_attr = lookup_attribute ("arm", "new", DECL_ATTRIBUTES (decl));
++ if (!aarch64_merge_string_arguments (args, old_attr, true))
++ *no_add_attrs = true;
++
++ return NULL_TREE;
++}
++
++/* Callback for arm::{in,out,inout,preserves} attributes. */
++static tree
++handle_arm_shared (tree *node, tree name, tree args,
++ int, bool *no_add_attrs)
++{
++ tree type = *node;
++ tree old_attrs = TYPE_ATTRIBUTES (type);
++ auto flags = aarch64_attribute_shared_state_flags (IDENTIFIER_POINTER (name));
++ for (tree arg = args; arg; arg = TREE_CHAIN (arg))
++ {
++ tree value = TREE_VALUE (arg);
++ if (aarch64_check_state_string (name, value))
++ {
++ const char *state_name = TREE_STRING_POINTER (value);
++ auto old_flags = aarch64_lookup_shared_state_flags (old_attrs,
++ state_name);
++ if (old_flags && old_flags != flags)
++ {
++ error ("inconsistent attributes for state %qs", state_name);
++ *no_add_attrs = true;
++ return NULL_TREE;
++ }
++ }
++ }
++
++ /* We can't update an old attribute in-place, since types are shared.
++ Instead make sure that this new attribute contains all the
++ information, so that the old attribute becomes redundant. */
++ tree old_attr = lookup_attribute ("arm", IDENTIFIER_POINTER (name),
++ old_attrs);
++ if (!aarch64_merge_string_arguments (args, old_attr, false))
++ *no_add_attrs = true;
++
++ return NULL_TREE;
++}
++
+ /* Mutually-exclusive function type attributes for controlling PSTATE.SM. */
+ static const struct attribute_spec::exclusions attr_streaming_exclusions[] =
+ {
+@@ -3023,6 +3283,16 @@ static const attribute_spec aarch64_arm_attributes[] =
+ NULL, attr_streaming_exclusions },
+ { "streaming_compatible", 0, 0, false, true, true, true,
+ NULL, attr_streaming_exclusions },
++ { "new", 1, -1, true, false, false, false,
++ handle_arm_new, NULL },
++ { "preserves", 1, -1, false, true, true, true,
++ handle_arm_shared, NULL },
++ { "in", 1, -1, false, true, true, true,
++ handle_arm_shared, NULL },
++ { "out", 1, -1, false, true, true, true,
++ handle_arm_shared, NULL },
++ { "inout", 1, -1, false, true, true, true,
++ handle_arm_shared, NULL }
+ };
+
+ static const scoped_attribute_specs aarch64_arm_attribute_table =
+@@ -4202,6 +4472,7 @@ aarch64_hard_regno_nregs (unsigned regno, machine_mode mode)
+ case PR_HI_REGS:
+ case FFR_REGS:
+ case PR_AND_FFR_REGS:
++ case FAKE_REGS:
+ return 1;
+ default:
+ return CEIL (lowest_size, UNITS_PER_WORD);
+@@ -4232,6 +4503,10 @@ aarch64_hard_regno_mode_ok (unsigned regno, machine_mode mode)
+ if (pr_or_ffr_regnum_p (regno))
+ return false;
+
++ /* These registers are abstract; their modes don't matter. */
++ if (FAKE_REGNUM_P (regno))
++ return true;
++
+ if (regno == SP_REGNUM)
+ /* The purpose of comparing with ptr_mode is to support the
+ global register variable associated with the stack pointer
+@@ -4352,12 +4627,34 @@ aarch64_fntype_pstate_sm (const_tree fntype)
+ return AARCH64_FL_SM_OFF;
+ }
+
++/* Return state flags that describe whether and how functions of type
++ FNTYPE share state STATE_NAME with their callers. */
++
++static unsigned int
++aarch64_fntype_shared_flags (const_tree fntype, const char *state_name)
++{
++ return aarch64_lookup_shared_state_flags (TYPE_ATTRIBUTES (fntype),
++ state_name);
++}
++
++/* Return the state of PSTATE.ZA on entry to functions of type FNTYPE. */
++
++static aarch64_feature_flags
++aarch64_fntype_pstate_za (const_tree fntype)
++{
++ if (aarch64_fntype_shared_flags (fntype, "za"))
++ return AARCH64_FL_ZA_ON;
++
++ return 0;
++}
++
+ /* Return the ISA mode on entry to functions of type FNTYPE. */
+
+ static aarch64_feature_flags
+ aarch64_fntype_isa_mode (const_tree fntype)
+ {
+- return aarch64_fntype_pstate_sm (fntype);
++ return (aarch64_fntype_pstate_sm (fntype)
++ | aarch64_fntype_pstate_za (fntype));
+ }
+
+ /* Return the state of PSTATE.SM when compiling the body of
+@@ -4370,13 +4667,37 @@ aarch64_fndecl_pstate_sm (const_tree fndecl)
+ return aarch64_fntype_pstate_sm (TREE_TYPE (fndecl));
+ }
+
++/* Return true if function FNDECL has state STATE_NAME, either by creating
++ new state itself or by sharing state with callers. */
++
++static bool
++aarch64_fndecl_has_state (tree fndecl, const char *state_name)
++{
++ return (aarch64_fndecl_has_new_state (fndecl, state_name)
++ || aarch64_fntype_shared_flags (TREE_TYPE (fndecl),
++ state_name) != 0);
++}
++
++/* Return the state of PSTATE.ZA when compiling the body of function FNDECL.
++ This might be different from the state of PSTATE.ZA on entry. */
++
++static aarch64_feature_flags
++aarch64_fndecl_pstate_za (const_tree fndecl)
++{
++ if (aarch64_fndecl_has_new_state (fndecl, "za"))
++ return AARCH64_FL_ZA_ON;
++
++ return aarch64_fntype_pstate_za (TREE_TYPE (fndecl));
++}
++
+ /* Return the ISA mode that should be used to compile the body of
+ function FNDECL. */
+
+ static aarch64_feature_flags
+ aarch64_fndecl_isa_mode (const_tree fndecl)
+ {
+- return aarch64_fndecl_pstate_sm (fndecl);
++ return (aarch64_fndecl_pstate_sm (fndecl)
++ | aarch64_fndecl_pstate_za (fndecl));
+ }
+
+ /* Return the state of PSTATE.SM on entry to the current function.
+@@ -4389,6 +4710,44 @@ aarch64_cfun_incoming_pstate_sm ()
+ return aarch64_fntype_pstate_sm (TREE_TYPE (cfun->decl));
+ }
+
++/* Return the state of PSTATE.ZA on entry to the current function.
++ This might be different from the state of PSTATE.ZA in the function
++ body. */
++
++static aarch64_feature_flags
++aarch64_cfun_incoming_pstate_za ()
++{
++ return aarch64_fntype_pstate_za (TREE_TYPE (cfun->decl));
++}
++
++/* Return state flags that describe whether and how the current function shares
++ state STATE_NAME with callers. */
++
++static unsigned int
++aarch64_cfun_shared_flags (const char *state_name)
++{
++ return aarch64_fntype_shared_flags (TREE_TYPE (cfun->decl), state_name);
++}
++
++/* Return true if the current function creates new state of type STATE_NAME
++ (as opposed to sharing the state with its callers or ignoring the state
++ altogether). */
++
++static bool
++aarch64_cfun_has_new_state (const char *state_name)
++{
++ return aarch64_fndecl_has_new_state (cfun->decl, state_name);
++}
++
++/* Return true if the current function has state STATE_NAME, either by
++ creating new state itself or by sharing state with callers. */
++
++static bool
++aarch64_cfun_has_state (const char *state_name)
++{
++ return aarch64_fndecl_has_state (cfun->decl, state_name);
++}
++
+ /* Return true if a call from the current function to a function with
+ ISA mode CALLEE_MODE would involve a change to PSTATE.SM around
+ the BL instruction. */
+@@ -5952,6 +6311,74 @@ aarch64_output_sve_vector_inc_dec (const char *operands, rtx x)
+ factor, nelts_per_vq);
+ }
+
++/* Return a constant that represents FACTOR multiplied by the
++ number of 128-bit quadwords in an SME vector. ISA_MODE is the
++ ISA mode in which the calculation is being performed. */
++
++static rtx
++aarch64_sme_vq_immediate (machine_mode mode, HOST_WIDE_INT factor,
++ aarch64_feature_flags isa_mode)
++{
++ gcc_assert (aarch64_sve_rdvl_factor_p (factor));
++ if (isa_mode & AARCH64_FL_SM_ON)
++ /* We're in streaming mode, so we can use normal poly-int values. */
++ return gen_int_mode ({ factor, factor }, mode);
++
++ rtvec vec = gen_rtvec (1, gen_int_mode (factor, SImode));
++ rtx unspec = gen_rtx_UNSPEC (mode, vec, UNSPEC_SME_VQ);
++ return gen_rtx_CONST (mode, unspec);
++}
++
++/* Return true if X is a constant that represents some number X
++ multiplied by the number of quadwords in an SME vector. Store this X
++ in *FACTOR if so. */
++
++static bool
++aarch64_sme_vq_unspec_p (const_rtx x, HOST_WIDE_INT *factor)
++{
++ if (!TARGET_SME || GET_CODE (x) != CONST)
++ return false;
++
++ x = XEXP (x, 0);
++ if (GET_CODE (x) != UNSPEC
++ || XINT (x, 1) != UNSPEC_SME_VQ
++ || XVECLEN (x, 0) != 1)
++ return false;
++
++ x = XVECEXP (x, 0, 0);
++ if (!CONST_INT_P (x))
++ return false;
++
++ *factor = INTVAL (x);
++ return true;
++}
++
++/* Return true if X is a constant that represents some number Y
++ multiplied by the number of quadwords in an SME vector, and if
++ that Y is in the range of RDSVL. */
++
++bool
++aarch64_rdsvl_immediate_p (const_rtx x)
++{
++ HOST_WIDE_INT factor;
++ return (aarch64_sme_vq_unspec_p (x, &factor)
++ && aarch64_sve_rdvl_factor_p (factor));
++}
++
++/* Return the asm string for an RDSVL instruction that calculates X,
++ which is a constant that satisfies aarch64_rdsvl_immediate_p. */
++
++char *
++aarch64_output_rdsvl (const_rtx x)
++{
++ gcc_assert (aarch64_rdsvl_immediate_p (x));
++ static char buffer[sizeof ("rdsvl\t%x0, #-") + 3 * sizeof (int)];
++ x = XVECEXP (XEXP (x, 0), 0, 0);
++ snprintf (buffer, sizeof (buffer), "rdsvl\t%%x0, #%d",
++ (int) INTVAL (x) / 16);
++ return buffer;
++}
++
+ /* Multipliers for repeating bitmasks of width 32, 16, 8, 4, and 2. */
+
+ static const unsigned HOST_WIDE_INT bitmask_imm_mul[] =
+@@ -7717,6 +8144,15 @@ aarch64_expand_mov_immediate (rtx dest, rtx imm)
+ return;
+ }
+
++ if (aarch64_rdsvl_immediate_p (base))
++ {
++ /* We could handle non-constant offsets if they are ever
++ generated. */
++ gcc_assert (const_offset == 0);
++ emit_insn (gen_rtx_SET (dest, imm));
++ return;
++ }
++
+ sty = aarch64_classify_symbol (base, const_offset);
+ switch (sty)
+ {
+@@ -8732,8 +9168,10 @@ aarch64_function_arg (cumulative_args_t pcum_v, const function_arg_info &arg)
+ rtx abi_cookie = aarch64_gen_callee_cookie (pcum->isa_mode,
+ pcum->pcs_variant);
+ rtx sme_mode_switch_args = aarch64_finish_sme_mode_switch_args (pcum);
+- return gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, abi_cookie,
+- sme_mode_switch_args));
++ rtx shared_za_flags = gen_int_mode (pcum->shared_za_flags, SImode);
++ return gen_rtx_PARALLEL (VOIDmode, gen_rtvec (3, abi_cookie,
++ sme_mode_switch_args,
++ shared_za_flags));
+ }
+
+ aarch64_layout_arg (pcum_v, arg);
+@@ -8744,7 +9182,7 @@ void
+ aarch64_init_cumulative_args (CUMULATIVE_ARGS *pcum,
+ const_tree fntype,
+ rtx libname ATTRIBUTE_UNUSED,
+- const_tree fndecl ATTRIBUTE_UNUSED,
++ const_tree fndecl,
+ unsigned n_named ATTRIBUTE_UNUSED,
+ bool silent_p)
+ {
+@@ -8769,6 +9207,8 @@ aarch64_init_cumulative_args (CUMULATIVE_ARGS *pcum,
+ pcum->aapcs_stack_words = 0;
+ pcum->aapcs_stack_size = 0;
+ pcum->silent_p = silent_p;
++ pcum->shared_za_flags
++ = (fntype ? aarch64_fntype_shared_flags (fntype, "za") : 0U);
+ pcum->num_sme_mode_switch_args = 0;
+
+ if (!silent_p
+@@ -10803,14 +11243,31 @@ aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2,
+ }
+ }
+
++/* Implement TARGET_EXTRA_LIVE_ON_ENTRY. */
++
++void
++aarch64_extra_live_on_entry (bitmap regs)
++{
++ if (TARGET_ZA)
++ {
++ bitmap_set_bit (regs, LOWERING_REGNUM);
++ bitmap_set_bit (regs, SME_STATE_REGNUM);
++ bitmap_set_bit (regs, TPIDR2_SETUP_REGNUM);
++ bitmap_set_bit (regs, ZA_FREE_REGNUM);
++ bitmap_set_bit (regs, ZA_SAVED_REGNUM);
++
++ /* The only time ZA can't have live contents on entry is when
++ the function explicitly treats it as a pure output. */
++ auto za_flags = aarch64_cfun_shared_flags ("za");
++ if (za_flags != (AARCH64_STATE_SHARED | AARCH64_STATE_OUT))
++ bitmap_set_bit (regs, ZA_REGNUM);
++ }
++}
++
+ /* Return 1 if the register is used by the epilogue. We need to say the
+ return register is used, but only after epilogue generation is complete.
+ Note that in the case of sibcalls, the values "used by the epilogue" are
+- considered live at the start of the called function.
+-
+- For SIMD functions we need to return 1 for FP registers that are saved and
+- restored by a function but are not zero in call_used_regs. If we do not do
+- this optimizations may remove the restore of the register. */
++ considered live at the start of the called function. */
+
+ int
+ aarch64_epilogue_uses (int regno)
+@@ -10820,6 +11277,18 @@ aarch64_epilogue_uses (int regno)
+ if (regno == LR_REGNUM)
+ return 1;
+ }
++ if (regno == LOWERING_REGNUM && TARGET_ZA)
++ return 1;
++ if (regno == SME_STATE_REGNUM && TARGET_ZA)
++ return 1;
++ if (regno == TPIDR2_SETUP_REGNUM && TARGET_ZA)
++ return 1;
++ /* If the function shares SME state with its caller, ensure that that
++ data is not in the lazy save buffer on exit. */
++ if (regno == ZA_SAVED_REGNUM && aarch64_cfun_incoming_pstate_za () != 0)
++ return 1;
++ if (regno == ZA_REGNUM && aarch64_cfun_shared_flags ("za") != 0)
++ return 1;
+ return 0;
+ }
+
+@@ -11501,8 +11970,10 @@ aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED, rtx x)
+
+ /* There's no way to calculate VL-based values using relocations. */
+ subrtx_iterator::array_type array;
++ HOST_WIDE_INT factor;
+ FOR_EACH_SUBRTX (iter, array, x, ALL)
+- if (GET_CODE (*iter) == CONST_POLY_INT)
++ if (GET_CODE (*iter) == CONST_POLY_INT
++ || aarch64_sme_vq_unspec_p (x, &factor))
+ return true;
+
+ poly_int64 offset;
+@@ -12364,6 +12835,72 @@ aarch64_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
+ return true;
+ }
+
++/* Return a fresh memory reference to the current function's TPIDR2 block,
++ creating a block if necessary. */
++
++static rtx
++aarch64_get_tpidr2_block ()
++{
++ if (!cfun->machine->tpidr2_block)
++ /* The TPIDR2 block is 16 bytes in size and must be aligned to a 128-bit
++ boundary. */
++ cfun->machine->tpidr2_block = assign_stack_local (V16QImode, 16, 128);
++ return copy_rtx (cfun->machine->tpidr2_block);
++}
++
++/* Return a fresh register that points to the current function's
++ TPIDR2 block, creating a block if necessary. */
++
++static rtx
++aarch64_get_tpidr2_ptr ()
++{
++ rtx block = aarch64_get_tpidr2_block ();
++ return force_reg (Pmode, XEXP (block, 0));
++}
++
++/* Emit instructions to allocate a ZA lazy save buffer and initialize the
++ current function's TPIDR2 block. */
++
++static void
++aarch64_init_tpidr2_block ()
++{
++ rtx block = aarch64_get_tpidr2_block ();
++
++ /* The ZA save buffer is SVL.B*SVL.B bytes in size. */
++ rtx svl_bytes = aarch64_sme_vq_immediate (Pmode, 16, AARCH64_ISA_MODE);
++ rtx svl_bytes_reg = force_reg (DImode, svl_bytes);
++ rtx za_size = expand_simple_binop (Pmode, MULT, svl_bytes_reg,
++ svl_bytes_reg, NULL, 0, OPTAB_LIB_WIDEN);
++ rtx za_save_buffer = allocate_dynamic_stack_space (za_size, 128,
++ BITS_PER_UNIT, -1, true);
++ za_save_buffer = force_reg (Pmode, za_save_buffer);
++ cfun->machine->za_save_buffer = za_save_buffer;
++
++ /* The first word of the block points to the save buffer and the second
++ word is the number of ZA slices to save. */
++ rtx block_0 = adjust_address (block, DImode, 0);
++ rtx block_8 = adjust_address (block, DImode, 8);
++ emit_insn (gen_store_pair_dw_didi (block_0, za_save_buffer,
++ block_8, svl_bytes_reg));
++
++ if (!memory_operand (block, V16QImode))
++ block = replace_equiv_address (block, force_reg (Pmode, XEXP (block, 0)));
++ emit_insn (gen_aarch64_setup_local_tpidr2 (block));
++}
++
++/* Restore the contents of ZA from the lazy save buffer, given that
++ register TPIDR2_BLOCK points to the current function's TPIDR2 block.
++ PSTATE.ZA is known to be 0 and TPIDR2_EL0 is known to be null. */
++
++void
++aarch64_restore_za (rtx tpidr2_block)
++{
++ emit_insn (gen_aarch64_smstart_za ());
++ if (REGNO (tpidr2_block) != R0_REGNUM)
++ emit_move_insn (gen_rtx_REG (Pmode, R0_REGNUM), tpidr2_block);
++ emit_insn (gen_aarch64_tpidr2_restore ());
++}
++
+ /* Implement TARGET_START_CALL_ARGS. */
+
+ static void
+@@ -12379,6 +12916,20 @@ aarch64_start_call_args (cumulative_args_t ca_v)
+ " option %<-march%>, or by using the %<target%>"
+ " attribute or pragma", "sme");
+ }
++
++ if ((ca->shared_za_flags & (AARCH64_STATE_IN | AARCH64_STATE_OUT))
++ && !aarch64_cfun_has_state ("za"))
++ error ("call to a function that shares %qs state from a function"
++ " that has no %qs state", "za", "za");
++ else if (!TARGET_ZA && (ca->isa_mode & AARCH64_FL_ZA_ON))
++ error ("call to a function that shares SME state from a function"
++ " that has no SME state");
++
++ /* If this is a call to a private ZA function, emit a marker to
++ indicate where any necessary set-up code could be inserted.
++ The code itself is inserted by the mode-switching pass. */
++ if (TARGET_ZA && !(ca->isa_mode & AARCH64_FL_ZA_ON))
++ emit_insn (gen_aarch64_start_private_za_call ());
+ }
+
+ /* This function is used by the call expanders of the machine description.
+@@ -12391,6 +12942,8 @@ aarch64_start_call_args (cumulative_args_t ca_v)
+ The second element is a PARALLEL that lists all the argument
+ registers that need to be saved and restored around a change
+ in PSTATE.SM, or const0_rtx if no such switch is needed.
++ The third element is a const_int that contains the sharing flags
++ for ZA.
+ SIBCALL indicates whether this function call is normal call or sibling call.
+ It will generate different pattern accordingly. */
+
+@@ -12403,10 +12956,12 @@ aarch64_expand_call (rtx result, rtx mem, rtx cookie, bool sibcall)
+
+ rtx callee_abi = cookie;
+ rtx sme_mode_switch_args = const0_rtx;
++ unsigned int shared_za_flags = 0;
+ if (GET_CODE (cookie) == PARALLEL)
+ {
+ callee_abi = XVECEXP (cookie, 0, 0);
+ sme_mode_switch_args = XVECEXP (cookie, 0, 1);
++ shared_za_flags = INTVAL (XVECEXP (cookie, 0, 2));
+ }
+
+ gcc_assert (CONST_INT_P (callee_abi));
+@@ -12426,6 +12981,41 @@ aarch64_expand_call (rtx result, rtx mem, rtx cookie, bool sibcall)
+ : !REG_P (callee))
+ XEXP (mem, 0) = force_reg (mode, callee);
+
++ /* Accumulate the return values, including state that is shared via
++ attributes. */
++ auto_vec<rtx, 8> return_values;
++ if (result)
++ {
++ if (GET_CODE (result) == PARALLEL)
++ for (int i = 0; i < XVECLEN (result, 0); ++i)
++ return_values.safe_push (XVECEXP (result, 0, i));
++ else
++ return_values.safe_push (result);
++ }
++ unsigned int orig_num_return_values = return_values.length ();
++ if (shared_za_flags & AARCH64_STATE_OUT)
++ return_values.safe_push (gen_rtx_REG (VNx16BImode, ZA_REGNUM));
++ /* When calling private-ZA functions from functions with ZA state,
++ we want to know whether the call committed a lazy save. */
++ if (TARGET_ZA && !shared_za_flags)
++ return_values.safe_push (gen_rtx_REG (VNx16BImode, ZA_SAVED_REGNUM));
++
++ /* Create the new return value, if necessary. */
++ if (orig_num_return_values != return_values.length ())
++ {
++ if (return_values.length () == 1)
++ result = return_values[0];
++ else
++ {
++ for (rtx &x : return_values)
++ if (GET_CODE (x) != EXPR_LIST)
++ x = gen_rtx_EXPR_LIST (VOIDmode, x, const0_rtx);
++ rtvec v = gen_rtvec_v (return_values.length (),
++ return_values.address ());
++ result = gen_rtx_PARALLEL (VOIDmode, v);
++ }
++ }
++
+ call = gen_rtx_CALL (VOIDmode, mem, const0_rtx);
+
+ if (result != NULL_RTX)
+@@ -12492,6 +13082,50 @@ aarch64_expand_call (rtx result, rtx mem, rtx cookie, bool sibcall)
+
+ cfun->machine->call_switches_pstate_sm = true;
+ }
++
++ /* Add any ZA-related information.
++ ZA_REGNUM represents the current function's ZA state, rather than
++ the contents of the ZA register itself. We ensure that the function's
++ ZA state is preserved by private-ZA call sequences, so the call itself
++ does not use or clobber ZA_REGNUM. */
++ if (TARGET_ZA)
++ {
++ /* The callee requires ZA to be active if the callee is shared-ZA,
++ otherwise it requires ZA to be dormant or off. The state of ZA is
++ captured by a combination of SME_STATE_REGNUM, TPIDR2_SETUP_REGNUM,
++ and ZA_SAVED_REGNUM. */
++ use_reg (&CALL_INSN_FUNCTION_USAGE (call_insn),
++ gen_rtx_REG (DImode, SME_STATE_REGNUM));
++ use_reg (&CALL_INSN_FUNCTION_USAGE (call_insn),
++ gen_rtx_REG (DImode, TPIDR2_SETUP_REGNUM));
++ use_reg (&CALL_INSN_FUNCTION_USAGE (call_insn),
++ gen_rtx_REG (VNx16BImode, ZA_SAVED_REGNUM));
++
++ /* Keep the aarch64_start/end_private_za_call markers live. */
++ if (!(callee_isa_mode & AARCH64_FL_ZA_ON))
++ use_reg (&CALL_INSN_FUNCTION_USAGE (call_insn),
++ gen_rtx_REG (VNx16BImode, LOWERING_REGNUM));
++
++ /* If the callee is a shared-ZA function, record whether it uses the
++ current value of ZA. */
++ if (shared_za_flags & AARCH64_STATE_IN)
++ use_reg (&CALL_INSN_FUNCTION_USAGE (call_insn),
++ gen_rtx_REG (VNx16BImode, ZA_REGNUM));
++ }
++}
++
++/* Implement TARGET_END_CALL_ARGS. */
++
++static void
++aarch64_end_call_args (cumulative_args_t ca_v)
++{
++ CUMULATIVE_ARGS *ca = get_cumulative_args (ca_v);
++
++ /* If this is a call to a private ZA function, emit a marker to
++ indicate where any necessary restoration code could be inserted.
++ The code itself is inserted by the mode-switching pass. */
++ if (TARGET_ZA && !(ca->isa_mode & AARCH64_FL_ZA_ON))
++ emit_insn (gen_aarch64_end_private_za_call ());
+ }
+
+ /* Emit call insn with PAT and do aarch64-specific handling. */
+@@ -13602,6 +14236,9 @@ aarch64_regno_regclass (unsigned regno)
+ if (regno == FFR_REGNUM || regno == FFRT_REGNUM)
+ return FFR_REGS;
+
++ if (FAKE_REGNUM_P (regno))
++ return FAKE_REGS;
++
+ return NO_REGS;
+ }
+
+@@ -13957,12 +14594,14 @@ aarch64_class_max_nregs (reg_class_t regclass, machine_mode mode)
+ return (vec_flags & VEC_ADVSIMD
+ ? CEIL (lowest_size, UNITS_PER_VREG)
+ : CEIL (lowest_size, UNITS_PER_WORD));
++
+ case STACK_REG:
+ case PR_REGS:
+ case PR_LO_REGS:
+ case PR_HI_REGS:
+ case FFR_REGS:
+ case PR_AND_FFR_REGS:
++ case FAKE_REGS:
+ return 1;
+
+ case NO_REGS:
+@@ -19002,10 +19641,14 @@ aarch64_override_options_internal (struct gcc_options *opts)
+ && !fixed_regs[R18_REGNUM])
+ error ("%<-fsanitize=shadow-call-stack%> requires %<-ffixed-x18%>");
+
+- if ((opts->x_aarch64_isa_flags & AARCH64_FL_SM_ON)
++ if ((opts->x_aarch64_isa_flags & (AARCH64_FL_SM_ON | AARCH64_FL_ZA_ON))
+ && !(opts->x_aarch64_isa_flags & AARCH64_FL_SME))
+ {
+- error ("streaming functions require the ISA extension %qs", "sme");
++ if (opts->x_aarch64_isa_flags & AARCH64_FL_SM_ON)
++ error ("streaming functions require the ISA extension %qs", "sme");
++ else
++ error ("functions with SME state require the ISA extension %qs",
++ "sme");
+ inform (input_location, "you can enable %qs using the command-line"
+ " option %<-march%>, or by using the %<target%>"
+ " attribute or pragma", "sme");
+@@ -21341,6 +21984,8 @@ aarch64_conditional_register_usage (void)
+ CLEAR_HARD_REG_BIT (operand_reg_set, VG_REGNUM);
+ CLEAR_HARD_REG_BIT (operand_reg_set, FFR_REGNUM);
+ CLEAR_HARD_REG_BIT (operand_reg_set, FFRT_REGNUM);
++ for (int i = FIRST_FAKE_REGNUM; i <= LAST_FAKE_REGNUM; ++i)
++ CLEAR_HARD_REG_BIT (operand_reg_set, i);
+
+ /* When tracking speculation, we need a couple of call-clobbered registers
+ to track the speculation state. It would be nice to just use
+@@ -22795,6 +23440,9 @@ aarch64_mov_operand_p (rtx x, machine_mode mode)
+ || aarch64_sve_rdvl_immediate_p (x)))
+ return true;
+
++ if (aarch64_rdsvl_immediate_p (x))
++ return true;
++
+ return aarch64_classify_symbolic_expression (x)
+ == SYMBOL_TINY_ABSOLUTE;
+ }
+@@ -28266,9 +28914,45 @@ aarch64_comp_type_attributes (const_tree type1, const_tree type2)
+ return 0;
+ if (!check_attr ("arm", "streaming_compatible"))
+ return 0;
++ if (aarch64_lookup_shared_state_flags (TYPE_ATTRIBUTES (type1), "za")
++ != aarch64_lookup_shared_state_flags (TYPE_ATTRIBUTES (type2), "za"))
++ return 0;
+ return 1;
+ }
+
++/* Implement TARGET_MERGE_DECL_ATTRIBUTES. */
++
++static tree
++aarch64_merge_decl_attributes (tree olddecl, tree newdecl)
++{
++ tree old_attrs = DECL_ATTRIBUTES (olddecl);
++ tree old_new = lookup_attribute ("arm", "new", old_attrs);
++
++ tree new_attrs = DECL_ATTRIBUTES (newdecl);
++ tree new_new = lookup_attribute ("arm", "new", new_attrs);
++
++ if (DECL_INITIAL (olddecl) && new_new)
++ {
++ error ("cannot apply attribute %qs to %q+D after the function"
++ " has been defined", "new", newdecl);
++ inform (DECL_SOURCE_LOCATION (olddecl), "%q+D defined here",
++ newdecl);
++ }
++ else
++ {
++ if (old_new && new_new)
++ {
++ old_attrs = remove_attribute ("arm", "new", old_attrs);
++ TREE_VALUE (new_new) = chainon (TREE_VALUE (new_new),
++ TREE_VALUE (old_new));
++ }
++ if (new_new)
++ aarch64_check_arm_new_against_type (TREE_VALUE (new_new), newdecl);
++ }
++
++ return merge_attributes (old_attrs, new_attrs);
++}
++
+ /* Implement TARGET_GET_MULTILIB_ABI_NAME */
+
+ static const char *
+@@ -28634,6 +29318,629 @@ aarch64_indirect_call_asm (rtx addr)
+ return "";
+ }
+
++/* Implement OPTIMIZE_MODE_SWITCHING. */
++
++bool
++aarch64_optimize_mode_switching (aarch64_mode_entity entity)
++{
++ bool have_sme_state = (aarch64_cfun_incoming_pstate_za () != 0
++ || (aarch64_cfun_has_new_state ("za")
++ && df_regs_ever_live_p (ZA_REGNUM)));
++
++ if (have_sme_state && nonlocal_goto_handler_labels)
++ {
++ static bool reported;
++ if (!reported)
++ {
++ sorry ("non-local gotos in functions with SME state");
++ reported = true;
++ }
++ }
++
++ switch (entity)
++ {
++ case aarch64_mode_entity::HAVE_ZA_SAVE_BUFFER:
++ case aarch64_mode_entity::LOCAL_SME_STATE:
++ return have_sme_state && !nonlocal_goto_handler_labels;
++ }
++ gcc_unreachable ();
++}
++
++/* Implement TARGET_MODE_EMIT for ZA_SAVE_BUFFER. */
++
++static void
++aarch64_mode_emit_za_save_buffer (aarch64_tristate_mode mode,
++ aarch64_tristate_mode prev_mode)
++{
++ if (mode == aarch64_tristate_mode::YES)
++ {
++ gcc_assert (prev_mode == aarch64_tristate_mode::NO);
++ aarch64_init_tpidr2_block ();
++ }
++ else
++ gcc_unreachable ();
++}
++
++/* Implement TARGET_MODE_EMIT for LOCAL_SME_STATE. */
++
++static void
++aarch64_mode_emit_local_sme_state (aarch64_local_sme_state mode,
++ aarch64_local_sme_state prev_mode)
++{
++ /* Back-propagation should ensure that we're always starting from
++ a known mode. */
++ gcc_assert (prev_mode != aarch64_local_sme_state::ANY);
++
++ if (prev_mode == aarch64_local_sme_state::INACTIVE_CALLER)
++ {
++ /* Commit any uncommitted lazy save. This leaves ZA either active
++ and zero (lazy save case) or off (normal case).
++
++ The sequence is:
++
++ mrs <temp>, tpidr2_el0
++ cbz <temp>, no_save
++ bl __arm_tpidr2_save
++ msr tpidr2_el0, xzr
++ zero { za } // Only if ZA is live
++ no_save: */
++ bool is_active = (mode == aarch64_local_sme_state::ACTIVE_LIVE
++ || mode == aarch64_local_sme_state::ACTIVE_DEAD);
++ auto tmp_reg = gen_reg_rtx (DImode);
++ auto active_flag = gen_int_mode (is_active, DImode);
++ emit_insn (gen_aarch64_read_tpidr2 (tmp_reg));
++ emit_insn (gen_aarch64_commit_lazy_save (tmp_reg, active_flag));
++ }
++
++ if (mode == aarch64_local_sme_state::ACTIVE_LIVE
++ || mode == aarch64_local_sme_state::ACTIVE_DEAD)
++ {
++ if (prev_mode == aarch64_local_sme_state::INACTIVE_LOCAL)
++ {
++ /* Make ZA active after being inactive.
++
++ First handle the case in which the lazy save we set up was
++ committed by a callee. If the function's source-level ZA state
++ is live then we must conditionally restore it from the lazy
++ save buffer. Otherwise we can just force PSTATE.ZA to 1. */
++ if (mode == aarch64_local_sme_state::ACTIVE_LIVE)
++ emit_insn (gen_aarch64_restore_za (aarch64_get_tpidr2_ptr ()));
++ else
++ emit_insn (gen_aarch64_smstart_za ());
++
++ /* Now handle the case in which the lazy save was not committed.
++ In that case, ZA still contains the current function's ZA state,
++ and we just need to cancel the lazy save. */
++ emit_insn (gen_aarch64_clear_tpidr2 ());
++ return;
++ }
++
++ if (prev_mode == aarch64_local_sme_state::SAVED_LOCAL)
++ {
++ /* Retrieve the current function's ZA state from the lazy save
++ buffer. */
++ aarch64_restore_za (aarch64_get_tpidr2_ptr ());
++ return;
++ }
++
++ if (prev_mode == aarch64_local_sme_state::INACTIVE_CALLER
++ || prev_mode == aarch64_local_sme_state::OFF)
++ {
++ /* INACTIVE_CALLER means that we are enabling ZA for the first
++ time in this function. The code above means that ZA is either
++ active and zero (if we committed a lazy save) or off. Handle
++ the latter case by forcing ZA on.
++
++ OFF means that PSTATE.ZA is guaranteed to be 0. We just need
++ to force it to 1.
++
++ Both cases leave ZA zeroed. */
++ emit_insn (gen_aarch64_smstart_za ());
++ return;
++ }
++
++ if (prev_mode == aarch64_local_sme_state::ACTIVE_DEAD
++ || prev_mode == aarch64_local_sme_state::ACTIVE_LIVE)
++ /* A simple change in liveness, such as in a CFG structure where
++ ZA is only conditionally defined. No code is needed. */
++ return;
++
++ gcc_unreachable ();
++ }
++
++ if (mode == aarch64_local_sme_state::INACTIVE_LOCAL)
++ {
++ if (prev_mode == aarch64_local_sme_state::ACTIVE_LIVE
++ || prev_mode == aarch64_local_sme_state::ACTIVE_DEAD
++ || prev_mode == aarch64_local_sme_state::INACTIVE_CALLER)
++ {
++ /* A transition from ACTIVE_LIVE to INACTIVE_LOCAL is the usual
++ case of setting up a lazy save buffer before a call.
++ A transition from INACTIVE_CALLER is similar, except that
++ the contents of ZA are known to be zero.
++
++ A transition from ACTIVE_DEAD means that ZA is live at the
++ point of the transition, but is dead on at least one incoming
++ edge. (That is, ZA is only conditionally initialized.)
++ For efficiency, we want to set up a lazy save even for
++ dead contents, since forcing ZA off would make later code
++ restore ZA from the lazy save buffer. */
++ emit_insn (gen_aarch64_write_tpidr2 (aarch64_get_tpidr2_ptr ()));
++ return;
++ }
++
++ if (prev_mode == aarch64_local_sme_state::SAVED_LOCAL
++ || prev_mode == aarch64_local_sme_state::OFF)
++ /* We're simply discarding the information about which inactive
++ state applies. */
++ return;
++
++ gcc_unreachable ();
++ }
++
++ if (mode == aarch64_local_sme_state::INACTIVE_CALLER
++ || mode == aarch64_local_sme_state::OFF)
++ {
++ /* The transition to INACTIVE_CALLER is used before returning from
++ new("za") functions. Any state in ZA belongs to the current
++ function rather than a caller, but that state is no longer
++ needed. Clear any pending lazy save and turn ZA off.
++
++ The transition to OFF is used before calling a private-ZA function.
++ We committed any incoming lazy save above, so at this point any
++ contents in ZA belong to the current function. */
++ if (prev_mode == aarch64_local_sme_state::INACTIVE_LOCAL)
++ emit_insn (gen_aarch64_clear_tpidr2 ());
++
++ if (prev_mode != aarch64_local_sme_state::OFF
++ && prev_mode != aarch64_local_sme_state::SAVED_LOCAL)
++ emit_insn (gen_aarch64_smstop_za ());
++
++ return;
++ }
++
++ if (mode == aarch64_local_sme_state::SAVED_LOCAL)
++ {
++ /* This is a transition to an exception handler. */
++ gcc_assert (prev_mode == aarch64_local_sme_state::OFF
++ || prev_mode == aarch64_local_sme_state::INACTIVE_LOCAL);
++ return;
++ }
++
++ gcc_unreachable ();
++}
++
++/* Implement TARGET_MODE_EMIT. */
++
++static void
++aarch64_mode_emit (int entity, int mode, int prev_mode, HARD_REG_SET live)
++{
++ if (mode == prev_mode)
++ return;
++
++ start_sequence ();
++ switch (aarch64_mode_entity (entity))
++ {
++ case aarch64_mode_entity::HAVE_ZA_SAVE_BUFFER:
++ aarch64_mode_emit_za_save_buffer (aarch64_tristate_mode (mode),
++ aarch64_tristate_mode (prev_mode));
++ break;
++
++ case aarch64_mode_entity::LOCAL_SME_STATE:
++ aarch64_mode_emit_local_sme_state (aarch64_local_sme_state (mode),
++ aarch64_local_sme_state (prev_mode));
++ break;
++ }
++ rtx_insn *seq = get_insns ();
++ end_sequence ();
++
++ /* Get the set of clobbered registers that are currently live. */
++ HARD_REG_SET clobbers = {};
++ for (rtx_insn *insn = seq; insn; insn = NEXT_INSN (insn))
++ {
++ vec_rtx_properties properties;
++ properties.add_insn (insn, false);
++ for (rtx_obj_reference ref : properties.refs ())
++ if (ref.is_write () && HARD_REGISTER_NUM_P (ref.regno))
++ SET_HARD_REG_BIT (clobbers, ref.regno);
++ }
++ clobbers &= live;
++
++ /* Emit instructions to save clobbered registers to pseudos. Queue
++ instructions to restore the registers afterwards.
++
++ This should only needed in rare situations. */
++ auto_vec<rtx, 33> after;
++ for (unsigned int regno = R0_REGNUM; regno < R30_REGNUM; ++regno)
++ if (TEST_HARD_REG_BIT (clobbers, regno))
++ {
++ rtx hard_reg = gen_rtx_REG (DImode, regno);
++ rtx pseudo_reg = gen_reg_rtx (DImode);
++ emit_move_insn (pseudo_reg, hard_reg);
++ after.quick_push (gen_move_insn (hard_reg, pseudo_reg));
++ }
++ if (TEST_HARD_REG_BIT (clobbers, CC_REGNUM))
++ {
++ rtx pseudo_reg = gen_reg_rtx (DImode);
++ emit_insn (gen_aarch64_save_nzcv (pseudo_reg));
++ after.quick_push (gen_aarch64_restore_nzcv (pseudo_reg));
++ }
++
++ /* Emit the transition instructions themselves. */
++ emit_insn (seq);
++
++ /* Restore the clobbered registers. */
++ for (auto *insn : after)
++ emit_insn (insn);
++}
++
++/* Return true if INSN references the SME state represented by hard register
++ REGNO. */
++
++static bool
++aarch64_insn_references_sme_state_p (rtx_insn *insn, unsigned int regno)
++{
++ df_ref ref;
++ FOR_EACH_INSN_DEF (ref, insn)
++ if (!DF_REF_FLAGS_IS_SET (ref, DF_REF_MUST_CLOBBER)
++ && DF_REF_REGNO (ref) == regno)
++ return true;
++ FOR_EACH_INSN_USE (ref, insn)
++ if (DF_REF_REGNO (ref) == regno)
++ return true;
++ return false;
++}
++
++/* Implement TARGET_MODE_NEEDED for LOCAL_SME_STATE. */
++
++static aarch64_local_sme_state
++aarch64_mode_needed_local_sme_state (rtx_insn *insn, HARD_REG_SET live)
++{
++ if (!CALL_P (insn)
++ && find_reg_note (insn, REG_EH_REGION, NULL_RTX))
++ {
++ static bool reported;
++ if (!reported)
++ {
++ sorry ("catching non-call exceptions in functions with SME state");
++ reported = true;
++ }
++ /* Aim for graceful error recovery by picking the value that is
++ least likely to generate an ICE. */
++ return aarch64_local_sme_state::INACTIVE_LOCAL;
++ }
++
++ /* A non-local goto is equivalent to a return. We disallow non-local
++ receivers in functions with SME state, so we know that the target
++ expects ZA to be dormant or off. */
++ if (JUMP_P (insn)
++ && find_reg_note (insn, REG_NON_LOCAL_GOTO, NULL_RTX))
++ return aarch64_local_sme_state::INACTIVE_CALLER;
++
++ /* start_private_za_call and end_private_za_call bracket a sequence
++ that calls a private-ZA function. Force ZA to be turned off if the
++ function doesn't have any live ZA state, otherwise require ZA to be
++ inactive. */
++ auto icode = recog_memoized (insn);
++ if (icode == CODE_FOR_aarch64_start_private_za_call
++ || icode == CODE_FOR_aarch64_end_private_za_call)
++ return (TEST_HARD_REG_BIT (live, ZA_REGNUM)
++ ? aarch64_local_sme_state::INACTIVE_LOCAL
++ : aarch64_local_sme_state::OFF);
++
++ /* Force ZA to contain the current function's ZA state if INSN wants
++ to access it. */
++ if (aarch64_insn_references_sme_state_p (insn, ZA_REGNUM))
++ return (TEST_HARD_REG_BIT (live, ZA_REGNUM)
++ ? aarch64_local_sme_state::ACTIVE_LIVE
++ : aarch64_local_sme_state::ACTIVE_DEAD);
++
++ return aarch64_local_sme_state::ANY;
++}
++
++/* Implement TARGET_MODE_NEEDED for ZA_SAVE_BUFFER. */
++
++static aarch64_tristate_mode
++aarch64_mode_needed_za_save_buffer (rtx_insn *insn, HARD_REG_SET live)
++{
++ /* We need to set up a lazy save buffer no later than the first
++ transition to INACTIVE_LOCAL (which involves setting up a lazy save). */
++ if (aarch64_mode_needed_local_sme_state (insn, live)
++ == aarch64_local_sme_state::INACTIVE_LOCAL)
++ return aarch64_tristate_mode::YES;
++
++ /* Also make sure that the lazy save buffer is set up before the first
++ insn that throws internally. The exception handler will sometimes
++ load from it. */
++ if (find_reg_note (insn, REG_EH_REGION, NULL_RTX))
++ return aarch64_tristate_mode::YES;
++
++ return aarch64_tristate_mode::MAYBE;
++}
++
++/* Implement TARGET_MODE_NEEDED. */
++
++static int
++aarch64_mode_needed (int entity, rtx_insn *insn, HARD_REG_SET live)
++{
++ switch (aarch64_mode_entity (entity))
++ {
++ case aarch64_mode_entity::HAVE_ZA_SAVE_BUFFER:
++ return int (aarch64_mode_needed_za_save_buffer (insn, live));
++
++ case aarch64_mode_entity::LOCAL_SME_STATE:
++ return int (aarch64_mode_needed_local_sme_state (insn, live));
++ }
++ gcc_unreachable ();
++}
++
++/* Implement TARGET_MODE_AFTER for LOCAL_SME_STATE. */
++
++static aarch64_local_sme_state
++aarch64_mode_after_local_sme_state (aarch64_local_sme_state mode,
++ HARD_REG_SET live)
++{
++ /* Note places where ZA dies, so that we can try to avoid saving and
++ restoring state that isn't needed. */
++ if (mode == aarch64_local_sme_state::ACTIVE_LIVE
++ && !TEST_HARD_REG_BIT (live, ZA_REGNUM))
++ return aarch64_local_sme_state::ACTIVE_DEAD;
++
++ /* Note where ZA is born, e.g. when moving past an __arm_out("za")
++ function. */
++ if (mode == aarch64_local_sme_state::ACTIVE_DEAD
++ && TEST_HARD_REG_BIT (live, ZA_REGNUM))
++ return aarch64_local_sme_state::ACTIVE_LIVE;
++
++ return mode;
++}
++
++/* Implement TARGET_MODE_AFTER. */
++
++static int
++aarch64_mode_after (int entity, int mode, rtx_insn *, HARD_REG_SET live)
++{
++ switch (aarch64_mode_entity (entity))
++ {
++ case aarch64_mode_entity::HAVE_ZA_SAVE_BUFFER:
++ return mode;
++
++ case aarch64_mode_entity::LOCAL_SME_STATE:
++ return int (aarch64_mode_after_local_sme_state
++ (aarch64_local_sme_state (mode), live));
++ }
++ gcc_unreachable ();
++}
++
++/* Implement TARGET_MODE_CONFLUENCE for LOCAL_SME_STATE. */
++
++static aarch64_local_sme_state
++aarch64_local_sme_confluence (aarch64_local_sme_state mode1,
++ aarch64_local_sme_state mode2)
++{
++ /* Perform a symmetrical check for two values. */
++ auto is_pair = [&](aarch64_local_sme_state val1,
++ aarch64_local_sme_state val2)
++ {
++ return ((mode1 == val1 && mode2 == val2)
++ || (mode1 == val2 && mode2 == val1));
++ };
++
++ /* INACTIVE_CALLER means ZA is off or it has dormant contents belonging
++ to a caller. OFF is one of the options. */
++ if (is_pair (aarch64_local_sme_state::INACTIVE_CALLER,
++ aarch64_local_sme_state::OFF))
++ return aarch64_local_sme_state::INACTIVE_CALLER;
++
++ /* Similarly for dormant contents belonging to the current function. */
++ if (is_pair (aarch64_local_sme_state::INACTIVE_LOCAL,
++ aarch64_local_sme_state::OFF))
++ return aarch64_local_sme_state::INACTIVE_LOCAL;
++
++ /* Treat a conditionally-initialized value as a fully-initialized value. */
++ if (is_pair (aarch64_local_sme_state::ACTIVE_LIVE,
++ aarch64_local_sme_state::ACTIVE_DEAD))
++ return aarch64_local_sme_state::ACTIVE_LIVE;
++
++ return aarch64_local_sme_state::ANY;
++}
++
++/* Implement TARGET_MODE_CONFLUENCE. */
++
++static int
++aarch64_mode_confluence (int entity, int mode1, int mode2)
++{
++ gcc_assert (mode1 != mode2);
++ switch (aarch64_mode_entity (entity))
++ {
++ case aarch64_mode_entity::HAVE_ZA_SAVE_BUFFER:
++ return int (aarch64_tristate_mode::MAYBE);
++
++ case aarch64_mode_entity::LOCAL_SME_STATE:
++ return int (aarch64_local_sme_confluence
++ (aarch64_local_sme_state (mode1),
++ aarch64_local_sme_state (mode2)));
++ }
++ gcc_unreachable ();
++}
++
++/* Implement TARGET_MODE_BACKPROP for an entity that either stays
++ NO throughput, or makes one transition from NO to YES. */
++
++static aarch64_tristate_mode
++aarch64_one_shot_backprop (aarch64_tristate_mode mode1,
++ aarch64_tristate_mode mode2)
++{
++ /* Keep bringing the transition forward until it starts from NO. */
++ if (mode1 == aarch64_tristate_mode::MAYBE
++ && mode2 == aarch64_tristate_mode::YES)
++ return mode2;
++
++ return aarch64_tristate_mode::MAYBE;
++}
++
++/* Implement TARGET_MODE_BACKPROP for LOCAL_SME_STATE. */
++
++static aarch64_local_sme_state
++aarch64_local_sme_backprop (aarch64_local_sme_state mode1,
++ aarch64_local_sme_state mode2)
++{
++ /* We always need to know what the current state is when transitioning
++ to a new state. Force any location with indeterminate starting state
++ to be active. */
++ if (mode1 == aarch64_local_sme_state::ANY)
++ switch (mode2)
++ {
++ case aarch64_local_sme_state::INACTIVE_CALLER:
++ case aarch64_local_sme_state::OFF:
++ case aarch64_local_sme_state::ACTIVE_DEAD:
++ /* The current function's ZA state is not live. */
++ return aarch64_local_sme_state::ACTIVE_DEAD;
++
++ case aarch64_local_sme_state::INACTIVE_LOCAL:
++ case aarch64_local_sme_state::ACTIVE_LIVE:
++ /* The current function's ZA state is live. */
++ return aarch64_local_sme_state::ACTIVE_LIVE;
++
++ case aarch64_local_sme_state::SAVED_LOCAL:
++ /* This is a transition to an exception handler. Since we don't
++ support non-call exceptions for SME functions, the source of
++ the transition must be known. We'll assert later if that's
++ not the case. */
++ return aarch64_local_sme_state::ANY;
++
++ case aarch64_local_sme_state::ANY:
++ return aarch64_local_sme_state::ANY;
++ }
++
++ return aarch64_local_sme_state::ANY;
++}
++
++/* Implement TARGET_MODE_BACKPROP. */
++
++static int
++aarch64_mode_backprop (int entity, int mode1, int mode2)
++{
++ switch (aarch64_mode_entity (entity))
++ {
++ case aarch64_mode_entity::HAVE_ZA_SAVE_BUFFER:
++ return int (aarch64_one_shot_backprop (aarch64_tristate_mode (mode1),
++ aarch64_tristate_mode (mode2)));
++
++ case aarch64_mode_entity::LOCAL_SME_STATE:
++ return int (aarch64_local_sme_backprop
++ (aarch64_local_sme_state (mode1),
++ aarch64_local_sme_state (mode2)));
++ }
++ gcc_unreachable ();
++}
++
++/* Implement TARGET_MODE_ENTRY. */
++
++static int
++aarch64_mode_entry (int entity)
++{
++ switch (aarch64_mode_entity (entity))
++ {
++ case aarch64_mode_entity::HAVE_ZA_SAVE_BUFFER:
++ return int (aarch64_tristate_mode::NO);
++
++ case aarch64_mode_entity::LOCAL_SME_STATE:
++ return int (aarch64_cfun_shared_flags ("za") != 0
++ ? aarch64_local_sme_state::ACTIVE_LIVE
++ : aarch64_local_sme_state::INACTIVE_CALLER);
++ }
++ gcc_unreachable ();
++}
++
++/* Implement TARGET_MODE_EXIT. */
++
++static int
++aarch64_mode_exit (int entity)
++{
++ switch (aarch64_mode_entity (entity))
++ {
++ case aarch64_mode_entity::HAVE_ZA_SAVE_BUFFER:
++ return int (aarch64_tristate_mode::MAYBE);
++
++ case aarch64_mode_entity::LOCAL_SME_STATE:
++ return int (aarch64_cfun_shared_flags ("za") != 0
++ ? aarch64_local_sme_state::ACTIVE_LIVE
++ : aarch64_local_sme_state::INACTIVE_CALLER);
++ }
++ gcc_unreachable ();
++}
++
++/* Implement TARGET_MODE_EH_HANDLER. */
++
++static int
++aarch64_mode_eh_handler (int entity)
++{
++ switch (aarch64_mode_entity (entity))
++ {
++ case aarch64_mode_entity::HAVE_ZA_SAVE_BUFFER:
++ /* Require a lazy save buffer to be allocated before the first
++ insn that can throw. */
++ return int (aarch64_tristate_mode::YES);
++
++ case aarch64_mode_entity::LOCAL_SME_STATE:
++ return int (aarch64_local_sme_state::SAVED_LOCAL);
++ }
++ gcc_unreachable ();
++}
++
++/* Implement TARGET_MODE_PRIORITY. */
++
++static int
++aarch64_mode_priority (int, int n)
++{
++ return n;
++}
++
++/* Implement TARGET_MD_ASM_ADJUST. */
++
++static rtx_insn *
++aarch64_md_asm_adjust (vec<rtx> &outputs, vec<rtx> &inputs,
++ vec<machine_mode> &input_modes,
++ vec<const char *> &constraints,
++ vec<rtx> &uses, vec<rtx> &clobbers,
++ HARD_REG_SET &clobbered_regs, location_t loc)
++{
++ rtx_insn *seq = arm_md_asm_adjust (outputs, inputs, input_modes, constraints,
++ uses, clobbers, clobbered_regs, loc);
++
++ /* "za" in the clobber list of a function with ZA state is defined to
++ mean that the asm can read from and write to ZA. We can model the
++ read using a USE, but unfortunately, it's not possible to model the
++ write directly. Use a separate insn to model the effect.
++
++ We must ensure that ZA is active on entry, which is enforced by using
++ SME_STATE_REGNUM. The asm must ensure that ZA is active on return. */
++ if (TARGET_ZA)
++ for (unsigned int i = clobbers.length (); i-- > 0; )
++ {
++ rtx x = clobbers[i];
++ if (REG_P (x) && REGNO (x) == ZA_REGNUM)
++ {
++ auto id = cfun->machine->next_asm_update_za_id++;
++
++ start_sequence ();
++ if (seq)
++ emit_insn (seq);
++ emit_insn (gen_aarch64_asm_update_za (gen_int_mode (id, SImode)));
++ seq = get_insns ();
++ end_sequence ();
++
++ uses.safe_push (gen_rtx_REG (VNx16QImode, ZA_REGNUM));
++ uses.safe_push (gen_rtx_REG (DImode, SME_STATE_REGNUM));
++
++ clobbers.ordered_remove (i);
++ CLEAR_HARD_REG_BIT (clobbered_regs, ZA_REGNUM);
++ }
++ }
++ return seq;
++}
++
+ /* If CALL involves a change in PSTATE.SM, emit the instructions needed
+ to switch to the new mode and the instructions needed to restore the
+ original mode. Return true if something changed. */
+@@ -29108,6 +30415,9 @@ aarch64_get_v16qi_mode ()
+ #undef TARGET_START_CALL_ARGS
+ #define TARGET_START_CALL_ARGS aarch64_start_call_args
+
++#undef TARGET_END_CALL_ARGS
++#define TARGET_END_CALL_ARGS aarch64_end_call_args
++
+ #undef TARGET_GIMPLE_FOLD_BUILTIN
+ #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
+
+@@ -29473,6 +30783,9 @@ aarch64_libgcc_floating_mode_supported_p
+ #undef TARGET_COMP_TYPE_ATTRIBUTES
+ #define TARGET_COMP_TYPE_ATTRIBUTES aarch64_comp_type_attributes
+
++#undef TARGET_MERGE_DECL_ATTRIBUTES
++#define TARGET_MERGE_DECL_ATTRIBUTES aarch64_merge_decl_attributes
++
+ #undef TARGET_GET_MULTILIB_ABI_NAME
+ #define TARGET_GET_MULTILIB_ABI_NAME aarch64_get_multilib_abi_name
+
+@@ -29493,8 +30806,35 @@ aarch64_libgcc_floating_mode_supported_p
+ #undef TARGET_STRICT_ARGUMENT_NAMING
+ #define TARGET_STRICT_ARGUMENT_NAMING hook_bool_CUMULATIVE_ARGS_true
+
++#undef TARGET_MODE_EMIT
++#define TARGET_MODE_EMIT aarch64_mode_emit
++
++#undef TARGET_MODE_NEEDED
++#define TARGET_MODE_NEEDED aarch64_mode_needed
++
++#undef TARGET_MODE_AFTER
++#define TARGET_MODE_AFTER aarch64_mode_after
++
++#undef TARGET_MODE_CONFLUENCE
++#define TARGET_MODE_CONFLUENCE aarch64_mode_confluence
++
++#undef TARGET_MODE_BACKPROP
++#define TARGET_MODE_BACKPROP aarch64_mode_backprop
++
++#undef TARGET_MODE_ENTRY
++#define TARGET_MODE_ENTRY aarch64_mode_entry
++
++#undef TARGET_MODE_EXIT
++#define TARGET_MODE_EXIT aarch64_mode_exit
++
++#undef TARGET_MODE_EH_HANDLER
++#define TARGET_MODE_EH_HANDLER aarch64_mode_eh_handler
++
++#undef TARGET_MODE_PRIORITY
++#define TARGET_MODE_PRIORITY aarch64_mode_priority
++
+ #undef TARGET_MD_ASM_ADJUST
+-#define TARGET_MD_ASM_ADJUST arm_md_asm_adjust
++#define TARGET_MD_ASM_ADJUST aarch64_md_asm_adjust
+
+ #undef TARGET_ASM_FILE_END
+ #define TARGET_ASM_FILE_END aarch64_asm_file_end
+@@ -29505,6 +30845,9 @@ aarch64_libgcc_floating_mode_supported_p
+ #undef TARGET_HAVE_SHADOW_CALL_STACK
+ #define TARGET_HAVE_SHADOW_CALL_STACK true
+
++#undef TARGET_EXTRA_LIVE_ON_ENTRY
++#define TARGET_EXTRA_LIVE_ON_ENTRY aarch64_extra_live_on_entry
++
+ #undef TARGET_EMIT_EPILOGUE_FOR_SIBCALL
+ #define TARGET_EMIT_EPILOGUE_FOR_SIBCALL aarch64_expand_epilogue
+
+diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h
+index 6bfe55968..89d30b9bf 100644
+--- a/gcc/config/aarch64/aarch64.h
++++ b/gcc/config/aarch64/aarch64.h
+@@ -207,6 +207,7 @@ constexpr auto AARCH64_FL_DEFAULT_ISA_MODE = AARCH64_FL_SM_OFF;
+ /* Macros to test ISA flags. */
+
+ #define AARCH64_ISA_SM_OFF (aarch64_isa_flags & AARCH64_FL_SM_OFF)
++#define AARCH64_ISA_ZA_ON (aarch64_isa_flags & AARCH64_FL_ZA_ON)
+ #define AARCH64_ISA_MODE (aarch64_isa_flags & AARCH64_FL_ISA_MODES)
+ #define AARCH64_ISA_CRC (aarch64_isa_flags & AARCH64_FL_CRC)
+ #define AARCH64_ISA_CRYPTO (aarch64_isa_flags & AARCH64_FL_CRYPTO)
+@@ -259,6 +260,9 @@ constexpr auto AARCH64_FL_DEFAULT_ISA_MODE = AARCH64_FL_SM_OFF;
+ #define TARGET_STREAMING_COMPATIBLE \
+ ((aarch64_isa_flags & AARCH64_FL_SM_STATE) == 0)
+
++/* PSTATE.ZA is enabled in the current function body. */
++#define TARGET_ZA (AARCH64_ISA_ZA_ON)
++
+ /* Crypto is an optional extension to AdvSIMD. */
+ #define TARGET_CRYPTO (AARCH64_ISA_CRYPTO)
+
+@@ -445,7 +449,8 @@ constexpr auto AARCH64_FL_DEFAULT_ISA_MODE = AARCH64_FL_SM_OFF;
+ 1, 1, 1, 1, /* SFP, AP, CC, VG */ \
+ 0, 0, 0, 0, 0, 0, 0, 0, /* P0 - P7 */ \
+ 0, 0, 0, 0, 0, 0, 0, 0, /* P8 - P15 */ \
+- 1, 1 /* FFR and FFRT */ \
++ 1, 1, /* FFR and FFRT */ \
++ 1, 1, 1, 1, 1, 1, 1 /* Fake registers */ \
+ }
+
+ /* X30 is marked as caller-saved which is in line with regular function call
+@@ -455,7 +460,7 @@ constexpr auto AARCH64_FL_DEFAULT_ISA_MODE = AARCH64_FL_SM_OFF;
+ true but not until function epilogues have been generated. This ensures
+ that X30 is available for use in leaf functions if needed. */
+
+-#define CALL_USED_REGISTERS \
++#define CALL_REALLY_USED_REGISTERS \
+ { \
+ 1, 1, 1, 1, 1, 1, 1, 1, /* R0 - R7 */ \
+ 1, 1, 1, 1, 1, 1, 1, 1, /* R8 - R15 */ \
+@@ -468,7 +473,8 @@ constexpr auto AARCH64_FL_DEFAULT_ISA_MODE = AARCH64_FL_SM_OFF;
+ 1, 1, 1, 0, /* SFP, AP, CC, VG */ \
+ 1, 1, 1, 1, 1, 1, 1, 1, /* P0 - P7 */ \
+ 1, 1, 1, 1, 1, 1, 1, 1, /* P8 - P15 */ \
+- 1, 1 /* FFR and FFRT */ \
++ 1, 1, /* FFR and FFRT */ \
++ 0, 0, 0, 0, 0, 0, 0 /* Fake registers */ \
+ }
+
+ #define REGISTER_NAMES \
+@@ -484,7 +490,9 @@ constexpr auto AARCH64_FL_DEFAULT_ISA_MODE = AARCH64_FL_SM_OFF;
+ "sfp", "ap", "cc", "vg", \
+ "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", \
+ "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", \
+- "ffr", "ffrt" \
++ "ffr", "ffrt", \
++ "lowering", "tpidr2_block", "sme_state", "tpidr2_setup", \
++ "za_free", "za_saved", "za" \
+ }
+
+ /* Generate the register aliases for core register N */
+@@ -533,7 +541,7 @@ constexpr auto AARCH64_FL_DEFAULT_ISA_MODE = AARCH64_FL_SM_OFF;
+ #define FRAME_POINTER_REGNUM SFP_REGNUM
+ #define STACK_POINTER_REGNUM SP_REGNUM
+ #define ARG_POINTER_REGNUM AP_REGNUM
+-#define FIRST_PSEUDO_REGISTER (FFRT_REGNUM + 1)
++#define FIRST_PSEUDO_REGISTER (LAST_FAKE_REGNUM + 1)
+
+ /* The number of argument registers available for each class. */
+ #define NUM_ARG_REGS 8
+@@ -657,6 +665,9 @@ constexpr auto AARCH64_FL_DEFAULT_ISA_MODE = AARCH64_FL_SM_OFF;
+
+ #define FP_SIMD_SAVED_REGNUM_P(REGNO) \
+ (((unsigned) (REGNO - V8_REGNUM)) <= (V23_REGNUM - V8_REGNUM))
++
++#define FAKE_REGNUM_P(REGNO) \
++ IN_RANGE (REGNO, FIRST_FAKE_REGNUM, LAST_FAKE_REGNUM)
+
+ /* Register and constant classes. */
+
+@@ -677,6 +688,7 @@ enum reg_class
+ PR_REGS,
+ FFR_REGS,
+ PR_AND_FFR_REGS,
++ FAKE_REGS,
+ ALL_REGS,
+ LIM_REG_CLASSES /* Last */
+ };
+@@ -700,6 +712,7 @@ enum reg_class
+ "PR_REGS", \
+ "FFR_REGS", \
+ "PR_AND_FFR_REGS", \
++ "FAKE_REGS", \
+ "ALL_REGS" \
+ }
+
+@@ -720,6 +733,7 @@ enum reg_class
+ { 0x00000000, 0x00000000, 0x000ffff0 }, /* PR_REGS */ \
+ { 0x00000000, 0x00000000, 0x00300000 }, /* FFR_REGS */ \
+ { 0x00000000, 0x00000000, 0x003ffff0 }, /* PR_AND_FFR_REGS */ \
++ { 0x00000000, 0x00000000, 0x1fc00000 }, /* FAKE_REGS */ \
+ { 0xffffffff, 0xffffffff, 0x000fffff } /* ALL_REGS */ \
+ }
+
+@@ -920,6 +934,15 @@ typedef struct GTY (()) machine_function
+ bool reg_is_wrapped_separately[LAST_SAVED_REGNUM];
+ /* One entry for each general purpose register. */
+ rtx call_via[SP_REGNUM];
++
++ /* A pseudo register that points to the function's TPIDR2 block, or null
++ if the function doesn't have a TPIDR2 block. */
++ rtx tpidr2_block;
++
++ /* A pseudo register that points to the function's ZA save buffer,
++ or null if none. */
++ rtx za_save_buffer;
++
+ bool label_is_assembled;
+
+ /* True if we've expanded at least one call to a function that changes
+@@ -927,6 +950,10 @@ typedef struct GTY (()) machine_function
+ guarantees that no such mode switch exists. */
+ bool call_switches_pstate_sm;
+
++ /* Used to generated unique identifiers for each update to ZA by an
++ asm statement. */
++ unsigned int next_asm_update_za_id;
++
+ /* A set of all decls that have been passed to a vld1 intrinsic in the
+ current function. This is used to help guide the vector cost model. */
+ hash_set<tree> *vector_load_decls;
+@@ -996,6 +1023,10 @@ typedef struct
+ bool silent_p; /* True if we should act silently, rather than
+ raise an error for invalid calls. */
+
++ /* AARCH64_STATE_* flags that describe whether the function shares ZA
++ with its callers. */
++ unsigned int shared_za_flags;
++
+ /* A list of registers that need to be saved and restored around a
+ change to PSTATE.SM. An auto_vec would be more convenient, but those
+ can't be copied. */
+@@ -1344,4 +1375,61 @@ extern poly_uint16 aarch64_sve_vg;
+ STACK_BOUNDARY / BITS_PER_UNIT) \
+ : (crtl->outgoing_args_size + STACK_POINTER_OFFSET))
+
++#ifndef USED_FOR_TARGET
++
++/* Enumerates the mode-switching "entities" for AArch64. */
++enum class aarch64_mode_entity : int
++{
++ /* An aarch64_tristate_mode that says whether we have created a local
++ save buffer for the current function's ZA state. The only transition
++ is from NO to YES. */
++ HAVE_ZA_SAVE_BUFFER,
++
++ /* An aarch64_local_sme_state that reflects the state of all data
++ controlled by PSTATE.ZA. */
++ LOCAL_SME_STATE
++};
++
++/* Describes the state of all data controlled by PSTATE.ZA */
++enum class aarch64_local_sme_state : int
++{
++ /* ZA is in the off or dormant state. If it is dormant, the contents
++ of ZA belong to a caller. */
++ INACTIVE_CALLER,
++
++ /* ZA is in the off state: PSTATE.ZA is 0 and TPIDR2_EL0 is null. */
++ OFF,
++
++ /* ZA is in the off or dormant state. If it is dormant, the contents
++ of ZA belong to the current function. */
++ INACTIVE_LOCAL,
++
++ /* ZA is in the off state and the current function's ZA contents are
++ stored in the lazy save buffer. This is the state on entry to
++ exception handlers. */
++ SAVED_LOCAL,
++
++ /* ZA is in the active state: PSTATE.ZA is 1 and TPIDR2_EL0 is null.
++ The contents of ZA are live. */
++ ACTIVE_LIVE,
++
++ /* ZA is in the active state: PSTATE.ZA is 1 and TPIDR2_EL0 is null.
++ The contents of ZA are dead. */
++ ACTIVE_DEAD,
++
++ /* ZA could be in multiple states. */
++ ANY
++};
++
++enum class aarch64_tristate_mode : int { NO, YES, MAYBE };
++
++#define OPTIMIZE_MODE_SWITCHING(ENTITY) \
++ aarch64_optimize_mode_switching (aarch64_mode_entity (ENTITY))
++
++#define NUM_MODES_FOR_MODE_SWITCHING \
++ { int (aarch64_tristate_mode::MAYBE), \
++ int (aarch64_local_sme_state::ANY) }
++
++#endif
++
+ #endif /* GCC_AARCH64_H */
+diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
+index bb867de74..05a7c6675 100644
+--- a/gcc/config/aarch64/aarch64.md
++++ b/gcc/config/aarch64/aarch64.md
+@@ -111,6 +111,56 @@
+ ;; "FFR token": a fake register used for representing the scheduling
+ ;; restrictions on FFR-related operations.
+ (FFRT_REGNUM 85)
++
++ ;; ----------------------------------------------------------------
++ ;; Fake registers
++ ;; ----------------------------------------------------------------
++ ;; These registers represent abstract things, rather than real
++ ;; architected registers.
++
++ ;; Sometimes we use placeholder instructions to mark where later
++ ;; ABI-related lowering is needed. These placeholders read and
++ ;; write this register. Instructions that depend on the lowering
++ ;; read the register.
++ (LOWERING_REGNUM 86)
++
++ ;; Represents the contents of the current function's TPIDR2 block,
++ ;; in abstract form.
++ (TPIDR2_BLOCK_REGNUM 87)
++
++ ;; Holds the value that the current function wants PSTATE.ZA to be.
++ ;; The actual value can sometimes vary, because it does not track
++ ;; changes to PSTATE.ZA that happen during a lazy save and restore.
++ ;; Those effects are instead tracked by ZA_SAVED_REGNUM.
++ (SME_STATE_REGNUM 88)
++
++ ;; Instructions write to this register if they set TPIDR2_EL0 to a
++ ;; well-defined value. Instructions read from the register if they
++ ;; depend on the result of such writes.
++ ;;
++ ;; The register does not model the architected TPIDR2_ELO, just the
++ ;; current function's management of it.
++ (TPIDR2_SETUP_REGNUM 89)
++
++ ;; Represents the property "has an incoming lazy save been committed?".
++ (ZA_FREE_REGNUM 90)
++
++ ;; Represents the property "are the current function's ZA contents
++ ;; stored in the lazy save buffer, rather than in ZA itself?".
++ (ZA_SAVED_REGNUM 91)
++
++ ;; Represents the contents of the current function's ZA state in
++ ;; abstract form. At various times in the function, these contents
++ ;; might be stored in ZA itself, or in the function's lazy save buffer.
++ ;;
++ ;; The contents persist even when the architected ZA is off. Private-ZA
++ ;; functions have no effect on its contents.
++ (ZA_REGNUM 92)
++ ;; ----------------------------------------------------------------
++ (FIRST_FAKE_REGNUM LOWERING_REGNUM)
++ (LAST_FAKE_REGNUM ZA_REGNUM)
++ ;; ----------------------------------------------------------------
++
+ ;; The pair of scratch registers used for stack probing with -fstack-check.
+ ;; Leave R9 alone as a possible choice for the static chain.
+ ;; Note that the use of these registers is mutually exclusive with the use
+@@ -303,7 +353,12 @@
+ UNSPEC_TAG_SPACE ; Translate address to MTE tag address space.
+ UNSPEC_LD1RO
+ UNSPEC_SALT_ADDR
++ UNSPEC_SAVE_NZCV
++ UNSPEC_RESTORE_NZCV
+ UNSPECV_PATCHABLE_AREA
++ ;; Wraps a constant integer that should be multiplied by the number
++ ;; of quadwords in an SME vector.
++ UNSPEC_SME_VQ
+ ])
+
+ (define_c_enum "unspecv" [
+@@ -379,7 +434,7 @@
+ ;; Q registers and is equivalent to "simd".
+
+ (define_enum "arches" [any rcpc8_4 fp fp_q base_simd nobase_simd
+- simd nosimd sve fp16])
++ simd nosimd sve fp16 sme])
+
+ (define_enum_attr "arch" "arches" (const_string "any"))
+
+@@ -423,7 +478,10 @@
+ (match_test "TARGET_FP_F16INST"))
+
+ (and (eq_attr "arch" "sve")
+- (match_test "TARGET_SVE")))
++ (match_test "TARGET_SVE"))
++
++ (and (eq_attr "arch" "sme")
++ (match_test "TARGET_SME")))
+ (const_string "yes")
+ (const_string "no")))
+
+@@ -928,7 +986,7 @@
+ (set_attr "sls_length" "retbr")]
+ )
+
+-(define_insn "*cb<optab><mode>1"
++(define_insn "aarch64_cb<optab><mode>1"
+ [(set (pc) (if_then_else (EQL (match_operand:GPI 0 "register_operand" "r")
+ (const_int 0))
+ (label_ref (match_operand 1 "" ""))
+@@ -1291,6 +1349,7 @@
+ /* The "mov_imm" type for CNT is just a placeholder. */
+ [r , Usv; mov_imm , sve , 4] << aarch64_output_sve_cnt_immediate ("cnt", "%x0", operands[1]);
+ [r , Usr; mov_imm , sve, 4] << aarch64_output_sve_rdvl (operands[1]);
++ [r , UsR; mov_imm , sme, 4] << aarch64_output_rdsvl (operands[1]);
+ [r , m ; load_4 , * , 4] ldr\t%w0, %1
+ [w , m ; load_4 , fp , 4] ldr\t%s0, %1
+ [m , r Z; store_4 , * , 4] str\t%w1, %0
+@@ -1326,6 +1385,7 @@
+ /* The "mov_imm" type for CNT is just a placeholder. */
+ [r, Usv; mov_imm , sve , 4] << aarch64_output_sve_cnt_immediate ("cnt", "%x0", operands[1]);
+ [r, Usr; mov_imm , sve, 4] << aarch64_output_sve_rdvl (operands[1]);
++ [r, UsR; mov_imm , sme, 4] << aarch64_output_rdsvl (operands[1]);
+ [r, m ; load_8 , * , 4] ldr\t%x0, %1
+ [w, m ; load_8 , fp , 4] ldr\t%d0, %1
+ [m, r Z; store_8 , * , 4] str\t%x1, %0
+@@ -7733,6 +7793,21 @@
+ [(set (attr "length") (symbol_ref "INTVAL (operands[0])"))]
+ )
+
++(define_insn "aarch64_save_nzcv"
++ [(set (match_operand:DI 0 "register_operand" "=r")
++ (unspec:DI [(reg:CC CC_REGNUM)] UNSPEC_SAVE_NZCV))]
++ ""
++ "mrs\t%0, nzcv"
++)
++
++(define_insn "aarch64_restore_nzcv"
++ [(set (reg:CC CC_REGNUM)
++ (unspec:CC [(match_operand:DI 0 "register_operand" "r")]
++ UNSPEC_RESTORE_NZCV))]
++ ""
++ "msr\tnzcv, %0"
++)
++
+ ;; AdvSIMD Stuff
+ (include "aarch64-simd.md")
+
+diff --git a/gcc/config/aarch64/constraints.md b/gcc/config/aarch64/constraints.md
+index 212a73416..88fb9a07c 100644
+--- a/gcc/config/aarch64/constraints.md
++++ b/gcc/config/aarch64/constraints.md
+@@ -220,6 +220,12 @@
+ (and (match_code "const_poly_int")
+ (match_test "aarch64_sve_rdvl_immediate_p (op)")))
+
++(define_constraint "UsR"
++ "@internal
++ A constraint that matches a value produced by RDSVL."
++ (and (match_code "const")
++ (match_test "aarch64_rdsvl_immediate_p (op)")))
++
+ (define_constraint "Usv"
+ "@internal
+ A constraint that matches a VG-based constant that can be loaded by
+diff --git a/gcc/testsuite/g++.target/aarch64/sme/exceptions_1.C b/gcc/testsuite/g++.target/aarch64/sme/exceptions_1.C
+new file mode 100644
+index 000000000..a245546d8
+--- /dev/null
++++ b/gcc/testsuite/g++.target/aarch64/sme/exceptions_1.C
+@@ -0,0 +1,189 @@
++// { dg-options "-O -fno-optimize-sibling-calls" }
++// { dg-final { check-function-bodies "**" "" } }
++
++void callee_inout() __arm_inout("za");
++void callee_in() noexcept __arm_in("za");
++void callee_out() noexcept __arm_out("za");
++void callee_normal();
++
++/*
++** _Z5test1v:
++** ...
++** bl __arm_tpidr2_save
++** ...
++** bl __cxa_begin_catch
++** bl __cxa_end_catch
++** mov w0, #?2
++** ...
++*/
++__arm_new("za") int
++test1 ()
++{
++ try
++ {
++ callee_inout();
++ return 1;
++ }
++ catch (...)
++ {
++ return 2;
++ }
++}
++
++/*
++** _Z5test2v:
++** ...
++** bl __arm_tpidr2_save
++** ...
++** bl __cxa_begin_catch
++** smstart za
++** bl _Z10callee_outv
++** bl _Z9callee_inv
++** smstop za
++** bl __cxa_end_catch
++** mov w0, #?2
++** ...
++*/
++__arm_new("za") int
++test2 ()
++{
++ try
++ {
++ callee_inout();
++ return 1;
++ }
++ catch (...)
++ {
++ callee_out();
++ callee_in();
++ return 2;
++ }
++}
++
++/*
++** _Z5test3v:
++** ...
++** bl __arm_tpidr2_save
++** ...
++** smstop za
++** ...
++** bl _Z13callee_normalv
++** ...
++** bl __cxa_begin_catch
++** smstart za
++** bl _Z10callee_outv
++** bl _Z9callee_inv
++** smstop za
++** bl __cxa_end_catch
++** mov w0, #?2
++** ...
++*/
++__arm_new("za") int
++test3 ()
++{
++ try
++ {
++ callee_normal();
++ return 1;
++ }
++ catch (...)
++ {
++ callee_out();
++ callee_in();
++ return 2;
++ }
++}
++
++__arm_new("za") int
++test4 ()
++{
++ try
++ {
++ // No lazy save set up because this is a shared-ZA function.
++ callee_inout();
++ return 1;
++ }
++ catch (...)
++ {
++ callee_inout();
++ return 2;
++ }
++}
++// { dg-final { scan-assembler {_Z5test4v:(?:(?!msr\ttpidr2_el0, x[0-9]+).)*\tret} } }
++
++/*
++** _Z5test5v:
++** ...
++** bl __arm_tpidr2_save
++** ...
++** smstart za
++** ...
++** bl _Z12callee_inoutv
++** add (x[0-9]+), [^\n]+
++** msr tpidr2_el0, \1
++** bl _Z13callee_normalv
++** msr tpidr2_el0, xzr
++** smstop za
++** ...
++** bl __cxa_begin_catch
++** ...
++** mrs x[0-9]+, tpidr2_el0
++** ...
++** smstart za
++** ...
++** bl __arm_tpidr2_restore
++** msr tpidr2_el0, xzr
++** bl _Z12callee_inoutv
++** smstop za
++** bl __cxa_end_catch
++** mov w0, #?2
++** ...
++*/
++__arm_new("za") int
++test5 ()
++{
++ try
++ {
++ callee_inout();
++ callee_normal();
++ return 1;
++ }
++ catch (...)
++ {
++ callee_inout();
++ return 2;
++ }
++}
++
++/*
++** _Z5test6v:
++** ...
++** msr tpidr2_el0, x[0-9]+
++** bl _Z13callee_normalv
++** msr tpidr2_el0, xzr
++** ...
++** bl __cxa_begin_catch
++** bl __cxa_end_catch
++** ...
++** mrs x[0-9]+, tpidr2_el0
++** ...
++** smstart za
++** ...
++** bl __arm_tpidr2_restore
++** msr tpidr2_el0, xzr
++** ...
++*/
++int
++test6 () __arm_inout("za")
++{
++ try
++ {
++ callee_normal();
++ callee_out();
++ return 1;
++ }
++ catch (...)
++ {
++ return 2;
++ }
++}
+diff --git a/gcc/testsuite/g++.target/aarch64/sme/keyword_macros_1.C b/gcc/testsuite/g++.target/aarch64/sme/keyword_macros_1.C
+index 032485adf..8b0755014 100644
+--- a/gcc/testsuite/g++.target/aarch64/sme/keyword_macros_1.C
++++ b/gcc/testsuite/g++.target/aarch64/sme/keyword_macros_1.C
+@@ -2,3 +2,8 @@
+
+ void f1 () __arm_streaming;
+ void f2 () __arm_streaming_compatible;
++void f3 () __arm_in("za");
++void f4 () __arm_out("za");
++void f5 () __arm_inout("za");
++void f6 () __arm_preserves("za");
++__arm_new("za") void f7 () {}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sme/keyword_macros_1.c b/gcc/testsuite/gcc.target/aarch64/sme/keyword_macros_1.c
+index 8f1b83676..fcabe3edc 100644
+--- a/gcc/testsuite/gcc.target/aarch64/sme/keyword_macros_1.c
++++ b/gcc/testsuite/gcc.target/aarch64/sme/keyword_macros_1.c
+@@ -2,3 +2,8 @@
+
+ void f1 () __arm_streaming;
+ void f2 () __arm_streaming_compatible;
++void f3 () __arm_in("za");
++void f4 () __arm_out("za");
++void f5 () __arm_inout("za");
++void f6 () __arm_preserves("za");
++__arm_new("za") void f7 () {}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sme/za_state_1.c b/gcc/testsuite/gcc.target/aarch64/sme/za_state_1.c
+new file mode 100644
+index 000000000..856880e21
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sme/za_state_1.c
+@@ -0,0 +1,154 @@
++// { dg-options "" }
++
++void shared_a () [[arm::inout("za")]];
++void shared_a (); // { dg-error "conflicting types" }
++
++void shared_b ();
++void shared_b () [[arm::inout("za")]]; // { dg-error "conflicting types" }
++
++void shared_c () [[arm::inout("za")]];
++void shared_c () {} // Inherits attribute from declaration (confusingly).
++
++void shared_d ();
++void shared_d () [[arm::inout("za")]] {} // { dg-error "conflicting types" }
++
++void shared_e () [[arm::inout("za")]] {}
++void shared_e (); // { dg-error "conflicting types" }
++
++void shared_f () {}
++void shared_f () [[arm::inout("za")]]; // { dg-error "conflicting types" }
++
++extern void (*shared_g) ();
++extern void (*shared_g) () [[arm::inout("za")]]; // { dg-error "conflicting types" }
++
++extern void (*shared_h) () [[arm::inout("za")]];
++extern void (*shared_h) (); // { dg-error "conflicting types" }
++
++//----------------------------------------------------------------------------
++
++void preserved_a () [[arm::preserves("za")]];
++void preserved_a (); // { dg-error "conflicting types" }
++
++void preserved_b ();
++void preserved_b () [[arm::preserves("za")]]; // { dg-error "conflicting types" }
++
++void preserved_c () [[arm::preserves("za")]];
++void preserved_c () {} // Inherits attribute from declaration (confusingly).
++
++void preserved_d ();
++void preserved_d () [[arm::preserves("za")]] {} // { dg-error "conflicting types" }
++
++void preserved_e () [[arm::preserves("za")]] {}
++void preserved_e (); // { dg-error "conflicting types" }
++
++void preserved_f () {}
++void preserved_f () [[arm::preserves("za")]]; // { dg-error "conflicting types" }
++
++extern void (*preserved_g) ();
++extern void (*preserved_g) () [[arm::preserves("za")]]; // { dg-error "conflicting types" }
++
++extern void (*preserved_h) () [[arm::preserves("za")]];
++extern void (*preserved_h) (); // { dg-error "conflicting types" }
++
++//----------------------------------------------------------------------------
++
++void replicated_1 () [[arm::in("za", "za"), arm::in("za")]];
++void replicated_2 () [[arm::out("za", "za"), arm::out("za")]];
++void replicated_3 () [[arm::inout("za", "za"), arm::inout("za")]];
++void replicated_4 () [[arm::preserves("za", "za"), arm::preserves("za")]];
++
++//----------------------------------------------------------------------------
++
++void invalid_1 () [[arm::in]]; // { dg-error "wrong number of arguments" }
++void invalid_2 () [[arm::in()]]; // { dg-error "parentheses must be omitted" }
++ // { dg-error "wrong number of arguments" "" { target *-*-* } .-1 }
++void invalid_3 () [[arm::in("")]]; // { dg-error "unrecognized state string ''" }
++void invalid_4 () [[arm::in("foo")]]; // { dg-error "unrecognized state string 'foo'" }
++void invalid_5 () [[arm::in(42)]]; // { dg-error "the arguments to 'in' must be constant strings" }
++void invalid_6 () [[arm::in(*(int *)0 ? "za" : "za")]]; // { dg-error "the arguments to 'in' must be constant strings" }
++
++//----------------------------------------------------------------------------
++
++void mixed_a () [[arm::preserves("za")]];
++void mixed_a () [[arm::inout("za")]]; // { dg-error "conflicting types" }
++
++void mixed_b () [[arm::inout("za")]];
++void mixed_b () [[arm::preserves("za")]]; // { dg-error "conflicting types" }
++
++void mixed_c () [[arm::preserves("za")]];
++void mixed_c () [[arm::in("za")]] {} // { dg-error "conflicting types" }
++
++void mixed_d () [[arm::inout("za")]];
++void mixed_d () [[arm::in("za")]] {} // { dg-error "conflicting types" }
++
++void mixed_e () [[arm::out("za")]] {}
++void mixed_e () [[arm::in("za")]]; // { dg-error "conflicting types" }
++
++void mixed_f () [[arm::inout("za")]] {}
++void mixed_f () [[arm::out("za")]]; // { dg-error "conflicting types" }
++
++extern void (*mixed_g) () [[arm::in("za")]];
++extern void (*mixed_g) () [[arm::preserves("za")]]; // { dg-error "conflicting types" }
++
++extern void (*mixed_h) () [[arm::preserves("za")]];
++extern void (*mixed_h) () [[arm::out("za")]]; // { dg-error "conflicting types" }
++
++//----------------------------------------------------------------------------
++
++void contradiction_1 () [[arm::preserves("za"), arm::inout("za")]]; // { dg-error "inconsistent attributes for state 'za'" }
++void contradiction_2 () [[arm::inout("za"), arm::preserves("za")]]; // { dg-error "inconsistent attributes for state 'za'" }
++
++int [[arm::inout("za")]] int_attr; // { dg-warning "only applies to function types" }
++void *[[arm::preserves("za")]] ptr_attr; // { dg-warning "only applies to function types" }
++
++typedef void preserved_callback () [[arm::preserves("za")]];
++typedef void shared_callback () [[arm::inout("za")]];
++
++void (*preserved_callback_ptr) () [[arm::preserves("za")]];
++void (*shared_callback_ptr) () [[arm::inout("za")]];
++
++typedef void contradiction_callback_1 () [[arm::preserves("za"), arm::inout("za")]]; // { dg-error "inconsistent attributes for state 'za'" }
++typedef void contradiction_callback_2 () [[arm::inout("za"), arm::preserves("za")]]; // { dg-error "inconsistent attributes for state 'za'" }
++
++void (*contradiction_callback_ptr_1) () [[arm::preserves("za"), arm::inout("za")]]; // { dg-error "inconsistent attributes for state 'za'" }
++void (*contradiction_callback_ptr_2) () [[arm::inout("za"), arm::preserves("za")]]; // { dg-error "inconsistent attributes for state 'za'" }
++
++struct s {
++ void (*contradiction_callback_ptr_1) () [[arm::preserves("za"), arm::inout("za")]]; // { dg-error "inconsistent attributes for state 'za'" }
++ void (*contradiction_callback_ptr_2) () [[arm::inout("za"), arm::preserves("za")]]; // { dg-error "inconsistent attributes for state 'za'" }
++};
++
++//----------------------------------------------------------------------------
++
++void keyword_ok_1 () __arm_inout("za");
++void keyword_ok_1 () __arm_inout("za");
++
++void keyword_ok_2 () __arm_in("za");
++void keyword_ok_2 () [[arm::in("za")]];
++
++void keyword_ok_3 () [[arm::out("za")]];
++void keyword_ok_3 () __arm_out("za");
++
++void keyword_ok_4 () __arm_inout("za") [[arm::inout("za")]];
++
++void keyword_ok_5 () __arm_preserves("za");
++void keyword_ok_5 () [[arm::preserves("za")]];
++
++__arm_new("za") void keyword_ok_6 () {}
++
++//----------------------------------------------------------------------------
++
++void keyword_conflict_1 () __arm_inout("za");
++void keyword_conflict_1 (); // { dg-error "conflicting types" }
++
++void keyword_conflict_2 ();
++void keyword_conflict_2 () __arm_inout("za"); // { dg-error "conflicting types" }
++
++void keyword_conflict_3 () __arm_inout("za");
++void keyword_conflict_3 () [[arm::preserves("za")]]; // { dg-error "conflicting types" }
++
++void keyword_conflict_4 () [[arm::preserves("za")]];
++void keyword_conflict_4 () __arm_inout("za"); // { dg-error "conflicting types" }
++
++__arm_new("za") void keyword_conflict_5 () __arm_inout("za") {} // { dg-error "cannot create a new 'za' scope since 'za' is shared with callers" }
++__arm_new("za") void keyword_conflict_6 () __arm_preserves("za") {} // { dg-error "cannot create a new 'za' scope since 'za' is shared with callers" }
+diff --git a/gcc/testsuite/gcc.target/aarch64/sme/za_state_2.c b/gcc/testsuite/gcc.target/aarch64/sme/za_state_2.c
+new file mode 100644
+index 000000000..572ff309f
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sme/za_state_2.c
+@@ -0,0 +1,73 @@
++// { dg-options "" }
++
++[[arm::new("za")]] void new_za_a ();
++void new_za_a ();
++
++void new_za_b ();
++[[arm::new("za")]] void new_za_b ();
++
++[[arm::new("za")]] void new_za_c ();
++void new_za_c () {}
++
++void new_za_d ();
++[[arm::new("za")]] void new_za_d () {}
++
++[[arm::new("za")]] void new_za_e () {}
++void new_za_e ();
++
++void new_za_f () {}
++[[arm::new("za")]] void new_za_f (); // { dg-error "cannot apply attribute 'new' to 'new_za_f' after the function has been defined" }
++
++//----------------------------------------------------------------------------
++
++[[arm::new("za")]] void shared_a ();
++void shared_a () [[arm::inout("za")]]; // { dg-error "conflicting types" }
++
++void shared_b () [[arm::inout("za")]];
++[[arm::new("za")]] void shared_b (); // { dg-error "conflicting types" }
++
++[[arm::new("za")]] void shared_c ();
++void shared_c () [[arm::in("za")]] {} // { dg-error "conflicting types" }
++
++void shared_d () [[arm::in("za")]];
++[[arm::new("za")]] void shared_d () {} // { dg-error "cannot create a new 'za' scope since 'za' is shared with callers" }
++
++[[arm::new("za")]] void shared_e () {}
++void shared_e () [[arm::out("za")]]; // { dg-error "conflicting types" }
++
++void shared_f () [[arm::out("za")]] {}
++[[arm::new("za")]] void shared_f (); // { dg-error "conflicting types" }
++
++[[arm::new("za")]] void shared_g () {}
++void shared_g () [[arm::preserves("za")]]; // { dg-error "conflicting types" }
++
++void shared_h () [[arm::preserves("za")]] {}
++[[arm::new("za")]] void shared_h (); // { dg-error "conflicting types" }
++
++//----------------------------------------------------------------------------
++
++[[arm::new("za")]] void contradiction_1 () [[arm::inout("za")]]; // { dg-error "cannot create a new 'za' scope since 'za' is shared with callers" }
++void contradiction_2 [[arm::new("za")]] () [[arm::inout("za")]]; // { dg-error "cannot create a new 'za' scope since 'za' is shared with callers" }
++[[arm::new("za")]] void contradiction_3 () [[arm::preserves("za")]]; // { dg-error "cannot create a new 'za' scope since 'za' is shared with callers" }
++void contradiction_4 [[arm::new("za")]] () [[arm::preserves("za")]]; // { dg-error "cannot create a new 'za' scope since 'za' is shared with callers" }
++
++int [[arm::new("za")]] int_attr; // { dg-warning "does not apply to types" }
++[[arm::new("za")]] int int_var_attr; // { dg-error "applies only to function definitions" }
++typedef void new_za_callback () [[arm::new("za")]]; // { dg-warning "does not apply to types" }
++[[arm::new("za")]] void (*new_za_var_callback) (); // { dg-error "applies only to function definitions" }
++
++//----------------------------------------------------------------------------
++
++[[arm::new("za")]] void complementary_1 () [[arm::streaming]] {}
++void complementary_2 [[arm::new("za")]] () [[arm::streaming]] {}
++[[arm::new("za")]] void complementary_3 () [[arm::streaming_compatible]] {}
++void complementary_4 [[arm::new("za")]] () [[arm::streaming_compatible]] {}
++
++//----------------------------------------------------------------------------
++
++#pragma GCC target "+nosme"
++
++[[arm::new("za")]] void bereft_1 ();
++[[arm::new("za")]] void bereft_2 () {} // { dg-error "functions with SME state require the ISA extension 'sme'" }
++void bereft_3 () [[arm::inout("za")]];
++void bereft_4 () [[arm::inout("za")]] {} // { dg-error "functions with SME state require the ISA extension 'sme'" }
+diff --git a/gcc/testsuite/gcc.target/aarch64/sme/za_state_3.c b/gcc/testsuite/gcc.target/aarch64/sme/za_state_3.c
+new file mode 100644
+index 000000000..203f6ae8a
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sme/za_state_3.c
+@@ -0,0 +1,31 @@
++// { dg-options "" }
++
++void normal_callee ();
++void in_callee () [[arm::in("za")]];
++void out_callee () [[arm::out("za")]];
++void inout_callee () [[arm::inout("za")]];
++void preserves_callee () [[arm::preserves("za")]];
++
++struct callbacks {
++ void (*normal_ptr) ();
++ void (*in_ptr) () [[arm::in("za")]];
++ void (*out_ptr) () [[arm::out("za")]];
++ void (*inout_ptr) () [[arm::inout("za")]];
++ void (*preserves_ptr) () [[arm::preserves("za")]];
++};
++
++void
++normal_caller (struct callbacks *c)
++{
++ normal_callee ();
++ in_callee (); // { dg-error {call to a function that shares 'za' state from a function that has no 'za' state} }
++ out_callee (); // { dg-error {call to a function that shares 'za' state from a function that has no 'za' state} }
++ inout_callee (); // { dg-error {call to a function that shares 'za' state from a function that has no 'za' state} }
++ preserves_callee (); // { dg-error {call to a function that shares SME state from a function that has no SME state} }
++
++ c->normal_ptr ();
++ c->in_ptr (); // { dg-error {call to a function that shares 'za' state from a function that has no 'za' state} }
++ c->out_ptr (); // { dg-error {call to a function that shares 'za' state from a function that has no 'za' state} }
++ c->inout_ptr (); // { dg-error {call to a function that shares 'za' state from a function that has no 'za' state} }
++ c->preserves_ptr (); // { dg-error {call to a function that shares SME state from a function that has no SME state} }
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sme/za_state_4.c b/gcc/testsuite/gcc.target/aarch64/sme/za_state_4.c
+new file mode 100644
+index 000000000..cec0abf0e
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sme/za_state_4.c
+@@ -0,0 +1,585 @@
++// { dg-options "-O -fno-optimize-sibling-calls" }
++// { dg-final { check-function-bodies "**" "" } }
++
++void private_za();
++void out_za() __arm_out("za");
++void in_za() __arm_in("za");
++void inout_za() __arm_inout("za");
++void preserves_za() __arm_preserves("za");
++
++/*
++** test1:
++** ret
++*/
++__arm_new("za") void test1()
++{
++}
++
++/*
++** test2:
++** ldr w0, \[x0\]
++** ret
++*/
++__arm_new("za") int test2(int *ptr)
++{
++ return *ptr;
++}
++
++/*
++** test3:
++** stp [^\n]+
++** mov x29, sp
++** bl private_za
++** (
++** mov w0, 0
++** ldp [^\n]+
++** |
++** ldp [^\n]+
++** mov w0, 0
++** )
++** ret
++*/
++__arm_new("za") int test3()
++{
++ private_za();
++ return 0;
++}
++
++/*
++** test4:
++** ...
++** mrs x0, tpidr2_el0
++** cbz x0, [^\n]+
++** bl __arm_tpidr2_save
++** msr tpidr2_el0, xzr
++** zero { za }
++** smstart za
++** bl in_za
++** smstop za
++** ldp [^\n]+
++** ret
++*/
++__arm_new("za") void test4()
++{
++ in_za(); // Uses zeroed contents.
++}
++
++/*
++** test5:
++** ...
++** mrs x0, tpidr2_el0
++** cbz x0, [^\n]+
++** bl __arm_tpidr2_save
++** msr tpidr2_el0, xzr
++** smstop za
++** bl private_za
++** smstart za
++** bl out_za
++** bl in_za
++** smstop za
++** bl private_za
++** ldp [^\n]+
++** ret
++*/
++__arm_new("za") void test5()
++{
++ private_za();
++ out_za();
++ in_za();
++ private_za();
++}
++
++// Despite the long test, there shouldn't be too much scope for variation
++// here. The point is both to test correctness and code quality.
++/*
++** test6:
++** stp [^\n]+
++** mov x29, sp
++** mrs x0, tpidr2_el0
++** cbz x0, [^\n]+
++** bl __arm_tpidr2_save
++** msr tpidr2_el0, xzr
++** smstart za
++** bl out_za
++** rdsvl (x[0-9]+), #1
++** mul (x[0-9]+), \1, \1
++** sub sp, sp, \2
++** mov (x[0-9]+), sp
++** stp \3, \1, \[x29, #?16\]
++** add (x[0-9]+), x29, #?16
++** msr tpidr2_el0, \4
++** bl private_za
++** (
++** add (x[0-9]+), x29, #?16
++** mrs (x[0-9]+), tpidr2_el0
++** cbnz \6, [^\n]+
++** smstart za
++** mov x0, \5
++** |
++** add x0, x29, #?16
++** mrs (x[0-9]+), tpidr2_el0
++** cbnz \6, [^\n]+
++** smstart za
++** )
++** bl __arm_tpidr2_restore
++** msr tpidr2_el0, xzr
++** bl in_za
++** smstop za
++** mov sp, x29
++** ldp [^\n]+
++** ret
++*/
++__arm_new("za") void test6()
++{
++ out_za();
++ private_za();
++ in_za();
++}
++
++// Rely on previous tests for the part leading up to the smstart.
++/*
++** test7:
++** ...
++** smstart za
++** bl out_za
++** bl in_za
++** smstop za
++** bl private_za
++** smstart za
++** bl out_za
++** bl in_za
++** smstop za
++** ldp [^\n]+
++** ret
++*/
++__arm_new("za") void test7()
++{
++ out_za();
++ in_za();
++ private_za();
++ out_za();
++ in_za();
++}
++
++/*
++** test8:
++** ...
++** smstart za
++** bl out_za
++** bl in_za
++** smstop za
++** bl private_za
++** smstart za
++** bl out_za
++** bl in_za
++** smstop za
++** bl private_za
++** ldp [^\n]+
++** ret
++*/
++__arm_new("za") void test8()
++{
++ out_za();
++ in_za();
++ private_za();
++ out_za();
++ in_za();
++ private_za();
++}
++
++/*
++** test9:
++** ...
++** msr tpidr2_el0, x[0-9]+
++** bl private_za
++** bl private_za
++** bl private_za
++** bl private_za
++** add x[0-9]+, x29, #?16
++** mrs x[0-9]+, tpidr2_el0
++** ...
++*/
++__arm_new("za") void test9()
++{
++ out_za();
++ private_za();
++ private_za();
++ private_za();
++ private_za();
++ in_za();
++}
++
++/*
++** test10:
++** ldr (w[0-9]+), \[x0\]
++** cbz \1, [^\n]+
++** ldr [^\n]+
++** add [^\n]+
++** str [^\n]+
++** ret
++** ...
++*/
++__arm_new("za") void test10(volatile int *ptr)
++{
++ if (__builtin_expect (*ptr != 0, 1))
++ *ptr = *ptr + 1;
++ else
++ inout_za();
++}
++
++/*
++** test11:
++** ...
++** ldr w[0-9]+, [^\n]+
++** add (w[0-9]+), [^\n]+
++** str \1, [^\n]+
++** ...
++** ret
++** mrs x[0-9]+, tpidr2_el0
++** ...
++** smstart za
++** bl inout_za
++** ldr (w[0-9]+), [^\n]+
++** cbnz \2, [^\n]+
++** smstop za
++** ...
++*/
++__arm_new("za") void test11(volatile int *ptr)
++{
++ if (__builtin_expect (*ptr == 0, 0))
++ do
++ inout_za();
++ while (*ptr);
++ else
++ *ptr += 1;
++}
++
++__arm_new("za") void test12(volatile int *ptr)
++{
++ do
++ {
++ inout_za();
++ private_za();
++ }
++ while (*ptr);
++ out_za();
++ in_za();
++}
++
++/*
++** test13:
++** stp [^\n]+
++** ...
++** stp [^\n]+
++** ...
++** bl __arm_tpidr2_save
++** ...
++** msr tpidr2_el0, x[0-9]+
++** bl private_za
++** ...
++** mrs x[0-9]+, tpidr2_el0
++** ...
++** bl inout_za
++** ...
++** msr tpidr2_el0, x[0-9]+
++** ...
++** bl private_za
++** ...
++** cbnz [^\n]+
++** smstart za
++** msr tpidr2_el0, xzr
++** bl out_za
++** bl in_za
++** ...
++** smstop za
++** ...
++*/
++__arm_new("za") void test13(volatile int *ptr)
++{
++ do
++ {
++ private_za();
++ inout_za();
++ private_za();
++ }
++ while (*ptr);
++ out_za();
++ in_za();
++}
++
++/*
++** test14:
++** ...
++** bl __arm_tpidr2_save
++** ...
++** smstart za
++** bl inout_za
++** ldr [^\n]+
++** cbnz [^\n]+
++** bl out_za
++** bl in_za
++** smstop za
++** ...
++*/
++__arm_new("za") void test14(volatile int *ptr)
++{
++ do
++ inout_za();
++ while (*ptr);
++ out_za();
++ in_za();
++}
++
++/*
++** test15:
++** ...
++** bl __arm_tpidr2_save
++** ...
++** smstart za
++** bl out_za
++** bl in_za
++** ldr [^\n]+
++** cbnz [^\n]+
++** smstop za
++** bl private_za
++** ldr [^\n]+
++** ldp [^\n]+
++** ret
++*/
++__arm_new("za") void test15(volatile int *ptr)
++{
++ do
++ {
++ out_za();
++ in_za();
++ }
++ while (*ptr);
++ private_za();
++}
++
++/*
++** test16:
++** ...
++** bl __arm_tpidr2_save
++** ...
++** smstart za
++** b [^\n]+
++-- loop:
++** ...
++** mrs x[0-9]+, tpidr2_el0
++** ...
++** msr tpidr2_el0, xzr
++-- loop_entry:
++** bl inout_za
++** ...
++** msr tpidr2_el0, x[0-9]+
++** bl private_za
++** ldr [^\n]+
++** cbnz [^\n]+
++** msr tpidr2_el0, xzr
++** smstop za
++** bl private_za
++** ...
++*/
++__arm_new("za") void test16(volatile int *ptr)
++{
++ do
++ {
++ inout_za();
++ private_za();
++ }
++ while (*ptr);
++ private_za();
++}
++
++/*
++** test17:
++** ...
++** bl private_za
++** ldr [^\n]+
++** cbnz [^\n]+
++** ...
++** msr tpidr2_el0, xzr
++** ...
++** smstop za
++** ...
++*/
++__arm_new("za") void test17(volatile int *ptr)
++{
++ do
++ {
++ inout_za();
++ private_za();
++ }
++ while (*ptr);
++}
++
++/*
++** test18:
++** ldr w[0-9]+, [^\n]+
++** cbnz w[0-9]+, [^\n]+
++** ret
++** ...
++** smstop za
++** bl private_za
++** ...
++*/
++__arm_new("za") void test18(volatile int *ptr)
++{
++ if (__builtin_expect (*ptr, 0))
++ {
++ out_za();
++ in_za();
++ private_za();
++ }
++}
++
++/*
++** test19:
++** ...
++** ldr w[0-9]+, [^\n]+
++** cbz w[0-9]+, [^\n]+
++** mrs x[0-9]+, tpidr2_el0
++** ...
++** smstop za
++** bl private_za
++** ...
++*/
++__arm_new("za") void test19(volatile int *ptr)
++{
++ if (__builtin_expect (*ptr != 0, 1))
++ private_za();
++ else
++ do
++ {
++ inout_za();
++ private_za();
++ }
++ while (*ptr);
++}
++
++/*
++** test20:
++** ...
++** bl a20
++** (?:(?!x0).)*
++** bl b20
++** ...
++** mov ([wx][0-9]+), [wx]0
++** ...
++** bl __arm_tpidr2_restore
++** ...
++** mov [wx]0, \1
++** ...
++** bl c20
++** ...
++*/
++__arm_new("za") void test20()
++{
++ extern int a20() __arm_inout("za");
++ extern int b20(int);
++ extern void c20(int) __arm_inout("za");
++ c20(b20(a20()));
++}
++
++/*
++** test21:
++** ...
++** bl a21
++** (?:(?!x0).)*
++** bl b21
++** ...
++** mov (x[0-9]+), x0
++** ...
++** bl __arm_tpidr2_restore
++** ...
++** mov x0, \1
++** ...
++** bl c21
++** ...
++*/
++__arm_new("za") void test21()
++{
++ extern __UINT64_TYPE__ a21() __arm_inout("za");
++ extern __UINT64_TYPE__ b21(__UINT64_TYPE__);
++ extern void c21(__UINT64_TYPE__) __arm_inout("za");
++ c21(b21(a21()));
++}
++
++/*
++** test22:
++** (?:(?!rdsvl).)*
++** rdsvl x[0-9]+, #1
++** (?:(?!rdsvl).)*
++*/
++__arm_new("za") void test22(volatile int *ptr)
++{
++ inout_za();
++ if (*ptr)
++ *ptr += 1;
++ else
++ private_za();
++ private_za();
++ in_za();
++}
++
++/*
++** test23:
++** (?:(?!__arm_tpidr2_save).)*
++** bl __arm_tpidr2_save
++** (?:(?!__arm_tpidr2_save).)*
++*/
++__arm_new("za") void test23(volatile int *ptr)
++{
++ if (*ptr)
++ *ptr += 1;
++ else
++ inout_za();
++ inout_za();
++}
++
++/*
++** test24:
++** ...
++** bl in_za
++** ...
++** incb x1
++** ...
++** bl out_za
++** bl inout_za
++** ...
++** msr tpidr2_el0, x[0-9]+
++** ...
++** bl private_za
++** ...
++** mrs x[0-9]+, tpidr2_el0
++** ...
++** incb x1
++** ...
++** msr tpidr2_el0, x[0-9]+
++** ...
++** bl private_za
++** ...
++** mrs x[0-9]+, tpidr2_el0
++** ...
++** incb x1
++** ...
++** smstop za
++** ...
++** bl private_za
++** ...
++** ret
++*/
++__arm_new("za") void test24()
++{
++ in_za();
++ asm ("incb\tx1" ::: "x1", "za");
++ out_za();
++ inout_za();
++ private_za();
++ asm ("incb\tx1" ::: "x1", "za");
++ private_za();
++ asm ("incb\tx1" ::: "x1", "za");
++ in_za();
++ private_za();
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sme/za_state_5.c b/gcc/testsuite/gcc.target/aarch64/sme/za_state_5.c
+new file mode 100644
+index 000000000..d54840d3d
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sme/za_state_5.c
+@@ -0,0 +1,595 @@
++// { dg-options "-O2 -fno-optimize-sibling-calls" }
++// { dg-final { check-function-bodies "**" "" } }
++
++void private_za();
++void out_za() __arm_out("za");
++void in_za() __arm_in("za");
++void inout_za() __arm_inout("za");
++void preserves_za() __arm_preserves("za");
++
++/*
++** test1:
++** ret
++*/
++void test1() __arm_inout("za")
++{
++}
++
++/*
++** test2:
++** ldr w0, \[x0\]
++** ret
++*/
++int test2(int *ptr) __arm_inout("za")
++{
++ return *ptr;
++}
++
++/*
++** test3:
++** ...
++** sub sp, sp, x[0-9]+
++** ...
++** msr tpidr2_el0, x[0-9]+
++** ...
++** bl private_za
++** ...
++** mrs x[0-9]+, tpidr2_el0
++** ...
++** smstart za
++** ...
++** bl __arm_tpidr2_restore
++** ...
++** msr tpidr2_el0, xzr
++** ...
++*/
++int test3() __arm_inout("za")
++{
++ private_za();
++ return 0;
++}
++
++/*
++** test4:
++** stp [^\n]+
++** [^\n]+
++** bl in_za
++** ldp [^\n]+
++** ret
++*/
++void test4() __arm_inout("za")
++{
++ in_za();
++}
++
++/*
++** test5:
++** ...
++** smstop za
++** ...
++** bl private_za
++** smstart za
++** bl out_za
++** bl in_za
++** ...
++** sub sp, sp, x[0-9]+
++** ...
++** msr tpidr2_el0, x[0-9]+
++** ...
++** bl private_za
++** ...
++** mrs x[0-9]+, tpidr2_el0
++** ...
++** smstart za
++** ...
++** bl __arm_tpidr2_restore
++** ...
++** msr tpidr2_el0, xzr
++** ...
++*/
++void test5() __arm_inout("za")
++{
++ private_za();
++ out_za();
++ in_za();
++ private_za();
++}
++
++/*
++** test6:
++** ...
++** bl out_za
++** ...
++** sub sp, sp, x[0-9]+
++** ...
++** msr tpidr2_el0, x[0-9]+
++** ...
++** bl private_za
++** ...
++** mrs x[0-9]+, tpidr2_el0
++** ...
++** smstart za
++** ...
++** bl __arm_tpidr2_restore
++** ...
++** msr tpidr2_el0, xzr
++** ...
++** bl in_za
++** ...
++*/
++void test6() __arm_inout("za")
++{
++ out_za();
++ private_za();
++ in_za();
++}
++
++/*
++** test7:
++** stp [^\n]+
++** [^\n]+
++** bl out_za
++** bl in_za
++** smstop za
++** bl private_za
++** smstart za
++** bl out_za
++** bl in_za
++** ldp [^\n]+
++** ret
++*/
++void test7() __arm_inout("za")
++{
++ out_za();
++ in_za();
++ private_za();
++ out_za();
++ in_za();
++}
++
++/*
++** test8:
++** stp [^\n]+
++** [^\n]+
++** bl out_za
++** bl in_za
++** smstop za
++** bl private_za
++** smstart za
++** bl out_za
++** bl in_za
++** ...
++** sub sp, sp, x[0-9]+
++** ...
++** msr tpidr2_el0, x[0-9]+
++** ...
++** bl private_za
++** ...
++** mrs x[0-9]+, tpidr2_el0
++** ...
++** smstart za
++** ...
++** bl __arm_tpidr2_restore
++** ...
++** msr tpidr2_el0, xzr
++** ...
++** ret
++*/
++void test8() __arm_inout("za")
++{
++ out_za();
++ in_za();
++ private_za();
++ out_za();
++ in_za();
++ private_za();
++}
++
++/*
++** test9:
++** stp [^\n]+
++** [^\n]+
++** bl out_za
++** ...
++** msr tpidr2_el0, x[0-9]+
++** bl private_za
++** bl private_za
++** bl private_za
++** bl private_za
++** ...
++** mrs x[0-9]+, tpidr2_el0
++** ...
++** smstart za
++** ...
++** bl __arm_tpidr2_restore
++** ...
++** msr tpidr2_el0, xzr
++** ...
++*/
++void test9() __arm_inout("za")
++{
++ out_za();
++ private_za();
++ private_za();
++ private_za();
++ private_za();
++ in_za();
++}
++
++/*
++** test10:
++** ldr (w[0-9]+), \[x0\]
++** cbz \1, [^\n]+
++** ldr [^\n]+
++** add [^\n]+
++** str [^\n]+
++** ret
++** ...
++*/
++void test10(volatile int *ptr) __arm_inout("za")
++{
++ if (__builtin_expect (*ptr != 0, 1))
++ *ptr = *ptr + 1;
++ else
++ inout_za();
++}
++
++/*
++** test11:
++** (?!.*(\t__arm|\tza|tpidr2_el0)).*
++*/
++void test11(volatile int *ptr) __arm_inout("za")
++{
++ if (__builtin_expect (*ptr == 0, 0))
++ do
++ inout_za();
++ while (*ptr);
++ else
++ *ptr += 1;
++}
++
++void test12(volatile int *ptr) __arm_inout("za")
++{
++ do
++ {
++ inout_za();
++ private_za();
++ }
++ while (*ptr);
++ out_za();
++ in_za();
++}
++
++/*
++** test13:
++** stp [^\n]+
++** ...
++** stp [^\n]+
++** ...
++-- loop:
++** mrs x[0-9]+, tpidr2_el0
++** ...
++** smstart za
++** ...
++** bl __arm_tpidr2_restore
++** ...
++** msr tpidr2_el0, xzr
++** bl inout_za
++** ...
++** msr tpidr2_el0, x[0-9]+
++** ...
++** bl private_za
++** ldr [^\n]+
++** cbnz [^\n]+
++** smstart za
++** msr tpidr2_el0, xzr
++** bl out_za
++** bl in_za
++** [^\n]+
++** [^\n]+
++** ldp [^\n]+
++** ret
++*/
++void test13(volatile int *ptr) __arm_inout("za")
++{
++ do
++ {
++ private_za();
++ inout_za();
++ private_za();
++ }
++ while (*ptr);
++ out_za();
++ in_za();
++}
++
++/*
++** test14:
++** ...
++** bl inout_za
++** ldr [^\n]+
++** cbnz [^\n]+
++** bl out_za
++** bl in_za
++** ...
++*/
++void test14(volatile int *ptr) __arm_inout("za")
++{
++ do
++ inout_za();
++ while (*ptr);
++ out_za();
++ in_za();
++}
++
++/*
++** test15:
++** ...
++** bl out_za
++** bl in_za
++** ldr [^\n]+
++** cbnz [^\n]+
++** ...
++** stp [^\n]+
++** ...
++** msr tpidr2_el0, [^\n]+
++** ...
++** bl private_za
++** ...
++** mrs x[0-9]+, tpidr2_el0
++** ...
++** bl __arm_tpidr2_restore
++** ...
++** msr tpidr2_el0, xzr
++** ...
++*/
++void test15(volatile int *ptr) __arm_inout("za")
++{
++ do
++ {
++ out_za();
++ in_za();
++ }
++ while (*ptr);
++ private_za();
++}
++
++/*
++** test16:
++** stp [^\n]+
++** ...
++** stp [^\n]+
++** ...
++** b [^\n]+
++-- loop:
++** ...
++** mrs x[0-9]+, tpidr2_el0
++** ...
++** msr tpidr2_el0, xzr
++-- loop_entry:
++** bl inout_za
++** ...
++** msr tpidr2_el0, x[0-9]+
++** ...
++** bl private_za
++** ...
++** bl private_za
++** ...
++** mrs x[0-9]+, tpidr2_el0
++** ...
++** bl __arm_tpidr2_restore
++** ...
++** msr tpidr2_el0, xzr
++** ...
++*/
++void test16(volatile int *ptr) __arm_inout("za")
++{
++ do
++ {
++ inout_za();
++ private_za();
++ }
++ while (*ptr);
++ private_za();
++}
++
++/*
++** test17:
++** ...
++-- loop:
++** bl inout_za
++** ...
++** msr tpidr2_el0, x[0-9]+
++** ...
++** bl private_za
++** ...
++** mrs x[0-9]+, tpidr2_el0
++** ...
++** smstart za
++** ...
++** bl __arm_tpidr2_restore
++** ...
++** msr tpidr2_el0, xzr
++** ...
++** cbnz [^\n]+
++** [^\n]+
++** [^\n]+
++** ldp [^\n]+
++** ret
++*/
++void test17(volatile int *ptr) __arm_inout("za")
++{
++ do
++ {
++ inout_za();
++ private_za();
++ while (*ptr)
++ ptr += 1;
++ }
++ while (*ptr);
++}
++
++/*
++** test18:
++** ldr w[0-9]+, [^\n]+
++** cbnz w[0-9]+, [^\n]+
++** ret
++** ...
++** bl out_za
++** bl in_za
++** ...
++** msr tpidr2_el0, x[0-9]+
++** ...
++** bl private_za
++** ...
++** mrs x[0-9]+, tpidr2_el0
++** ...
++** bl __arm_tpidr2_restore
++** ...
++** msr tpidr2_el0, xzr
++** ...
++*/
++void test18(volatile int *ptr) __arm_inout("za")
++{
++ if (__builtin_expect (*ptr, 0))
++ {
++ out_za();
++ in_za();
++ private_za();
++ }
++}
++
++void test19(volatile int *ptr) __arm_inout("za")
++{
++ if (__builtin_expect (*ptr != 0, 1))
++ private_za();
++ else
++ do
++ {
++ inout_za();
++ private_za();
++ }
++ while (*ptr);
++}
++
++/*
++** test20:
++** ...
++** bl a20
++** (?:(?!x0).)*
++** bl b20
++** ...
++** mov ([wx][0-9]+), [wx]0
++** ...
++** bl __arm_tpidr2_restore
++** ...
++** mov [wx]0, \1
++** ...
++** bl c20
++** ...
++*/
++void test20() __arm_inout("za")
++{
++ extern int a20() __arm_inout("za");
++ extern int b20(int);
++ extern void c20(int) __arm_inout("za");
++ c20(b20(a20()));
++}
++
++/*
++** test21:
++** ...
++** bl a21
++** (?:(?!x0).)*
++** bl b21
++** ...
++** mov (x[0-9]+), x0
++** ...
++** bl __arm_tpidr2_restore
++** ...
++** mov x0, \1
++** ...
++** bl c21
++** ...
++*/
++void test21() __arm_inout("za")
++{
++ extern __UINT64_TYPE__ a21() __arm_inout("za");
++ extern __UINT64_TYPE__ b21(__UINT64_TYPE__);
++ extern void c21(__UINT64_TYPE__) __arm_inout("za");
++ c21(b21(a21()));
++}
++
++/*
++** test22:
++** (?:(?!rdsvl).)*
++** rdsvl x[0-9]+, #1
++** (?:(?!rdsvl).)*
++*/
++void test22(volatile int *ptr) __arm_inout("za")
++{
++ inout_za();
++ if (*ptr)
++ *ptr += 1;
++ else
++ private_za();
++ private_za();
++ in_za();
++}
++
++void test23(volatile int *ptr) __arm_inout("za")
++{
++ if (*ptr)
++ *ptr += 1;
++ else
++ inout_za();
++ inout_za();
++}
++
++/*
++** test24:
++** ...
++** bl in_za
++** ...
++** incb x1
++** ...
++** bl out_za
++** bl inout_za
++** ...
++** msr tpidr2_el0, x[0-9]+
++** ...
++** bl private_za
++** ...
++** mrs x[0-9]+, tpidr2_el0
++** ...
++** incb x1
++** ...
++** msr tpidr2_el0, x[0-9]+
++** ...
++** bl private_za
++** ...
++** mrs x[0-9]+, tpidr2_el0
++** ...
++** incb x1
++** ...
++** msr tpidr2_el0, x[0-9]+
++** ...
++** bl private_za
++** ...
++** mrs x[0-9]+, tpidr2_el0
++** ...
++** ret
++*/
++void test24() __arm_inout("za")
++{
++ in_za();
++ asm ("incb\tx1" ::: "x1", "za");
++ out_za();
++ inout_za();
++ private_za();
++ asm ("incb\tx1" ::: "x1", "za");
++ private_za();
++ asm ("incb\tx1" ::: "x1", "za");
++ in_za();
++ private_za();
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/sme/za_state_6.c b/gcc/testsuite/gcc.target/aarch64/sme/za_state_6.c
+new file mode 100644
+index 000000000..d5b226ae1
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sme/za_state_6.c
+@@ -0,0 +1,23 @@
++// { dg-options "-O -fno-optimize-sibling-calls -fomit-frame-pointer" }
++
++void private_za();
++void out_za() __arm_out("za");
++void in_za() __arm_in("za");
++
++__arm_new("za") void test20(volatile int *ptr)
++{
++ if (*ptr)
++ out_za();
++ else
++ *ptr += 1;
++ *ptr += 1;
++ if (*ptr)
++ in_za();
++ else
++ *ptr += 1;
++}
++
++// { dg-final { scan-assembler {\tbl\t__arm_tpidr2_save\n} } }
++// { dg-final { scan-assembler {\tsmstart\tza\n} } }
++// { dg-final { scan-assembler {\tsmstop\tza\n} } }
++// { dg-final { scan-assembler-not {\tsub\tsp, sp, x[0-9]+\n} } }
+--
+2.33.0
+