diff options
author | CoprDistGit <infra@openeuler.org> | 2025-02-28 10:03:49 +0000 |
---|---|---|
committer | CoprDistGit <infra@openeuler.org> | 2025-02-28 10:03:49 +0000 |
commit | 73127104a245052cd5cf29cdaaca3e5c32c70348 (patch) | |
tree | 8e28b63e478c43c252f18b49836dff7313affe54 /0212-Backport-SME-aarch64-Add-support-for-__arm_locally_s.patch | |
parent | 49d3feaf4665cdb07576fc1a2382a4d82a612d35 (diff) |
automatic import of gccopeneuler24.03_LTS_SP1
Diffstat (limited to '0212-Backport-SME-aarch64-Add-support-for-__arm_locally_s.patch')
-rw-r--r-- | 0212-Backport-SME-aarch64-Add-support-for-__arm_locally_s.patch | 1748 |
1 files changed, 1748 insertions, 0 deletions
diff --git a/0212-Backport-SME-aarch64-Add-support-for-__arm_locally_s.patch b/0212-Backport-SME-aarch64-Add-support-for-__arm_locally_s.patch new file mode 100644 index 0000000..72576e3 --- /dev/null +++ b/0212-Backport-SME-aarch64-Add-support-for-__arm_locally_s.patch @@ -0,0 +1,1748 @@ +From 0ad41f11bea5c303ff39c54cae8e46afdfae6070 Mon Sep 17 00:00:00 2001 +From: Richard Sandiford <richard.sandiford@arm.com> +Date: Tue, 5 Dec 2023 10:11:29 +0000 +Subject: [PATCH 113/157] [Backport][SME] aarch64: Add support for + __arm_locally_streaming + +Reference: https://gcc.gnu.org/git/?p=gcc.git;a=commit;h=3f6e5991fab507aa79121dc44d1afcd622c78744 + +This patch adds support for the __arm_locally_streaming attribute, +which allows a function to use SME internally without changing +the function's ABI. The attribute is valid but redundant for +__arm_streaming functions. + +gcc/ + * config/aarch64/aarch64.cc (aarch64_arm_attribute_table): Add + arm::locally_streaming. + (aarch64_fndecl_is_locally_streaming): New function. + (aarch64_fndecl_sm_state): Handle locally-streaming functions. + (aarch64_cfun_enables_pstate_sm): New function. + (aarch64_add_offset): Add an argument that specifies whether + the streaming vector length should be used instead of the + prevailing one. + (aarch64_split_add_offset, aarch64_add_sp, aarch64_sub_sp): Likewise. + (aarch64_allocate_and_probe_stack_space): Likewise. + (aarch64_expand_mov_immediate): Update calls accordingly. + (aarch64_need_old_pstate_sm): Return true for locally-streaming + streaming-compatible functions. + (aarch64_layout_frame): Force all call-preserved Z and P registers + to be saved and restored if the function switches PSTATE.SM in the + prologue. + (aarch64_get_separate_components): Disable shrink-wrapping of + such Z and P saves and restores. + (aarch64_use_late_prologue_epilogue): New function. + (aarch64_expand_prologue): Measure SVE lengths in the streaming + vector length for locally-streaming functions, then emit code + to enable streaming mode. + (aarch64_expand_epilogue): Likewise in reverse. + (TARGET_USE_LATE_PROLOGUE_EPILOGUE): Define. + * config/aarch64/aarch64-c.cc (aarch64_define_unconditional_macros): + Define __arm_locally_streaming. + +gcc/testsuite/ + * gcc.target/aarch64/sme/locally_streaming_1.c: New test. + * gcc.target/aarch64/sme/locally_streaming_2.c: Likewise. + * gcc.target/aarch64/sme/locally_streaming_3.c: Likewise. + * gcc.target/aarch64/sme/locally_streaming_4.c: Likewise. + * gcc.target/aarch64/sme/keyword_macros_1.c: Add + __arm_locally_streaming. + * g++.target/aarch64/sme/keyword_macros_1.C: Likewise. +--- + gcc/config/aarch64/aarch64-c.cc | 1 + + gcc/config/aarch64/aarch64.cc | 233 +++++++-- + .../g++.target/aarch64/sme/keyword_macros_1.C | 1 + + .../gcc.target/aarch64/sme/keyword_macros_1.c | 1 + + .../aarch64/sme/locally_streaming_1.c | 466 ++++++++++++++++++ + .../aarch64/sme/locally_streaming_2.c | 177 +++++++ + .../aarch64/sme/locally_streaming_3.c | 273 ++++++++++ + .../aarch64/sme/locally_streaming_4.c | 145 ++++++ + 8 files changed, 1259 insertions(+), 38 deletions(-) + create mode 100644 gcc/testsuite/gcc.target/aarch64/sme/locally_streaming_1.c + create mode 100644 gcc/testsuite/gcc.target/aarch64/sme/locally_streaming_2.c + create mode 100644 gcc/testsuite/gcc.target/aarch64/sme/locally_streaming_3.c + create mode 100644 gcc/testsuite/gcc.target/aarch64/sme/locally_streaming_4.c + +diff --git a/gcc/config/aarch64/aarch64-c.cc b/gcc/config/aarch64/aarch64-c.cc +index cb8a6c2fc..745719d8b 100644 +--- a/gcc/config/aarch64/aarch64-c.cc ++++ b/gcc/config/aarch64/aarch64-c.cc +@@ -86,6 +86,7 @@ aarch64_define_unconditional_macros (cpp_reader *pfile) + + DEFINE_ARM_KEYWORD_MACRO ("streaming"); + DEFINE_ARM_KEYWORD_MACRO ("streaming_compatible"); ++ DEFINE_ARM_KEYWORD_MACRO ("locally_streaming"); + + #undef DEFINE_ARM_KEYWORD_MACRO + +diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc +index 113784e31..4cb43c2e2 100644 +--- a/gcc/config/aarch64/aarch64.cc ++++ b/gcc/config/aarch64/aarch64.cc +@@ -3283,6 +3283,7 @@ static const attribute_spec aarch64_arm_attributes[] = + NULL, attr_streaming_exclusions }, + { "streaming_compatible", 0, 0, false, true, true, true, + NULL, attr_streaming_exclusions }, ++ { "locally_streaming", 0, 0, true, false, false, false, NULL, NULL }, + { "new", 1, -1, true, false, false, false, + handle_arm_new, NULL }, + { "preserves", 1, -1, false, true, true, true, +@@ -4657,6 +4658,16 @@ aarch64_fntype_isa_mode (const_tree fntype) + | aarch64_fntype_pstate_za (fntype)); + } + ++/* Return true if FNDECL uses streaming mode internally, as an ++ implementation choice. */ ++ ++static bool ++aarch64_fndecl_is_locally_streaming (const_tree fndecl) ++{ ++ return lookup_attribute ("arm", "locally_streaming", ++ DECL_ATTRIBUTES (fndecl)); ++} ++ + /* Return the state of PSTATE.SM when compiling the body of + function FNDECL. This might be different from the state of + PSTATE.SM on entry. */ +@@ -4664,6 +4675,9 @@ aarch64_fntype_isa_mode (const_tree fntype) + static aarch64_feature_flags + aarch64_fndecl_pstate_sm (const_tree fndecl) + { ++ if (aarch64_fndecl_is_locally_streaming (fndecl)) ++ return AARCH64_FL_SM_ON; ++ + return aarch64_fntype_pstate_sm (TREE_TYPE (fndecl)); + } + +@@ -4739,6 +4753,16 @@ aarch64_cfun_has_new_state (const char *state_name) + return aarch64_fndecl_has_new_state (cfun->decl, state_name); + } + ++/* Return true if PSTATE.SM is 1 in the body of the current function, ++ but is not guaranteed to be 1 on entry. */ ++ ++static bool ++aarch64_cfun_enables_pstate_sm () ++{ ++ return (aarch64_fndecl_is_locally_streaming (cfun->decl) ++ && aarch64_cfun_incoming_pstate_sm () != AARCH64_FL_SM_ON); ++} ++ + /* Return true if the current function has state STATE_NAME, either by + creating new state itself or by sharing state with callers. */ + +@@ -6931,6 +6955,10 @@ aarch64_add_offset_temporaries (rtx x) + TEMP2, if nonnull, is a second temporary register that doesn't + overlap either DEST or REG. + ++ FORCE_ISA_MODE is AARCH64_FL_SM_ON if any variable component of OFFSET ++ is measured relative to the SME vector length instead of the current ++ prevailing vector length. It is 0 otherwise. ++ + Since this function may be used to adjust the stack pointer, we must + ensure that it cannot cause transient stack deallocation (for example + by first incrementing SP and then decrementing when adjusting by a +@@ -6939,6 +6967,7 @@ aarch64_add_offset_temporaries (rtx x) + static void + aarch64_add_offset (scalar_int_mode mode, rtx dest, rtx src, + poly_int64 offset, rtx temp1, rtx temp2, ++ aarch64_feature_flags force_isa_mode, + bool frame_related_p, bool emit_move_imm = true) + { + gcc_assert (emit_move_imm || temp1 != NULL_RTX); +@@ -6951,9 +6980,18 @@ aarch64_add_offset (scalar_int_mode mode, rtx dest, rtx src, + /* Try using ADDVL or ADDPL to add the whole value. */ + if (src != const0_rtx && aarch64_sve_addvl_addpl_immediate_p (offset)) + { +- rtx offset_rtx = gen_int_mode (offset, mode); ++ gcc_assert (offset.coeffs[0] == offset.coeffs[1]); ++ rtx offset_rtx; ++ if (force_isa_mode == 0) ++ offset_rtx = gen_int_mode (offset, mode); ++ else ++ offset_rtx = aarch64_sme_vq_immediate (mode, offset.coeffs[0], 0); + rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx)); + RTX_FRAME_RELATED_P (insn) = frame_related_p; ++ if (frame_related_p && (force_isa_mode & AARCH64_FL_SM_ON)) ++ add_reg_note (insn, REG_CFA_ADJUST_CFA, ++ gen_rtx_SET (dest, plus_constant (Pmode, src, ++ offset))); + return; + } + +@@ -6969,11 +7007,19 @@ aarch64_add_offset (scalar_int_mode mode, rtx dest, rtx src, + if (src != const0_rtx + && aarch64_sve_addvl_addpl_immediate_p (poly_offset)) + { +- rtx offset_rtx = gen_int_mode (poly_offset, mode); ++ rtx offset_rtx; ++ if (force_isa_mode == 0) ++ offset_rtx = gen_int_mode (poly_offset, mode); ++ else ++ offset_rtx = aarch64_sme_vq_immediate (mode, factor, 0); + if (frame_related_p) + { + rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx)); + RTX_FRAME_RELATED_P (insn) = true; ++ if (force_isa_mode & AARCH64_FL_SM_ON) ++ add_reg_note (insn, REG_CFA_ADJUST_CFA, ++ gen_rtx_SET (dest, plus_constant (Pmode, src, ++ poly_offset))); + src = dest; + } + else +@@ -7004,9 +7050,19 @@ aarch64_add_offset (scalar_int_mode mode, rtx dest, rtx src, + rtx val; + if (IN_RANGE (rel_factor, -32, 31)) + { ++ if (force_isa_mode & AARCH64_FL_SM_ON) ++ { ++ /* Try to use an unshifted RDSVL, otherwise fall back on ++ a shifted RDSVL #1. */ ++ if (aarch64_sve_rdvl_addvl_factor_p (factor)) ++ shift = 0; ++ else ++ factor = rel_factor * 16; ++ val = aarch64_sme_vq_immediate (mode, factor, 0); ++ } + /* Try to use an unshifted CNT[BHWD] or RDVL. */ +- if (aarch64_sve_cnt_factor_p (factor) +- || aarch64_sve_rdvl_addvl_factor_p (factor)) ++ else if (aarch64_sve_cnt_factor_p (factor) ++ || aarch64_sve_rdvl_addvl_factor_p (factor)) + { + val = gen_int_mode (poly_int64 (factor, factor), mode); + shift = 0; +@@ -7036,11 +7092,18 @@ aarch64_add_offset (scalar_int_mode mode, rtx dest, rtx src, + a shift and add sequence for the multiplication. + If CNTB << SHIFT is out of range, stick with the current + shift factor. */ +- if (IN_RANGE (low_bit, 2, 16 * 16)) ++ if (force_isa_mode == 0 ++ && IN_RANGE (low_bit, 2, 16 * 16)) + { + val = gen_int_mode (poly_int64 (low_bit, low_bit), mode); + shift = 0; + } ++ else if ((force_isa_mode & AARCH64_FL_SM_ON) ++ && aarch64_sve_rdvl_addvl_factor_p (low_bit)) ++ { ++ val = aarch64_sme_vq_immediate (mode, low_bit, 0); ++ shift = 0; ++ } + else + val = gen_int_mode (BYTES_PER_SVE_VECTOR, mode); + +@@ -7128,30 +7191,34 @@ aarch64_split_add_offset (scalar_int_mode mode, rtx dest, rtx src, + rtx offset_rtx, rtx temp1, rtx temp2) + { + aarch64_add_offset (mode, dest, src, rtx_to_poly_int64 (offset_rtx), +- temp1, temp2, false); ++ temp1, temp2, 0, false); + } + + /* Add DELTA to the stack pointer, marking the instructions frame-related. +- TEMP1 is available as a temporary if nonnull. EMIT_MOVE_IMM is false +- if TEMP1 already contains abs (DELTA). */ ++ TEMP1 is available as a temporary if nonnull. FORCE_ISA_MODE is as ++ for aarch64_add_offset. EMIT_MOVE_IMM is false if TEMP1 already ++ contains abs (DELTA). */ + + static inline void +-aarch64_add_sp (rtx temp1, rtx temp2, poly_int64 delta, bool emit_move_imm) ++aarch64_add_sp (rtx temp1, rtx temp2, poly_int64 delta, ++ aarch64_feature_flags force_isa_mode, bool emit_move_imm) + { + aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, delta, +- temp1, temp2, true, emit_move_imm); ++ temp1, temp2, force_isa_mode, true, emit_move_imm); + } + + /* Subtract DELTA from the stack pointer, marking the instructions +- frame-related if FRAME_RELATED_P. TEMP1 is available as a temporary +- if nonnull. */ ++ frame-related if FRAME_RELATED_P. FORCE_ISA_MODE is as for ++ aarch64_add_offset. TEMP1 is available as a temporary if nonnull. */ + + static inline void +-aarch64_sub_sp (rtx temp1, rtx temp2, poly_int64 delta, bool frame_related_p, +- bool emit_move_imm = true) ++aarch64_sub_sp (rtx temp1, rtx temp2, poly_int64 delta, ++ aarch64_feature_flags force_isa_mode, ++ bool frame_related_p, bool emit_move_imm = true) + { + aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, -delta, +- temp1, temp2, frame_related_p, emit_move_imm); ++ temp1, temp2, force_isa_mode, frame_related_p, ++ emit_move_imm); + } + + /* A streaming-compatible function needs to switch temporarily to the known +@@ -8176,11 +8243,11 @@ aarch64_expand_mov_immediate (rtx dest, rtx imm) + { + base = aarch64_force_temporary (int_mode, dest, base); + aarch64_add_offset (int_mode, dest, base, offset, +- NULL_RTX, NULL_RTX, false); ++ NULL_RTX, NULL_RTX, 0, false); + } + else + aarch64_add_offset (int_mode, dest, base, offset, +- dest, NULL_RTX, false); ++ dest, NULL_RTX, 0, false); + } + return; + } +@@ -8207,7 +8274,7 @@ aarch64_expand_mov_immediate (rtx dest, rtx imm) + gcc_assert (can_create_pseudo_p ()); + base = aarch64_force_temporary (int_mode, dest, base); + aarch64_add_offset (int_mode, dest, base, const_offset, +- NULL_RTX, NULL_RTX, false); ++ NULL_RTX, NULL_RTX, 0, false); + return; + } + +@@ -8247,7 +8314,7 @@ aarch64_expand_mov_immediate (rtx dest, rtx imm) + gcc_assert(can_create_pseudo_p ()); + base = aarch64_force_temporary (int_mode, dest, base); + aarch64_add_offset (int_mode, dest, base, const_offset, +- NULL_RTX, NULL_RTX, false); ++ NULL_RTX, NULL_RTX, 0, false); + return; + } + /* FALLTHRU */ +@@ -9755,6 +9822,9 @@ aarch64_need_old_pstate_sm () + if (aarch64_cfun_incoming_pstate_sm () != 0) + return false; + ++ if (aarch64_cfun_enables_pstate_sm ()) ++ return true; ++ + if (cfun->machine->call_switches_pstate_sm) + for (auto insn = get_insns (); insn; insn = NEXT_INSN (insn)) + if (auto *call = dyn_cast<rtx_call_insn *> (insn)) +@@ -9781,6 +9851,7 @@ aarch64_layout_frame (void) + bool frame_related_fp_reg_p = false; + aarch64_frame &frame = cfun->machine->frame; + poly_int64 top_of_locals = -1; ++ bool enables_pstate_sm = aarch64_cfun_enables_pstate_sm (); + + vec_safe_truncate (frame.saved_gprs, 0); + vec_safe_truncate (frame.saved_fprs, 0); +@@ -9818,7 +9889,7 @@ aarch64_layout_frame (void) + frame.reg_offset[regno] = SLOT_REQUIRED; + + for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++) +- if (df_regs_ever_live_p (regno) ++ if ((enables_pstate_sm || df_regs_ever_live_p (regno)) + && !fixed_regs[regno] + && !crtl->abi->clobbers_full_reg_p (regno)) + { +@@ -9847,7 +9918,7 @@ aarch64_layout_frame (void) + } + + for (regno = P0_REGNUM; regno <= P15_REGNUM; regno++) +- if (df_regs_ever_live_p (regno) ++ if ((enables_pstate_sm || df_regs_ever_live_p (regno)) + && !fixed_regs[regno] + && !crtl->abi->clobbers_full_reg_p (regno)) + frame.reg_offset[regno] = SLOT_REQUIRED; +@@ -9964,7 +10035,8 @@ aarch64_layout_frame (void) + /* If the current function changes the SVE vector length, ensure that the + old value of the DWARF VG register is saved and available in the CFI, + so that outer frames with VL-sized offsets can be processed correctly. */ +- if (cfun->machine->call_switches_pstate_sm) ++ if (cfun->machine->call_switches_pstate_sm ++ || aarch64_cfun_enables_pstate_sm ()) + { + frame.reg_offset[VG_REGNUM] = offset; + offset += UNITS_PER_WORD; +@@ -10749,9 +10821,16 @@ aarch64_get_separate_components (void) + bitmap_clear (components); + + /* The registers we need saved to the frame. */ ++ bool enables_pstate_sm = aarch64_cfun_enables_pstate_sm (); + for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++) + if (aarch64_register_saved_on_entry (regno)) + { ++ /* Disallow shrink wrapping for registers that will be clobbered ++ by an SMSTART SM in the prologue. */ ++ if (enables_pstate_sm ++ && (FP_REGNUM_P (regno) || PR_REGNUM_P (regno))) ++ continue; ++ + /* Punt on saves and restores that use ST1D and LD1D. We could + try to be smarter, but it would involve making sure that the + spare predicate register itself is safe to use at the save +@@ -11070,11 +11149,16 @@ aarch64_emit_stack_tie (rtx reg) + events, e.g. if we were to allow the stack to be dropped by more than a page + and then have multiple probes up and we take a signal somewhere in between + then the signal handler doesn't know the state of the stack and can make no +- assumptions about which pages have been probed. */ ++ assumptions about which pages have been probed. ++ ++ FORCE_ISA_MODE is AARCH64_FL_SM_ON if any variable component of POLY_SIZE ++ is measured relative to the SME vector length instead of the current ++ prevailing vector length. It is 0 otherwise. */ + + static void + aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2, + poly_int64 poly_size, ++ aarch64_feature_flags force_isa_mode, + bool frame_related_p, + bool final_adjustment_p) + { +@@ -11116,7 +11200,8 @@ aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2, + if (known_lt (poly_size, min_probe_threshold) + || !flag_stack_clash_protection) + { +- aarch64_sub_sp (temp1, temp2, poly_size, frame_related_p); ++ aarch64_sub_sp (temp1, temp2, poly_size, force_isa_mode, ++ frame_related_p); + return; + } + +@@ -11133,7 +11218,8 @@ aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2, + + /* First calculate the amount of bytes we're actually spilling. */ + aarch64_add_offset (Pmode, temp1, CONST0_RTX (Pmode), +- poly_size, temp1, temp2, false, true); ++ poly_size, temp1, temp2, force_isa_mode, ++ false, true); + + rtx_insn *insn = get_last_insn (); + +@@ -11191,7 +11277,7 @@ aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2, + { + for (HOST_WIDE_INT i = 0; i < rounded_size; i += guard_size) + { +- aarch64_sub_sp (NULL, temp2, guard_size, true); ++ aarch64_sub_sp (NULL, temp2, guard_size, force_isa_mode, true); + emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx, + guard_used_by_caller)); + emit_insn (gen_blockage ()); +@@ -11202,7 +11288,7 @@ aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2, + { + /* Compute the ending address. */ + aarch64_add_offset (Pmode, temp1, stack_pointer_rtx, -rounded_size, +- temp1, NULL, false, true); ++ temp1, NULL, force_isa_mode, false, true); + rtx_insn *insn = get_last_insn (); + + /* For the initial allocation, we don't have a frame pointer +@@ -11268,7 +11354,7 @@ aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2, + if (final_adjustment_p && rounded_size != 0) + min_probe_threshold = 0; + +- aarch64_sub_sp (temp1, temp2, residual, frame_related_p); ++ aarch64_sub_sp (temp1, temp2, residual, force_isa_mode, frame_related_p); + if (residual >= min_probe_threshold) + { + if (dump_file) +@@ -11333,6 +11419,14 @@ aarch64_epilogue_uses (int regno) + return 0; + } + ++/* Implement TARGET_USE_LATE_PROLOGUE_EPILOGUE. */ ++ ++static bool ++aarch64_use_late_prologue_epilogue () ++{ ++ return aarch64_cfun_enables_pstate_sm (); ++} ++ + /* The current function's frame has a save slot for the incoming state + of SVCR. Return a legitimate memory for the slot, based on the hard + frame pointer. */ +@@ -11469,6 +11563,9 @@ aarch64_expand_prologue (void) + unsigned reg2 = frame.wb_push_candidate2; + bool emit_frame_chain = frame.emit_frame_chain; + rtx_insn *insn; ++ aarch64_feature_flags force_isa_mode = 0; ++ if (aarch64_cfun_enables_pstate_sm ()) ++ force_isa_mode = AARCH64_FL_SM_ON; + + if (flag_stack_clash_protection && known_eq (callee_adjust, 0)) + { +@@ -11530,7 +11627,7 @@ aarch64_expand_prologue (void) + less the amount of the guard reserved for use by the caller's + outgoing args. */ + aarch64_allocate_and_probe_stack_space (tmp0_rtx, tmp1_rtx, initial_adjust, +- true, false); ++ force_isa_mode, true, false); + + if (callee_adjust != 0) + aarch64_push_regs (reg1, reg2, callee_adjust); +@@ -11553,7 +11650,8 @@ aarch64_expand_prologue (void) + gcc_assert (known_eq (chain_offset, 0)); + aarch64_add_offset (Pmode, hard_frame_pointer_rtx, + stack_pointer_rtx, chain_offset, +- tmp1_rtx, tmp0_rtx, frame_pointer_needed); ++ tmp1_rtx, tmp0_rtx, force_isa_mode, ++ frame_pointer_needed); + if (frame_pointer_needed && !frame_size.is_constant ()) + { + /* Variable-sized frames need to describe the save slot +@@ -11600,6 +11698,7 @@ aarch64_expand_prologue (void) + || known_eq (initial_adjust, 0)); + aarch64_allocate_and_probe_stack_space (tmp1_rtx, tmp0_rtx, + sve_callee_adjust, ++ force_isa_mode, + !frame_pointer_needed, false); + bytes_below_sp -= sve_callee_adjust; + } +@@ -11612,12 +11711,15 @@ aarch64_expand_prologue (void) + that is assumed by the called. */ + gcc_assert (known_eq (bytes_below_sp, final_adjust)); + aarch64_allocate_and_probe_stack_space (tmp1_rtx, tmp0_rtx, final_adjust, ++ force_isa_mode, + !frame_pointer_needed, true); + if (emit_frame_chain && maybe_ne (final_adjust, 0)) + aarch64_emit_stack_tie (hard_frame_pointer_rtx); + +- /* Save the incoming value of PSTATE.SM, if required. */ +- if (known_ge (frame.old_svcr_offset, 0)) ++ /* Save the incoming value of PSTATE.SM, if required. Code further ++ down does this for locally-streaming functions. */ ++ if (known_ge (frame.old_svcr_offset, 0) ++ && !aarch64_cfun_enables_pstate_sm ()) + { + rtx mem = aarch64_old_svcr_mem (); + MEM_VOLATILE_P (mem) = 1; +@@ -11649,6 +11751,34 @@ aarch64_expand_prologue (void) + emit_move_insn (gen_rtx_REG (DImode, R1_REGNUM), old_r1); + } + } ++ ++ /* Enable PSTATE.SM, if required. */ ++ if (aarch64_cfun_enables_pstate_sm ()) ++ { ++ rtx_insn *guard_label = nullptr; ++ if (known_ge (cfun->machine->frame.old_svcr_offset, 0)) ++ { ++ /* The current function is streaming-compatible. Save the ++ original state of PSTATE.SM. */ ++ rtx svcr = gen_rtx_REG (DImode, IP0_REGNUM); ++ emit_insn (gen_aarch64_read_svcr (svcr)); ++ emit_move_insn (aarch64_old_svcr_mem (), svcr); ++ guard_label = aarch64_guard_switch_pstate_sm (svcr, ++ aarch64_isa_flags); ++ } ++ aarch64_sme_mode_switch_regs args_switch; ++ auto &args = crtl->args.info; ++ for (unsigned int i = 0; i < args.num_sme_mode_switch_args; ++i) ++ { ++ rtx x = args.sme_mode_switch_args[i]; ++ args_switch.add_reg (GET_MODE (x), REGNO (x)); ++ } ++ args_switch.emit_prologue (); ++ emit_insn (gen_aarch64_smstart_sm ()); ++ args_switch.emit_epilogue (); ++ if (guard_label) ++ emit_label (guard_label); ++ } + } + + /* Return TRUE if we can use a simple_return insn. +@@ -11695,6 +11825,9 @@ aarch64_expand_epilogue (rtx_call_insn *sibcall) + HOST_WIDE_INT guard_size + = 1 << param_stack_clash_protection_guard_size; + HOST_WIDE_INT guard_used_by_caller = STACK_CLASH_CALLER_GUARD; ++ aarch64_feature_flags force_isa_mode = 0; ++ if (aarch64_cfun_enables_pstate_sm ()) ++ force_isa_mode = AARCH64_FL_SM_ON; + + /* We can re-use the registers when: + +@@ -11719,6 +11852,24 @@ aarch64_expand_epilogue (rtx_call_insn *sibcall) + = maybe_ne (get_frame_size () + + frame.saved_varargs_size, 0); + ++ /* Reset PSTATE.SM, if required. */ ++ if (aarch64_cfun_enables_pstate_sm ()) ++ { ++ rtx_insn *guard_label = nullptr; ++ if (known_ge (cfun->machine->frame.old_svcr_offset, 0)) ++ guard_label = aarch64_guard_switch_pstate_sm (IP0_REGNUM, ++ aarch64_isa_flags); ++ aarch64_sme_mode_switch_regs return_switch; ++ if (crtl->return_rtx && REG_P (crtl->return_rtx)) ++ return_switch.add_reg (GET_MODE (crtl->return_rtx), ++ REGNO (crtl->return_rtx)); ++ return_switch.emit_prologue (); ++ emit_insn (gen_aarch64_smstop_sm ()); ++ return_switch.emit_epilogue (); ++ if (guard_label) ++ emit_label (guard_label); ++ } ++ + /* Emit a barrier to prevent loads from a deallocated stack. */ + if (maybe_gt (final_adjust, crtl->outgoing_args_size) + || cfun->calls_alloca +@@ -11739,19 +11890,21 @@ aarch64_expand_epilogue (rtx_call_insn *sibcall) + aarch64_add_offset (Pmode, stack_pointer_rtx, + hard_frame_pointer_rtx, + -bytes_below_hard_fp + final_adjust, +- tmp1_rtx, tmp0_rtx, callee_adjust == 0); ++ tmp1_rtx, tmp0_rtx, force_isa_mode, ++ callee_adjust == 0); + else + /* The case where we need to re-use the register here is very rare, so + avoid the complicated condition and just always emit a move if the + immediate doesn't fit. */ +- aarch64_add_sp (tmp1_rtx, tmp0_rtx, final_adjust, true); ++ aarch64_add_sp (tmp1_rtx, tmp0_rtx, final_adjust, force_isa_mode, true); + + /* Restore the vector registers before the predicate registers, + so that we can use P4 as a temporary for big-endian SVE frames. */ + aarch64_restore_callee_saves (final_adjust, frame.saved_fprs, &cfi_ops); + aarch64_restore_callee_saves (final_adjust, frame.saved_prs, &cfi_ops); + if (maybe_ne (sve_callee_adjust, 0)) +- aarch64_add_sp (NULL_RTX, NULL_RTX, sve_callee_adjust, true); ++ aarch64_add_sp (NULL_RTX, NULL_RTX, sve_callee_adjust, ++ force_isa_mode, true); + + /* When shadow call stack is enabled, the scs_pop in the epilogue will + restore x30, we don't need to restore x30 again in the traditional +@@ -11781,7 +11934,7 @@ aarch64_expand_epilogue (rtx_call_insn *sibcall) + + /* Liveness of EP0_REGNUM can not be trusted across function calls either, so + add restriction on emit_move optimization to leaf functions. */ +- aarch64_add_sp (tmp0_rtx, tmp1_rtx, initial_adjust, ++ aarch64_add_sp (tmp0_rtx, tmp1_rtx, initial_adjust, force_isa_mode, + (!can_inherit_p || !crtl->is_leaf + || df_regs_ever_live_p (EP0_REGNUM))); + +@@ -11914,7 +12067,8 @@ aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED, + temp1 = gen_rtx_REG (Pmode, EP1_REGNUM); + + if (vcall_offset == 0) +- aarch64_add_offset (Pmode, this_rtx, this_rtx, delta, temp1, temp0, false); ++ aarch64_add_offset (Pmode, this_rtx, this_rtx, delta, temp1, temp0, ++ 0, false); + else + { + gcc_assert ((vcall_offset & (POINTER_BYTES - 1)) == 0); +@@ -11927,7 +12081,7 @@ aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED, + plus_constant (Pmode, this_rtx, delta)); + else + aarch64_add_offset (Pmode, this_rtx, this_rtx, delta, +- temp1, temp0, false); ++ temp1, temp0, 0, false); + } + + if (Pmode == ptr_mode) +@@ -30962,6 +31116,9 @@ aarch64_libgcc_floating_mode_supported_p + #undef TARGET_EXTRA_LIVE_ON_ENTRY + #define TARGET_EXTRA_LIVE_ON_ENTRY aarch64_extra_live_on_entry + ++#undef TARGET_USE_LATE_PROLOGUE_EPILOGUE ++#define TARGET_USE_LATE_PROLOGUE_EPILOGUE aarch64_use_late_prologue_epilogue ++ + #undef TARGET_EMIT_EPILOGUE_FOR_SIBCALL + #define TARGET_EMIT_EPILOGUE_FOR_SIBCALL aarch64_expand_epilogue + +diff --git a/gcc/testsuite/g++.target/aarch64/sme/keyword_macros_1.C b/gcc/testsuite/g++.target/aarch64/sme/keyword_macros_1.C +index 8b0755014..dc5c097bd 100644 +--- a/gcc/testsuite/g++.target/aarch64/sme/keyword_macros_1.C ++++ b/gcc/testsuite/g++.target/aarch64/sme/keyword_macros_1.C +@@ -7,3 +7,4 @@ void f4 () __arm_out("za"); + void f5 () __arm_inout("za"); + void f6 () __arm_preserves("za"); + __arm_new("za") void f7 () {} ++__arm_locally_streaming void f8 () {} +diff --git a/gcc/testsuite/gcc.target/aarch64/sme/keyword_macros_1.c b/gcc/testsuite/gcc.target/aarch64/sme/keyword_macros_1.c +index fcabe3edc..22f5facfd 100644 +--- a/gcc/testsuite/gcc.target/aarch64/sme/keyword_macros_1.c ++++ b/gcc/testsuite/gcc.target/aarch64/sme/keyword_macros_1.c +@@ -7,3 +7,4 @@ void f4 () __arm_out("za"); + void f5 () __arm_inout("za"); + void f6 () __arm_preserves("za"); + __arm_new("za") void f7 () {} ++__arm_locally_streaming void f8 () {} +diff --git a/gcc/testsuite/gcc.target/aarch64/sme/locally_streaming_1.c b/gcc/testsuite/gcc.target/aarch64/sme/locally_streaming_1.c +new file mode 100644 +index 000000000..20ff4b87d +--- /dev/null ++++ b/gcc/testsuite/gcc.target/aarch64/sme/locally_streaming_1.c +@@ -0,0 +1,466 @@ ++// { dg-options "-O -fomit-frame-pointer" } ++// { dg-final { check-function-bodies "**" "" } } ++ ++void consume_za () [[arm::streaming, arm::inout("za")]]; ++ ++/* ++** n_ls: ++** sub sp, sp, #?80 ++** cntd x16 ++** str x16, \[sp\] ++** stp d8, d9, \[sp, #?16\] ++** stp d10, d11, \[sp, #?32\] ++** stp d12, d13, \[sp, #?48\] ++** stp d14, d15, \[sp, #?64\] ++** smstart sm ++** smstop sm ++** ldp d8, d9, \[sp, #?16\] ++** ldp d10, d11, \[sp, #?32\] ++** ldp d12, d13, \[sp, #?48\] ++** ldp d14, d15, \[sp, #?64\] ++** add sp, sp, #?80 ++** ret ++*/ ++[[arm::locally_streaming]] void ++n_ls () ++{ ++ asm (""); ++} ++ ++/* ++** s_ls: ++** ret ++*/ ++[[arm::locally_streaming]] void ++s_ls () [[arm::streaming]] ++{ ++ asm (""); ++} ++ ++/* ++** sc_ls: ++** stp x29, x30, \[sp, #?-96\]! ++** mov x29, sp ++** cntd x16 ++** str x16, \[sp, #?24\] ++** stp d8, d9, \[sp, #?32\] ++** stp d10, d11, \[sp, #?48\] ++** stp d12, d13, \[sp, #?64\] ++** stp d14, d15, \[sp, #?80\] ++** mrs x16, svcr ++** str x16, \[x29, #?16\] ++** tbnz x16, 0, [^\n]+ ++** smstart sm ++** ldr x16, \[x29, #?16\] ++** tbnz x16, 0, [^\n]+ ++** smstop sm ++** ldp d8, d9, \[sp, #?32\] ++** ldp d10, d11, \[sp, #?48\] ++** ldp d12, d13, \[sp, #?64\] ++** ldp d14, d15, \[sp, #?80\] ++** ldp x29, x30, \[sp\], #?96 ++** ret ++*/ ++[[arm::locally_streaming]] void ++sc_ls () [[arm::streaming_compatible]] ++{ ++ asm (""); ++} ++ ++/* ++** n_ls_new_za: ++** str x30, \[sp, #?-80\]! ++** cntd x16 ++** str x16, \[sp, #?8\] ++** stp d8, d9, \[sp, #?16\] ++** stp d10, d11, \[sp, #?32\] ++** stp d12, d13, \[sp, #?48\] ++** stp d14, d15, \[sp, #?64\] ++** smstart sm ++** mrs (x[0-9]+), tpidr2_el0 ++** cbz \1, [^\n]+ ++** bl __arm_tpidr2_save ++** msr tpidr2_el0, xzr ++** zero { za } ++** smstart za ++** bl consume_za ++** smstop za ++** smstop sm ++** ldp d8, d9, \[sp, #?16\] ++** ldp d10, d11, \[sp, #?32\] ++** ldp d12, d13, \[sp, #?48\] ++** ldp d14, d15, \[sp, #?64\] ++** ldr x30, \[sp\], #?80 ++** ret ++*/ ++[[arm::locally_streaming, arm::new("za")]] void ++n_ls_new_za () ++{ ++ consume_za (); ++ asm (""); ++} ++ ++/* ++** s_ls_new_za: ++** str x30, \[sp, #?-16\]! ++** mrs (x[0-9]+), tpidr2_el0 ++** cbz \1, [^\n]+ ++** bl __arm_tpidr2_save ++** msr tpidr2_el0, xzr ++** zero { za } ++** smstart za ++** bl consume_za ++** smstop za ++** ldr x30, \[sp\], #?16 ++** ret ++*/ ++[[arm::locally_streaming, arm::new("za")]] void ++s_ls_new_za () [[arm::streaming]] ++{ ++ consume_za (); ++ asm (""); ++} ++ ++/* ++** sc_ls_new_za: ++** stp x29, x30, \[sp, #?-96\]! ++** mov x29, sp ++** cntd x16 ++** str x16, \[sp, #?24\] ++** stp d8, d9, \[sp, #?32\] ++** stp d10, d11, \[sp, #?48\] ++** stp d12, d13, \[sp, #?64\] ++** stp d14, d15, \[sp, #?80\] ++** mrs x16, svcr ++** str x16, \[x29, #?16\] ++** tbnz x16, 0, [^\n]+ ++** smstart sm ++** mrs (x[0-9]+), tpidr2_el0 ++** cbz \1, [^\n]+ ++** bl __arm_tpidr2_save ++** msr tpidr2_el0, xzr ++** zero { za } ++** smstart za ++** bl consume_za ++** smstop za ++** ldr x16, \[x29, #?16\] ++** tbnz x16, 0, [^\n]+ ++** smstop sm ++** ldp d8, d9, \[sp, #?32\] ++** ldp d10, d11, \[sp, #?48\] ++** ldp d12, d13, \[sp, #?64\] ++** ldp d14, d15, \[sp, #?80\] ++** ldp x29, x30, \[sp\], #?96 ++** ret ++*/ ++[[arm::locally_streaming, arm::new("za")]] void ++sc_ls_new_za () [[arm::streaming_compatible]] ++{ ++ consume_za (); ++ asm (""); ++} ++ ++/* ++** n_ls_shared_za: ++** str x30, \[sp, #?-80\]! ++** cntd x16 ++** str x16, \[sp, #?8\] ++** stp d8, d9, \[sp, #?16\] ++** stp d10, d11, \[sp, #?32\] ++** stp d12, d13, \[sp, #?48\] ++** stp d14, d15, \[sp, #?64\] ++** smstart sm ++** bl consume_za ++** smstop sm ++** ldp d8, d9, \[sp, #?16\] ++** ldp d10, d11, \[sp, #?32\] ++** ldp d12, d13, \[sp, #?48\] ++** ldp d14, d15, \[sp, #?64\] ++** ldr x30, \[sp\], #?80 ++** ret ++*/ ++[[arm::locally_streaming]] void ++n_ls_shared_za () [[arm::inout("za")]] ++{ ++ consume_za (); ++ asm (""); ++} ++ ++/* ++** s_ls_shared_za: ++** str x30, \[sp, #?-16\]! ++** bl consume_za ++** ldr x30, \[sp\], #?16 ++** ret ++*/ ++[[arm::locally_streaming]] void ++s_ls_shared_za () [[arm::streaming, arm::inout("za")]] ++{ ++ consume_za (); ++ asm (""); ++} ++ ++/* ++** sc_ls_shared_za: ++** stp x29, x30, \[sp, #?-96\]! ++** mov x29, sp ++** cntd x16 ++** str x16, \[sp, #?24\] ++** stp d8, d9, \[sp, #?32\] ++** stp d10, d11, \[sp, #?48\] ++** stp d12, d13, \[sp, #?64\] ++** stp d14, d15, \[sp, #?80\] ++** mrs x16, svcr ++** str x16, \[x29, #?16\] ++** tbnz x16, 0, [^\n]+ ++** smstart sm ++** bl consume_za ++** ldr x16, \[x29, #?16\] ++** tbnz x16, 0, [^\n]+ ++** smstop sm ++** ldp d8, d9, \[sp, #?32\] ++** ldp d10, d11, \[sp, #?48\] ++** ldp d12, d13, \[sp, #?64\] ++** ldp d14, d15, \[sp, #?80\] ++** ldp x29, x30, \[sp\], #?96 ++** ret ++*/ ++[[arm::locally_streaming]] void ++sc_ls_shared_za () [[arm::streaming_compatible, arm::inout("za")]] ++{ ++ consume_za (); ++ asm (""); ++} ++ ++/* ++** n_ls_vector_pcs: ++** sub sp, sp, #?272 ++** cntd x16 ++** str x16, \[sp\] ++** stp q8, q9, \[sp, #?16\] ++** stp q10, q11, \[sp, #?48\] ++** stp q12, q13, \[sp, #?80\] ++** stp q14, q15, \[sp, #?112\] ++** stp q16, q17, \[sp, #?144\] ++** stp q18, q19, \[sp, #?176\] ++** stp q20, q21, \[sp, #?208\] ++** stp q22, q23, \[sp, #?240\] ++** smstart sm ++** smstop sm ++** ldp q8, q9, \[sp, #?16\] ++** ldp q10, q11, \[sp, #?48\] ++** ldp q12, q13, \[sp, #?80\] ++** ldp q14, q15, \[sp, #?112\] ++** ldp q16, q17, \[sp, #?144\] ++** ldp q18, q19, \[sp, #?176\] ++** ldp q20, q21, \[sp, #?208\] ++** ldp q22, q23, \[sp, #?240\] ++** add sp, sp, #?272 ++** ret ++*/ ++[[arm::locally_streaming]] void __attribute__((aarch64_vector_pcs)) ++n_ls_vector_pcs () ++{ ++ asm (""); ++} ++ ++/* ++** n_ls_sve_pcs: ++** sub sp, sp, #?16 ++** cntd x16 ++** str x16, \[sp\] ++** addsvl sp, sp, #-18 ++** str p4, \[sp\] ++** str p5, \[sp, #1, mul vl\] ++** str p6, \[sp, #2, mul vl\] ++** str p7, \[sp, #3, mul vl\] ++** str p8, \[sp, #4, mul vl\] ++** str p9, \[sp, #5, mul vl\] ++** str p10, \[sp, #6, mul vl\] ++** str p11, \[sp, #7, mul vl\] ++** str p12, \[sp, #8, mul vl\] ++** str p13, \[sp, #9, mul vl\] ++** str p14, \[sp, #10, mul vl\] ++** str p15, \[sp, #11, mul vl\] ++** str z8, \[sp, #2, mul vl\] ++** str z9, \[sp, #3, mul vl\] ++** str z10, \[sp, #4, mul vl\] ++** str z11, \[sp, #5, mul vl\] ++** str z12, \[sp, #6, mul vl\] ++** str z13, \[sp, #7, mul vl\] ++** str z14, \[sp, #8, mul vl\] ++** str z15, \[sp, #9, mul vl\] ++** str z16, \[sp, #10, mul vl\] ++** str z17, \[sp, #11, mul vl\] ++** str z18, \[sp, #12, mul vl\] ++** str z19, \[sp, #13, mul vl\] ++** str z20, \[sp, #14, mul vl\] ++** str z21, \[sp, #15, mul vl\] ++** str z22, \[sp, #16, mul vl\] ++** str z23, \[sp, #17, mul vl\] ++** addvl sp, sp, #-1 ++** str p0, \[sp\] ++** smstart sm ++** ldr p0, \[sp\] ++** addvl sp, sp, #1 ++** smstop sm ++** ldr z8, \[sp, #2, mul vl\] ++** ldr z9, \[sp, #3, mul vl\] ++** ldr z10, \[sp, #4, mul vl\] ++** ldr z11, \[sp, #5, mul vl\] ++** ldr z12, \[sp, #6, mul vl\] ++** ldr z13, \[sp, #7, mul vl\] ++** ldr z14, \[sp, #8, mul vl\] ++** ldr z15, \[sp, #9, mul vl\] ++** ldr z16, \[sp, #10, mul vl\] ++** ldr z17, \[sp, #11, mul vl\] ++** ldr z18, \[sp, #12, mul vl\] ++** ldr z19, \[sp, #13, mul vl\] ++** ldr z20, \[sp, #14, mul vl\] ++** ldr z21, \[sp, #15, mul vl\] ++** ldr z22, \[sp, #16, mul vl\] ++** ldr z23, \[sp, #17, mul vl\] ++** ldr p4, \[sp\] ++** ldr p5, \[sp, #1, mul vl\] ++** ldr p6, \[sp, #2, mul vl\] ++** ldr p7, \[sp, #3, mul vl\] ++** ldr p8, \[sp, #4, mul vl\] ++** ldr p9, \[sp, #5, mul vl\] ++** ldr p10, \[sp, #6, mul vl\] ++** ldr p11, \[sp, #7, mul vl\] ++** ldr p12, \[sp, #8, mul vl\] ++** ldr p13, \[sp, #9, mul vl\] ++** ldr p14, \[sp, #10, mul vl\] ++** ldr p15, \[sp, #11, mul vl\] ++** addsvl sp, sp, #18 ++** add sp, sp, #?16 ++** ret ++*/ ++[[arm::locally_streaming]] void ++n_ls_sve_pcs (__SVBool_t x) ++{ ++ asm (""); ++} ++ ++/* ++** n_ls_v0: ++** addsvl sp, sp, #-1 ++** ... ++** smstart sm ++** add x[0-9]+, [^\n]+ ++** smstop sm ++** ... ++** addsvl sp, sp, #1 ++** ... ++*/ ++#define TEST(VN) __SVInt32_t VN; asm ("" :: "r" (&VN)); ++[[arm::locally_streaming]] void ++n_ls_v0 () ++{ ++ TEST (v0); ++} ++ ++/* ++** n_ls_v32: ++** addsvl sp, sp, #-32 ++** ... ++** smstart sm ++** ... ++** smstop sm ++** ... ++** rdsvl (x[0-9]+), #1 ++** lsl (x[0-9]+), \1, #?5 ++** add sp, sp, \2 ++** ... ++*/ ++[[arm::locally_streaming]] void ++n_ls_v32 () ++{ ++ TEST (v0); ++ TEST (v1); ++ TEST (v2); ++ TEST (v3); ++ TEST (v4); ++ TEST (v5); ++ TEST (v6); ++ TEST (v7); ++ TEST (v8); ++ TEST (v9); ++ TEST (v10); ++ TEST (v11); ++ TEST (v12); ++ TEST (v13); ++ TEST (v14); ++ TEST (v15); ++ TEST (v16); ++ TEST (v17); ++ TEST (v18); ++ TEST (v19); ++ TEST (v20); ++ TEST (v21); ++ TEST (v22); ++ TEST (v23); ++ TEST (v24); ++ TEST (v25); ++ TEST (v26); ++ TEST (v27); ++ TEST (v28); ++ TEST (v29); ++ TEST (v30); ++ TEST (v31); ++} ++ ++/* ++** n_ls_v33: ++** rdsvl (x[0-9]+), #1 ++** mov (x[0-9]+), #?33 ++** mul (x[0-9]+), (?:\1, \2|\2, \1) ++** sub sp, sp, \3 ++** ... ++** smstart sm ++** ... ++** smstop sm ++** ... ++** rdsvl (x[0-9]+), #1 ++** mov (x[0-9]+), #?33 ++** mul (x[0-9]+), (?:\4, \5|\5, \4) ++** add sp, sp, \6 ++** ... ++*/ ++[[arm::locally_streaming]] void ++n_ls_v33 () ++{ ++ TEST (v0); ++ TEST (v1); ++ TEST (v2); ++ TEST (v3); ++ TEST (v4); ++ TEST (v5); ++ TEST (v6); ++ TEST (v7); ++ TEST (v8); ++ TEST (v9); ++ TEST (v10); ++ TEST (v11); ++ TEST (v12); ++ TEST (v13); ++ TEST (v14); ++ TEST (v15); ++ TEST (v16); ++ TEST (v17); ++ TEST (v18); ++ TEST (v19); ++ TEST (v20); ++ TEST (v21); ++ TEST (v22); ++ TEST (v23); ++ TEST (v24); ++ TEST (v25); ++ TEST (v26); ++ TEST (v27); ++ TEST (v28); ++ TEST (v29); ++ TEST (v30); ++ TEST (v31); ++ TEST (v32); ++} +diff --git a/gcc/testsuite/gcc.target/aarch64/sme/locally_streaming_2.c b/gcc/testsuite/gcc.target/aarch64/sme/locally_streaming_2.c +new file mode 100644 +index 000000000..0eba99385 +--- /dev/null ++++ b/gcc/testsuite/gcc.target/aarch64/sme/locally_streaming_2.c +@@ -0,0 +1,177 @@ ++// { dg-options "-O -fomit-frame-pointer" } ++// { dg-final { check-function-bodies "**" "" } } ++ ++#include <arm_neon.h> ++#include <arm_sve.h> ++ ++/* ++** test_d0: ++** ... ++** smstart sm ++** ... ++** fmov x10, d0 ++** smstop sm ++** fmov d0, x10 ++** ... ++*/ ++[[arm::locally_streaming]] double ++test_d0 () ++{ ++ asm (""); ++ return 1.0f; ++} ++ ++/* ++** test_d0_vec: ++** ... ++** smstart sm ++** ... ++** ( ++** fmov x10, d0 ++** | ++** umov x10, v0.d\[0\] ++** ) ++** smstop sm ++** fmov d0, x10 ++** ... ++*/ ++[[arm::locally_streaming]] int8x8_t ++test_d0_vec () ++{ ++ asm (""); ++ return (int8x8_t) {}; ++} ++ ++/* ++** test_q0: ++** ... ++** smstart sm ++** ... ++** str q0, \[sp, #?-16\]! ++** smstop sm ++** ldr q0, \[sp\], #?16 ++** ... ++*/ ++[[arm::locally_streaming]] int8x16_t ++test_q0 () ++{ ++ asm (""); ++ return (int8x16_t) {}; ++} ++ ++/* ++** test_q1: ++** ... ++** smstart sm ++** ... ++** stp q0, q1, \[sp, #?-32\]! ++** smstop sm ++** ldp q0, q1, \[sp\], #?32 ++** ... ++*/ ++[[arm::locally_streaming]] int8x16x2_t ++test_q1 () ++{ ++ asm (""); ++ return (int8x16x2_t) {}; ++} ++ ++/* ++** test_q2: ++** ... ++** smstart sm ++** ... ++** stp q0, q1, \[sp, #?-48\]! ++** str q2, \[sp, #?32\] ++** smstop sm ++** ldr q2, \[sp, #?32\] ++** ldp q0, q1, \[sp\], #?48 ++** ... ++*/ ++[[arm::locally_streaming]] int8x16x3_t ++test_q2 () ++{ ++ asm (""); ++ return (int8x16x3_t) {}; ++} ++ ++/* ++** test_q3: ++** ... ++** smstart sm ++** ... ++** stp q0, q1, \[sp, #?-64\]! ++** stp q2, q3, \[sp, #?32\] ++** smstop sm ++** ldp q2, q3, \[sp, #?32\] ++** ldp q0, q1, \[sp\], #?64 ++** ... ++*/ ++[[arm::locally_streaming]] int8x16x4_t ++test_q3 () ++{ ++ asm (""); ++ return (int8x16x4_t) {}; ++} ++ ++/* ++** test_z0: ++** ... ++** smstart sm ++** mov z0\.b, #0 ++** addvl sp, sp, #-1 ++** str z0, \[sp\] ++** smstop sm ++** ldr z0, \[sp\] ++** addvl sp, sp, #1 ++** ... ++*/ ++[[arm::locally_streaming]] svint8_t ++test_z0 () ++{ ++ asm (""); ++ return (svint8_t) {}; ++} ++ ++/* ++** test_z3: ++** ... ++** smstart sm ++** ... ++** addvl sp, sp, #-4 ++** str z0, \[sp\] ++** str z1, \[sp, #1, mul vl\] ++** str z2, \[sp, #2, mul vl\] ++** str z3, \[sp, #3, mul vl\] ++** smstop sm ++** ldr z0, \[sp\] ++** ldr z1, \[sp, #1, mul vl\] ++** ldr z2, \[sp, #2, mul vl\] ++** ldr z3, \[sp, #3, mul vl\] ++** ... ++*/ ++[[arm::locally_streaming]] svint8x4_t ++test_z3 () ++{ ++ asm (""); ++ return (svint8x4_t) {}; ++} ++ ++/* ++** test_p0: ++** ... ++** smstart sm ++** pfalse p0\.b ++** addvl sp, sp, #-1 ++** str p0, \[sp\] ++** smstop sm ++** ldr p0, \[sp\] ++** addvl sp, sp, #1 ++** ... ++*/ ++[[arm::locally_streaming]] svbool_t ++test_p0 () ++{ ++ asm (""); ++ return (svbool_t) {}; ++} +diff --git a/gcc/testsuite/gcc.target/aarch64/sme/locally_streaming_3.c b/gcc/testsuite/gcc.target/aarch64/sme/locally_streaming_3.c +new file mode 100644 +index 000000000..2bdea6ac6 +--- /dev/null ++++ b/gcc/testsuite/gcc.target/aarch64/sme/locally_streaming_3.c +@@ -0,0 +1,273 @@ ++// { dg-options "-O -fomit-frame-pointer" } ++// { dg-final { check-function-bodies "**" "" } } ++ ++#include <arm_neon.h> ++#include <arm_sve.h> ++ ++/* ++** test_d0: ++** ... ++** fmov x10, d0 ++** smstart sm ++** fmov d0, x10 ++** smstop sm ++** ... ++*/ ++[[arm::locally_streaming]] void ++test_d0 (double d0) ++{ ++ asm (""); ++} ++ ++/* ++** test_d7: ++** ... ++** fmov x10, d0 ++** fmov x11, d1 ++** fmov x12, d2 ++** fmov x13, d3 ++** fmov x14, d4 ++** fmov x15, d5 ++** fmov x16, d6 ++** fmov x17, d7 ++** smstart sm ++** fmov d0, x10 ++** fmov d1, x11 ++** fmov d2, x12 ++** fmov d3, x13 ++** fmov d4, x14 ++** fmov d5, x15 ++** fmov d6, x16 ++** fmov d7, x17 ++** smstop sm ++** ... ++*/ ++[[arm::locally_streaming]] void ++test_d7 (double d0, double d1, double d2, double d3, ++ double d4, double d5, double d6, double d7) ++{ ++ asm (""); ++} ++ ++/* ++** test_d0_vec: ++** ... ++** ( ++** fmov x10, d0 ++** | ++** umov x10, v0.d\[0\] ++** ) ++** smstart sm ++** fmov d0, x10 ++** smstop sm ++** ... ++*/ ++[[arm::locally_streaming]] void ++test_d0_vec (int8x8_t d0) ++{ ++ asm (""); ++} ++ ++/* ++** test_d7_vec: ++** ... ++** ( ++** fmov x10, d0 ++** fmov x11, d1 ++** fmov x12, d2 ++** fmov x13, d3 ++** fmov x14, d4 ++** fmov x15, d5 ++** fmov x16, d6 ++** fmov x17, d7 ++** | ++** umov x10, v0.d\[0\] ++** umov x11, v1.d\[0\] ++** umov x12, v2.d\[0\] ++** umov x13, v3.d\[0\] ++** umov x14, v4.d\[0\] ++** umov x15, v5.d\[0\] ++** umov x16, v6.d\[0\] ++** umov x17, v7.d\[0\] ++** ) ++** smstart sm ++** fmov d0, x10 ++** fmov d1, x11 ++** fmov d2, x12 ++** fmov d3, x13 ++** fmov d4, x14 ++** fmov d5, x15 ++** fmov d6, x16 ++** fmov d7, x17 ++** smstop sm ++** ... ++*/ ++[[arm::locally_streaming]] void ++test_d7_vec (int8x8_t d0, int8x8_t d1, int8x8_t d2, int8x8_t d3, ++ int8x8_t d4, int8x8_t d5, int8x8_t d6, int8x8_t d7) ++{ ++ asm (""); ++} ++ ++/* ++** test_q0: ++** ... ++** str q0, \[sp, #?-16\]! ++** smstart sm ++** ldr q0, \[sp\], #?16 ++** smstop sm ++** ... ++*/ ++[[arm::locally_streaming]] void ++test_q0 (int8x16_t q0) ++{ ++ asm (""); ++} ++ ++/* ++** test_q7: ++** ... ++** stp q0, q1, \[sp, #?-128\]! ++** stp q2, q3, \[sp, #?32\] ++** stp q4, q5, \[sp, #?64\] ++** stp q6, q7, \[sp, #?96\] ++** smstart sm ++** ldp q2, q3, \[sp, #?32\] ++** ldp q4, q5, \[sp, #?64\] ++** ldp q6, q7, \[sp, #?96\] ++** ldp q0, q1, \[sp\], #?128 ++** smstop sm ++** ... ++*/ ++[[arm::locally_streaming]] void ++test_q7 (int8x16x4_t q0, int8x16x4_t q4) ++{ ++ asm (""); ++} ++ ++/* ++** test_z0: ++** ... ++** addvl sp, sp, #-1 ++** str z0, \[sp\] ++** smstart sm ++** ldr z0, \[sp\] ++** addvl sp, sp, #1 ++** smstop sm ++** ... ++*/ ++[[arm::locally_streaming]] void ++test_z0 (svint8_t z0) ++{ ++ asm (""); ++} ++ ++/* ++** test_z7: ++** ... ++** addvl sp, sp, #-8 ++** str z0, \[sp\] ++** str z1, \[sp, #1, mul vl\] ++** str z2, \[sp, #2, mul vl\] ++** str z3, \[sp, #3, mul vl\] ++** str z4, \[sp, #4, mul vl\] ++** str z5, \[sp, #5, mul vl\] ++** str z6, \[sp, #6, mul vl\] ++** str z7, \[sp, #7, mul vl\] ++** smstart sm ++** ldr z0, \[sp\] ++** ldr z1, \[sp, #1, mul vl\] ++** ldr z2, \[sp, #2, mul vl\] ++** ldr z3, \[sp, #3, mul vl\] ++** ldr z4, \[sp, #4, mul vl\] ++** ldr z5, \[sp, #5, mul vl\] ++** ldr z6, \[sp, #6, mul vl\] ++** ldr z7, \[sp, #7, mul vl\] ++** addvl sp, sp, #8 ++** smstop sm ++** ... ++*/ ++[[arm::locally_streaming]] void ++test_z7 (svint8x4_t z0, svint8x4_t z4) ++{ ++ asm (""); ++} ++ ++/* ++** test_p0: ++** ... ++** addvl sp, sp, #-1 ++** str p0, \[sp\] ++** smstart sm ++** ldr p0, \[sp\] ++** addvl sp, sp, #1 ++** smstop sm ++** ... ++*/ ++[[arm::locally_streaming]] void ++test_p0 (svbool_t p0) ++{ ++ asm (""); ++} ++ ++/* ++** test_p3: ++** ... ++** addvl sp, sp, #-1 ++** str p0, \[sp\] ++** str p1, \[sp, #1, mul vl\] ++** str p2, \[sp, #2, mul vl\] ++** str p3, \[sp, #3, mul vl\] ++** smstart sm ++** ldr p0, \[sp\] ++** ldr p1, \[sp, #1, mul vl\] ++** ldr p2, \[sp, #2, mul vl\] ++** ldr p3, \[sp, #3, mul vl\] ++** addvl sp, sp, #1 ++** smstop sm ++** ... ++*/ ++[[arm::locally_streaming]] void ++test_p3 (svbool_t p0, svbool_t p1, svbool_t p2, svbool_t p3) ++{ ++ asm (""); ++} ++ ++/* ++** test_mixed: ++** ... ++** addvl sp, sp, #-3 ++** str p0, \[sp\] ++** str p1, \[sp, #1, mul vl\] ++** str p2, \[sp, #2, mul vl\] ++** str p3, \[sp, #3, mul vl\] ++** str z3, \[sp, #1, mul vl\] ++** str z7, \[sp, #2, mul vl\] ++** stp q2, q6, \[sp, #?-32\]! ++** fmov w10, s0 ++** fmov x11, d1 ++** fmov w12, s4 ++** fmov x13, d5 ++** smstart sm ++** fmov s0, w10 ++** fmov d1, x11 ++** fmov s4, w12 ++** fmov d5, x13 ++** ldp q2, q6, \[sp\], #?32 ++** ldr p0, \[sp\] ++** ldr p1, \[sp, #1, mul vl\] ++** ldr p2, \[sp, #2, mul vl\] ++** ldr p3, \[sp, #3, mul vl\] ++** ldr z3, \[sp, #1, mul vl\] ++** ldr z7, \[sp, #2, mul vl\] ++** addvl sp, sp, #3 ++** smstop sm ++** ... ++*/ ++[[arm::locally_streaming]] void ++test_mixed (float s0, double d1, float32x4_t q2, svfloat32_t z3, ++ float s4, double d5, float64x2_t q6, svfloat64_t z7, ++ svbool_t p0, svbool_t p1, svbool_t p2, svbool_t p3) ++{ ++ asm (""); ++} +diff --git a/gcc/testsuite/gcc.target/aarch64/sme/locally_streaming_4.c b/gcc/testsuite/gcc.target/aarch64/sme/locally_streaming_4.c +new file mode 100644 +index 000000000..42adeb152 +--- /dev/null ++++ b/gcc/testsuite/gcc.target/aarch64/sme/locally_streaming_4.c +@@ -0,0 +1,145 @@ ++// { dg-options "-O -fomit-frame-pointer" } ++/* { dg-final { check-function-bodies "**" "" } } */ ++ ++#include <arm_neon.h> ++#include <arm_sve.h> ++ ++/* ++** test_d0: ++** ... ++** smstart sm ++** ... ++** fmov x10, d0 ++** smstop sm ++** fmov d0, x10 ++** ... ++** smstart sm ++** ... ++** smstop sm ++** ... ++*/ ++void consume_d0 (double d0); ++ ++__arm_locally_streaming void ++test_d0 () ++{ ++ asm (""); ++ consume_d0 (1.0); ++ asm (""); ++} ++ ++/* ++** test_d7: ++** ... ++** fmov x10, d0 ++** fmov x11, d1 ++** fmov x12, d2 ++** fmov x13, d3 ++** fmov x14, d4 ++** fmov x15, d5 ++** fmov x16, d6 ++** fmov x17, d7 ++** smstop sm ++** fmov d0, x10 ++** fmov d1, x11 ++** fmov d2, x12 ++** fmov d3, x13 ++** fmov d4, x14 ++** fmov d5, x15 ++** fmov d6, x16 ++** fmov d7, x17 ++** ... ++*/ ++void consume_d7 (double d0, double d1, double d2, double d3, ++ double d4, double d5, double d6, double d7); ++__arm_locally_streaming void ++test_d7 () ++{ ++ asm (""); ++ consume_d7 (1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0); ++ asm (""); ++} ++ ++/* ++** test_q7: ++** ... ++** stp q0, q1, \[sp, #?-128\]! ++** stp q2, q3, \[sp, #?32\] ++** stp q4, q5, \[sp, #?64\] ++** stp q6, q7, \[sp, #?96\] ++** smstop sm ++** ldp q2, q3, \[sp, #?32\] ++** ldp q4, q5, \[sp, #?64\] ++** ldp q6, q7, \[sp, #?96\] ++** ldp q0, q1, \[sp\], #?128 ++** ... ++*/ ++void consume_q7 (int8x16x4_t q0, int8x16x4_t q4); ++ ++__arm_locally_streaming void ++test_q7 (int8x16x4_t *ptr) ++{ ++ asm (""); ++ consume_q7 (ptr[0], ptr[1]); ++ asm (""); ++} ++ ++/* ++** test_z7: ++** ... ++** addvl sp, sp, #-8 ++** str z0, \[sp\] ++** str z1, \[sp, #1, mul vl\] ++** str z2, \[sp, #2, mul vl\] ++** str z3, \[sp, #3, mul vl\] ++** str z4, \[sp, #4, mul vl\] ++** str z5, \[sp, #5, mul vl\] ++** str z6, \[sp, #6, mul vl\] ++** str z7, \[sp, #7, mul vl\] ++** smstop sm ++** ldr z0, \[sp\] ++** ldr z1, \[sp, #1, mul vl\] ++** ldr z2, \[sp, #2, mul vl\] ++** ldr z3, \[sp, #3, mul vl\] ++** ldr z4, \[sp, #4, mul vl\] ++** ldr z5, \[sp, #5, mul vl\] ++** ldr z6, \[sp, #6, mul vl\] ++** ldr z7, \[sp, #7, mul vl\] ++** addvl sp, sp, #8 ++** ... ++*/ ++void consume_z7 (svint8x4_t z0, svint8x4_t z4); ++ ++__arm_locally_streaming void ++test_z7 (svint8x4_t *ptr1, svint8x4_t *ptr2) ++{ ++ asm (""); ++ consume_z7 (*ptr1, *ptr2); ++ asm (""); ++} ++ ++/* ++** test_p3: ++** ... ++** addvl sp, sp, #-1 ++** str p0, \[sp\] ++** str p1, \[sp, #1, mul vl\] ++** str p2, \[sp, #2, mul vl\] ++** str p3, \[sp, #3, mul vl\] ++** smstop sm ++** ldr p0, \[sp\] ++** ldr p1, \[sp, #1, mul vl\] ++** ldr p2, \[sp, #2, mul vl\] ++** ldr p3, \[sp, #3, mul vl\] ++** addvl sp, sp, #1 ++** ... ++*/ ++void consume_p3 (svbool_t p0, svbool_t p1, svbool_t p2, svbool_t p3); ++ ++__arm_locally_streaming void ++test_p3 (svbool_t *ptr1, svbool_t *ptr2, svbool_t *ptr3, svbool_t *ptr4) ++{ ++ asm (""); ++ consume_p3 (*ptr1, *ptr2, *ptr3, *ptr4); ++ asm (""); ++} +-- +2.33.0 + |