summaryrefslogtreecommitdiff
path: root/0148-Backport-SME-Allow-prologues-and-epilogues-to-be-ins.patch
diff options
context:
space:
mode:
Diffstat (limited to '0148-Backport-SME-Allow-prologues-and-epilogues-to-be-ins.patch')
-rw-r--r--0148-Backport-SME-Allow-prologues-and-epilogues-to-be-ins.patch233
1 files changed, 233 insertions, 0 deletions
diff --git a/0148-Backport-SME-Allow-prologues-and-epilogues-to-be-ins.patch b/0148-Backport-SME-Allow-prologues-and-epilogues-to-be-ins.patch
new file mode 100644
index 0000000..13dc0e3
--- /dev/null
+++ b/0148-Backport-SME-Allow-prologues-and-epilogues-to-be-ins.patch
@@ -0,0 +1,233 @@
+From 417d51e1ecf41b3ba3ddf24eaf1e07db5c1ded9e Mon Sep 17 00:00:00 2001
+From: Richard Sandiford <richard.sandiford@arm.com>
+Date: Tue, 5 Dec 2023 09:28:46 +0000
+Subject: [PATCH 049/157] [Backport][SME] Allow prologues and epilogues to be
+ inserted later
+
+Reference: https://gcc.gnu.org/git/?p=gcc.git;a=commit;h=e9d2ae6b9816e61a6148040149c63faa83f54702
+
+Arm's SME adds a new processor mode called streaming mode.
+This mode enables some new (matrix-oriented) instructions and
+disables several existing groups of instructions, such as most
+Advanced SIMD vector instructions and a much smaller set of SVE
+instructions. It can also change the current vector length.
+
+There are instructions to switch in and out of streaming mode.
+However, their effect on the ISA and vector length can't be represented
+directly in RTL, so they need to be emitted late in the pass pipeline,
+close to md_reorg.
+
+It's sometimes the responsibility of the prologue and epilogue to
+switch modes, which means we need to emit the prologue and epilogue
+sequences late as well. (This loses shrink-wrapping and scheduling
+opportunities, but that's a price worth paying.)
+
+This patch therefore adds a target hook for forcing prologue
+and epilogue insertion to happen later in the pipeline.
+
+gcc/
+ * target.def (use_late_prologue_epilogue): New hook.
+ * doc/tm.texi.in: Add TARGET_USE_LATE_PROLOGUE_EPILOGUE.
+ * doc/tm.texi: Regenerate.
+ * passes.def (pass_late_thread_prologue_and_epilogue): New pass.
+ * tree-pass.h (make_pass_late_thread_prologue_and_epilogue): Declare.
+ * function.cc (pass_thread_prologue_and_epilogue::gate): New function.
+ (pass_data_late_thread_prologue_and_epilogue): New pass variable.
+ (pass_late_thread_prologue_and_epilogue): New pass class.
+ (make_pass_late_thread_prologue_and_epilogue): New function.
+---
+ gcc/doc/tm.texi | 19 ++++++++++++++++++
+ gcc/doc/tm.texi.in | 2 ++
+ gcc/function.cc | 50 ++++++++++++++++++++++++++++++++++++++++++++++
+ gcc/passes.def | 3 +++
+ gcc/target.def | 21 +++++++++++++++++++
+ gcc/tree-pass.h | 2 ++
+ 6 files changed, 97 insertions(+)
+
+diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi
+index 5f0972356..d930d233d 100644
+--- a/gcc/doc/tm.texi
++++ b/gcc/doc/tm.texi
+@@ -11684,6 +11684,25 @@ of the if-block in the @code{struct ce_if_block} structure that is pointed
+ to by @var{ce_info}.
+ @end defmac
+
++@deftypefn {Target Hook} bool TARGET_USE_LATE_PROLOGUE_EPILOGUE ()
++Return true if the current function's prologue and epilogue should
++be emitted late in the pass pipeline, instead of at the usual point.
++
++Normally, the prologue and epilogue sequences are introduced soon after
++register allocation is complete. The advantage of this approach is that
++it allows the prologue and epilogue instructions to be optimized and
++scheduled with other code in the function. However, some targets
++require the prologue and epilogue to be the first and last sequences
++executed by the function, with no variation allowed. This hook should
++return true on such targets.
++
++The default implementation returns false, which is correct for most
++targets. The hook should only return true if there is a specific
++target limitation that cannot be described in RTL. For example,
++the hook might return true if the prologue and epilogue need to switch
++between instruction sets.
++@end deftypefn
++
+ @deftypefn {Target Hook} void TARGET_MACHINE_DEPENDENT_REORG (void)
+ If non-null, this hook performs a target-specific pass over the
+ instruction stream. The compiler will run it at all optimization levels,
+diff --git a/gcc/doc/tm.texi.in b/gcc/doc/tm.texi.in
+index fcab21744..19eabec48 100644
+--- a/gcc/doc/tm.texi.in
++++ b/gcc/doc/tm.texi.in
+@@ -7708,6 +7708,8 @@ of the if-block in the @code{struct ce_if_block} structure that is pointed
+ to by @var{ce_info}.
+ @end defmac
+
++@hook TARGET_USE_LATE_PROLOGUE_EPILOGUE
++
+ @hook TARGET_MACHINE_DEPENDENT_REORG
+
+ @hook TARGET_INIT_BUILTINS
+diff --git a/gcc/function.cc b/gcc/function.cc
+index fc8eb5812..7c90b5f23 100644
+--- a/gcc/function.cc
++++ b/gcc/function.cc
+@@ -84,6 +84,7 @@ along with GCC; see the file COPYING3. If not see
+ #include "function-abi.h"
+ #include "value-range.h"
+ #include "gimple-range.h"
++#include "insn-attr.h"
+
+ /* So we can assign to cfun in this file. */
+ #undef cfun
+@@ -6620,6 +6621,11 @@ public:
+ {}
+
+ /* opt_pass methods: */
++ bool gate (function *) final override
++ {
++ return !targetm.use_late_prologue_epilogue ();
++ }
++
+ unsigned int execute (function * fun) final override
+ {
+ rest_of_handle_thread_prologue_and_epilogue (fun);
+@@ -6628,6 +6634,44 @@ public:
+
+ }; // class pass_thread_prologue_and_epilogue
+
++const pass_data pass_data_late_thread_prologue_and_epilogue =
++{
++ RTL_PASS, /* type */
++ "late_pro_and_epilogue", /* name */
++ OPTGROUP_NONE, /* optinfo_flags */
++ TV_THREAD_PROLOGUE_AND_EPILOGUE, /* tv_id */
++ 0, /* properties_required */
++ 0, /* properties_provided */
++ 0, /* properties_destroyed */
++ 0, /* todo_flags_start */
++ ( TODO_df_verify | TODO_df_finish ), /* todo_flags_finish */
++};
++
++class pass_late_thread_prologue_and_epilogue : public rtl_opt_pass
++{
++public:
++ pass_late_thread_prologue_and_epilogue (gcc::context *ctxt)
++ : rtl_opt_pass (pass_data_late_thread_prologue_and_epilogue, ctxt)
++ {}
++
++ /* opt_pass methods: */
++ bool gate (function *) final override
++ {
++ return targetm.use_late_prologue_epilogue ();
++ }
++
++ unsigned int execute (function *fn) final override
++ {
++ /* It's not currently possible to have both delay slots and
++ late prologue/epilogue, since the latter has to run before
++ the former, and the former won't honor whatever restrictions
++ the latter is trying to enforce. */
++ gcc_assert (!DELAY_SLOTS);
++ rest_of_handle_thread_prologue_and_epilogue (fn);
++ return 0;
++ }
++}; // class pass_late_thread_prologue_and_epilogue
++
+ } // anon namespace
+
+ rtl_opt_pass *
+@@ -6636,6 +6680,12 @@ make_pass_thread_prologue_and_epilogue (gcc::context *ctxt)
+ return new pass_thread_prologue_and_epilogue (ctxt);
+ }
+
++rtl_opt_pass *
++make_pass_late_thread_prologue_and_epilogue (gcc::context *ctxt)
++{
++ return new pass_late_thread_prologue_and_epilogue (ctxt);
++}
++
+ namespace {
+
+ const pass_data pass_data_zero_call_used_regs =
+diff --git a/gcc/passes.def b/gcc/passes.def
+index cdc600298..8797f166f 100644
+--- a/gcc/passes.def
++++ b/gcc/passes.def
+@@ -523,6 +523,9 @@ along with GCC; see the file COPYING3. If not see
+ NEXT_PASS (pass_stack_regs_run);
+ POP_INSERT_PASSES ()
+ POP_INSERT_PASSES ()
++ NEXT_PASS (pass_late_thread_prologue_and_epilogue);
++ /* No target-independent code motion is allowed beyond this point,
++ excepting the legacy delayed-branch pass. */
+ NEXT_PASS (pass_late_compilation);
+ PUSH_INSERT_PASSES_WITHIN (pass_late_compilation)
+ NEXT_PASS (pass_zero_call_used_regs);
+diff --git a/gcc/target.def b/gcc/target.def
+index 4d77c1523..fd4899612 100644
+--- a/gcc/target.def
++++ b/gcc/target.def
+@@ -4120,6 +4120,27 @@ returns @code{VOIDmode}.",
+ machine_mode, (machine_mode m1, machine_mode m2),
+ default_cc_modes_compatible)
+
++DEFHOOK
++(use_late_prologue_epilogue,
++ "Return true if the current function's prologue and epilogue should\n\
++be emitted late in the pass pipeline, instead of at the usual point.\n\
++\n\
++Normally, the prologue and epilogue sequences are introduced soon after\n\
++register allocation is complete. The advantage of this approach is that\n\
++it allows the prologue and epilogue instructions to be optimized and\n\
++scheduled with other code in the function. However, some targets\n\
++require the prologue and epilogue to be the first and last sequences\n\
++executed by the function, with no variation allowed. This hook should\n\
++return true on such targets.\n\
++\n\
++The default implementation returns false, which is correct for most\n\
++targets. The hook should only return true if there is a specific\n\
++target limitation that cannot be described in RTL. For example,\n\
++the hook might return true if the prologue and epilogue need to switch\n\
++between instruction sets.",
++ bool, (),
++ hook_bool_void_false)
++
+ /* Do machine-dependent code transformations. Called just before
+ delayed-branch scheduling. */
+ DEFHOOK
+diff --git a/gcc/tree-pass.h b/gcc/tree-pass.h
+index 34e60bc38..1c983ef71 100644
+--- a/gcc/tree-pass.h
++++ b/gcc/tree-pass.h
+@@ -612,6 +612,8 @@ extern rtl_opt_pass *make_pass_gcse2 (gcc::context *ctxt);
+ extern rtl_opt_pass *make_pass_split_after_reload (gcc::context *ctxt);
+ extern rtl_opt_pass *make_pass_thread_prologue_and_epilogue (gcc::context
+ *ctxt);
++extern rtl_opt_pass *make_pass_late_thread_prologue_and_epilogue (gcc::context
++ *ctxt);
+ extern rtl_opt_pass *make_pass_zero_call_used_regs (gcc::context *ctxt);
+ extern rtl_opt_pass *make_pass_split_complex_instructions (gcc::context *ctxt);
+ extern rtl_opt_pass *make_pass_stack_adjustments (gcc::context *ctxt);
+--
+2.33.0
+