summaryrefslogtreecommitdiff
path: root/0148-Backport-SME-Allow-prologues-and-epilogues-to-be-ins.patch
blob: 13dc0e3ddeca9ed9fd8632d85e5654baf4bae3f7 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
From 417d51e1ecf41b3ba3ddf24eaf1e07db5c1ded9e Mon Sep 17 00:00:00 2001
From: Richard Sandiford <richard.sandiford@arm.com>
Date: Tue, 5 Dec 2023 09:28:46 +0000
Subject: [PATCH 049/157] [Backport][SME] Allow prologues and epilogues to be
 inserted later

Reference: https://gcc.gnu.org/git/?p=gcc.git;a=commit;h=e9d2ae6b9816e61a6148040149c63faa83f54702

Arm's SME adds a new processor mode called streaming mode.
This mode enables some new (matrix-oriented) instructions and
disables several existing groups of instructions, such as most
Advanced SIMD vector instructions and a much smaller set of SVE
instructions.  It can also change the current vector length.

There are instructions to switch in and out of streaming mode.
However, their effect on the ISA and vector length can't be represented
directly in RTL, so they need to be emitted late in the pass pipeline,
close to md_reorg.

It's sometimes the responsibility of the prologue and epilogue to
switch modes, which means we need to emit the prologue and epilogue
sequences late as well.  (This loses shrink-wrapping and scheduling
opportunities, but that's a price worth paying.)

This patch therefore adds a target hook for forcing prologue
and epilogue insertion to happen later in the pipeline.

gcc/
	* target.def (use_late_prologue_epilogue): New hook.
	* doc/tm.texi.in: Add TARGET_USE_LATE_PROLOGUE_EPILOGUE.
	* doc/tm.texi: Regenerate.
	* passes.def (pass_late_thread_prologue_and_epilogue): New pass.
	* tree-pass.h (make_pass_late_thread_prologue_and_epilogue): Declare.
	* function.cc (pass_thread_prologue_and_epilogue::gate): New function.
	(pass_data_late_thread_prologue_and_epilogue): New pass variable.
	(pass_late_thread_prologue_and_epilogue): New pass class.
	(make_pass_late_thread_prologue_and_epilogue): New function.
---
 gcc/doc/tm.texi    | 19 ++++++++++++++++++
 gcc/doc/tm.texi.in |  2 ++
 gcc/function.cc    | 50 ++++++++++++++++++++++++++++++++++++++++++++++
 gcc/passes.def     |  3 +++
 gcc/target.def     | 21 +++++++++++++++++++
 gcc/tree-pass.h    |  2 ++
 6 files changed, 97 insertions(+)

diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi
index 5f0972356..d930d233d 100644
--- a/gcc/doc/tm.texi
+++ b/gcc/doc/tm.texi
@@ -11684,6 +11684,25 @@ of the if-block in the @code{struct ce_if_block} structure that is pointed
 to by @var{ce_info}.
 @end defmac
 
+@deftypefn {Target Hook} bool TARGET_USE_LATE_PROLOGUE_EPILOGUE ()
+Return true if the current function's prologue and epilogue should
+be emitted late in the pass pipeline, instead of at the usual point.
+
+Normally, the prologue and epilogue sequences are introduced soon after
+register allocation is complete.  The advantage of this approach is that
+it allows the prologue and epilogue instructions to be optimized and
+scheduled with other code in the function.  However, some targets
+require the prologue and epilogue to be the first and last sequences
+executed by the function, with no variation allowed.  This hook should
+return true on such targets.
+
+The default implementation returns false, which is correct for most
+targets.  The hook should only return true if there is a specific
+target limitation that cannot be described in RTL.  For example,
+the hook might return true if the prologue and epilogue need to switch
+between instruction sets.
+@end deftypefn
+
 @deftypefn {Target Hook} void TARGET_MACHINE_DEPENDENT_REORG (void)
 If non-null, this hook performs a target-specific pass over the
 instruction stream.  The compiler will run it at all optimization levels,
diff --git a/gcc/doc/tm.texi.in b/gcc/doc/tm.texi.in
index fcab21744..19eabec48 100644
--- a/gcc/doc/tm.texi.in
+++ b/gcc/doc/tm.texi.in
@@ -7708,6 +7708,8 @@ of the if-block in the @code{struct ce_if_block} structure that is pointed
 to by @var{ce_info}.
 @end defmac
 
+@hook TARGET_USE_LATE_PROLOGUE_EPILOGUE
+
 @hook TARGET_MACHINE_DEPENDENT_REORG
 
 @hook TARGET_INIT_BUILTINS
diff --git a/gcc/function.cc b/gcc/function.cc
index fc8eb5812..7c90b5f23 100644
--- a/gcc/function.cc
+++ b/gcc/function.cc
@@ -84,6 +84,7 @@ along with GCC; see the file COPYING3.  If not see
 #include "function-abi.h"
 #include "value-range.h"
 #include "gimple-range.h"
+#include "insn-attr.h"
 
 /* So we can assign to cfun in this file.  */
 #undef cfun
@@ -6620,6 +6621,11 @@ public:
   {}
 
   /* opt_pass methods: */
+  bool gate (function *) final override
+    {
+      return !targetm.use_late_prologue_epilogue ();
+    }
+
   unsigned int execute (function * fun) final override
     {
       rest_of_handle_thread_prologue_and_epilogue (fun);
@@ -6628,6 +6634,44 @@ public:
 
 }; // class pass_thread_prologue_and_epilogue
 
+const pass_data pass_data_late_thread_prologue_and_epilogue =
+{
+  RTL_PASS, /* type */
+  "late_pro_and_epilogue", /* name */
+  OPTGROUP_NONE, /* optinfo_flags */
+  TV_THREAD_PROLOGUE_AND_EPILOGUE, /* tv_id */
+  0, /* properties_required */
+  0, /* properties_provided */
+  0, /* properties_destroyed */
+  0, /* todo_flags_start */
+  ( TODO_df_verify | TODO_df_finish ), /* todo_flags_finish */
+};
+
+class pass_late_thread_prologue_and_epilogue : public rtl_opt_pass
+{
+public:
+  pass_late_thread_prologue_and_epilogue (gcc::context *ctxt)
+    : rtl_opt_pass (pass_data_late_thread_prologue_and_epilogue, ctxt)
+  {}
+
+  /* opt_pass methods: */
+  bool gate (function *) final override
+    {
+      return targetm.use_late_prologue_epilogue ();
+    }
+
+  unsigned int execute (function *fn) final override
+    {
+      /* It's not currently possible to have both delay slots and
+	 late prologue/epilogue, since the latter has to run before
+	 the former, and the former won't honor whatever restrictions
+	 the latter is trying to enforce.  */
+      gcc_assert (!DELAY_SLOTS);
+      rest_of_handle_thread_prologue_and_epilogue (fn);
+      return 0;
+    }
+}; // class pass_late_thread_prologue_and_epilogue
+
 } // anon namespace
 
 rtl_opt_pass *
@@ -6636,6 +6680,12 @@ make_pass_thread_prologue_and_epilogue (gcc::context *ctxt)
   return new pass_thread_prologue_and_epilogue (ctxt);
 }
 
+rtl_opt_pass *
+make_pass_late_thread_prologue_and_epilogue (gcc::context *ctxt)
+{
+  return new pass_late_thread_prologue_and_epilogue (ctxt);
+}
+
 namespace {
 
 const pass_data pass_data_zero_call_used_regs =
diff --git a/gcc/passes.def b/gcc/passes.def
index cdc600298..8797f166f 100644
--- a/gcc/passes.def
+++ b/gcc/passes.def
@@ -523,6 +523,9 @@ along with GCC; see the file COPYING3.  If not see
 	      NEXT_PASS (pass_stack_regs_run);
 	  POP_INSERT_PASSES ()
       POP_INSERT_PASSES ()
+      NEXT_PASS (pass_late_thread_prologue_and_epilogue);
+      /* No target-independent code motion is allowed beyond this point,
+         excepting the legacy delayed-branch pass.  */
       NEXT_PASS (pass_late_compilation);
       PUSH_INSERT_PASSES_WITHIN (pass_late_compilation)
 	  NEXT_PASS (pass_zero_call_used_regs);
diff --git a/gcc/target.def b/gcc/target.def
index 4d77c1523..fd4899612 100644
--- a/gcc/target.def
+++ b/gcc/target.def
@@ -4120,6 +4120,27 @@ returns @code{VOIDmode}.",
  machine_mode, (machine_mode m1, machine_mode m2),
  default_cc_modes_compatible)
 
+DEFHOOK
+(use_late_prologue_epilogue,
+ "Return true if the current function's prologue and epilogue should\n\
+be emitted late in the pass pipeline, instead of at the usual point.\n\
+\n\
+Normally, the prologue and epilogue sequences are introduced soon after\n\
+register allocation is complete.  The advantage of this approach is that\n\
+it allows the prologue and epilogue instructions to be optimized and\n\
+scheduled with other code in the function.  However, some targets\n\
+require the prologue and epilogue to be the first and last sequences\n\
+executed by the function, with no variation allowed.  This hook should\n\
+return true on such targets.\n\
+\n\
+The default implementation returns false, which is correct for most\n\
+targets.  The hook should only return true if there is a specific\n\
+target limitation that cannot be described in RTL.  For example,\n\
+the hook might return true if the prologue and epilogue need to switch\n\
+between instruction sets.",
+ bool, (),
+ hook_bool_void_false)
+
 /* Do machine-dependent code transformations.  Called just before
      delayed-branch scheduling.  */
 DEFHOOK
diff --git a/gcc/tree-pass.h b/gcc/tree-pass.h
index 34e60bc38..1c983ef71 100644
--- a/gcc/tree-pass.h
+++ b/gcc/tree-pass.h
@@ -612,6 +612,8 @@ extern rtl_opt_pass *make_pass_gcse2 (gcc::context *ctxt);
 extern rtl_opt_pass *make_pass_split_after_reload (gcc::context *ctxt);
 extern rtl_opt_pass *make_pass_thread_prologue_and_epilogue (gcc::context
 							     *ctxt);
+extern rtl_opt_pass *make_pass_late_thread_prologue_and_epilogue (gcc::context
+								  *ctxt);
 extern rtl_opt_pass *make_pass_zero_call_used_regs (gcc::context *ctxt);
 extern rtl_opt_pass *make_pass_split_complex_instructions (gcc::context *ctxt);
 extern rtl_opt_pass *make_pass_stack_adjustments (gcc::context *ctxt);
-- 
2.33.0