summaryrefslogtreecommitdiff
path: root/0312-Add-late-slp-vectorization-pass-with-additional-chec.patch
blob: 19e519f614622d61953da3eb4653d979248cdafd (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
From 9df4a0bd76299734ae47f2f4e236b10f6c156994 Mon Sep 17 00:00:00 2001
From: d84370931 <dementiev.daniil@h-partners.com>
Date: Thu, 14 Nov 2024 17:08:40 +0800
Subject: [PATCH 3/8] Add late slp vectorization pass with additional checks.

Add expansion of data reference offset using affine trees to check
if data references may alias.

Add check if a group of interleaving data references is smaller than
max vector register size.

Add operands swap for commutative operations.
Swapping operands is necessary for better vector constructing.
For example for operations
  _1 = a * b;
  _2 = b * c;
Construction vectors (a, c) * (b, b) is more profitable
than (a, b) * (b, c).

Add tests and special param flags for each check:
  --param=vect-addr-expand-for-alias-check={0,1}
  --param=vect-swap-operands={0,1}
  --param=vect-register-size-check={0,1}

Add enabling flag for late slp pass:
  -ftree-slp-late
---
 gcc/common.opt                                |  4 ++
 gcc/params.opt                                | 12 ++++++
 gcc/passes.def                                |  4 ++
 gcc/testsuite/gcc.dg/vect/vect-alias-expand.c | 12 ++++++
 gcc/testsuite/gcc.dg/vect/vect-op-swap.c      | 10 +++++
 gcc/testsuite/gcc.dg/vect/vect-regsize.c      | 18 +++++++++
 gcc/timevar.def                               |  1 +
 gcc/tree-data-ref.cc                          | 12 ++++++
 gcc/tree-pass.h                               |  1 +
 gcc/tree-vect-data-refs.cc                    | 15 +++++++
 gcc/tree-vect-slp.cc                          | 28 +++++++++++++
 gcc/tree-vectorizer.cc                        | 39 +++++++++++++++++++
 12 files changed, 156 insertions(+)
 create mode 100644 gcc/testsuite/gcc.dg/vect/vect-alias-expand.c
 create mode 100644 gcc/testsuite/gcc.dg/vect/vect-op-swap.c
 create mode 100644 gcc/testsuite/gcc.dg/vect/vect-regsize.c

diff --git a/gcc/common.opt b/gcc/common.opt
index 78cfc333a..c3c64ceaf 100644
--- a/gcc/common.opt
+++ b/gcc/common.opt
@@ -3268,6 +3268,10 @@ ftree-slp-transpose-vectorize
 Common Var(flag_tree_slp_transpose_vectorize) Optimization Init(0)
 Enable basic block vectorization (SLP) for transposed stores and loads on trees.
 
+ftree-slp-late
+Common Var(flag_slp_late) Init(0) Optimization
+Enable additional SLP vectorization pass after reassociation.
+
 fvect-cost-model=
 Common Joined RejectNegative Enum(vect_cost_model) Var(flag_vect_cost_model) Init(VECT_COST_MODEL_DEFAULT) Optimization
 -fvect-cost-model=[unlimited|dynamic|cheap|very-cheap]	Specifies the cost model for vectorization.
diff --git a/gcc/params.opt b/gcc/params.opt
index 3ddfaf5b2..bb4dc1825 100644
--- a/gcc/params.opt
+++ b/gcc/params.opt
@@ -1213,6 +1213,18 @@ The maximum factor which the loop vectorizer applies to the cost of statements i
 Common Joined UInteger Var(param_vect_induction_float) Init(1) IntegerRange(0, 1) Param Optimization
 Enable loop vectorization of floating point inductions.
 
+-param=vect-swap-operands=
+Common Joined UInteger Var(param_vect_swap_operands) Init(0) IntegerRange(0, 1) Param Optimization
+Enable swapping operands for commutative operations in vectorization analysis.
+
+-param=addr-expand-for-alias-check=
+Common Joined UInteger Var(param_addr_expand_for_alias_check) Init(0) IntegerRange(0, 1) Param Optimization
+Enable data reference address expansion for alias check.
+
+-param=vect-register-size-check=
+Common Joined UInteger Var(param_vect_register_size_check) Init(0) IntegerRange(0, 1) Param Optimization
+Enable checking if a group of interleaving data references may not fit in vector register.
+
 -param=vrp1-mode=
 Common Joined Var(param_vrp1_mode) Enum(vrp_mode) Init(VRP_MODE_VRP) Param Optimization
 --param=vrp1-mode=[vrp|ranger] Specifies the mode VRP1 should operate in.
diff --git a/gcc/passes.def b/gcc/passes.def
index e945af96a..529cc5093 100644
--- a/gcc/passes.def
+++ b/gcc/passes.def
@@ -337,6 +337,10 @@ along with GCC; see the file COPYING3.  If not see
       NEXT_PASS (pass_lower_switch);
       NEXT_PASS (pass_cse_reciprocals);
       NEXT_PASS (pass_reassoc, false /* early_p */);
+      NEXT_PASS (pass_slp_vectorize_late);
+      PUSH_INSERT_PASSES_WITHIN (pass_slp_vectorize_late)
+	  NEXT_PASS (pass_slp_vectorize);
+      POP_INSERT_PASSES ()
       NEXT_PASS (pass_strength_reduction);
       NEXT_PASS (pass_split_paths);
       NEXT_PASS (pass_tracer);
diff --git a/gcc/testsuite/gcc.dg/vect/vect-alias-expand.c b/gcc/testsuite/gcc.dg/vect/vect-alias-expand.c
new file mode 100644
index 000000000..a68f4baf8
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-alias-expand.c
@@ -0,0 +1,12 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -ftree-vectorize --param=addr-expand-for-alias-check=1 -fdump-tree-slp-details" } */
+
+extern float arr[2][2];
+
+void foo (int i, int j, float a, float b)
+{
+  arr[i][j] *= a;
+  arr[i][j+1] *= b;
+}
+
+/* { dg-final { scan-tree-dump "Basic block will be vectorized using SLP" "slp2" } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-op-swap.c b/gcc/testsuite/gcc.dg/vect/vect-op-swap.c
new file mode 100644
index 000000000..4872dc414
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-op-swap.c
@@ -0,0 +1,10 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -ftree-vectorize --param=vect-swap-operands=1 -fdump-tree-slp-details" } */
+
+void foo (float *res, float a, float b, float c)
+{
+  res[0] = a * b;
+  res[1] = b * c;
+}
+
+/* { dg-final { scan-tree-dump "Swapped operands for" "slp2" } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-regsize.c b/gcc/testsuite/gcc.dg/vect/vect-regsize.c
new file mode 100644
index 000000000..bcd81e6df
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-regsize.c
@@ -0,0 +1,18 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -ftree-vectorize --param=vect-register-size-check=1 -fdump-tree-slp-details" } */
+
+extern float arr[256][256][1024];
+
+void foo (int i, int j, float a, float b)
+{
+  arr[i][j][0] += a;
+  arr[i][j][1] += b;
+  arr[i][j+1][0] += a;
+  arr[i][j+1][1] += b;
+  arr[i+1][j][0] += a;
+  arr[i+1][j][1] += b;
+  arr[i+1][j+1][0] += a;
+  arr[i+1][j+1][1] += b;
+}
+
+/* { dg-final { scan-tree-dump "Basic block will be vectorized using SLP" "slp2" } } */
diff --git a/gcc/timevar.def b/gcc/timevar.def
index fc2b1e1e7..7560e930a 100644
--- a/gcc/timevar.def
+++ b/gcc/timevar.def
@@ -205,6 +205,7 @@ DEFTIMEVAR (TV_SCALAR_CLEANUP        , "scalar cleanup")
 DEFTIMEVAR (TV_TREE_PARALLELIZE_LOOPS, "tree parallelize loops")
 DEFTIMEVAR (TV_TREE_VECTORIZATION    , "tree vectorization")
 DEFTIMEVAR (TV_TREE_SLP_VECTORIZATION, "tree slp vectorization")
+DEFTIMEVAR (TV_TREE_LATE_SLP         , "late slp vectorization")
 DEFTIMEVAR (TV_GRAPHITE              , "Graphite")
 DEFTIMEVAR (TV_GRAPHITE_TRANSFORMS   , "Graphite loop transforms")
 DEFTIMEVAR (TV_GRAPHITE_DATA_DEPS    , "Graphite data dep analysis")
diff --git a/gcc/tree-data-ref.cc b/gcc/tree-data-ref.cc
index a05073c51..5eb4ac102 100644
--- a/gcc/tree-data-ref.cc
+++ b/gcc/tree-data-ref.cc
@@ -3021,6 +3021,18 @@ dr_may_alias_p (const struct data_reference *a, const struct data_reference *b,
       get_inner_reference_aff (DR_REF (b), &off2, &size2);
       aff_combination_scale (&off1, -1);
       aff_combination_add (&off2, &off1);
+
+      if (param_addr_expand_for_alias_check)
+	{
+	  using tree_expand_map_t = hash_map<tree, name_expansion *>;
+	  /* Cache used by aff_combination_expand.  */
+	  tree_expand_map_t *cache = NULL;
+
+	  if (off2.n)
+	    aff_combination_expand (&off2, &cache);
+	  free_affine_expand_cache (&cache);
+	}
+
       if (aff_comb_cannot_overlap_p (&off2, size1, size2))
 	return false;
     }
diff --git a/gcc/tree-pass.h b/gcc/tree-pass.h
index 18b0f8022..2ed79f353 100644
--- a/gcc/tree-pass.h
+++ b/gcc/tree-pass.h
@@ -390,6 +390,7 @@ extern gimple_opt_pass *make_pass_slp_vectorize (gcc::context *ctxt);
 extern gimple_opt_pass *make_pass_complete_unroll (gcc::context *ctxt);
 extern gimple_opt_pass *make_pass_complete_unrolli (gcc::context *ctxt);
 extern gimple_opt_pass *make_pass_pre_slp_scalar_cleanup (gcc::context *ctxt);
+extern gimple_opt_pass *make_pass_slp_vectorize_late (gcc::context *ctxt);
 extern gimple_opt_pass *make_pass_parallelize_loops (gcc::context *ctxt);
 extern gimple_opt_pass *make_pass_loop_prefetch (gcc::context *ctxt);
 extern gimple_opt_pass *make_pass_iv_optimize (gcc::context *ctxt);
diff --git a/gcc/tree-vect-data-refs.cc b/gcc/tree-vect-data-refs.cc
index aae7f62f3..ee58c8f6c 100644
--- a/gcc/tree-vect-data-refs.cc
+++ b/gcc/tree-vect-data-refs.cc
@@ -3234,6 +3234,21 @@ vect_analyze_data_ref_accesses (vec_info *vinfo,
 		      != type_size_a))
 		break;
 
+	      if (param_vect_register_size_check)
+		{
+		  tree scalar_type = TREE_TYPE (DR_REF (dra));
+		  tree vec_type = get_related_vectype_for_scalar_type (
+		      vinfo->vector_mode, scalar_type);
+		  poly_uint64 vec_size = TYPE_VECTOR_SUBPARTS (vec_type);
+
+		  /* If we have a large interleaving group (especially a group
+		     of loads with gaps) that does not fit in vector register,
+		     we should split this group to chunks we support.  */
+		  if (maybe_ge (((unsigned HOST_WIDE_INT)init_b - init_prev)
+				/ type_size_a, vec_size))
+		    break;
+		}
+
 	      /* If the step (if not zero or non-constant) is smaller than the
 		 difference between data-refs' inits this splits groups into
 		 suitable sizes.  */
diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc
index fbd638333..79026fb5b 100644
--- a/gcc/tree-vect-slp.cc
+++ b/gcc/tree-vect-slp.cc
@@ -687,6 +687,34 @@ vect_get_and_check_slp_defs (vec_info *vinfo, unsigned char swap,
   if (first)
     return 0;
 
+  /* If different statements in the group of commutative operations
+     have the same arguments but in different places, swap them to
+     group the same operands in one vector.
+
+     Check if swapping is enabled, operation is commutative and has
+     two operands of the same type.
+     If one of the operands in current statement match the operand
+     on another place of the first statement in the group we
+     swap operands in current statement.  */
+  if (param_vect_swap_operands && commutative_op == 0 && !first
+      && is_a <bb_vec_info> (vinfo) && number_of_oprnds == 2
+      && vect_def_types_match (dts[0], dts[1]))
+    {
+      slp_oprnd_info oprnd_info0 = (*oprnds_info)[0];
+      slp_oprnd_info oprnd_info1 = (*oprnds_info)[1];
+      if (oprnd_info1->ops[stmt_num] == oprnd_info0->ops[0]
+	  || oprnd_info0->ops[stmt_num] == oprnd_info1->ops[0])
+      {
+	std::swap (oprnd_info0->def_stmts[stmt_num],
+		   oprnd_info1->def_stmts[stmt_num]);
+	std::swap (oprnd_info0->ops[stmt_num],
+		   oprnd_info1->ops[stmt_num]);
+	if (dump_enabled_p ())
+	  dump_printf_loc (MSG_NOTE, vect_location,
+			   "Swapped operands for %G", stmt_info->stmt);
+      }
+    }
+
   /* Now match the operand definition types to that of the first stmt.  */
   for (i = 0; i < number_of_oprnds;)
     {
diff --git a/gcc/tree-vectorizer.cc b/gcc/tree-vectorizer.cc
index a63fa3912..c363ce490 100644
--- a/gcc/tree-vectorizer.cc
+++ b/gcc/tree-vectorizer.cc
@@ -1524,6 +1524,45 @@ make_pass_slp_vectorize (gcc::context *ctxt)
   return new pass_slp_vectorize (ctxt);
 }
 
+/*  The late SLP vectorization pass.  */
+
+namespace {
+
+const pass_data pass_data_slp_vectorize_late =
+{
+  GIMPLE_PASS, /* type.  */
+  "slp_late", /* name.  */
+  OPTGROUP_NONE, /* optinfo_flags.  */
+  TV_TREE_LATE_SLP, /* tv_id.  */
+  PROP_cfg, /* properties_required.  */
+  0, /* properties_provided.  */
+  0, /* properties_destroyed.  */
+  0, /* todo_flags_start.  */
+  0, /* todo_flags_finish.  */
+};
+
+class pass_slp_vectorize_late : public gimple_opt_pass
+{
+public:
+  pass_slp_vectorize_late (gcc::context *ctxt)
+    : gimple_opt_pass (pass_data_slp_vectorize_late, ctxt)
+  {}
+
+  /* opt_pass methods: */
+  virtual bool gate (function *)
+  {
+    return flag_slp_late != 0;
+  }
+
+}; // class pass_slp_vectorize_late
+
+} // anon namespace
+
+gimple_opt_pass *
+make_pass_slp_vectorize_late (gcc::context *ctxt)
+{
+  return new pass_slp_vectorize_late (ctxt);
+}
 
 /* Increase alignment of global arrays to improve vectorization potential.
    TODO:
-- 
2.33.0