summaryrefslogtreecommitdiff
path: root/0312-Add-late-slp-vectorization-pass-with-additional-chec.patch
diff options
context:
space:
mode:
Diffstat (limited to '0312-Add-late-slp-vectorization-pass-with-additional-chec.patch')
-rw-r--r--0312-Add-late-slp-vectorization-pass-with-additional-chec.patch320
1 files changed, 320 insertions, 0 deletions
diff --git a/0312-Add-late-slp-vectorization-pass-with-additional-chec.patch b/0312-Add-late-slp-vectorization-pass-with-additional-chec.patch
new file mode 100644
index 0000000..19e519f
--- /dev/null
+++ b/0312-Add-late-slp-vectorization-pass-with-additional-chec.patch
@@ -0,0 +1,320 @@
+From 9df4a0bd76299734ae47f2f4e236b10f6c156994 Mon Sep 17 00:00:00 2001
+From: d84370931 <dementiev.daniil@h-partners.com>
+Date: Thu, 14 Nov 2024 17:08:40 +0800
+Subject: [PATCH 3/8] Add late slp vectorization pass with additional checks.
+
+Add expansion of data reference offset using affine trees to check
+if data references may alias.
+
+Add check if a group of interleaving data references is smaller than
+max vector register size.
+
+Add operands swap for commutative operations.
+Swapping operands is necessary for better vector constructing.
+For example for operations
+ _1 = a * b;
+ _2 = b * c;
+Construction vectors (a, c) * (b, b) is more profitable
+than (a, b) * (b, c).
+
+Add tests and special param flags for each check:
+ --param=vect-addr-expand-for-alias-check={0,1}
+ --param=vect-swap-operands={0,1}
+ --param=vect-register-size-check={0,1}
+
+Add enabling flag for late slp pass:
+ -ftree-slp-late
+---
+ gcc/common.opt | 4 ++
+ gcc/params.opt | 12 ++++++
+ gcc/passes.def | 4 ++
+ gcc/testsuite/gcc.dg/vect/vect-alias-expand.c | 12 ++++++
+ gcc/testsuite/gcc.dg/vect/vect-op-swap.c | 10 +++++
+ gcc/testsuite/gcc.dg/vect/vect-regsize.c | 18 +++++++++
+ gcc/timevar.def | 1 +
+ gcc/tree-data-ref.cc | 12 ++++++
+ gcc/tree-pass.h | 1 +
+ gcc/tree-vect-data-refs.cc | 15 +++++++
+ gcc/tree-vect-slp.cc | 28 +++++++++++++
+ gcc/tree-vectorizer.cc | 39 +++++++++++++++++++
+ 12 files changed, 156 insertions(+)
+ create mode 100644 gcc/testsuite/gcc.dg/vect/vect-alias-expand.c
+ create mode 100644 gcc/testsuite/gcc.dg/vect/vect-op-swap.c
+ create mode 100644 gcc/testsuite/gcc.dg/vect/vect-regsize.c
+
+diff --git a/gcc/common.opt b/gcc/common.opt
+index 78cfc333a..c3c64ceaf 100644
+--- a/gcc/common.opt
++++ b/gcc/common.opt
+@@ -3268,6 +3268,10 @@ ftree-slp-transpose-vectorize
+ Common Var(flag_tree_slp_transpose_vectorize) Optimization Init(0)
+ Enable basic block vectorization (SLP) for transposed stores and loads on trees.
+
++ftree-slp-late
++Common Var(flag_slp_late) Init(0) Optimization
++Enable additional SLP vectorization pass after reassociation.
++
+ fvect-cost-model=
+ Common Joined RejectNegative Enum(vect_cost_model) Var(flag_vect_cost_model) Init(VECT_COST_MODEL_DEFAULT) Optimization
+ -fvect-cost-model=[unlimited|dynamic|cheap|very-cheap] Specifies the cost model for vectorization.
+diff --git a/gcc/params.opt b/gcc/params.opt
+index 3ddfaf5b2..bb4dc1825 100644
+--- a/gcc/params.opt
++++ b/gcc/params.opt
+@@ -1213,6 +1213,18 @@ The maximum factor which the loop vectorizer applies to the cost of statements i
+ Common Joined UInteger Var(param_vect_induction_float) Init(1) IntegerRange(0, 1) Param Optimization
+ Enable loop vectorization of floating point inductions.
+
++-param=vect-swap-operands=
++Common Joined UInteger Var(param_vect_swap_operands) Init(0) IntegerRange(0, 1) Param Optimization
++Enable swapping operands for commutative operations in vectorization analysis.
++
++-param=addr-expand-for-alias-check=
++Common Joined UInteger Var(param_addr_expand_for_alias_check) Init(0) IntegerRange(0, 1) Param Optimization
++Enable data reference address expansion for alias check.
++
++-param=vect-register-size-check=
++Common Joined UInteger Var(param_vect_register_size_check) Init(0) IntegerRange(0, 1) Param Optimization
++Enable checking if a group of interleaving data references may not fit in vector register.
++
+ -param=vrp1-mode=
+ Common Joined Var(param_vrp1_mode) Enum(vrp_mode) Init(VRP_MODE_VRP) Param Optimization
+ --param=vrp1-mode=[vrp|ranger] Specifies the mode VRP1 should operate in.
+diff --git a/gcc/passes.def b/gcc/passes.def
+index e945af96a..529cc5093 100644
+--- a/gcc/passes.def
++++ b/gcc/passes.def
+@@ -337,6 +337,10 @@ along with GCC; see the file COPYING3. If not see
+ NEXT_PASS (pass_lower_switch);
+ NEXT_PASS (pass_cse_reciprocals);
+ NEXT_PASS (pass_reassoc, false /* early_p */);
++ NEXT_PASS (pass_slp_vectorize_late);
++ PUSH_INSERT_PASSES_WITHIN (pass_slp_vectorize_late)
++ NEXT_PASS (pass_slp_vectorize);
++ POP_INSERT_PASSES ()
+ NEXT_PASS (pass_strength_reduction);
+ NEXT_PASS (pass_split_paths);
+ NEXT_PASS (pass_tracer);
+diff --git a/gcc/testsuite/gcc.dg/vect/vect-alias-expand.c b/gcc/testsuite/gcc.dg/vect/vect-alias-expand.c
+new file mode 100644
+index 000000000..a68f4baf8
+--- /dev/null
++++ b/gcc/testsuite/gcc.dg/vect/vect-alias-expand.c
+@@ -0,0 +1,12 @@
++/* { dg-do compile } */
++/* { dg-options "-O3 -ftree-vectorize --param=addr-expand-for-alias-check=1 -fdump-tree-slp-details" } */
++
++extern float arr[2][2];
++
++void foo (int i, int j, float a, float b)
++{
++ arr[i][j] *= a;
++ arr[i][j+1] *= b;
++}
++
++/* { dg-final { scan-tree-dump "Basic block will be vectorized using SLP" "slp2" } } */
+diff --git a/gcc/testsuite/gcc.dg/vect/vect-op-swap.c b/gcc/testsuite/gcc.dg/vect/vect-op-swap.c
+new file mode 100644
+index 000000000..4872dc414
+--- /dev/null
++++ b/gcc/testsuite/gcc.dg/vect/vect-op-swap.c
+@@ -0,0 +1,10 @@
++/* { dg-do compile } */
++/* { dg-options "-O3 -ftree-vectorize --param=vect-swap-operands=1 -fdump-tree-slp-details" } */
++
++void foo (float *res, float a, float b, float c)
++{
++ res[0] = a * b;
++ res[1] = b * c;
++}
++
++/* { dg-final { scan-tree-dump "Swapped operands for" "slp2" } } */
+diff --git a/gcc/testsuite/gcc.dg/vect/vect-regsize.c b/gcc/testsuite/gcc.dg/vect/vect-regsize.c
+new file mode 100644
+index 000000000..bcd81e6df
+--- /dev/null
++++ b/gcc/testsuite/gcc.dg/vect/vect-regsize.c
+@@ -0,0 +1,18 @@
++/* { dg-do compile } */
++/* { dg-options "-O3 -ftree-vectorize --param=vect-register-size-check=1 -fdump-tree-slp-details" } */
++
++extern float arr[256][256][1024];
++
++void foo (int i, int j, float a, float b)
++{
++ arr[i][j][0] += a;
++ arr[i][j][1] += b;
++ arr[i][j+1][0] += a;
++ arr[i][j+1][1] += b;
++ arr[i+1][j][0] += a;
++ arr[i+1][j][1] += b;
++ arr[i+1][j+1][0] += a;
++ arr[i+1][j+1][1] += b;
++}
++
++/* { dg-final { scan-tree-dump "Basic block will be vectorized using SLP" "slp2" } } */
+diff --git a/gcc/timevar.def b/gcc/timevar.def
+index fc2b1e1e7..7560e930a 100644
+--- a/gcc/timevar.def
++++ b/gcc/timevar.def
+@@ -205,6 +205,7 @@ DEFTIMEVAR (TV_SCALAR_CLEANUP , "scalar cleanup")
+ DEFTIMEVAR (TV_TREE_PARALLELIZE_LOOPS, "tree parallelize loops")
+ DEFTIMEVAR (TV_TREE_VECTORIZATION , "tree vectorization")
+ DEFTIMEVAR (TV_TREE_SLP_VECTORIZATION, "tree slp vectorization")
++DEFTIMEVAR (TV_TREE_LATE_SLP , "late slp vectorization")
+ DEFTIMEVAR (TV_GRAPHITE , "Graphite")
+ DEFTIMEVAR (TV_GRAPHITE_TRANSFORMS , "Graphite loop transforms")
+ DEFTIMEVAR (TV_GRAPHITE_DATA_DEPS , "Graphite data dep analysis")
+diff --git a/gcc/tree-data-ref.cc b/gcc/tree-data-ref.cc
+index a05073c51..5eb4ac102 100644
+--- a/gcc/tree-data-ref.cc
++++ b/gcc/tree-data-ref.cc
+@@ -3021,6 +3021,18 @@ dr_may_alias_p (const struct data_reference *a, const struct data_reference *b,
+ get_inner_reference_aff (DR_REF (b), &off2, &size2);
+ aff_combination_scale (&off1, -1);
+ aff_combination_add (&off2, &off1);
++
++ if (param_addr_expand_for_alias_check)
++ {
++ using tree_expand_map_t = hash_map<tree, name_expansion *>;
++ /* Cache used by aff_combination_expand. */
++ tree_expand_map_t *cache = NULL;
++
++ if (off2.n)
++ aff_combination_expand (&off2, &cache);
++ free_affine_expand_cache (&cache);
++ }
++
+ if (aff_comb_cannot_overlap_p (&off2, size1, size2))
+ return false;
+ }
+diff --git a/gcc/tree-pass.h b/gcc/tree-pass.h
+index 18b0f8022..2ed79f353 100644
+--- a/gcc/tree-pass.h
++++ b/gcc/tree-pass.h
+@@ -390,6 +390,7 @@ extern gimple_opt_pass *make_pass_slp_vectorize (gcc::context *ctxt);
+ extern gimple_opt_pass *make_pass_complete_unroll (gcc::context *ctxt);
+ extern gimple_opt_pass *make_pass_complete_unrolli (gcc::context *ctxt);
+ extern gimple_opt_pass *make_pass_pre_slp_scalar_cleanup (gcc::context *ctxt);
++extern gimple_opt_pass *make_pass_slp_vectorize_late (gcc::context *ctxt);
+ extern gimple_opt_pass *make_pass_parallelize_loops (gcc::context *ctxt);
+ extern gimple_opt_pass *make_pass_loop_prefetch (gcc::context *ctxt);
+ extern gimple_opt_pass *make_pass_iv_optimize (gcc::context *ctxt);
+diff --git a/gcc/tree-vect-data-refs.cc b/gcc/tree-vect-data-refs.cc
+index aae7f62f3..ee58c8f6c 100644
+--- a/gcc/tree-vect-data-refs.cc
++++ b/gcc/tree-vect-data-refs.cc
+@@ -3234,6 +3234,21 @@ vect_analyze_data_ref_accesses (vec_info *vinfo,
+ != type_size_a))
+ break;
+
++ if (param_vect_register_size_check)
++ {
++ tree scalar_type = TREE_TYPE (DR_REF (dra));
++ tree vec_type = get_related_vectype_for_scalar_type (
++ vinfo->vector_mode, scalar_type);
++ poly_uint64 vec_size = TYPE_VECTOR_SUBPARTS (vec_type);
++
++ /* If we have a large interleaving group (especially a group
++ of loads with gaps) that does not fit in vector register,
++ we should split this group to chunks we support. */
++ if (maybe_ge (((unsigned HOST_WIDE_INT)init_b - init_prev)
++ / type_size_a, vec_size))
++ break;
++ }
++
+ /* If the step (if not zero or non-constant) is smaller than the
+ difference between data-refs' inits this splits groups into
+ suitable sizes. */
+diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc
+index fbd638333..79026fb5b 100644
+--- a/gcc/tree-vect-slp.cc
++++ b/gcc/tree-vect-slp.cc
+@@ -687,6 +687,34 @@ vect_get_and_check_slp_defs (vec_info *vinfo, unsigned char swap,
+ if (first)
+ return 0;
+
++ /* If different statements in the group of commutative operations
++ have the same arguments but in different places, swap them to
++ group the same operands in one vector.
++
++ Check if swapping is enabled, operation is commutative and has
++ two operands of the same type.
++ If one of the operands in current statement match the operand
++ on another place of the first statement in the group we
++ swap operands in current statement. */
++ if (param_vect_swap_operands && commutative_op == 0 && !first
++ && is_a <bb_vec_info> (vinfo) && number_of_oprnds == 2
++ && vect_def_types_match (dts[0], dts[1]))
++ {
++ slp_oprnd_info oprnd_info0 = (*oprnds_info)[0];
++ slp_oprnd_info oprnd_info1 = (*oprnds_info)[1];
++ if (oprnd_info1->ops[stmt_num] == oprnd_info0->ops[0]
++ || oprnd_info0->ops[stmt_num] == oprnd_info1->ops[0])
++ {
++ std::swap (oprnd_info0->def_stmts[stmt_num],
++ oprnd_info1->def_stmts[stmt_num]);
++ std::swap (oprnd_info0->ops[stmt_num],
++ oprnd_info1->ops[stmt_num]);
++ if (dump_enabled_p ())
++ dump_printf_loc (MSG_NOTE, vect_location,
++ "Swapped operands for %G", stmt_info->stmt);
++ }
++ }
++
+ /* Now match the operand definition types to that of the first stmt. */
+ for (i = 0; i < number_of_oprnds;)
+ {
+diff --git a/gcc/tree-vectorizer.cc b/gcc/tree-vectorizer.cc
+index a63fa3912..c363ce490 100644
+--- a/gcc/tree-vectorizer.cc
++++ b/gcc/tree-vectorizer.cc
+@@ -1524,6 +1524,45 @@ make_pass_slp_vectorize (gcc::context *ctxt)
+ return new pass_slp_vectorize (ctxt);
+ }
+
++/* The late SLP vectorization pass. */
++
++namespace {
++
++const pass_data pass_data_slp_vectorize_late =
++{
++ GIMPLE_PASS, /* type. */
++ "slp_late", /* name. */
++ OPTGROUP_NONE, /* optinfo_flags. */
++ TV_TREE_LATE_SLP, /* tv_id. */
++ PROP_cfg, /* properties_required. */
++ 0, /* properties_provided. */
++ 0, /* properties_destroyed. */
++ 0, /* todo_flags_start. */
++ 0, /* todo_flags_finish. */
++};
++
++class pass_slp_vectorize_late : public gimple_opt_pass
++{
++public:
++ pass_slp_vectorize_late (gcc::context *ctxt)
++ : gimple_opt_pass (pass_data_slp_vectorize_late, ctxt)
++ {}
++
++ /* opt_pass methods: */
++ virtual bool gate (function *)
++ {
++ return flag_slp_late != 0;
++ }
++
++}; // class pass_slp_vectorize_late
++
++} // anon namespace
++
++gimple_opt_pass *
++make_pass_slp_vectorize_late (gcc::context *ctxt)
++{
++ return new pass_slp_vectorize_late (ctxt);
++}
+
+ /* Increase alignment of global arrays to improve vectorization potential.
+ TODO:
+--
+2.33.0
+