From 9df4a0bd76299734ae47f2f4e236b10f6c156994 Mon Sep 17 00:00:00 2001 From: d84370931 Date: Thu, 14 Nov 2024 17:08:40 +0800 Subject: [PATCH 3/8] Add late slp vectorization pass with additional checks. Add expansion of data reference offset using affine trees to check if data references may alias. Add check if a group of interleaving data references is smaller than max vector register size. Add operands swap for commutative operations. Swapping operands is necessary for better vector constructing. For example for operations _1 = a * b; _2 = b * c; Construction vectors (a, c) * (b, b) is more profitable than (a, b) * (b, c). Add tests and special param flags for each check: --param=vect-addr-expand-for-alias-check={0,1} --param=vect-swap-operands={0,1} --param=vect-register-size-check={0,1} Add enabling flag for late slp pass: -ftree-slp-late --- gcc/common.opt | 4 ++ gcc/params.opt | 12 ++++++ gcc/passes.def | 4 ++ gcc/testsuite/gcc.dg/vect/vect-alias-expand.c | 12 ++++++ gcc/testsuite/gcc.dg/vect/vect-op-swap.c | 10 +++++ gcc/testsuite/gcc.dg/vect/vect-regsize.c | 18 +++++++++ gcc/timevar.def | 1 + gcc/tree-data-ref.cc | 12 ++++++ gcc/tree-pass.h | 1 + gcc/tree-vect-data-refs.cc | 15 +++++++ gcc/tree-vect-slp.cc | 28 +++++++++++++ gcc/tree-vectorizer.cc | 39 +++++++++++++++++++ 12 files changed, 156 insertions(+) create mode 100644 gcc/testsuite/gcc.dg/vect/vect-alias-expand.c create mode 100644 gcc/testsuite/gcc.dg/vect/vect-op-swap.c create mode 100644 gcc/testsuite/gcc.dg/vect/vect-regsize.c diff --git a/gcc/common.opt b/gcc/common.opt index 78cfc333a..c3c64ceaf 100644 --- a/gcc/common.opt +++ b/gcc/common.opt @@ -3268,6 +3268,10 @@ ftree-slp-transpose-vectorize Common Var(flag_tree_slp_transpose_vectorize) Optimization Init(0) Enable basic block vectorization (SLP) for transposed stores and loads on trees. +ftree-slp-late +Common Var(flag_slp_late) Init(0) Optimization +Enable additional SLP vectorization pass after reassociation. + fvect-cost-model= Common Joined RejectNegative Enum(vect_cost_model) Var(flag_vect_cost_model) Init(VECT_COST_MODEL_DEFAULT) Optimization -fvect-cost-model=[unlimited|dynamic|cheap|very-cheap] Specifies the cost model for vectorization. diff --git a/gcc/params.opt b/gcc/params.opt index 3ddfaf5b2..bb4dc1825 100644 --- a/gcc/params.opt +++ b/gcc/params.opt @@ -1213,6 +1213,18 @@ The maximum factor which the loop vectorizer applies to the cost of statements i Common Joined UInteger Var(param_vect_induction_float) Init(1) IntegerRange(0, 1) Param Optimization Enable loop vectorization of floating point inductions. +-param=vect-swap-operands= +Common Joined UInteger Var(param_vect_swap_operands) Init(0) IntegerRange(0, 1) Param Optimization +Enable swapping operands for commutative operations in vectorization analysis. + +-param=addr-expand-for-alias-check= +Common Joined UInteger Var(param_addr_expand_for_alias_check) Init(0) IntegerRange(0, 1) Param Optimization +Enable data reference address expansion for alias check. + +-param=vect-register-size-check= +Common Joined UInteger Var(param_vect_register_size_check) Init(0) IntegerRange(0, 1) Param Optimization +Enable checking if a group of interleaving data references may not fit in vector register. + -param=vrp1-mode= Common Joined Var(param_vrp1_mode) Enum(vrp_mode) Init(VRP_MODE_VRP) Param Optimization --param=vrp1-mode=[vrp|ranger] Specifies the mode VRP1 should operate in. diff --git a/gcc/passes.def b/gcc/passes.def index e945af96a..529cc5093 100644 --- a/gcc/passes.def +++ b/gcc/passes.def @@ -337,6 +337,10 @@ along with GCC; see the file COPYING3. If not see NEXT_PASS (pass_lower_switch); NEXT_PASS (pass_cse_reciprocals); NEXT_PASS (pass_reassoc, false /* early_p */); + NEXT_PASS (pass_slp_vectorize_late); + PUSH_INSERT_PASSES_WITHIN (pass_slp_vectorize_late) + NEXT_PASS (pass_slp_vectorize); + POP_INSERT_PASSES () NEXT_PASS (pass_strength_reduction); NEXT_PASS (pass_split_paths); NEXT_PASS (pass_tracer); diff --git a/gcc/testsuite/gcc.dg/vect/vect-alias-expand.c b/gcc/testsuite/gcc.dg/vect/vect-alias-expand.c new file mode 100644 index 000000000..a68f4baf8 --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/vect-alias-expand.c @@ -0,0 +1,12 @@ +/* { dg-do compile } */ +/* { dg-options "-O3 -ftree-vectorize --param=addr-expand-for-alias-check=1 -fdump-tree-slp-details" } */ + +extern float arr[2][2]; + +void foo (int i, int j, float a, float b) +{ + arr[i][j] *= a; + arr[i][j+1] *= b; +} + +/* { dg-final { scan-tree-dump "Basic block will be vectorized using SLP" "slp2" } } */ diff --git a/gcc/testsuite/gcc.dg/vect/vect-op-swap.c b/gcc/testsuite/gcc.dg/vect/vect-op-swap.c new file mode 100644 index 000000000..4872dc414 --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/vect-op-swap.c @@ -0,0 +1,10 @@ +/* { dg-do compile } */ +/* { dg-options "-O3 -ftree-vectorize --param=vect-swap-operands=1 -fdump-tree-slp-details" } */ + +void foo (float *res, float a, float b, float c) +{ + res[0] = a * b; + res[1] = b * c; +} + +/* { dg-final { scan-tree-dump "Swapped operands for" "slp2" } } */ diff --git a/gcc/testsuite/gcc.dg/vect/vect-regsize.c b/gcc/testsuite/gcc.dg/vect/vect-regsize.c new file mode 100644 index 000000000..bcd81e6df --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/vect-regsize.c @@ -0,0 +1,18 @@ +/* { dg-do compile } */ +/* { dg-options "-O3 -ftree-vectorize --param=vect-register-size-check=1 -fdump-tree-slp-details" } */ + +extern float arr[256][256][1024]; + +void foo (int i, int j, float a, float b) +{ + arr[i][j][0] += a; + arr[i][j][1] += b; + arr[i][j+1][0] += a; + arr[i][j+1][1] += b; + arr[i+1][j][0] += a; + arr[i+1][j][1] += b; + arr[i+1][j+1][0] += a; + arr[i+1][j+1][1] += b; +} + +/* { dg-final { scan-tree-dump "Basic block will be vectorized using SLP" "slp2" } } */ diff --git a/gcc/timevar.def b/gcc/timevar.def index fc2b1e1e7..7560e930a 100644 --- a/gcc/timevar.def +++ b/gcc/timevar.def @@ -205,6 +205,7 @@ DEFTIMEVAR (TV_SCALAR_CLEANUP , "scalar cleanup") DEFTIMEVAR (TV_TREE_PARALLELIZE_LOOPS, "tree parallelize loops") DEFTIMEVAR (TV_TREE_VECTORIZATION , "tree vectorization") DEFTIMEVAR (TV_TREE_SLP_VECTORIZATION, "tree slp vectorization") +DEFTIMEVAR (TV_TREE_LATE_SLP , "late slp vectorization") DEFTIMEVAR (TV_GRAPHITE , "Graphite") DEFTIMEVAR (TV_GRAPHITE_TRANSFORMS , "Graphite loop transforms") DEFTIMEVAR (TV_GRAPHITE_DATA_DEPS , "Graphite data dep analysis") diff --git a/gcc/tree-data-ref.cc b/gcc/tree-data-ref.cc index a05073c51..5eb4ac102 100644 --- a/gcc/tree-data-ref.cc +++ b/gcc/tree-data-ref.cc @@ -3021,6 +3021,18 @@ dr_may_alias_p (const struct data_reference *a, const struct data_reference *b, get_inner_reference_aff (DR_REF (b), &off2, &size2); aff_combination_scale (&off1, -1); aff_combination_add (&off2, &off1); + + if (param_addr_expand_for_alias_check) + { + using tree_expand_map_t = hash_map; + /* Cache used by aff_combination_expand. */ + tree_expand_map_t *cache = NULL; + + if (off2.n) + aff_combination_expand (&off2, &cache); + free_affine_expand_cache (&cache); + } + if (aff_comb_cannot_overlap_p (&off2, size1, size2)) return false; } diff --git a/gcc/tree-pass.h b/gcc/tree-pass.h index 18b0f8022..2ed79f353 100644 --- a/gcc/tree-pass.h +++ b/gcc/tree-pass.h @@ -390,6 +390,7 @@ extern gimple_opt_pass *make_pass_slp_vectorize (gcc::context *ctxt); extern gimple_opt_pass *make_pass_complete_unroll (gcc::context *ctxt); extern gimple_opt_pass *make_pass_complete_unrolli (gcc::context *ctxt); extern gimple_opt_pass *make_pass_pre_slp_scalar_cleanup (gcc::context *ctxt); +extern gimple_opt_pass *make_pass_slp_vectorize_late (gcc::context *ctxt); extern gimple_opt_pass *make_pass_parallelize_loops (gcc::context *ctxt); extern gimple_opt_pass *make_pass_loop_prefetch (gcc::context *ctxt); extern gimple_opt_pass *make_pass_iv_optimize (gcc::context *ctxt); diff --git a/gcc/tree-vect-data-refs.cc b/gcc/tree-vect-data-refs.cc index aae7f62f3..ee58c8f6c 100644 --- a/gcc/tree-vect-data-refs.cc +++ b/gcc/tree-vect-data-refs.cc @@ -3234,6 +3234,21 @@ vect_analyze_data_ref_accesses (vec_info *vinfo, != type_size_a)) break; + if (param_vect_register_size_check) + { + tree scalar_type = TREE_TYPE (DR_REF (dra)); + tree vec_type = get_related_vectype_for_scalar_type ( + vinfo->vector_mode, scalar_type); + poly_uint64 vec_size = TYPE_VECTOR_SUBPARTS (vec_type); + + /* If we have a large interleaving group (especially a group + of loads with gaps) that does not fit in vector register, + we should split this group to chunks we support. */ + if (maybe_ge (((unsigned HOST_WIDE_INT)init_b - init_prev) + / type_size_a, vec_size)) + break; + } + /* If the step (if not zero or non-constant) is smaller than the difference between data-refs' inits this splits groups into suitable sizes. */ diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc index fbd638333..79026fb5b 100644 --- a/gcc/tree-vect-slp.cc +++ b/gcc/tree-vect-slp.cc @@ -687,6 +687,34 @@ vect_get_and_check_slp_defs (vec_info *vinfo, unsigned char swap, if (first) return 0; + /* If different statements in the group of commutative operations + have the same arguments but in different places, swap them to + group the same operands in one vector. + + Check if swapping is enabled, operation is commutative and has + two operands of the same type. + If one of the operands in current statement match the operand + on another place of the first statement in the group we + swap operands in current statement. */ + if (param_vect_swap_operands && commutative_op == 0 && !first + && is_a (vinfo) && number_of_oprnds == 2 + && vect_def_types_match (dts[0], dts[1])) + { + slp_oprnd_info oprnd_info0 = (*oprnds_info)[0]; + slp_oprnd_info oprnd_info1 = (*oprnds_info)[1]; + if (oprnd_info1->ops[stmt_num] == oprnd_info0->ops[0] + || oprnd_info0->ops[stmt_num] == oprnd_info1->ops[0]) + { + std::swap (oprnd_info0->def_stmts[stmt_num], + oprnd_info1->def_stmts[stmt_num]); + std::swap (oprnd_info0->ops[stmt_num], + oprnd_info1->ops[stmt_num]); + if (dump_enabled_p ()) + dump_printf_loc (MSG_NOTE, vect_location, + "Swapped operands for %G", stmt_info->stmt); + } + } + /* Now match the operand definition types to that of the first stmt. */ for (i = 0; i < number_of_oprnds;) { diff --git a/gcc/tree-vectorizer.cc b/gcc/tree-vectorizer.cc index a63fa3912..c363ce490 100644 --- a/gcc/tree-vectorizer.cc +++ b/gcc/tree-vectorizer.cc @@ -1524,6 +1524,45 @@ make_pass_slp_vectorize (gcc::context *ctxt) return new pass_slp_vectorize (ctxt); } +/* The late SLP vectorization pass. */ + +namespace { + +const pass_data pass_data_slp_vectorize_late = +{ + GIMPLE_PASS, /* type. */ + "slp_late", /* name. */ + OPTGROUP_NONE, /* optinfo_flags. */ + TV_TREE_LATE_SLP, /* tv_id. */ + PROP_cfg, /* properties_required. */ + 0, /* properties_provided. */ + 0, /* properties_destroyed. */ + 0, /* todo_flags_start. */ + 0, /* todo_flags_finish. */ +}; + +class pass_slp_vectorize_late : public gimple_opt_pass +{ +public: + pass_slp_vectorize_late (gcc::context *ctxt) + : gimple_opt_pass (pass_data_slp_vectorize_late, ctxt) + {} + + /* opt_pass methods: */ + virtual bool gate (function *) + { + return flag_slp_late != 0; + } + +}; // class pass_slp_vectorize_late + +} // anon namespace + +gimple_opt_pass * +make_pass_slp_vectorize_late (gcc::context *ctxt) +{ + return new pass_slp_vectorize_late (ctxt); +} /* Increase alignment of global arrays to improve vectorization potential. TODO: -- 2.33.0