summaryrefslogtreecommitdiff
path: root/0049-Add-more-flexible-check-for-pointer-aliasing-during-.patch
diff options
context:
space:
mode:
Diffstat (limited to '0049-Add-more-flexible-check-for-pointer-aliasing-during-.patch')
-rw-r--r--0049-Add-more-flexible-check-for-pointer-aliasing-during-.patch239
1 files changed, 239 insertions, 0 deletions
diff --git a/0049-Add-more-flexible-check-for-pointer-aliasing-during-.patch b/0049-Add-more-flexible-check-for-pointer-aliasing-during-.patch
new file mode 100644
index 0000000..df88789
--- /dev/null
+++ b/0049-Add-more-flexible-check-for-pointer-aliasing-during-.patch
@@ -0,0 +1,239 @@
+From b5865aef36ebaac87ae30d51f08bfe081795ed67 Mon Sep 17 00:00:00 2001
+From: Chernonog Viacheslav <chernonog.vyacheslav@huawei.com>
+Date: Tue, 12 Mar 2024 23:30:56 +0800
+Subject: [PATCH 17/18] Add more flexible check for pointer aliasing during
+ vectorization It takes minimum between number of iteration and segment length
+ it helps to speed up loops with small number of iterations when only tail can
+ be vectorized
+
+---
+ gcc/params.opt | 5 ++
+ .../sve/var_stride_flexible_segment_len_1.c | 23 +++++++
+ gcc/tree-data-ref.cc | 67 +++++++++++++------
+ gcc/tree-data-ref.h | 11 ++-
+ gcc/tree-vect-data-refs.cc | 14 +++-
+ 5 files changed, 95 insertions(+), 25 deletions(-)
+ create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/var_stride_flexible_segment_len_1.c
+
+diff --git a/gcc/params.opt b/gcc/params.opt
+index 6176d4790..7e5c119cf 100644
+--- a/gcc/params.opt
++++ b/gcc/params.opt
+@@ -1180,6 +1180,11 @@ Maximum number of loop peels to enhance alignment of data references in a loop.
+ Common Joined UInteger Var(param_vect_max_version_for_alias_checks) Init(10) Param Optimization
+ Bound on number of runtime checks inserted by the vectorizer's loop versioning for alias check.
+
++-param=vect-alias-flexible-segment-len=
++Common Joined UInteger Var(param_flexible_seg_len) Init(0) IntegerRange(0, 1) Param Optimization
++Use a minimum length of different segments. Currenlty the minimum between
++iteration number and vectorization length is chosen by this param.
++
+ -param=vect-max-version-for-alignment-checks=
+ Common Joined UInteger Var(param_vect_max_version_for_alignment_checks) Init(6) Param Optimization
+ Bound on number of runtime checks inserted by the vectorizer's loop versioning for alignment check.
+diff --git a/gcc/testsuite/gcc.target/aarch64/sve/var_stride_flexible_segment_len_1.c b/gcc/testsuite/gcc.target/aarch64/sve/var_stride_flexible_segment_len_1.c
+new file mode 100644
+index 000000000..894f075f3
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/sve/var_stride_flexible_segment_len_1.c
+@@ -0,0 +1,23 @@
++/* { dg-do compile } */
++/* { dg-options "-O2 -ftree-vectorize --param=vect-alias-flexible-segment-len=1" } */
++
++#define TYPE int
++#define SIZE 257
++
++void __attribute__ ((weak))
++f (TYPE *x, TYPE *y, unsigned short n, long m __attribute__((unused)))
++{
++ for (int i = 0; i < SIZE; ++i)
++ x[i * n] += y[i * n];
++}
++
++/* { dg-final { scan-assembler {\tld1w\tz[0-9]+} } } */
++/* { dg-final { scan-assembler {\tst1w\tz[0-9]+} } } */
++/* { dg-final { scan-assembler {\tldr\tw[0-9]+} } } */
++/* { dg-final { scan-assembler {\tstr\tw[0-9]+} } } */
++/* Should use a WAR check that multiplies by (VF-2)*4 rather than
++ an overlap check that multiplies by (257-1)*4. */
++/* { dg-final { scan-assembler {\tcntb\t(x[0-9]+)\n.*\tsub\tx[0-9]+, \1, #8\n.*\tmul\tx[0-9]+,[^\n]*\1} } } */
++/* One range check and a check for n being zero. */
++/* { dg-final { scan-assembler-times {\t(?:cmp|tst)\t} 2 } } */
++/* { dg-final { scan-assembler-times {\tccmp\t} 1 } } */
+diff --git a/gcc/tree-data-ref.cc b/gcc/tree-data-ref.cc
+index 397792c35..e6ae9e847 100644
+--- a/gcc/tree-data-ref.cc
++++ b/gcc/tree-data-ref.cc
+@@ -2329,31 +2329,15 @@ create_intersect_range_checks_index (class loop *loop, tree *cond_expr,
+ same arguments. Try to optimize cases in which the second access
+ is a write and in which some overlap is valid. */
+
+-static bool
+-create_waw_or_war_checks (tree *cond_expr,
++static void
++create_waw_or_war_checks2 (tree *cond_expr, tree seg_len_a,
+ const dr_with_seg_len_pair_t &alias_pair)
+ {
+ const dr_with_seg_len& dr_a = alias_pair.first;
+ const dr_with_seg_len& dr_b = alias_pair.second;
+
+- /* Check for cases in which:
+-
+- (a) DR_B is always a write;
+- (b) the accesses are well-ordered in both the original and new code
+- (see the comment above the DR_ALIAS_* flags for details); and
+- (c) the DR_STEPs describe all access pairs covered by ALIAS_PAIR. */
+- if (alias_pair.flags & ~(DR_ALIAS_WAR | DR_ALIAS_WAW))
+- return false;
+-
+- /* Check for equal (but possibly variable) steps. */
+ tree step = DR_STEP (dr_a.dr);
+- if (!operand_equal_p (step, DR_STEP (dr_b.dr)))
+- return false;
+-
+- /* Make sure that we can operate on sizetype without loss of precision. */
+ tree addr_type = TREE_TYPE (DR_BASE_ADDRESS (dr_a.dr));
+- if (TYPE_PRECISION (addr_type) != TYPE_PRECISION (sizetype))
+- return false;
+
+ /* All addresses involved are known to have a common alignment ALIGN.
+ We can therefore subtract ALIGN from an exclusive endpoint to get
+@@ -2370,9 +2354,6 @@ create_waw_or_war_checks (tree *cond_expr,
+ fold_convert (ssizetype, indicator),
+ ssize_int (0));
+
+- /* Get lengths in sizetype. */
+- tree seg_len_a
+- = fold_convert (sizetype, rewrite_to_non_trapping_overflow (dr_a.seg_len));
+ step = fold_convert (sizetype, rewrite_to_non_trapping_overflow (step));
+
+ /* Each access has the following pattern:
+@@ -2479,6 +2460,50 @@ create_waw_or_war_checks (tree *cond_expr,
+ *cond_expr = fold_build2 (GT_EXPR, boolean_type_node, subject, limit);
+ if (dump_enabled_p ())
+ dump_printf (MSG_NOTE, "using an address-based WAR/WAW test\n");
++}
++
++/* This is a wrapper function for create_waw_or_war_checks2. */
++static bool
++create_waw_or_war_checks (tree *cond_expr,
++ const dr_with_seg_len_pair_t &alias_pair)
++{
++ const dr_with_seg_len& dr_a = alias_pair.first;
++ const dr_with_seg_len& dr_b = alias_pair.second;
++
++ /* Check for cases in which:
++
++ (a) DR_B is always a write;
++ (b) the accesses are well-ordered in both the original and new code
++ (see the comment above the DR_ALIAS_* flags for details); and
++ (c) the DR_STEPs describe all access pairs covered by ALIAS_PAIR. */
++ if (alias_pair.flags & ~(DR_ALIAS_WAR | DR_ALIAS_WAW))
++ return false;
++
++ /* Check for equal (but possibly variable) steps. */
++ tree step = DR_STEP (dr_a.dr);
++ if (!operand_equal_p (step, DR_STEP (dr_b.dr)))
++ return false;
++
++ /* Make sure that we can operate on sizetype without loss of precision. */
++ tree addr_type = TREE_TYPE (DR_BASE_ADDRESS (dr_a.dr));
++ if (TYPE_PRECISION (addr_type) != TYPE_PRECISION (sizetype))
++ return false;
++
++ /* Get lengths in sizetype. */
++ tree seg_len_a
++ = fold_convert (sizetype,
++ rewrite_to_non_trapping_overflow (dr_a.seg_len));
++ create_waw_or_war_checks2 (cond_expr, seg_len_a, alias_pair);
++ if (param_flexible_seg_len && dr_a.seg_len != dr_a.seg_len2)
++ {
++ tree seg_len2_a
++ = fold_convert (sizetype,
++ rewrite_to_non_trapping_overflow (dr_a.seg_len2));
++ tree cond_expr2;
++ create_waw_or_war_checks2 (&cond_expr2, seg_len2_a, alias_pair);
++ *cond_expr = fold_build2 (TRUTH_OR_EXPR, boolean_type_node,
++ *cond_expr, cond_expr2);
++ }
+ return true;
+ }
+
+diff --git a/gcc/tree-data-ref.h b/gcc/tree-data-ref.h
+index f643a95b2..9bc5f16ee 100644
+--- a/gcc/tree-data-ref.h
++++ b/gcc/tree-data-ref.h
+@@ -213,12 +213,19 @@ class dr_with_seg_len
+ public:
+ dr_with_seg_len (data_reference_p d, tree len, unsigned HOST_WIDE_INT size,
+ unsigned int a)
+- : dr (d), seg_len (len), access_size (size), align (a) {}
+-
++ : dr (d), seg_len (len), seg_len2 (len), access_size (size), align (a)
++ {}
++ dr_with_seg_len (data_reference_p d, tree len, tree len2,
++ unsigned HOST_WIDE_INT size, unsigned int a)
++ : dr (d), seg_len (len), seg_len2 (len2), access_size (size), align (a)
++ {}
+ data_reference_p dr;
+ /* The offset of the last access that needs to be checked minus
+ the offset of the first. */
+ tree seg_len;
++ /* The second version of segment length. Currently this is used to
++ soften checks for a small number of iterations. */
++ tree seg_len2;
+ /* A value that, when added to abs (SEG_LEN), gives the total number of
+ bytes in the segment. */
+ poly_uint64 access_size;
+diff --git a/gcc/tree-vect-data-refs.cc b/gcc/tree-vect-data-refs.cc
+index 4e615b80b..04e68f621 100644
+--- a/gcc/tree-vect-data-refs.cc
++++ b/gcc/tree-vect-data-refs.cc
+@@ -3646,6 +3646,7 @@ vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
+ {
+ poly_uint64 lower_bound;
+ tree segment_length_a, segment_length_b;
++ tree segment_length2_a, segment_length2_b;
+ unsigned HOST_WIDE_INT access_size_a, access_size_b;
+ unsigned int align_a, align_b;
+
+@@ -3751,6 +3752,8 @@ vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
+ {
+ segment_length_a = size_zero_node;
+ segment_length_b = size_zero_node;
++ segment_length2_a = size_zero_node;
++ segment_length2_b = size_zero_node;
+ }
+ else
+ {
+@@ -3759,8 +3762,15 @@ vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
+ length_factor = scalar_loop_iters;
+ else
+ length_factor = size_int (vect_factor);
++ /* In any case we should rememeber scalar_loop_iters
++ this helps to create flexible aliasing check
++ for small number of iterations. */
+ segment_length_a = vect_vfa_segment_size (dr_info_a, length_factor);
+ segment_length_b = vect_vfa_segment_size (dr_info_b, length_factor);
++ segment_length2_a
++ = vect_vfa_segment_size (dr_info_a, scalar_loop_iters);
++ segment_length2_b
++ = vect_vfa_segment_size (dr_info_b, scalar_loop_iters);
+ }
+ access_size_a = vect_vfa_access_size (loop_vinfo, dr_info_a);
+ access_size_b = vect_vfa_access_size (loop_vinfo, dr_info_b);
+@@ -3805,9 +3815,9 @@ vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
+ }
+
+ dr_with_seg_len dr_a (dr_info_a->dr, segment_length_a,
+- access_size_a, align_a);
++ segment_length2_a, access_size_a, align_a);
+ dr_with_seg_len dr_b (dr_info_b->dr, segment_length_b,
+- access_size_b, align_b);
++ segment_length2_b, access_size_b, align_b);
+ /* Canonicalize the order to be the one that's needed for accurate
+ RAW, WAR and WAW flags, in cases where the data references are
+ well-ordered. The order doesn't really matter otherwise,
+--
+2.33.0
+