diff options
Diffstat (limited to '0099-Enable-Transposed-SLP.patch')
-rw-r--r-- | 0099-Enable-Transposed-SLP.patch | 5624 |
1 files changed, 5624 insertions, 0 deletions
diff --git a/0099-Enable-Transposed-SLP.patch b/0099-Enable-Transposed-SLP.patch new file mode 100644 index 0000000..b4e8b24 --- /dev/null +++ b/0099-Enable-Transposed-SLP.patch @@ -0,0 +1,5624 @@ +From 0dd3b8532f35486bd5db2c71342c8dfed4c0893a Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?=E9=83=91=E6=99=A8=E5=8D=89?= <zhengchenhui1@huawei.com> +Date: Thu, 25 Jul 2024 17:25:23 +0800 +Subject: [PATCH] Enable Transposed SLP. + +--- + gcc/common.opt | 4 + + gcc/testsuite/gcc.dg/vect/transpose-1.c | 53 + + gcc/testsuite/gcc.dg/vect/transpose-2.c | 50 + + gcc/testsuite/gcc.dg/vect/transpose-3.c | 54 + + gcc/testsuite/gcc.dg/vect/transpose-4.c | 53 + + gcc/testsuite/gcc.dg/vect/transpose-5.c | 74 ++ + gcc/testsuite/gcc.dg/vect/transpose-6.c | 67 + + gcc/testsuite/gcc.dg/vect/transpose-7.c | 53 + + gcc/testsuite/gcc.dg/vect/transpose-8.c | 53 + + gcc/testsuite/gcc.dg/vect/vect.exp | 7 + + gcc/tree-loop-distribution.cc | 1464 ++++++++++++++++++++- + gcc/tree-vect-data-refs.cc | 237 ++++ + gcc/tree-vect-loop.cc | 42 +- + gcc/tree-vect-patterns.cc | 4 +- + gcc/tree-vect-slp.cc | 1553 ++++++++++++++++++++--- + gcc/tree-vect-stmts.cc | 973 +++++++++++++- + gcc/tree-vectorizer.h | 96 +- + 17 files changed, 4648 insertions(+), 189 deletions(-) + create mode 100644 gcc/testsuite/gcc.dg/vect/transpose-1.c + create mode 100644 gcc/testsuite/gcc.dg/vect/transpose-2.c + create mode 100644 gcc/testsuite/gcc.dg/vect/transpose-3.c + create mode 100644 gcc/testsuite/gcc.dg/vect/transpose-4.c + create mode 100644 gcc/testsuite/gcc.dg/vect/transpose-5.c + create mode 100644 gcc/testsuite/gcc.dg/vect/transpose-6.c + create mode 100644 gcc/testsuite/gcc.dg/vect/transpose-7.c + create mode 100644 gcc/testsuite/gcc.dg/vect/transpose-8.c + +diff --git a/gcc/common.opt b/gcc/common.opt +index b18f0b944..5958c4e0b 100644 +--- a/gcc/common.opt ++++ b/gcc/common.opt +@@ -3221,6 +3221,10 @@ ftree-slp-vectorize + Common Var(flag_tree_slp_vectorize) Optimization EnabledBy(ftree-vectorize) + Enable basic block vectorization (SLP) on trees. + ++ftree-slp-transpose-vectorize ++Common Var(flag_tree_slp_transpose_vectorize) Optimization Init(0) ++Enable basic block vectorization (SLP) for transposed stores and loads on trees. ++ + fvect-cost-model= + Common Joined RejectNegative Enum(vect_cost_model) Var(flag_vect_cost_model) Init(VECT_COST_MODEL_DEFAULT) Optimization + -fvect-cost-model=[unlimited|dynamic|cheap|very-cheap] Specifies the cost model for vectorization. +diff --git a/gcc/testsuite/gcc.dg/vect/transpose-1.c b/gcc/testsuite/gcc.dg/vect/transpose-1.c +new file mode 100644 +index 000000000..8237a8b9e +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/vect/transpose-1.c +@@ -0,0 +1,53 @@ ++/* { dg-do compile { target { aarch64*-*-linux* } } } */ ++/* { dg-require-effective-target vect_int } */ ++#include <stdio.h> ++#include <stdlib.h> ++#include "tree-vect.h" ++ ++#define N 4 ++#define M 256 ++ ++int foo (unsigned char *pix1, int i_pix1, unsigned char *pix2, int i_pix2) ++{ ++ int i = 0; ++ int sum = 0; ++ unsigned c0[N], c1[N], c2[N], c3[N], c4[N], c5[N], c6[N], c7[N]; ++ for (i = 0; i < N; i++, pix1 += i_pix1, pix2 += i_pix2) ++ { ++ c0[i] = pix1[0] - pix2[0]; ++ c1[i] = pix1[1] - pix2[1]; ++ c2[i] = pix1[2] - pix2[2]; ++ c3[i] = pix1[3] - pix2[3]; ++ c4[i] = pix1[4] - pix2[4]; ++ c5[i] = pix1[5] - pix2[5]; ++ c6[i] = pix1[6] - pix2[6]; ++ c7[i] = pix1[7] - pix2[7]; ++ } ++ for (int i = 0; i < N; i++) ++ { ++ sum += c0[i] + c1[i] + c2[i] + c3[i] + c4[i] + c5[i] + c6[i] + c7[i]; ++ } ++ return sum; ++} ++ ++int main (int argc, const char* argv[]) ++{ ++ unsigned char input1[M]; ++ unsigned char input2[M]; ++ int i1 = 16; ++ int i2 = 8; ++ check_vect (); ++ for (int i = 0; i < M; i++) ++ { ++ input1[i] = i * 2; ++ input2[i] = i; ++ } ++ int sum = foo (input1, i1, input2, i2); ++ if (sum != 1264) ++ { ++ abort (); ++ } ++ return 0; ++} ++ ++/* { dg-final { scan-tree-dump "vectorized using transposed version" "slp1" } } */ +diff --git a/gcc/testsuite/gcc.dg/vect/transpose-2.c b/gcc/testsuite/gcc.dg/vect/transpose-2.c +new file mode 100644 +index 000000000..fdf4dbd96 +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/vect/transpose-2.c +@@ -0,0 +1,50 @@ ++/* { dg-do compile { target { aarch64*-*-linux* } } } */ ++/* { dg-additional-options "-fno-tree-loop-vectorize -fno-tree-dse" } */ ++/* { dg-require-effective-target vect_int } */ ++#include <stdio.h> ++#include <stdlib.h> ++#include "tree-vect.h" ++ ++#define N 8 ++#define M 256 ++ ++int foo (unsigned char *pix1, int i_pix1, unsigned char *pix2, int i_pix2) ++{ ++ int i = 0; ++ int sum = 0; ++ unsigned short c0[N], c1[N], c2[N], c3[N], c4[N], c5[N], c6[N], c7[N]; ++ for (i = 0; i < N; i++, pix1 += i_pix1, pix2 += i_pix2) ++ { ++ c0[i] = pix1[0] - pix2[0]; ++ c1[i] = pix1[1] - pix2[1]; ++ c2[i] = pix1[2] - pix2[2]; ++ c3[i] = pix1[3] - pix2[3]; ++ } ++ for (int i = 0; i < N; i++) ++ { ++ sum += c0[i] + c1[i] + c2[i] + c3[i]; ++ } ++ return sum; ++} ++ ++int main (int argc, const char* argv[]) ++{ ++ unsigned char input1[M]; ++ unsigned char input2[M]; ++ int i1 = 5; ++ int i2 = 4; ++ check_vect (); ++ for (int i = 0; i < M; i++) ++ { ++ input1[i] = i * 4; ++ input2[i] = i * 2; ++ } ++ int sum = foo (input1, i1, input2, i2); ++ if (sum != 1440) ++ { ++ abort (); ++ } ++ return 0; ++} ++ ++/* { dg-final { scan-tree-dump "vectorized using transposed version" "slp1" } } */ +diff --git a/gcc/testsuite/gcc.dg/vect/transpose-3.c b/gcc/testsuite/gcc.dg/vect/transpose-3.c +new file mode 100644 +index 000000000..e492e3717 +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/vect/transpose-3.c +@@ -0,0 +1,54 @@ ++/* { dg-do compile { target { aarch64*-*-linux* } } } */ ++/* { dg-additional-options "-fno-tree-loop-vectorize -fno-tree-dse -fno-tree-fre" } */ ++/* { dg-require-effective-target vect_int } */ ++#include <stdio.h> ++#include <stdlib.h> ++#include "tree-vect.h" ++ ++#define N 4 ++#define M 256 ++ ++int foo (unsigned short *pix1, int i_pix1, unsigned short *pix2, int i_pix2) ++{ ++ int i = 0; ++ int sum = 0; ++ unsigned c0[N], c1[N], c2[N], c3[N], c4[N], c5[N], c6[N], c7[N]; ++ for (i = 0; i < N; i++, pix1 += i_pix1, pix2 += i_pix2) ++ { ++ c0[i] = pix1[0] - pix2[0]; ++ c1[i] = pix1[1] - pix2[1]; ++ c2[i] = pix1[2] - pix2[2]; ++ c3[i] = pix1[3] - pix2[3]; ++ c4[i] = pix1[4] - pix2[4]; ++ c5[i] = pix1[5] - pix2[5]; ++ c6[i] = pix1[6] - pix2[6]; ++ c7[i] = pix1[7] - pix2[7]; ++ } ++ for (int i = 0; i < N; i++) ++ { ++ sum += c0[i] + c1[i] + c2[i] + c3[i] + c4[i] + c5[i] + c6[i] + c7[i]; ++ } ++ return sum; ++} ++ ++int main (int argc, const char* argv[]) ++{ ++ unsigned short input1[M]; ++ unsigned short input2[M]; ++ int i1 = 8; ++ int i2 = 4; ++ check_vect (); ++ for (int i = 0; i < M; i++) ++ { ++ input1[i] = i * 4; ++ input2[i] = i; ++ } ++ int sum = foo (input1, i1, input2, i2); ++ if (sum != 1680) ++ { ++ abort (); ++ } ++ return 0; ++} ++ ++/* { dg-final { scan-tree-dump "vectorized using transposed version" "slp1" } } */ +diff --git a/gcc/testsuite/gcc.dg/vect/transpose-4.c b/gcc/testsuite/gcc.dg/vect/transpose-4.c +new file mode 100644 +index 000000000..0b4adea9b +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/vect/transpose-4.c +@@ -0,0 +1,53 @@ ++/* { dg-do compile { target { aarch64*-*-linux* } } } */ ++/* { dg-require-effective-target vect_int } */ ++#include <stdio.h> ++#include <stdlib.h> ++#include "tree-vect.h" ++ ++#define N 4 ++#define M 256 ++ ++int foo (unsigned *pix1, int i_pix1, unsigned *pix2, int i_pix2) ++{ ++ int i = 0; ++ int sum = 0; ++ unsigned c0[N], c1[N], c2[N], c3[N], c4[N], c5[N], c6[N], c7[N]; ++ for (i = 0; i < N; i++, pix1 += i_pix1, pix2 += i_pix2) ++ { ++ c0[i] = pix1[0] - pix2[0]; ++ c1[i] = pix1[1] - pix2[1]; ++ c2[i] = pix1[2] - pix2[2]; ++ c3[i] = pix1[3] - pix2[3]; ++ c4[i] = pix1[4] - pix2[4]; ++ c5[i] = pix1[5] - pix2[5]; ++ c6[i] = pix1[6] - pix2[6]; ++ c7[i] = pix1[7] - pix2[7]; ++ } ++ for (int i = 0; i < N; i++) ++ { ++ sum += c0[i] + c1[i] + c2[i] + c3[i] + c4[i] + c5[i] + c6[i] + c7[i]; ++ } ++ return sum; ++} ++ ++int main (int argc, const char* argv[]) ++{ ++ unsigned input1[M]; ++ unsigned input2[M]; ++ int i1 = 12; ++ int i2 = 6; ++ check_vect (); ++ for (int i = 0; i < M; i++) ++ { ++ input1[i] = i * 7; ++ input2[i] = i * 3; ++ } ++ int sum = foo (input1, i1, input2, i2); ++ if (sum != 3616) ++ { ++ abort (); ++ } ++ return 0; ++} ++ ++/* { dg-final { scan-tree-dump "vectorized using transposed version" "slp1" } } */ +diff --git a/gcc/testsuite/gcc.dg/vect/transpose-5.c b/gcc/testsuite/gcc.dg/vect/transpose-5.c +new file mode 100644 +index 000000000..040dedf1b +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/vect/transpose-5.c +@@ -0,0 +1,74 @@ ++/* { dg-do compile { target { aarch64*-*-linux* } } } */ ++/* { dg-additional-options "-fno-tree-dse -fno-tree-fre" } */ ++/* { dg-require-effective-target vect_int } */ ++#include <stdio.h> ++#include <stdlib.h> ++#include <math.h> ++#include "tree-vect.h" ++ ++#define N 4 ++#define M 256 ++#define eps 1e-8 ++ ++double foo (unsigned char *pix1, int i_pix1, unsigned char *pix2, int i_pix2) ++{ ++ unsigned a0[N]; ++ unsigned a1[N]; ++ unsigned a2[N]; ++ unsigned a3[N]; ++ ++ int b0[N]; ++ int b1[N]; ++ int b2[N]; ++ int b3[N]; ++ ++ for (int i = 0; i < N; i++, pix1 += i_pix1, pix2 += i_pix2) ++ { ++ a0[i] = (pix1[0] - pix2[0]) + ((pix1[4] + pix2[4]) << 16); ++ a1[i] = (pix1[1] - pix2[1]) + ((pix1[5] + pix2[5]) << 16); ++ a2[i] = (pix1[2] - pix2[2]) + ((pix1[6] + pix2[6]) << 16); ++ a3[i] = (pix1[3] - pix2[3]) + ((pix1[7] + pix2[7]) << 16); ++ } ++ ++ for (int i = 0; i < N; i++, pix1 += i_pix1, pix2 += i_pix2) ++ { ++ b0[i] = (pix1[0] - pix2[0]) + (pix1[4] + pix2[4]); ++ b1[i] = (pix1[1] - pix2[1]) + (pix1[5] + pix2[5]); ++ b2[i] = (pix1[2] - pix2[2]) + (pix1[6] + pix2[6]); ++ b3[i] = (pix1[3] - pix2[3]) + (pix1[7] + pix2[7]); ++ } ++ ++ double sum = 0; ++ for (int i = 0; i < N; i++) ++ { ++ sum += a0[i] + a1[i] + a2[i] + a3[i] + b0[i] + b1[i] + b2[i] + b3[i]; ++ } ++ return sum; ++} ++ ++int main (int argc, const char* argv[]) ++{ ++ unsigned char input1[M]; ++ unsigned char input2[M]; ++ int i1 = 8; ++ int i2 = 3; ++ unsigned char m = 2; ++ unsigned short n = 12; ++ float t = 3.0; ++ double k = 4.2; ++ check_vect (); ++ for (int i = 0; i < M; i++) ++ { ++ input1[i] = i * 6; ++ input2[i] = i * 3; ++ } ++ double sum = foo (input1, i1, input2, i2); ++ if (fabs (sum - 78648144) > eps) ++ { ++ abort (); ++ } ++ return 0; ++} ++ ++/* { dg-final { scan-tree-dump "vectorized using transposed version" "slp1" } } */ ++/* { dg-final { scan-tree-dump-times "vectorizable_store for slp transpose" 2 "slp1" } } */ +diff --git a/gcc/testsuite/gcc.dg/vect/transpose-6.c b/gcc/testsuite/gcc.dg/vect/transpose-6.c +new file mode 100644 +index 000000000..3e134ac02 +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/vect/transpose-6.c +@@ -0,0 +1,67 @@ ++/* { dg-do compile { target { aarch64*-*-linux* } } } */ ++/* { dg-require-effective-target vect_int } */ ++/* { dg-require-effective-target vect_float } */ ++#include <stdio.h> ++#include <stdlib.h> ++#include <math.h> ++#include "tree-vect.h" ++ ++#define N 4 ++#define M 256 ++#define eps 1e-8 ++ ++float foo (unsigned char *pix1, int i_pix1, unsigned char *pix2, int i_pix2) ++{ ++ unsigned a0[N]; ++ unsigned a1[N]; ++ unsigned a2[N]; ++ unsigned a3[N]; ++ ++ float c0[N]; ++ float c1[N]; ++ float c2[N]; ++ float c3[N]; ++ ++ for (int i = 0; i < N; i++, pix1 += i_pix1, pix2 += i_pix2) ++ { ++ a0[i] = (pix1[0] - pix2[0]) + ((pix1[4] - pix2[4]) << 16); ++ a1[i] = (pix1[1] - pix2[1]) + ((pix1[5] - pix2[5]) << 16); ++ a2[i] = (pix1[2] - pix2[2]) + ((pix1[6] - pix2[6]) << 16); ++ a3[i] = (pix1[3] - pix2[3]) + ((pix1[7] - pix2[7]) << 16); ++ ++ c0[i] = (pix1[0] * pix2[0]) + (pix1[4] * pix2[4]); ++ c1[i] = (pix1[1] * pix2[1]) + (pix1[5] * pix2[5]); ++ c2[i] = (pix1[2] * pix2[2]) + (pix1[6] * pix2[6]); ++ c3[i] = (pix1[3] * pix2[3]) + (pix1[7] * pix2[7]); ++ } ++ ++ float sum = 0; ++ for (int i = 0; i < N; i++) ++ { ++ sum += a0[i] + a1[i] + a2[i] + a3[i] + c0[i] + c1[i] + c2[i] + c3[i]; ++ } ++ return sum; ++} ++ ++int main (int argc, const char* argv[]) ++{ ++ unsigned char input1[M]; ++ unsigned char input2[M]; ++ int i1 = 18; ++ int i2 = 6; ++ check_vect (); ++ for (int i = 0; i < M; i++) ++ { ++ input1[i] = i * 4; ++ input2[i] = i * 2; ++ } ++ float sum = foo (input1, i1, input2, i2); ++ if (fabs (sum - 106041168) > eps) ++ { ++ abort (); ++ } ++ return 0; ++} ++ ++/* { dg-final { scan-tree-dump "vectorized using transposed version" "slp1" } } */ ++/* { dg-final { scan-tree-dump-times "vectorizable_store for slp transpose" 2 "slp1" } } */ +diff --git a/gcc/testsuite/gcc.dg/vect/transpose-7.c b/gcc/testsuite/gcc.dg/vect/transpose-7.c +new file mode 100644 +index 000000000..8ba1b1b6d +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/vect/transpose-7.c +@@ -0,0 +1,53 @@ ++/* { dg-do compile { target { aarch64*-*-linux* } } } */ ++/* { dg-additional-options "-fno-tree-loop-vectorize -fno-tree-dse" } */ ++/* { dg-require-effective-target vect_int } */ ++#include <stdio.h> ++#include <stdlib.h> ++#include "tree-vect.h" ++ ++#define N 16 ++#define M 256 ++ ++int foo (unsigned char *pix1, int i_pix1, unsigned char *pix2, int i_pix2) ++{ ++ int i = 0; ++ int sum = 0; ++ unsigned char c0[N], c1[N]; ++ for (int i = 0; i < N/2; i++, pix1 += i_pix1, pix2 += i_pix2) ++ { ++ c0[i] = pix1[0] - pix2[0]; ++ c1[i] = pix1[1] - pix2[1]; ++ } ++ for (int i = N/2; i < N; i++, pix1 += i_pix1, pix2 += i_pix2) ++ { ++ c0[i] = pix1[0] - pix2[0]; ++ c1[i] = pix1[1] - pix2[1]; ++ } ++ for (int i = 0; i < N; i++) ++ { ++ sum += c0[i] + c1[i]; ++ } ++ return sum; ++} ++ ++int main (int argc, const char* argv[]) ++{ ++ unsigned char input1[M]; ++ unsigned char input2[M]; ++ int i1 = 6; ++ int i2 = 4; ++ check_vect (); ++ for (int i = 0; i < M; i++) ++ { ++ input1[i] = i * 5; ++ input2[i] = i * 2; ++ } ++ int sum = foo (input1, i1, input2, i2); ++ if (sum != 3280) ++ { ++ abort (); ++ } ++ return 0; ++} ++ ++/* { dg-final { scan-tree-dump "vectorized using transposed version" "slp1" } } */ +diff --git a/gcc/testsuite/gcc.dg/vect/transpose-8.c b/gcc/testsuite/gcc.dg/vect/transpose-8.c +new file mode 100644 +index 000000000..a154f012a +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/vect/transpose-8.c +@@ -0,0 +1,53 @@ ++/* { dg-do compile { target { aarch64*-*-linux* } } } */ ++/* { dg-additional-options "-fno-tree-loop-vectorize" } */ ++/* { dg-require-effective-target vect_int } */ ++#include <stdio.h> ++#include <stdlib.h> ++#include "tree-vect.h" ++ ++#define N 32 ++#define M 256 ++ ++int foo (unsigned char *pix1, int i_pix1, unsigned char *pix2, int i_pix2) ++{ ++ int i = 0; ++ int sum = 0; ++ unsigned char c0[N], c1[N]; ++ for (int i = 0; i < N/2; i++, pix1 += i_pix1, pix2 += i_pix2) ++ { ++ c0[i] = pix1[0] - pix2[0]; ++ c1[i] = pix1[1] - pix2[1]; ++ } ++ for (int i = N/2; i < N; i++, pix1 += i_pix1, pix2 += i_pix2) ++ { ++ c0[i] = pix1[0] - pix2[0]; ++ c1[i] = pix1[1] - pix2[1]; ++ } ++ for (int i = 0; i < N; i++) ++ { ++ sum += c0[i] + c1[i]; ++ } ++ return sum; ++} ++ ++int main (int argc, const char* argv[]) ++{ ++ unsigned char input1[M]; ++ unsigned char input2[M]; ++ int i1 = 6; ++ int i2 = 4; ++ check_vect (); ++ for (int i = 0; i < M; i++) ++ { ++ input1[i] = i * 5; ++ input2[i] = i * 2; ++ } ++ int sum = foo (input1, i1, input2, i2); ++ if (sum != 7584) ++ { ++ abort (); ++ } ++ return 0; ++} ++ ++/* { dg-final { scan-tree-dump "vectorized using transposed version" "slp1" } } */ +diff --git a/gcc/testsuite/gcc.dg/vect/vect.exp b/gcc/testsuite/gcc.dg/vect/vect.exp +index dcaef1e0a..ae5212411 100644 +--- a/gcc/testsuite/gcc.dg/vect/vect.exp ++++ b/gcc/testsuite/gcc.dg/vect/vect.exp +@@ -117,6 +117,13 @@ et-dg-runtest dg-runtest [lsort \ + [glob -nocomplain $srcdir/$subdir/no-vfa-*.\[cS\]]] \ + "" $DEFAULT_VECTCFLAGS + ++# -ftree-slp-transpose-vectorize SLP tests ++set VECT_SLP_CFLAGS $SAVED_VECT_SLP_CFLAGS ++lappend VECT_SLP_CFLAGS "-ftree-slp-transpose-vectorize" ++et-dg-runtest dg-runtest [lsort \ ++ [glob -nocomplain $srcdir/$subdir/transpose-*.\[cS\]]] \ ++ "" "-ftree-slp-transpose-vectorize -fdump-tree-slp-details -O3" ++ + # -ffast-math tests + set DEFAULT_VECTCFLAGS $SAVED_DEFAULT_VECTCFLAGS + lappend DEFAULT_VECTCFLAGS "-ffast-math" +diff --git a/gcc/tree-loop-distribution.cc b/gcc/tree-loop-distribution.cc +index 606eb05e6..8d118e987 100644 +--- a/gcc/tree-loop-distribution.cc ++++ b/gcc/tree-loop-distribution.cc +@@ -36,6 +36,47 @@ along with GCC; see the file COPYING3. If not see + | D(I) = A(I-1)*E + |ENDDO + ++ If an unvectorizable loop has grouped loads, and calculations from grouped ++ loads are isomorphic, build temp arrays using stmts where isomorphic ++ calculations end. Afer distribution, the partition built from temp ++ arrays can be vectorized in pass SLP after loop unrolling. For example, ++ ++ |DO I = 1, N ++ | A = FOO (ARG_1); ++ | B = FOO (ARG_2); ++ | C = BAR_0 (A); ++ | D = BAR_1 (B); ++ |ENDDO ++ ++ is transformed to ++ ++ |DO I = 1, N ++ | J = FOO (ARG_1); ++ | K = FOO (ARG_2); ++ | X[I] = J; ++ | Y[I] = K; ++ | A = X[I]; ++ | B = Y[I]; ++ | C = BAR_0 (A); ++ | D = BAR_1 (B); ++ |ENDDO ++ ++ and is then distributed to ++ ++ |DO I = 1, N ++ | J = FOO (ARG_1); ++ | K = FOO (ARG_2); ++ | X[I] = J; ++ | Y[I] = K; ++ |ENDDO ++ ++ |DO I = 1, N ++ | A = X[I]; ++ | B = Y[I]; ++ | C = BAR_0 (A); ++ | D = BAR_1 (B); ++ |ENDDO ++ + Loop distribution is the dual of loop fusion. It separates statements + of a loop (or loop nest) into multiple loops (or loop nests) with the + same loop header. The major goal is to separate statements which may +@@ -44,7 +85,9 @@ along with GCC; see the file COPYING3. If not see + + 1) Seed partitions with specific type statements. For now we support + two types seed statements: statement defining variable used outside +- of loop; statement storing to memory. ++ of loop; statement storing to memory. Moreover, for unvectorizable ++ loops, we try to find isomorphic stmts from grouped load and build ++ temp arrays as new seed statements. + 2) Build reduced dependence graph (RDG) for loop to be distributed. + The vertices (RDG:V) model all statements in the loop and the edges + (RDG:E) model flow and control dependencies between statements. +@@ -90,6 +133,8 @@ along with GCC; see the file COPYING3. If not see + data reuse. */ + + #include "config.h" ++#define INCLUDE_MAP ++#define INCLUDE_ALGORITHM + #include "system.h" + #include "coretypes.h" + #include "backend.h" +@@ -115,6 +160,7 @@ along with GCC; see the file COPYING3. If not see + #include "tree-vectorizer.h" + #include "tree-eh.h" + #include "gimple-fold.h" ++#include "optabs-tree.h" + #include "tree-affine.h" + #include "intl.h" + #include "rtl.h" +@@ -188,6 +234,52 @@ struct rdg_vertex + #define RDG_MEM_WRITE_STMT(RDG, I) RDGV_HAS_MEM_WRITE (&(RDG->vertices[I])) + #define RDG_MEM_READS_STMT(RDG, I) RDGV_HAS_MEM_READS (&(RDG->vertices[I])) + ++/* Results of isomorphic group analysis. */ ++#define UNINITIALIZED (0) ++#define ISOMORPHIC (1) ++#define HETEROGENEOUS (1 << 1) ++#define UNCERTAIN (1 << 2) ++ ++/* Information of a stmt while analyzing isomorphic use in group. */ ++ ++typedef struct _group_info ++{ ++ gimple *stmt; ++ ++ /* True if stmt can be a cut point. */ ++ bool cut_point; ++ ++ /* For use_stmt with two rhses, one of which is the lhs of stmt. ++ If the other is unknown to be isomorphic, mark it uncertain. */ ++ bool uncertain; ++ ++ /* Searching of isomorphic stmt reaches heterogeneous groups or reaches ++ MEM stmts. */ ++ bool done; ++ ++ _group_info () ++ { ++ stmt = NULL; ++ cut_point = false; ++ uncertain = false; ++ done = false; ++ } ++} *group_info; ++ ++/* PAIR of cut points and corresponding profit. */ ++typedef std::pair<vec<gimple *> *, int> stmts_profit; ++ ++/* MAP of vector factor VF and corresponding stmts_profit PAIR. */ ++typedef std::map<unsigned, stmts_profit> vf_stmts_profit_map; ++ ++/* PAIR of group_num and iteration_num. We consider rhses from the same ++ group and interation are isomorphic. */ ++typedef std::pair<unsigned, unsigned> group_iteration; ++ ++/* An isomorphic stmt is detetmined by lhs of use_stmt, group_num and ++ the iteration_num when we insert this stmt to this map. */ ++typedef std::map<tree, group_iteration> isomer_stmt_lhs; ++ + /* Data dependence type. */ + + enum rdg_dep_type +@@ -600,13 +692,14 @@ class loop_distribution + /* Returns true when PARTITION1 and PARTITION2 access the same memory + object in RDG. */ + bool share_memory_accesses (struct graph *rdg, +- partition *partition1, partition *partition2); ++ partition *partition1, partition *partition2, ++ hash_set<tree> *excluded_arrays); + + /* For each seed statement in STARTING_STMTS, this function builds + partition for it by adding depended statements according to RDG. + All partitions are recorded in PARTITIONS. */ + void rdg_build_partitions (struct graph *rdg, +- vec<gimple *> starting_stmts, ++ vec<gimple *> *starting_stmts, + vec<partition *> *partitions); + + /* Compute partition dependence created by the data references in DRS1 +@@ -643,15 +736,50 @@ class loop_distribution + + /* Fuse PARTITIONS of LOOP if necessary before finalizing distribution. + ALIAS_DDRS contains ddrs which need runtime alias check. */ +- void finalize_partitions (class loop *loop, vec<struct partition *> +- *partitions, vec<ddr_p> *alias_ddrs); ++ void finalize_partitions (class loop *loop, ++ vec<struct partition *> *partitions, ++ vec<ddr_p> *alias_ddrs, bitmap producers); ++ ++ /* Analyze loop form and if it's vectorizable to decide if we need to ++ insert temp arrays to distribute it. */ ++ bool may_insert_temp_arrays (loop_p loop, struct graph *&rdg, ++ control_dependences *cd); ++ ++ /* Reset gimple_uid of GIMPLE_DEBUG and GIMPLE_LABEL to -1. */ ++ void reset_gimple_uid (loop_p loop); ++ ++ bool check_loop_vectorizable (loop_p loop); ++ ++ inline void rebuild_rdg (loop_p loop, struct graph *&rdg, ++ control_dependences *cd); ++ ++ /* If loop is not distributed, remove inserted temp arrays. */ ++ void remove_insertion (loop_p loop, struct graph *flow_only_rdg, ++ bitmap producers, struct partition *partition); ++ ++ /* Insert temp arrays if isomorphic computation exists. Temp arrays will be ++ regarded as SEED_STMTS for building partitions in succeeding processes. */ ++ bool insert_temp_arrays (loop_p loop, vec<gimple *> seed_stmts, ++ hash_set<tree> *tmp_array_vars, bitmap producers); ++ ++ void build_producers (loop_p loop, bitmap producers, ++ vec<gimple *> &transformed); ++ ++ void do_insertion (loop_p loop, struct graph *flow_only_rdg, tree iv, ++ bitmap cut_points, hash_set <tree> *tmp_array_vars, ++ bitmap producers); ++ ++ /* Fuse PARTITIONS built from inserted temp arrays into one partition, ++ fuse the rest into another. */ ++ void merge_remaining_partitions (vec<struct partition *> *partitions, ++ bitmap producers); + + /* Distributes the code from LOOP in such a way that producer statements + are placed before consumer statements. Tries to separate only the + statements from STMTS into separate loops. Returns the number of + distributed loops. Set NB_CALLS to number of generated builtin calls. + Set *DESTROY_P to whether LOOP needs to be destroyed. */ +- int distribute_loop (class loop *loop, const vec<gimple *> &stmts, ++ int distribute_loop (class loop *loop, vec<gimple *> &stmts, + control_dependences *cd, int *nb_calls, bool *destroy_p, + bool only_patterns_p); + +@@ -1893,7 +2021,8 @@ loop_distribution::classify_partition (loop_p loop, + + bool + loop_distribution::share_memory_accesses (struct graph *rdg, +- partition *partition1, partition *partition2) ++ partition *partition1, partition *partition2, ++ hash_set <tree> *excluded_arrays) + { + unsigned i, j; + bitmap_iterator bi, bj; +@@ -1927,7 +2056,10 @@ loop_distribution::share_memory_accesses (struct graph *rdg, + if (operand_equal_p (DR_BASE_ADDRESS (dr1), DR_BASE_ADDRESS (dr2), 0) + && operand_equal_p (DR_OFFSET (dr1), DR_OFFSET (dr2), 0) + && operand_equal_p (DR_INIT (dr1), DR_INIT (dr2), 0) +- && operand_equal_p (DR_STEP (dr1), DR_STEP (dr2), 0)) ++ && operand_equal_p (DR_STEP (dr1), DR_STEP (dr2), 0) ++ /* An exception, if PARTITION1 and PARTITION2 contain the ++ temp array we inserted, do not merge them. */ ++ && !excluded_arrays->contains (DR_REF (dr1))) + return true; + } + } +@@ -1941,14 +2073,14 @@ loop_distribution::share_memory_accesses (struct graph *rdg, + + void + loop_distribution::rdg_build_partitions (struct graph *rdg, +- vec<gimple *> starting_stmts, ++ vec<gimple *> *starting_stmts, + vec<partition *> *partitions) + { + auto_bitmap processed; + int i; + gimple *stmt; + +- FOR_EACH_VEC_ELT (starting_stmts, i, stmt) ++ FOR_EACH_VEC_ELT (*starting_stmts, i, stmt) + { + int v = rdg_vertex_for_stmt (rdg, stmt); + +@@ -2912,13 +3044,47 @@ fuse_memset_builtins (vec<struct partition *> *partitions) + } + } + ++void ++loop_distribution::merge_remaining_partitions ++ (vec<struct partition *> *partitions, ++ bitmap producers) ++{ ++ struct partition *partition = NULL; ++ struct partition *p1 = NULL, *p2 = NULL; ++ for (unsigned i = 0; partitions->iterate (i, &partition); i++) ++ { ++ if (bitmap_intersect_p (producers, partition->stmts)) ++ { ++ if (p1 == NULL) ++ { ++ p1 = partition; ++ continue; ++ } ++ partition_merge_into (NULL, p1, partition, FUSE_FINALIZE); ++ } ++ else ++ { ++ if (p2 == NULL) ++ { ++ p2 = partition; ++ continue; ++ } ++ partition_merge_into (NULL, p2, partition, FUSE_FINALIZE); ++ } ++ partitions->unordered_remove (i); ++ partition_free (partition); ++ i--; ++ } ++} ++ + void + loop_distribution::finalize_partitions (class loop *loop, + vec<struct partition *> *partitions, +- vec<ddr_p> *alias_ddrs) ++ vec<ddr_p> *alias_ddrs, ++ bitmap producers) + { + unsigned i; +- struct partition *partition, *a; ++ struct partition *partition; + + if (partitions->length () == 1 + || alias_ddrs->length () > 0) +@@ -2950,13 +3116,7 @@ loop_distribution::finalize_partitions (class loop *loop, + || (loop->inner == NULL + && i >= NUM_PARTITION_THRESHOLD && num_normal > num_builtin)) + { +- a = (*partitions)[0]; +- for (i = 1; partitions->iterate (i, &partition); ++i) +- { +- partition_merge_into (NULL, a, partition, FUSE_FINALIZE); +- partition_free (partition); +- } +- partitions->truncate (1); ++ merge_remaining_partitions (partitions, producers); + } + + /* Fuse memset builtins if possible. */ +@@ -2964,6 +3124,1216 @@ loop_distribution::finalize_partitions (class loop *loop, + fuse_memset_builtins (partitions); + } + ++/* Gimple uids of GIMPLE_DEBUG and GIMPLE_LABEL were changed during function ++ vect_analyze_loop, reset them to -1. */ ++ ++void ++loop_distribution::reset_gimple_uid (loop_p loop) ++{ ++ basic_block *bbs = get_loop_body_in_custom_order (loop, this, ++ bb_top_order_cmp_r); ++ for (int i = 0; i < int (loop->num_nodes); i++) ++ { ++ basic_block bb = bbs[i]; ++ for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi); ++ gsi_next (&gsi)) ++ { ++ gimple *stmt = gsi_stmt (gsi); ++ if (is_gimple_debug (stmt) || gimple_code (stmt) == GIMPLE_LABEL) ++ gimple_set_uid (stmt, -1); ++ } ++ } ++ free (bbs); ++} ++ ++bool ++loop_distribution::check_loop_vectorizable (loop_p loop) ++{ ++ vec_info_shared shared; ++ vect_analyze_loop (loop, &shared, true); ++ loop_vec_info vinfo = loop_vec_info_for_loop (loop); ++ reset_gimple_uid (loop); ++ if (vinfo == NULL) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, ++ "Loop %d no temp array insertion: bad data access pattern," ++ " unable to generate loop_vinfo.\n", loop->num); ++ return false; ++ } ++ if (vinfo->vectorizable) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "Loop %d no temp array insertion: original loop" ++ " can be vectorized without distribution.\n", ++ loop->num); ++ delete vinfo; ++ loop->aux = NULL; ++ return false; ++ } ++ if (vinfo->grouped_loads.length () == 0) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "Loop %d no temp array insertion: original loop" ++ " has no grouped loads.\n" , loop->num); ++ delete vinfo; ++ loop->aux = NULL; ++ return false; ++ } ++ return true; ++} ++ ++inline void ++loop_distribution::rebuild_rdg (loop_p loop, struct graph *&rdg, ++ control_dependences *cd) ++{ ++ free_rdg (rdg); ++ rdg = build_rdg (loop, cd); ++ gcc_checking_assert (rdg != NULL); ++} ++ ++bool ++loop_distribution::may_insert_temp_arrays (loop_p loop, struct graph *&rdg, ++ control_dependences *cd) ++{ ++ if (!(flag_tree_slp_transpose_vectorize && flag_tree_loop_vectorize)) ++ return false; ++ ++ /* Only loops with two basic blocks HEADER and LATCH are supported. HEADER ++ is the main body of a LOOP and LATCH is the basic block that controls the ++ LOOP execution. Size of temp array is determined by loop execution time, ++ so it must be a const. */ ++ tree loop_extent = number_of_latch_executions (loop); ++ if (loop->inner != NULL || loop->num_nodes > 2 ++ || TREE_CODE (loop_extent) != INTEGER_CST) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "Loop %d: no temp array insertion: bad loop" ++ " form.\n", loop->num); ++ return false; ++ } ++ ++ if (loop->dont_vectorize) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "Loop %d: no temp array insertion: this loop" ++ " should never be vectorized.\n", ++ loop->num); ++ return false; ++ } ++ ++ /* Do not distribute a LOOP that is able to be vectorized without ++ distribution. */ ++ if (!check_loop_vectorizable (loop)) ++ { ++ rebuild_rdg (loop, rdg, cd); ++ return false; ++ } ++ ++ rebuild_rdg (loop, rdg, cd); ++ return true; ++} ++ ++/* Return max grouped loads' length if all groupes length satisfy len = 2 ^ n. ++ Otherwise, return 0. */ ++ ++static unsigned ++get_max_vf (loop_vec_info vinfo) ++{ ++ unsigned size = 0; ++ unsigned max = 0; ++ stmt_vec_info stmt_info; ++ unsigned i = 0; ++ FOR_EACH_VEC_ELT (vinfo->grouped_loads, i, stmt_info) ++ { ++ size = stmt_info->size; ++ if (!pow2p_hwi (size)) ++ return 0; ++ max = size > max ? size : max; ++ } ++ return max; ++} ++ ++/* Convert grouped_loads from linked list to vector with length vf. Init ++ group_info of each stmt in the same group and put then into a vector. And ++ these vectors consist WORKLISTS. We will re-analyze a group if it is ++ uncertain, so we regard WORKLISTS as a circular queue. */ ++ ++static unsigned ++build_queue (loop_vec_info vinfo, unsigned vf, ++ vec<vec<group_info> *> &worklists) ++{ ++ stmt_vec_info stmt_info; ++ unsigned i = 0; ++ group_info ginfo = NULL; ++ vec<group_info> *worklist = NULL; ++ FOR_EACH_VEC_ELT (vinfo->grouped_loads, i, stmt_info) ++ { ++ unsigned group_size = stmt_info->size; ++ stmt_vec_info c_stmt_info = stmt_info; ++ bool succ = true; ++ while (group_size >= vf) ++ { ++ vec_alloc (worklist, vf); ++ for (unsigned j = 0; j < vf; ++j) ++ { ++ if (c_stmt_info == NULL) ++ { ++ succ = false; ++ break; ++ } ++ ginfo = new _group_info (); ++ ginfo->stmt = c_stmt_info->stmt; ++ worklist->safe_push (ginfo); ++ c_stmt_info = c_stmt_info->next_element; ++ } ++ if (!succ) ++ { ++ unsigned k = 0; ++ ginfo = NULL; ++ FOR_EACH_VEC_ELT (*worklist, k, ginfo) ++ delete ginfo; ++ vec_free (worklist); ++ break; ++ } ++ worklists.safe_push (worklist); ++ group_size -= vf; ++ } ++ } ++ return worklists.length (); ++} ++ ++static bool ++check_same_oprand_type (tree op1, tree op2) ++{ ++ tree type1 = TREE_TYPE (op1); ++ tree type2 = TREE_TYPE (op2); ++ if (TREE_CODE (type1) != INTEGER_TYPE && TREE_CODE (type1) != REAL_TYPE) ++ return false; ++ ++ return (TREE_CODE (type1) == TREE_CODE (type2) ++ && TYPE_UNSIGNED (type1) == TYPE_UNSIGNED (type2) ++ && TYPE_PRECISION (type1) == TYPE_PRECISION (type2)); ++} ++ ++static bool ++bit_field_p (gimple *stmt) ++{ ++ unsigned i = 0; ++ auto_vec<data_reference_p, 2> datarefs_vec; ++ data_reference_p dr; ++ if (!find_data_references_in_stmt (NULL, stmt, &datarefs_vec)) ++ return true; ++ ++ FOR_EACH_VEC_ELT (datarefs_vec, i, dr) ++ { ++ if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF ++ && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1))) ++ return true; ++ } ++ return false; ++} ++ ++static inline bool ++shift_operation (enum tree_code op) ++{ ++ return op == LSHIFT_EXPR || op == RSHIFT_EXPR || op == LROTATE_EXPR ++ || op == RROTATE_EXPR; ++} ++ ++/* Return relationship between USE_STMT and the first use_stmt of the group. ++ RHS1 is the lhs of stmt recorded in group_info. If another rhs of use_stmt ++ is not a constant, return UNCERTAIN and re-check it later. */ ++ ++static unsigned ++check_isomorphic (gimple *use_stmt, gimple *first, ++ tree rhs1, vec<tree> &hetero_lhs) ++{ ++ /* Check same operation. */ ++ enum tree_code rhs_code_first = gimple_assign_rhs_code (first); ++ enum tree_code rhs_code_current = gimple_assign_rhs_code (use_stmt); ++ if (rhs_code_first != rhs_code_current) ++ return HETEROGENEOUS; ++ ++ /* For shift operations, oprands should be equal. */ ++ if (shift_operation (rhs_code_current)) ++ { ++ tree shift_op_first = gimple_assign_rhs2 (first); ++ tree shift_op_current = gimple_assign_rhs2 (use_stmt); ++ if (!operand_equal_p (shift_op_first, shift_op_current, 0) ++ || !TREE_CONSTANT (shift_op_first)) ++ return HETEROGENEOUS; ++ ++ return ISOMORPHIC; ++ } ++ /* Type convertion expr or assignment. */ ++ if (gimple_num_ops (first) == 2) ++ return (rhs_code_first == NOP_EXPR || rhs_code_first == CONVERT_EXPR ++ || rhs_code_first == SSA_NAME) ? ISOMORPHIC : HETEROGENEOUS; ++ ++ /* We find USE_STMT from lhs of a stmt, denote it as rhs1 of USE_STMT and ++ the other one as rhs2. Check if define-stmt of current rhs2 is isomorphic ++ with define-stmt of rhs2 in the first USE_STMT at this group. */ ++ tree rhs2_first = gimple_assign_rhs1 (use_stmt) == rhs1 ++ ? gimple_assign_rhs2 (first) : gimple_assign_rhs1 (first); ++ tree rhs2_curr = gimple_assign_rhs1 (use_stmt) == rhs1 ++ ? gimple_assign_rhs2 (use_stmt) : gimple_assign_rhs1 (use_stmt); ++ ++ if (check_same_oprand_type (rhs2_first, rhs2_curr)) ++ { ++ if (TREE_CONSTANT (rhs2_curr)) ++ return ISOMORPHIC; ++ else if (hetero_lhs.contains (rhs2_curr)) ++ return HETEROGENEOUS; ++ ++ /* Provisionally set the stmt as uncertain and analyze the whole group ++ in function CHECK_UNCERTAIN later if all use_stmts are uncertain. */ ++ return UNCERTAIN; ++ } ++ return HETEROGENEOUS; ++} ++ ++static bool ++unsupported_operations (gimple *stmt) ++{ ++ enum tree_code code = gimple_assign_rhs_code (stmt); ++ return code == COND_EXPR; ++} ++ ++/* Check if the single use_stmt of STMT is isomorphic with the first one's ++ use_stmt in current group. */ ++ ++static unsigned ++check_use_stmt (group_info elmt, gimple *&first, ++ vec<gimple *> &tmp_stmts, vec<tree> &hetero_lhs) ++{ ++ if (gimple_code (elmt->stmt) != GIMPLE_ASSIGN) ++ return HETEROGENEOUS; ++ use_operand_p dummy; ++ tree lhs = gimple_assign_lhs (elmt->stmt); ++ gimple *use_stmt = NULL; ++ single_imm_use (lhs, &dummy, &use_stmt); ++ /* STMTs with three rhs are not supported, e.g., GIMPLE_COND. */ ++ if (use_stmt == NULL || gimple_code (use_stmt) != GIMPLE_ASSIGN ++ || unsupported_operations (use_stmt) || bit_field_p (use_stmt)) ++ return HETEROGENEOUS; ++ tmp_stmts.safe_push (use_stmt); ++ if (first == NULL) ++ { ++ first = use_stmt; ++ return UNINITIALIZED; ++ } ++ /* Check if current use_stmt and the first menber's use_stmt in the group ++ are of the same type. */ ++ tree first_lhs = gimple_assign_lhs (first); ++ tree curr_lhs = gimple_assign_lhs (use_stmt); ++ if (!check_same_oprand_type (first_lhs, curr_lhs)) ++ return HETEROGENEOUS; ++ return check_isomorphic (use_stmt, first, lhs, hetero_lhs); ++} ++ ++/* Replace stmt field in group with stmts in TMP_STMTS, and insert their ++ lhs_info to ISOMER_LHS. */ ++ ++static void ++update_isomer_lhs (vec<group_info> *group, unsigned group_num, ++ unsigned iteration, isomer_stmt_lhs &isomer_lhs, ++ vec<gimple *> &tmp_stmts, int &profit, ++ vec<unsigned> &merged_groups) ++{ ++ group_info elmt = NULL; ++ /* Do not insert temp array if isomorphic stmts from grouped load have ++ only casting operations. Once isomorphic calculation has 3 oprands, ++ such as plus operation, this group can be regarded as cut point. */ ++ bool operated = (gimple_num_ops (tmp_stmts[0]) == 3); ++ /* Do not insert temp arrays if search of iosomophic stmts reaches ++ MEM stmts. */ ++ bool has_vdef = gimple_vdef (tmp_stmts[0]) != NULL; ++ bool merge = false; ++ for (unsigned i = 0; i < group->length (); i++) ++ { ++ elmt = (*group)[i]; ++ elmt->stmt = has_vdef ? NULL : tmp_stmts[i]; ++ elmt->cut_point = has_vdef ? false : (elmt->cut_point || operated); ++ elmt->uncertain = false; ++ elmt->done = has_vdef; ++ tree lhs = gimple_assign_lhs (tmp_stmts[i]); ++ if (isomer_lhs.find (lhs) != isomer_lhs.end ()) ++ { ++ merge = true; ++ continue; ++ } ++ isomer_lhs[lhs] = std::make_pair (group_num, iteration); ++ } ++ if (merge) ++ { ++ merged_groups.safe_push (group_num); ++ profit = 0; ++ return; ++ } ++ enum vect_cost_for_stmt kind = scalar_stmt; ++ int scalar_cost = builtin_vectorization_cost (kind, NULL_TREE, 0); ++ profit = (tmp_stmts.length () - 1) * scalar_cost; ++} ++ ++/* Try to find rhs2 in ISOMER_LHS, if all rhs2 were found and their group_num ++ and iteration are same, GROUP is isomorphic. */ ++ ++static unsigned ++check_isomorphic_rhs (vec<group_info> *group, vec<gimple *> &tmp_stmts, ++ isomer_stmt_lhs &isomer_lhs) ++{ ++ group_info elmt = NULL; ++ gimple *stmt = NULL; ++ unsigned j = 0; ++ unsigned group_num = -1u; ++ unsigned iteration = -1u; ++ tree rhs1 = NULL; ++ tree rhs2 = NULL; ++ unsigned status = UNINITIALIZED; ++ FOR_EACH_VEC_ELT (*group, j, elmt) ++ { ++ rhs1 = gimple_assign_lhs (elmt->stmt); ++ stmt = tmp_stmts[j]; ++ rhs2 = (rhs1 == gimple_assign_rhs1 (stmt)) ++ ? gimple_assign_rhs2 (stmt) : gimple_assign_rhs1 (stmt); ++ isomer_stmt_lhs::iterator iter = isomer_lhs.find (rhs2); ++ if (iter != isomer_lhs.end ()) ++ { ++ if (group_num == -1u) ++ { ++ group_num = iter->second.first; ++ iteration = iter->second.second; ++ status |= ISOMORPHIC; ++ continue; ++ } ++ if (iter->second.first == group_num ++ && iter->second.second == iteration) ++ { ++ status |= ISOMORPHIC; ++ continue; ++ } ++ return HETEROGENEOUS; ++ } ++ else ++ status |= UNCERTAIN; ++ } ++ return status; ++} ++ ++/* Update group_info for uncertain groups. */ ++ ++static void ++update_uncertain_stmts (vec<group_info> *group, unsigned group_num, ++ unsigned iteration, vec<gimple *> &tmp_stmts) ++{ ++ unsigned j = 0; ++ group_info elmt = NULL; ++ FOR_EACH_VEC_ELT (*group, j, elmt) ++ { ++ elmt->uncertain = true; ++ elmt->done = false; ++ } ++} ++ ++/* Push stmts in TMP_STMTS into HETERO_LHS. */ ++ ++static void ++set_hetero (vec<group_info> *group, vec<tree> &hetero_lhs, ++ vec<gimple *> &tmp_stmts) ++{ ++ group_info elmt = NULL; ++ unsigned i = 0; ++ for (i = 0; i < group->length (); i++) ++ { ++ elmt = (*group)[i]; ++ elmt->uncertain = false; ++ elmt->done = true; ++ } ++ gimple *stmt = NULL; ++ FOR_EACH_VEC_ELT (tmp_stmts, i, stmt) ++ if (stmt != NULL) ++ hetero_lhs.safe_push (gimple_assign_lhs (stmt)); ++} ++ ++/* Given an uncertain group, TMP_STMTS are use_stmts of stmts in GROUP. ++ Rhs1 is the lhs of stmt in GROUP, rhs2 is the other rhs of USE_STMT. ++ ++ Try to find rhs2 in ISOMER_LHS, if all found rhs2 have same group_num ++ and iteration, this uncertain group is isomorphic. ++ ++ If no rhs matched, this GROUP remains uncertain and update group_info. ++ ++ Otherwise, this GROUP is heterogeneous and return true to end analysis ++ for this group. */ ++ ++static bool ++check_uncertain (vec<group_info> *group, unsigned group_num, ++ unsigned iteration, int &profit, ++ vec<gimple *> &tmp_stmts, isomer_stmt_lhs &isomer_lhs, ++ vec<tree> &hetero_lhs, vec<unsigned> &merged_groups) ++{ ++ unsigned status = check_isomorphic_rhs (group, tmp_stmts, isomer_lhs); ++ bool done = false; ++ switch (status) ++ { ++ case UNCERTAIN: ++ update_uncertain_stmts (group, group_num, iteration, tmp_stmts); ++ break; ++ case ISOMORPHIC: ++ update_isomer_lhs (group, group_num, iteration, isomer_lhs, ++ tmp_stmts, profit, merged_groups); ++ break; ++ default: ++ set_hetero (group, hetero_lhs, tmp_stmts); ++ done = true; ++ } ++ return done; ++} ++ ++/* Return false if analysis of this group is not finished, e.g., isomorphic or ++ uncertain. Calculate the profit if vectorized. */ ++ ++static bool ++check_group (vec<group_info> *group, unsigned group_num, unsigned iteration, ++ int &profit, vec<unsigned> &merged_groups, ++ isomer_stmt_lhs &isomer_lhs, vec<tree> &hetero_lhs) ++{ ++ unsigned j = 0; ++ group_info elmt = NULL; ++ gimple *first = NULL; ++ unsigned res = 0; ++ /* Record single use stmts in TMP_STMTS and decide whether replace stmts in ++ ginfo in succeeding processes. */ ++ auto_vec<gimple *, 12> tmp_stmts; ++ FOR_EACH_VEC_ELT (*group, j, elmt) ++ { ++ if (merged_groups.contains (group_num)) ++ return true; ++ res |= check_use_stmt (elmt, first, tmp_stmts, hetero_lhs); ++ } ++ ++ /* Update each group member according to RES. */ ++ switch (res) ++ { ++ case ISOMORPHIC: ++ update_isomer_lhs (group, group_num, iteration, isomer_lhs, ++ tmp_stmts, profit, merged_groups); ++ return false; ++ case UNCERTAIN: ++ return check_uncertain (group, group_num, iteration, profit, ++ tmp_stmts, isomer_lhs, hetero_lhs, ++ merged_groups); ++ default: ++ set_hetero (group, hetero_lhs, tmp_stmts); ++ return true; ++ } ++} ++ ++/* Return true if all analysises are done except uncertain groups. */ ++ ++static bool ++end_of_search (vec<vec<group_info> *> &circular_queue, ++ vec<unsigned> &merged_groups) ++{ ++ unsigned i = 0; ++ vec<group_info> *group = NULL; ++ group_info elmt = NULL; ++ FOR_EACH_VEC_ELT (circular_queue, i, group) ++ { ++ if (merged_groups.contains (i)) ++ continue; ++ elmt = (*group)[0]; ++ /* If there is any isomorphic use_stmts, continue analysis of isomorphic ++ use_stmts. */ ++ if (!elmt->done && !elmt->uncertain) ++ return false; ++ } ++ return true; ++} ++ ++/* Push valid stmts to STMTS as cutpoints. */ ++ ++static bool ++check_any_cutpoints (vec<vec<group_info> *> &circular_queue, ++ vec<gimple *> *&stmts, vec<unsigned> &merged_groups) ++{ ++ unsigned front = 0; ++ vec<group_info> *group = NULL; ++ group_info elmt = NULL; ++ unsigned max = circular_queue.length () * circular_queue[0]->length (); ++ vec_alloc (stmts, max); ++ while (front < circular_queue.length ()) ++ { ++ unsigned i = 0; ++ if (merged_groups.contains (front)) ++ { ++ front++; ++ continue; ++ } ++ group = circular_queue[front++]; ++ FOR_EACH_VEC_ELT (*group, i, elmt) ++ if (elmt->stmt != NULL && elmt->done && elmt->cut_point) ++ stmts->safe_push (elmt->stmt); ++ } ++ return stmts->length () != 0; ++} ++ ++/* Grouped loads are isomorphic. Make pair for group number and iteration, ++ map load stmt to this pair. We set iteration 0 here. */ ++ ++static void ++init_isomer_lhs (vec<vec<group_info> *> &groups, isomer_stmt_lhs &isomer_lhs) ++{ ++ vec<group_info> *group = NULL; ++ group_info elmt = NULL; ++ unsigned i = 0; ++ FOR_EACH_VEC_ELT (groups, i, group) ++ { ++ unsigned j = 0; ++ FOR_EACH_VEC_ELT (*group, j, elmt) ++ isomer_lhs[gimple_assign_lhs (elmt->stmt)] = std::make_pair (i, 0); ++ } ++} ++ ++/* It's not a strict analysis of load/store profit. Assume scalar and vector ++ load/store are of the same cost. The result PROFIT equals profit form ++ vectorizing of scalar loads/stores minus cost of a vectorized load/store. */ ++ ++static int ++load_store_profit (unsigned scalar_mem_ops, unsigned vf, unsigned new_mem_ops) ++{ ++ int profit = 0; ++ enum vect_cost_for_stmt kind = scalar_load; ++ int scalar_cost = builtin_vectorization_cost (kind, NULL_TREE, 0); ++ profit += (scalar_mem_ops - (scalar_mem_ops / vf)) * scalar_cost; ++ profit -= new_mem_ops / vf * scalar_cost; ++ kind = scalar_store; ++ scalar_cost = builtin_vectorization_cost (kind, NULL_TREE, 0); ++ profit -= new_mem_ops / vf * scalar_cost; ++ return profit; ++} ++ ++/* Breadth first search the graph consisting of define-use chain starting from ++ the circular queue initialized by function BUILD_QUEUE. Find single use of ++ each stmt in group and check if they are isomorphic. Isomorphic is defined ++ as same rhs type, same operator, and isomorphic calculation of each rhs ++ starting from load. If another rhs is uncertain to be isomorphic, put it ++ at the end of circular queue and re-analyze it during the next iteration. ++ If a group shares the same use_stmt with another group, skip one of them in ++ succeedor prcoesses as merged. Iterate the circular queue until all ++ remianing groups heterogeneous or reaches MEN stmts. If all other groups ++ have finishes the analysis, and the remaining groups are uncertain, ++ return false to avoid endless loop. */ ++ ++bool ++bfs_find_isomer_stmts (vec<vec<group_info> *> &circular_queue, ++ stmts_profit &profit_pair, unsigned vf, ++ bool &reach_vdef) ++{ ++ isomer_stmt_lhs isomer_lhs; ++ auto_vec<tree> hetero_lhs; ++ auto_vec<unsigned> merged_groups; ++ vec<group_info> *group = NULL; ++ /* True if analysis finishes. */ ++ bool done = false; ++ int profit_sum = 0; ++ vec<gimple *> *stmts = NULL; ++ init_isomer_lhs (circular_queue, isomer_lhs); ++ for (unsigned i = 1; !done; ++i) ++ { ++ unsigned front = 0; ++ /* Re-initialize DONE to TRUE while a new iteration begins. */ ++ done = true; ++ while (front < circular_queue.length ()) ++ { ++ int profit = 0; ++ group = circular_queue[front]; ++ done &= check_group (group, front, i, profit, merged_groups, ++ isomer_lhs, hetero_lhs); ++ profit_sum += profit; ++ if (profit != 0 && (*group)[0]->stmt == NULL) ++ { ++ reach_vdef = true; ++ return false; ++ } ++ ++front; ++ } ++ /* Uncertain result, return. */ ++ if (!done && end_of_search (circular_queue, merged_groups)) ++ return false; ++ } ++ if (check_any_cutpoints (circular_queue, stmts, merged_groups)) ++ { ++ profit_pair.first = stmts; ++ unsigned loads = circular_queue.length () * circular_queue[0]->length (); ++ profit_pair.second = profit_sum + load_store_profit (loads, vf, ++ stmts->length ()); ++ if (profit_pair.second > 0) ++ return true; ++ } ++ return false; ++} ++ ++/* Free memory allocated by ginfo. */ ++ ++static void ++free_ginfos (vec<vec<group_info> *> &worklists) ++{ ++ vec<group_info> *worklist; ++ unsigned i = 0; ++ while (i < worklists.length ()) ++ { ++ worklist = worklists[i++]; ++ group_info ginfo; ++ unsigned j = 0; ++ FOR_EACH_VEC_ELT (*worklist, j, ginfo) ++ delete ginfo; ++ vec_free (worklist); ++ } ++} ++ ++static void ++release_tmp_stmts (vf_stmts_profit_map &candi_stmts) ++{ ++ vf_stmts_profit_map::iterator iter; ++ for (iter = candi_stmts.begin (); iter != candi_stmts.end (); ++iter) ++ iter->second.first->release (); ++} ++ ++/* Choose the group of stmt with maximun profit. */ ++ ++static bool ++decide_stmts_by_profit (vf_stmts_profit_map &candi_stmts, vec<gimple *> &stmts) ++{ ++ vf_stmts_profit_map::iterator iter; ++ int profit = 0; ++ int max = 0; ++ vec<gimple *> *tmp = NULL; ++ for (iter = candi_stmts.begin (); iter != candi_stmts.end (); ++iter) ++ { ++ profit = iter->second.second; ++ if (profit > max) ++ { ++ tmp = iter->second.first; ++ max = profit; ++ } ++ } ++ if (max == 0) ++ { ++ release_tmp_stmts (candi_stmts); ++ return false; ++ } ++ unsigned i = 0; ++ gimple *stmt = NULL; ++ FOR_EACH_VEC_ELT (*tmp, i, stmt) ++ stmts.safe_push (stmt); ++ release_tmp_stmts (candi_stmts); ++ return stmts.length () != 0; ++} ++ ++/* Find isomorphic stmts from grouped loads with vector factor VF. ++ ++ Given source code as follows and ignore casting. ++ ++ a0 = (a[0] + b[0]) + ((a[4] - b[4]) << 16); ++ a1 = (a[1] + b[1]) + ((a[5] - b[5]) << 16); ++ a2 = (a[2] + b[2]) + ((a[6] - b[6]) << 16); ++ a3 = (a[3] + b[3]) + ((a[7] - b[7]) << 16); ++ ++ We get grouped loads in VINFO as ++ ++ GROUP_1 GROUP_2 ++ _1 = *a _11 = *b ++ _2 = *(a + 1) _12 = *(b + 1) ++ _3 = *(a + 2) _13 = *(b + 2) ++ _4 = *(a + 3) _14 = *(b + 3) ++ _5 = *(a + 4) _15 = *(b + 4) ++ _6 = *(a + 5) _16 = *(b + 5) ++ _7 = *(a + 6) _17 = *(b + 6) ++ _8 = *(a + 7) _18 = *(b + 7) ++ ++ First we try VF = 8, we get two worklists ++ ++ WORKLIST_1 WORKLIST_2 ++ _1 = *a _11 = *b ++ _2 = *(a + 1) _12 = *(b + 1) ++ _3 = *(a + 2) _13 = *(b + 2) ++ _4 = *(a + 3) _14 = *(b + 3) ++ _5 = *(a + 4) _15 = *(b + 4) ++ _6 = *(a + 5) _16 = *(b + 5) ++ _7 = *(a + 6) _17 = *(b + 6) ++ _8 = *(a + 7) _18 = *(b + 7) ++ ++ We find _111 = _1 + _11 and _115 = _5 - _15 are not isomorphic, ++ so we try VF = VF / 2. ++ ++ GROUP_1 GROUP_2 ++ _1 = *a _5 = *(a + 4) ++ _2 = *(a + 1) _6 = *(a + 5) ++ _3 = *(a + 2) _7 = *(a + 6) ++ _4 = *(a + 3) _8 = *(a + 7) ++ ++ GROUP_3 GROUP_4 ++ _11 = *b _15 = *(b + 4) ++ _12 = *(b + 1) _16 = *(b + 5) ++ _13 = *(b + 2) _17 = *(b + 6) ++ _14 = *(b + 3) _18 = *(b + 7) ++ ++ We first analyze group_1, and find all operations are isomorphic, then ++ replace stmts in group_1 with their use_stmts. Group_2 as well. ++ ++ GROUP_1 GROUP_2 ++ _111 = _1 + _11 _115 = _5 - _15 ++ _112 = _2 + _12 _116 = _6 - _16 ++ _113 = _3 + _13 _117 = _7 - _17 ++ _114 = _4 + _14 _118 = _8 - _18 ++ ++ When analyzing group_3 and group_4, we find their use_stmts are the same ++ as group_1 and group_2. So group_3 is regarded as being merged to group_1 ++ and group_4 being merged to group_2. In future procedures, we will skip ++ group_3 and group_4. ++ ++ We repeat such processing until opreations are not isomorphic or searching ++ reaches MEM stmts. In our given case, searching end up at a0, a1, a2 and ++ a3. */ ++ ++static bool ++find_isomorphic_stmts (loop_vec_info vinfo, vec<gimple *> &stmts) ++{ ++ unsigned vf = get_max_vf (vinfo); ++ if (vf == 0) ++ return false; ++ auto_vec<vec<group_info> *> circular_queue; ++ /* Map of vector factor and corresponding vectorizing profit. */ ++ stmts_profit profit_map; ++ /* Map of cut_points and vector factor. */ ++ vf_stmts_profit_map candi_stmts; ++ bool reach_vdef = false; ++ while (vf > 2) ++ { ++ if (build_queue (vinfo, vf, circular_queue) == 0) ++ return false; ++ if (!bfs_find_isomer_stmts (circular_queue, profit_map, vf, reach_vdef)) ++ { ++ if (reach_vdef) ++ { ++ release_tmp_stmts (candi_stmts); ++ free_ginfos (circular_queue); ++ circular_queue.release (); ++ return false; ++ } ++ vf /= 2; ++ free_ginfos (circular_queue); ++ circular_queue.release (); ++ continue; ++ } ++ candi_stmts[vf] = profit_map; ++ free_ginfos (circular_queue); ++ vf /= 2; ++ circular_queue.release (); ++ } ++ return decide_stmts_by_profit (candi_stmts, stmts); ++} ++ ++/* Get iv from SEED_STMTS and make sure each seed_stmt has only one iv as index ++ and all indices are the same. */ ++ ++static tree ++find_index (vec<gimple *> seed_stmts) ++{ ++ if (seed_stmts.length () == 0) ++ return NULL; ++ bool found_index = false; ++ tree index = NULL; ++ unsigned ui = 0; ++ for (ui = 0; ui < seed_stmts.length (); ui++) ++ { ++ if (!gimple_vdef (seed_stmts[ui])) ++ return NULL; ++ tree lhs = gimple_assign_lhs (seed_stmts[ui]); ++ unsigned num_index = 0; ++ while (TREE_CODE (lhs) == ARRAY_REF) ++ { ++ if (TREE_CODE (TREE_OPERAND (lhs, 1)) == SSA_NAME) ++ { ++ num_index++; ++ if (num_index > 1) ++ return NULL; ++ if (index == NULL) ++ { ++ index = TREE_OPERAND (lhs, 1); ++ found_index = true; ++ } ++ else if (index != TREE_OPERAND (lhs, 1)) ++ return NULL; ++ } ++ lhs = TREE_OPERAND (lhs, 0); ++ } ++ if (!found_index) ++ return NULL; ++ } ++ return index; ++} ++ ++/* Check if expression of phi is an increament of a const. */ ++ ++static void ++check_phi_inc (struct vertex *v_phi, struct graph *rdg, bool &found_inc) ++{ ++ struct graph_edge *e_phi; ++ for (e_phi = v_phi->succ; e_phi; e_phi = e_phi->succ_next) ++ { ++ struct vertex *v_inc = &(rdg->vertices[e_phi->dest]); ++ if (!is_gimple_assign (RDGV_STMT (v_inc)) ++ || gimple_expr_code (RDGV_STMT (v_inc)) != PLUS_EXPR) ++ continue; ++ tree rhs1 = gimple_assign_rhs1 (RDGV_STMT (v_inc)); ++ tree rhs2 = gimple_assign_rhs2 (RDGV_STMT (v_inc)); ++ if (!(integer_onep (rhs1) || integer_onep (rhs2))) ++ continue; ++ struct graph_edge *e_inc; ++ /* find cycle with only two vertices inc and phi: inc <--> phi. */ ++ bool found_cycle = false; ++ for (e_inc = v_inc->succ; e_inc; e_inc = e_inc->succ_next) ++ { ++ if (e_inc->dest == e_phi->src) ++ { ++ found_cycle = true; ++ break; ++ } ++ } ++ if (!found_cycle) ++ continue; ++ found_inc = true; ++ } ++} ++ ++/* Check if phi satisfies form like PHI <0, i>. */ ++ ++static inline bool ++iv_check_phi_stmt (gimple *phi_stmt) ++{ ++ return gimple_phi_num_args (phi_stmt) == 2 ++ && (integer_zerop (gimple_phi_arg_def (phi_stmt, 0)) ++ || integer_zerop (gimple_phi_arg_def (phi_stmt, 1))); ++} ++ ++/* Make sure the iteration varible is a phi. */ ++ ++static tree ++get_iv_from_seed (struct graph *flow_only_rdg, vec<gimple *> seed_stmts) ++{ ++ tree index = find_index (seed_stmts); ++ if (index == NULL) ++ return NULL; ++ for (int i = 0; i < flow_only_rdg->n_vertices; i++) ++ { ++ struct vertex *v = &(flow_only_rdg->vertices[i]); ++ if (RDGV_STMT (v) != seed_stmts[0]) ++ continue; ++ struct graph_edge *e; ++ bool found_phi = false; ++ for (e = v->pred; e; e = e->pred_next) ++ { ++ struct vertex *v_phi = &(flow_only_rdg->vertices[e->src]); ++ gimple *phi_stmt = RDGV_STMT (v_phi); ++ if (gimple_code (phi_stmt) != GIMPLE_PHI ++ || gimple_phi_result (phi_stmt) != index) ++ continue; ++ if (!iv_check_phi_stmt (phi_stmt)) ++ return NULL; ++ /* find inc expr in succ of phi. */ ++ bool found_inc = false; ++ check_phi_inc (v_phi, flow_only_rdg, found_inc); ++ if (!found_inc) ++ return NULL; ++ found_phi = true; ++ break; ++ } ++ if (!found_phi) ++ return NULL; ++ break; ++ } ++ return index; ++} ++ ++/* Do not distribute loop if vertexes in ROOT_MAP have antidependence with in ++ FLOW_ONLY_RDG. */ ++ ++static bool ++check_no_dependency (struct graph *flow_only_rdg, bitmap root_map) ++{ ++ bitmap_iterator bi; ++ unsigned ui; ++ auto_vec<unsigned, 16> visited_nodes; ++ auto_bitmap visited_map; ++ EXECUTE_IF_SET_IN_BITMAP (root_map, 0, ui, bi) ++ visited_nodes.safe_push (ui); ++ for (ui = 0; ui < visited_nodes.length (); ui++) ++ { ++ struct vertex *v = &(flow_only_rdg->vertices[visited_nodes[ui]]); ++ struct graph_edge *e; ++ for (e = v->succ; e; e = e->succ_next) ++ { ++ if (bitmap_bit_p (root_map, e->dest)) ++ return false; ++ if (bitmap_bit_p (visited_map, e->dest)) ++ continue; ++ visited_nodes.safe_push (e->dest); ++ bitmap_set_bit (visited_map, e->dest); ++ } ++ } ++ return true; ++} ++ ++/* Find isomorphic stmts from GROUPED_LOADS in VINFO and make sure ++ there is no dependency among those STMT we found. */ ++ ++static unsigned ++get_cut_points (struct graph *flow_only_rdg, bitmap cut_points, ++ loop_vec_info vinfo) ++{ ++ unsigned n_stmts = 0; ++ ++ /* STMTS that may be CUT_POINTS. */ ++ auto_vec<gimple *> stmts; ++ if (!find_isomorphic_stmts (vinfo, stmts)) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "No temp array insertion: no isomorphic stmts" ++ " were found.\n"); ++ return 0; ++ } ++ ++ for (int i = 0; i < flow_only_rdg->n_vertices; i++) ++ { ++ if (stmts.contains (RDG_STMT (flow_only_rdg, i))) ++ bitmap_set_bit (cut_points, i); ++ } ++ n_stmts = bitmap_count_bits (cut_points); ++ ++ bool succ = check_no_dependency (flow_only_rdg, cut_points); ++ if (!succ) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "No temp array inserted: data dependency" ++ " among isomorphic stmts.\n"); ++ return 0; ++ } ++ return n_stmts; ++} ++ ++static void ++build_temp_array (struct vertex *v, gimple_stmt_iterator &gsi, ++ poly_uint64 array_extent, tree iv, ++ hash_set<tree> *tmp_array_vars, vec<gimple *> *transformed) ++{ ++ gimple *stmt = RDGV_STMT (v); ++ tree lhs = gimple_assign_lhs (stmt); ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "original stmt:\t"); ++ print_gimple_stmt (dump_file, stmt, 0, TDF_VOPS|TDF_MEMSYMS); ++ } ++ tree var_ssa = duplicate_ssa_name (lhs, stmt); ++ gimple_assign_set_lhs (stmt, var_ssa); ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "changed to:\t"); ++ print_gimple_stmt (dump_file, stmt, 0, TDF_VOPS | TDF_MEMSYMS); ++ } ++ gimple_set_uid (gsi_stmt (gsi), -1); ++ tree vect_elt_type = TREE_TYPE (lhs); ++ tree array_type = build_array_type_nelts (vect_elt_type, array_extent); ++ tree array = create_tmp_var (array_type); ++ tree array_ssa = build4 (ARRAY_REF, vect_elt_type, array, iv, NULL, NULL); ++ tmp_array_vars->add (array_ssa); ++ gimple *store = gimple_build_assign (array_ssa, var_ssa); ++ tree new_vdef = make_ssa_name (gimple_vop (cfun), store); ++ gsi_insert_after (&gsi, store, GSI_NEW_STMT); ++ gimple_set_vdef (store, new_vdef); ++ transformed->safe_push (store); ++ gimple_set_uid (gsi_stmt (gsi), -1); ++ tree array_ssa2 = build4 (ARRAY_REF, vect_elt_type, array, iv, NULL, NULL); ++ tmp_array_vars->add (array_ssa2); ++ gimple *load = gimple_build_assign (lhs, array_ssa2); ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "insert stmt:\t"); ++ print_gimple_stmt (dump_file, store, 0, TDF_VOPS|TDF_MEMSYMS); ++ fprintf (dump_file, " and stmt:\t"); ++ print_gimple_stmt (dump_file, load, 0, TDF_VOPS|TDF_MEMSYMS); ++ } ++ gimple_set_vuse (load, new_vdef); ++ gsi_insert_after (&gsi, load, GSI_NEW_STMT); ++ gimple_set_uid (gsi_stmt (gsi), -1); ++} ++ ++/* Set bitmap PRODUCERS based on vec TRANSFORMED. */ ++ ++void ++loop_distribution::build_producers (loop_p loop, bitmap producers, ++ vec<gimple *> &transformed) ++{ ++ auto_vec<gimple *, 10> stmts; ++ stmts_from_loop (loop, &stmts); ++ int i = 0; ++ gimple *stmt = NULL; ++ ++ FOR_EACH_VEC_ELT (stmts, i, stmt) ++ gimple_set_uid (stmt, i); ++ i = 0; ++ FOR_EACH_VEC_ELT (transformed, i, stmt) ++ bitmap_set_bit (producers, stmt->uid); ++} ++ ++/* Transform stmt ++ ++ A = FOO (ARG_1); ++ ++ to ++ ++ STMT_1: A1 = FOO (ARG_1); ++ STMT_2: X[I] = A1; ++ STMT_3: A = X[I]; ++ ++ Producer is STMT_2 who defines the temp array and consumer is ++ STMT_3 who uses the temp array. */ ++ ++void ++loop_distribution::do_insertion (loop_p loop, struct graph *flow_only_rdg, ++ tree iv, bitmap cut_points, ++ hash_set<tree> *tmp_array_vars, ++ bitmap producers) ++{ ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "=== do insertion ===\n"); ++ ++ auto_vec<gimple *> transformed; ++ ++ /* Execution times of loop. */ ++ poly_uint64 array_extent ++ = tree_to_poly_uint64 (number_of_latch_executions (loop)) + 1; ++ ++ basic_block *bbs = get_loop_body_in_custom_order (loop, this, ++ bb_top_order_cmp_r); ++ ++ for (int i = 0; i < int (loop->num_nodes); i++) ++ { ++ basic_block bb = bbs[i]; ++ ++ /* Find all cut points in bb and transform them. */ ++ for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi); ++ gsi_next (&gsi)) ++ { ++ unsigned j = gimple_uid (gsi_stmt (gsi)); ++ if (bitmap_bit_p (cut_points, j)) ++ { ++ struct vertex *v = &(flow_only_rdg->vertices[j]); ++ build_temp_array (v, gsi, array_extent, iv, tmp_array_vars, ++ &transformed); ++ } ++ } ++ } ++ build_producers (loop, producers, transformed); ++ update_ssa (TODO_update_ssa); ++ free (bbs); ++} ++ ++/* After temp array insertion, given stmts ++ STMT_1: M = FOO (ARG_1); ++ STMT_2: X[I] = M; ++ STMT_3: A = X[I]; ++ STMT_2 is the producer, STMT_1 is its prev and STMT_3 is its next. ++ Replace M with A, and remove STMT_2 and STMT_3. */ ++ ++static void ++reset_gimple_assign (struct graph *flow_only_rdg, struct partition *partition, ++ gimple_stmt_iterator &gsi, int j) ++{ ++ struct vertex *v = &(flow_only_rdg->vertices[j]); ++ gimple *stmt = RDGV_STMT (v); ++ gimple *prev = stmt->prev; ++ gimple *next = stmt->next; ++ tree n_lhs = gimple_assign_lhs (next); ++ gimple_assign_set_lhs (prev, n_lhs); ++ unlink_stmt_vdef (stmt); ++ if (partition) ++ bitmap_clear_bit (partition->stmts, gimple_uid (gsi_stmt (gsi))); ++ gsi_remove (&gsi, true); ++ release_defs (stmt); ++ if (partition) ++ bitmap_clear_bit (partition->stmts, gimple_uid (gsi_stmt (gsi))); ++ gsi_remove (&gsi, true); ++} ++ ++void ++loop_distribution::remove_insertion (loop_p loop, struct graph *flow_only_rdg, ++ bitmap producers, struct partition *partition) ++{ ++ basic_block *bbs = get_loop_body_in_custom_order (loop, this, ++ bb_top_order_cmp_r); ++ for (int i = 0; i < int (loop->num_nodes); i++) ++ { ++ basic_block bb = bbs[i]; ++ for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi); ++ gsi_next (&gsi)) ++ { ++ unsigned j = gimple_uid (gsi_stmt (gsi)); ++ if (bitmap_bit_p (producers, j)) ++ reset_gimple_assign (flow_only_rdg, partition, gsi, j); ++ } ++ } ++ update_ssa (TODO_update_ssa); ++ free (bbs); ++} ++ ++/* Insert temp arrays if isomorphic computation exists. Temp arrays will be ++ regarded as SEED_STMTS for building partitions in succeeding processes. */ ++ ++bool ++loop_distribution::insert_temp_arrays (loop_p loop, vec<gimple *> seed_stmts, ++ hash_set<tree> *tmp_array_vars, bitmap producers) ++{ ++ struct graph *flow_only_rdg = build_rdg (loop, NULL); ++ gcc_checking_assert (flow_only_rdg != NULL); ++ tree iv = get_iv_from_seed (flow_only_rdg, seed_stmts); ++ if (iv == NULL) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "Loop %d no temp array insertion: failed to get" ++ " iteration variable.\n", loop->num); ++ free_rdg (flow_only_rdg); ++ return false; ++ } ++ auto_bitmap cut_points; ++ loop_vec_info vinfo = loop_vec_info_for_loop (loop); ++ unsigned n_cut_points = get_cut_points (flow_only_rdg, cut_points, vinfo); ++ delete vinfo; ++ loop->aux = NULL; ++ if (n_cut_points == 0) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "Loop %d no temp array insertion: no cut points" ++ " found.\n", loop->num); ++ free_rdg (flow_only_rdg); ++ return false; ++ } ++ do_insertion (loop, flow_only_rdg, iv, cut_points, tmp_array_vars, producers); ++ if (dump_enabled_p ()) ++ { ++ dump_user_location_t loc = find_loop_location (loop); ++ dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, loc, "Insertion done:" ++ " %d temp arrays inserted in Loop %d.\n", ++ n_cut_points, loop->num); ++ } ++ free_rdg (flow_only_rdg); ++ return true; ++} ++ ++static bool find_seed_stmts_for_distribution (class loop *, vec<gimple *> *); ++ + /* Distributes the code from LOOP in such a way that producer statements + are placed before consumer statements. Tries to separate only the + statements from STMTS into separate loops. Returns the number of +@@ -2972,7 +4342,7 @@ loop_distribution::finalize_partitions (class loop *loop, + + int + loop_distribution::distribute_loop (class loop *loop, +- const vec<gimple *> &stmts, ++ vec<gimple *> &stmts, + control_dependences *cd, int *nb_calls, bool *destroy_p, + bool only_patterns_p) + { +@@ -3021,6 +4391,33 @@ loop_distribution::distribute_loop (class loop *loop, + return 0; + } + ++ /* Try to distribute LOOP if LOOP is simple enough and unable to vectorize. ++ If LOOP has grouped loads, recursively find isomorphic stmts and insert ++ temp arrays, rebuild RDG and call find_seed_stmts_for_distribution ++ to replace STMTS. */ ++ ++ hash_set<tree> tmp_array_vars; ++ ++ /* STMTs that define those inserted TMP_ARRAYs. */ ++ auto_bitmap producers; ++ ++ /* New SEED_STMTS after insertion. */ ++ auto_vec<gimple *> work_list; ++ bool insert_success = false; ++ if (may_insert_temp_arrays (loop, rdg, cd)) ++ { ++ if (insert_temp_arrays (loop, stmts, &tmp_array_vars, producers)) ++ { ++ if (find_seed_stmts_for_distribution (loop, &work_list)) ++ { ++ insert_success = true; ++ } ++ else ++ remove_insertion (loop, rdg, producers, NULL); ++ rebuild_rdg (loop, rdg, cd); ++ } ++ } ++ + data_reference_p dref; + for (i = 0; datarefs_vec.iterate (i, &dref); ++i) + dref->aux = (void *) (uintptr_t) i; +@@ -3029,7 +4426,10 @@ loop_distribution::distribute_loop (class loop *loop, + dump_rdg (dump_file, rdg); + + auto_vec<struct partition *, 3> partitions; +- rdg_build_partitions (rdg, stmts, &partitions); ++ if (work_list.length() > stmts.length()) ++ rdg_build_partitions (rdg, &work_list, &partitions); ++ else ++ rdg_build_partitions (rdg, &stmts, &partitions); + + auto_vec<ddr_p> alias_ddrs; + +@@ -3101,7 +4501,7 @@ loop_distribution::distribute_loop (class loop *loop, + for (int j = i + 1; + partitions.iterate (j, &partition); ++j) + { +- if (share_memory_accesses (rdg, into, partition)) ++ if (share_memory_accesses (rdg, into, partition, &tmp_array_vars)) + { + partition_merge_into (rdg, into, partition, FUSE_SHARE_REF); + partitions.unordered_remove (j); +@@ -3151,7 +4551,7 @@ loop_distribution::distribute_loop (class loop *loop, + } + } + +- finalize_partitions (loop, &partitions, &alias_ddrs); ++ finalize_partitions (loop, &partitions, &alias_ddrs, producers); + + /* If there is a reduction in all partitions make sure the last one + is not classified for builtin code generation. */ +@@ -3169,6 +4569,24 @@ loop_distribution::distribute_loop (class loop *loop, + } + + nbp = partitions.length (); ++ ++ /* If we have inserted TMP_ARRAYs but there is only one partition left in ++ the succeeding processes, remove those inserted TMP_ARRAYs back to the ++ original version. */ ++ ++ if (nbp == 1 && insert_success) ++ { ++ struct partition *partition = NULL; ++ partitions.iterate (0, &partition); ++ remove_insertion (loop, rdg, producers, partition); ++ if (dump_enabled_p ()) ++ { ++ dump_user_location_t loc = find_loop_location (loop); ++ dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, loc, "Insertion removed:" ++ " unable to distribute loop %d.\n", loop->num); ++ } ++ } ++ + if (nbp == 0 + || (nbp == 1 && !partition_builtin_p (partitions[0])) + || (nbp > 1 && partition_contains_all_rw (rdg, partitions))) +diff --git a/gcc/tree-vect-data-refs.cc b/gcc/tree-vect-data-refs.cc +index 04e68f621..aae7f62f3 100644 +--- a/gcc/tree-vect-data-refs.cc ++++ b/gcc/tree-vect-data-refs.cc +@@ -2791,6 +2791,9 @@ vect_analyze_group_access_1 (vec_info *vinfo, dr_vec_info *dr_info) + DR_GROUP_GAP (stmt_info) = groupsize - last_accessed_element; + + DR_GROUP_SIZE (stmt_info) = groupsize; ++ ++ DR_GROUP_SLP_TRANSPOSE (stmt_info) = false; ++ + if (dump_enabled_p ()) + { + dump_printf_loc (MSG_NOTE, vect_location, +@@ -2820,6 +2823,20 @@ vect_analyze_group_access_1 (vec_info *vinfo, dr_vec_info *dr_info) + DR_GROUP_GAP (stmt_info)); + } + ++ /* SLP: create an SLP data structure for every interleaving group of ++ loads for further analysis in vect_analyse_slp. */ ++ if (DR_IS_READ (dr) && !slp_impossible) ++ { ++ if (loop_vinfo) ++ { ++ LOOP_VINFO_GROUPED_LOADS (loop_vinfo).safe_push (stmt_info); ++ } ++ if (bb_vinfo) ++ { ++ BB_VINFO_GROUPED_LOADS (bb_vinfo).safe_push (stmt_info); ++ } ++ } ++ + /* SLP: create an SLP data structure for every interleaving group of + stores for further analysis in vect_analyse_slp. */ + if (DR_IS_WRITE (dr) && !slp_impossible) +@@ -5636,6 +5653,226 @@ vect_permute_store_chain (vec_info *vinfo, vec<tree> &dr_chain, + } + } + ++/* Encoding the PERM_MASK_FIRST. */ ++ ++static void ++vect_indices_encoding_first (tree vectype, unsigned int array_num, ++ tree &perm_mask_high_first, ++ tree &perm_mask_low_first) ++{ ++ unsigned int nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant (); ++ vec_perm_builder sel (nelt, nelt, 1); ++ sel.quick_grow (nelt); ++ unsigned int group_num = nelt / array_num; ++ unsigned int index = 0; ++ unsigned int array = 0; ++ unsigned int group = 0; ++ ++ /* The encoding has 1 pattern in the fisrt stage. */ ++ for (array = 0; array < array_num / 2; array++) ++ { ++ for (group = 0; group < group_num * 2; group++) ++ { ++ sel[index++] = array + array_num * group; ++ } ++ } ++ vec_perm_indices indices (sel, 2, nelt); ++ perm_mask_high_first = vect_gen_perm_mask_checked (vectype, indices); ++ ++ index = 0; ++ for (array = array_num / 2; array < array_num; array++) ++ { ++ for (group = 0; group < group_num * 2; group++) ++ { ++ sel[index++] = array + array_num * group; ++ } ++ } ++ indices.new_vector (sel, 2, nelt); ++ perm_mask_low_first = vect_gen_perm_mask_checked (vectype, indices); ++} ++ ++/* Encoding the PERM_MASK. */ ++ ++static void ++vect_indices_encoding (tree vectype, unsigned int array_num, ++ tree &perm_mask_high, tree &perm_mask_low) ++{ ++ unsigned int nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant (); ++ vec_perm_builder sel (nelt, nelt, 1); ++ sel.quick_grow (nelt); ++ unsigned int group_num = nelt / array_num; ++ unsigned int index = 0; ++ unsigned int array = 0; ++ unsigned int group = 0; ++ ++ /* The encoding has 2 patterns in the folllowing stages. */ ++ for (array = 0; array < array_num / 2; array++) ++ { ++ for (group = 0; group < group_num; group++) ++ { ++ sel[index++] = group + group_num * array; ++ } ++ for (group = 0; group < group_num; group++) ++ { ++ sel[index++] = nelt + group + group_num * array; ++ } ++ } ++ vec_perm_indices indices (sel, 2, nelt); ++ perm_mask_high = vect_gen_perm_mask_checked (vectype, indices); ++ ++ index = 0; ++ for (array = array_num / 2; array < array_num; array++) ++ { ++ for (group = 0; group < group_num; group++) ++ { ++ sel[index++] = group + group_num * array; ++ } ++ for (group = 0; group < group_num; group++) ++ { ++ sel[index++] = nelt + group + group_num * array; ++ } ++ } ++ indices.new_vector (sel, 2, nelt); ++ perm_mask_low = vect_gen_perm_mask_checked (vectype, indices); ++} ++ ++/* Function vect_transpose_store_chain. ++ ++ Given a chain of interleaved stores in DR_CHAIN of LENGTH and ARRAY_NUM that ++ must be a power of 2. Generate interleave_high/low stmts to reorder ++ the data correctly for the stores. Return the final references for stores ++ in RESULT_CHAIN. This function is similar to vect_permute_store_chain (), ++ we interleave the contents of the vectors in their order. ++ ++ E.g., LENGTH is 4, the scalar type is short (i.e., VF is 8) and ARRAY_NUM ++ is 4. That is, the input is 4 vectors each containing 8 elements. ++ And 2 (VF / ARRAY_NUM) of 8 elements come from the same array. we interleave ++ the contents of the four vectors in their order. We assign a number to each ++ element, the input sequence is: ++ ++ 1st vec: 0 1 2 3 4 5 6 7 ++ 2nd vec: 8 9 10 11 12 13 14 15 ++ 3rd vec: 16 17 18 19 20 21 22 23 ++ 4th vec: 24 25 26 27 28 29 30 31 ++ ++ The output sequence should be: ++ ++ 1st vec: 0 4 8 12 16 20 24 28 ++ 2nd vec: 1 5 9 13 17 21 25 29 ++ 3rd vec: 2 6 10 14 18 22 26 30 ++ 4th vec: 3 7 11 15 19 23 27 31 ++ ++ In our example, ++ We get 2 (VF / ARRAY_NUM) elements together in every vector. ++ ++ I1: 0 4 1 5 2 6 3 7 ++ I2: 8 12 9 13 10 14 11 15 ++ I3: 16 20 17 21 18 22 19 23 ++ I4: 24 28 25 29 26 30 27 31 ++ ++ Then, we use interleave_high/low instructions to create such output. ++ Every 2 (VF / ARRAY_NUM) elements are regarded as a whole. The permutation ++ is done in log LENGTH stages. ++ ++ I1: interleave_high (1st vec, 3rd vec) ++ I2: interleave_low (1st vec, 3rd vec) ++ I3: interleave_high (2nd vec, 4th vec) ++ I4: interleave_low (2nd vec, 4th vec) ++ ++ The first stage of the sequence should be: ++ ++ I1: 0 4 16 20 1 5 17 21 ++ I2: 2 6 18 22 3 7 19 23 ++ I3: 8 12 24 28 9 13 25 29 ++ I4: 10 14 26 30 11 15 27 31 ++ ++ The following stage sequence should be, i.e. the final result is: ++ ++ I1: 0 4 8 12 16 20 24 28 ++ I2: 1 5 9 13 17 21 25 29 ++ I3: 2 6 10 14 18 22 26 30 ++ I4: 3 7 11 15 19 23 27 31. */ ++ ++void ++vect_transpose_store_chain (vec_info *vinfo, vec<tree> dr_chain, ++ unsigned int length, unsigned int array_num, ++ stmt_vec_info stmt_info, gimple_stmt_iterator *gsi, ++ vec<tree> *result_chain) ++{ ++ gimple *perm_stmt = NULL; ++ tree vectype = STMT_VINFO_VECTYPE (stmt_info); ++ tree perm_mask_low_first = NULL; ++ tree perm_mask_high_first = NULL; ++ tree perm_mask_low = NULL; ++ tree perm_mask_high = NULL; ++ unsigned int log_length = exact_log2 (length); ++ ++ /* Only power of 2 is supported. */ ++ gcc_assert (pow2p_hwi (length)); ++ ++ /* The encoding has 2 types, one for the grouped pattern in the fisrt stage, ++ another for the interleaved patterns in the following stages. */ ++ gcc_assert (array_num != 0); ++ ++ /* Create grouped stmt (in the first stage): ++ group = nelt / array_num; ++ high_first = VEC_PERM_EXPR <vect1, vect2, ++ {0, array_num, 2*array_num, ..., (2*group-1)*array_num, ++ 1, 1+array_num, 1+2*array_num, ..., 1+(2*group-1)*array_num, ++ ..., ++ array_num/2-1, (array_num/2-1)+array_num, ..., ++ (array_num/2-1)+(2*group-1)*array_num}> ++ low_first = VEC_PERM_EXPR <vect1, vect2, ++ {array_num/2, array_num/2+array_num, array_num/2+2*array_num, ++ ..., array_num/2+(2*group-1)*array_num, ++ array_num/2+1, array_num/2+1+array_num, ++ ..., array_num/2+1+(2*group-1)*array_num, ++ ..., ++ array_num-1, array_num-1+array_num, ++ ..., array_num-1+(2*group-1)*array_num}> */ ++ vect_indices_encoding_first (vectype, array_num, perm_mask_high_first, ++ perm_mask_low_first); ++ ++ /* Create interleaving stmt (in the following stages): ++ high = VEC_PERM_EXPR <vect1, vect2, {0, 1, ..., group-1, ++ nelt, nelt+1, ..., nelt+group-1, ++ group, group+1, ..., 2*group-1, ++ nelt+group, nelt+group+1, ..., nelt+2*group-1, ++ ...}> ++ low = VEC_PERM_EXPR <vect1, vect2, ++ {nelt/2, nelt/2+1, ..., nelt/2+group-1, ++ nelt*3/2, nelt*3/2+1, ..., nelt*3/2+group-1, ++ nelt/2+group, nelt/2+group+1, ..., nelt/2+2*group-1, ++ nelt*3/2+group, nelt*3/2+group+1, ..., nelt*3/2+2*group-1, ++ ...}> */ ++ vect_indices_encoding (vectype, array_num, perm_mask_high, perm_mask_low); ++ ++ for (unsigned int perm_time = 0; perm_time < log_length; perm_time++) ++ { ++ for (unsigned int index = 0; index < length / 2; index++) ++ { ++ tree vect1 = dr_chain[index]; ++ tree vect2 = dr_chain[index + length / 2]; ++ ++ tree high = make_temp_ssa_name (vectype, NULL, "vect_inter_high"); ++ perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1, vect2, ++ perm_time == 0 ? perm_mask_high_first ++ : perm_mask_high); ++ vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi); ++ (*result_chain)[2 * index] = high; ++ ++ tree low = make_temp_ssa_name (vectype, NULL, "vect_inter_low"); ++ perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1, vect2, ++ perm_time == 0 ? perm_mask_low_first ++ : perm_mask_low); ++ vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi); ++ (*result_chain)[2 * index+1] = low; ++ } ++ memcpy (dr_chain.address (), result_chain->address (), ++ length * sizeof (tree)); ++ } ++} ++ + /* Function vect_setup_realignment + + This function is called when vectorizing an unaligned load using +diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc +index 3435f9378..f296e9415 100644 +--- a/gcc/tree-vect-loop.cc ++++ b/gcc/tree-vect-loop.cc +@@ -2856,7 +2856,7 @@ vect_analyze_loop_1 (class loop *loop, vec_info_shared *shared, + loop_vec_info main_loop_vinfo, + const vector_modes &vector_modes, unsigned &mode_i, + machine_mode &autodetected_vector_mode, +- bool &fatal) ++ bool &fatal, bool result_only_p) + { + loop_vec_info loop_vinfo + = vect_create_loop_vinfo (loop, shared, loop_form_info, main_loop_vinfo); +@@ -2865,6 +2865,8 @@ vect_analyze_loop_1 (class loop *loop, vec_info_shared *shared, + loop_vinfo->vector_mode = vector_mode; + unsigned int suggested_unroll_factor = 1; + ++ /* Loop_vinfo for loop-distribution pass. */ ++ opt_loop_vec_info fail_loop_vinfo = opt_loop_vec_info::success (NULL); + /* Run the main analysis. */ + opt_result res = vect_analyze_loop_2 (loop_vinfo, fatal, + &suggested_unroll_factor); +@@ -2933,7 +2935,21 @@ vect_analyze_loop_1 (class loop *loop, vec_info_shared *shared, + + if (!res) + { +- delete loop_vinfo; ++ ++ /* If current analysis shows LOOP is unable to vectorize, loop_vinfo ++ will be deleted. If LOOP is under ldist analysis, backup it before ++ it is deleted and return it if all modes are analyzed and still ++ fail to vectorize. */ ++ if (result_only_p && (mode_i == vector_modes.length () ++ || autodetected_vector_mode == VOIDmode)) ++ { ++ fail_loop_vinfo = opt_loop_vec_info::success (loop_vinfo); ++ loop->aux = (loop_vec_info) fail_loop_vinfo; ++ } ++ else ++ { ++ delete loop_vinfo; ++ } + if (fatal) + gcc_checking_assert (main_loop_vinfo == NULL); + return opt_loop_vec_info::propagate_failure (res); +@@ -2946,9 +2962,11 @@ vect_analyze_loop_1 (class loop *loop, vec_info_shared *shared, + + Apply a set of analyses on LOOP, and create a loop_vec_info struct + for it. The different analyses will record information in the +- loop_vec_info struct. */ ++ loop_vec_info struct. When RESULT_ONLY_P is true, quit analysis ++ if loop is vectorizable, otherwise, do not delete vinfo. */ + opt_loop_vec_info +-vect_analyze_loop (class loop *loop, vec_info_shared *shared) ++vect_analyze_loop (class loop *loop, vec_info_shared *shared, ++ bool result_only_p) + { + DUMP_VECT_SCOPE ("analyze_loop_nest"); + +@@ -2996,6 +3014,12 @@ vect_analyze_loop (class loop *loop, vec_info_shared *shared) + && !unlimited_cost_model (loop)); + machine_mode autodetected_vector_mode = VOIDmode; + opt_loop_vec_info first_loop_vinfo = opt_loop_vec_info::success (NULL); ++ /* Loop_vinfo for loop-distribution pass. */ ++ opt_loop_vec_info fail_loop_vinfo = opt_loop_vec_info::success (NULL); ++ if (result_only_p) ++ { ++ vect_slp_init (); ++ } + unsigned int mode_i = 0; + unsigned HOST_WIDE_INT simdlen = loop->simdlen; + +@@ -3019,10 +3043,16 @@ vect_analyze_loop (class loop *loop, vec_info_shared *shared) + opt_loop_vec_info loop_vinfo + = vect_analyze_loop_1 (loop, shared, &loop_form_info, + NULL, vector_modes, mode_i, +- autodetected_vector_mode, fatal); ++ autodetected_vector_mode, fatal, result_only_p); + if (fatal) + break; + ++ if (result_only_p && (mode_i == vector_modes.length () ++ || autodetected_vector_mode == VOIDmode)) ++ { ++ return loop_vinfo; ++ } ++ + if (loop_vinfo) + { + /* Analyzis has been successful so update the VF value. The +@@ -3132,7 +3162,7 @@ vect_analyze_loop (class loop *loop, vec_info_shared *shared) + = vect_analyze_loop_1 (loop, shared, &loop_form_info, + first_loop_vinfo, + vector_modes, mode_i, +- autodetected_vector_mode, fatal); ++ autodetected_vector_mode, fatal, result_only_p); + if (fatal) + break; + +diff --git a/gcc/tree-vect-patterns.cc b/gcc/tree-vect-patterns.cc +index e1bcab0f7..c0c15773d 100644 +--- a/gcc/tree-vect-patterns.cc ++++ b/gcc/tree-vect-patterns.cc +@@ -5632,8 +5632,8 @@ static vect_recog_func vect_vect_recog_func_ptrs[] = { + internal functions. */ + { vect_recog_gather_scatter_pattern, "gather_scatter" }, + { vect_recog_mask_conversion_pattern, "mask_conversion" }, +- { vect_recog_widen_plus_pattern, "widen_plus" }, +- { vect_recog_widen_minus_pattern, "widen_minus" }, ++ // { vect_recog_widen_plus_pattern, "widen_plus" }, ++ // { vect_recog_widen_minus_pattern, "widen_minus" }, + }; + + const unsigned int NUM_PATTERNS = ARRAY_SIZE (vect_vect_recog_func_ptrs); +diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc +index af477c31a..6cbf8085f 100644 +--- a/gcc/tree-vect-slp.cc ++++ b/gcc/tree-vect-slp.cc +@@ -49,6 +49,8 @@ along with GCC; see the file COPYING3. If not see + #include "tree-eh.h" + #include "tree-cfg.h" + #include "alloc-pool.h" ++#include "print-tree.h" ++#include "gimple-pretty-print.h" + + static bool vectorizable_slp_permutation (vec_info *, gimple_stmt_iterator *, + slp_tree, stmt_vector_for_cost *); +@@ -994,6 +996,21 @@ vect_build_slp_tree_1 (vec_info *vinfo, unsigned char *swap, + } + + gcc_assert (vectype); ++ if (!STMT_VINFO_VECTYPE (stmt_info)) ++ STMT_VINFO_VECTYPE (stmt_info) = vectype; ++ if (dump_file) ++ { ++ fprintf (dump_file, "vect_build_slp_tree_1: %p\n", stmt_info); ++ print_gimple_stmt (dump_file, stmt, 0); ++ fprintf (dump_file, "vect_build_slp_tree_1: vectype="); ++ if (vectype) ++ print_generic_expr (dump_file, vectype); ++ fprintf (dump_file, "\n"); ++ fprintf (dump_file, "internal vectype="); ++ if (STMT_VINFO_VECTYPE (stmt_info)) ++ print_generic_expr (dump_file, STMT_VINFO_VECTYPE (stmt_info)); ++ fprintf (dump_file, "\n"); ++ } + + gcall *call_stmt = dyn_cast <gcall *> (stmt); + if (call_stmt) +@@ -1575,10 +1592,10 @@ vect_build_slp_tree (vec_info *vinfo, + dump_printf_loc (MSG_NOTE, vect_location, + "SLP discovery for node %p succeeded\n", res); + gcc_assert (res_ == res); +- res->max_nunits = this_max_nunits; ++ res_->max_nunits = this_max_nunits; + vect_update_max_nunits (max_nunits, this_max_nunits); + /* Keep a reference for the bst_map use. */ +- SLP_TREE_REF_COUNT (res)++; ++ SLP_TREE_REF_COUNT (res_)++; + } + return res_; + } +@@ -3190,8 +3207,10 @@ vect_build_slp_instance (vec_info *vinfo, + + /* For basic block SLP, try to break the group up into multiples of + a vector size. */ ++ bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo); + if (is_a <bb_vec_info> (vinfo) +- && (i > 1 && i < group_size)) ++ && (i > 1 && i < group_size) ++ && !bb_vinfo->transposed) + { + tree scalar_type + = TREE_TYPE (DR_REF (STMT_VINFO_DATA_REF (stmt_info))); +@@ -3301,84 +3320,1034 @@ vect_analyze_slp_instance (vec_info *vinfo, + scalar_stmts.create (DR_GROUP_SIZE (stmt_info)); + while (next_info) + { +- scalar_stmts.quick_push (vect_stmt_to_vectorize (next_info)); +- next_info = DR_GROUP_NEXT_ELEMENT (next_info); ++ scalar_stmts.quick_push (vect_stmt_to_vectorize (next_info)); ++ next_info = DR_GROUP_NEXT_ELEMENT (next_info); ++ } ++ } ++ else if (kind == slp_inst_kind_reduc_chain) ++ { ++ /* Collect the reduction stmts and store them in scalar_stmts. */ ++ scalar_stmts.create (REDUC_GROUP_SIZE (stmt_info)); ++ while (next_info) ++ { ++ scalar_stmts.quick_push (vect_stmt_to_vectorize (next_info)); ++ next_info = REDUC_GROUP_NEXT_ELEMENT (next_info); ++ } ++ /* Mark the first element of the reduction chain as reduction to properly ++ transform the node. In the reduction analysis phase only the last ++ element of the chain is marked as reduction. */ ++ STMT_VINFO_DEF_TYPE (stmt_info) ++ = STMT_VINFO_DEF_TYPE (scalar_stmts.last ()); ++ STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info)) ++ = STMT_VINFO_REDUC_DEF (vect_orig_stmt (scalar_stmts.last ())); ++ } ++ else if (kind == slp_inst_kind_ctor) ++ { ++ tree rhs = gimple_assign_rhs1 (stmt_info->stmt); ++ tree val; ++ scalar_stmts.create (CONSTRUCTOR_NELTS (rhs)); ++ FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs), i, val) ++ { ++ stmt_vec_info def_info = vinfo->lookup_def (val); ++ def_info = vect_stmt_to_vectorize (def_info); ++ scalar_stmts.quick_push (def_info); ++ } ++ if (dump_enabled_p ()) ++ dump_printf_loc (MSG_NOTE, vect_location, ++ "Analyzing vectorizable constructor: %G\n", ++ stmt_info->stmt); ++ } ++ else if (kind == slp_inst_kind_reduc_group) ++ { ++ /* Collect reduction statements. */ ++ const vec<stmt_vec_info> &reductions ++ = as_a <loop_vec_info> (vinfo)->reductions; ++ scalar_stmts.create (reductions.length ()); ++ for (i = 0; reductions.iterate (i, &next_info); i++) ++ if ((STMT_VINFO_RELEVANT_P (next_info) ++ || STMT_VINFO_LIVE_P (next_info)) ++ /* ??? Make sure we didn't skip a conversion around a reduction ++ path. In that case we'd have to reverse engineer that conversion ++ stmt following the chain using reduc_idx and from the PHI ++ using reduc_def. */ ++ && STMT_VINFO_DEF_TYPE (next_info) == vect_reduction_def) ++ scalar_stmts.quick_push (next_info); ++ /* If less than two were relevant/live there's nothing to SLP. */ ++ if (scalar_stmts.length () < 2) ++ return false; ++ } ++ else ++ gcc_unreachable (); ++ ++ vec<stmt_vec_info> roots = vNULL; ++ if (kind == slp_inst_kind_ctor) ++ { ++ roots.create (1); ++ roots.quick_push (stmt_info); ++ } ++ /* Build the tree for the SLP instance. */ ++ bool res = vect_build_slp_instance (vinfo, kind, scalar_stmts, ++ roots, ++ max_tree_size, limit, bst_map, ++ kind == slp_inst_kind_store ++ ? stmt_info : NULL); ++ if (!res) ++ roots.release (); ++ ++ /* ??? If this is slp_inst_kind_store and the above succeeded here's ++ where we should do store group splitting. */ ++ ++ return res; ++} ++ ++static inline bool ++is_const_assign (stmt_vec_info store_elem) ++{ ++ if (store_elem == NULL) ++ { ++ gcc_unreachable (); ++ } ++ gimple *stmt = store_elem->stmt; ++ gimple_rhs_class rhs_class = gimple_assign_rhs_class (stmt); ++ return rhs_class == GIMPLE_SINGLE_RHS ++ && TREE_CONSTANT (gimple_assign_rhs1 (store_elem->stmt)); ++} ++ ++/* Push inits to INNERMOST_INITS and check const assign. */ ++ ++static bool ++record_innermost (vec<tree> &innermost_inits, ++ vec<tree> &innermost_offsets, ++ stmt_vec_info stmt_vinfo) ++{ ++ if (!stmt_vinfo) ++ { ++ return false; ++ } ++ stmt_vec_info next_info = stmt_vinfo; ++ while (next_info) ++ { ++ /* No need to vectorize constant assign in a transposed version. */ ++ if (is_const_assign (next_info)) ++ { ++ if (dump_enabled_p ()) ++ { ++ dump_printf_loc (MSG_NOTE, vect_location, ++ "no need to vectorize, store is const assign: %G", ++ next_info->stmt); ++ } ++ return false; ++ } ++ innermost_inits.safe_push (STMT_VINFO_DR_INIT (next_info)); ++ innermost_offsets.safe_push (STMT_VINFO_DR_OFFSET (next_info)); ++ next_info = DR_GROUP_NEXT_ELEMENT (next_info); ++ } ++ return true; ++} ++ ++/* Compare inits to INNERMOST_INITS, return FALSE if inits do not match ++ the first grouped_store. And check const assign meanwhile. */ ++ ++static bool ++compare_innermost (const vec<tree> &innermost_inits, ++ const vec<tree> &innermost_offsets, ++ stmt_vec_info stmt_vinfo) ++{ ++ if (!stmt_vinfo || innermost_inits.length () != stmt_vinfo->size) ++ { ++ return false; ++ } ++ stmt_vec_info next_info = stmt_vinfo; ++ unsigned int i = 0; ++ while (next_info) ++ { ++ if (is_const_assign (next_info)) ++ { ++ if (dump_enabled_p ()) ++ { ++ dump_printf_loc (MSG_NOTE, vect_location, ++ "no need to vectorize, store is const " ++ "assign: %G", next_info->stmt); ++ } ++ return false; ++ } ++ if (innermost_inits[i] != STMT_VINFO_DR_INIT (next_info) ++ || innermost_offsets[i] != STMT_VINFO_DR_OFFSET (next_info)) ++ { ++ return false; ++ } ++ next_info = DR_GROUP_NEXT_ELEMENT (next_info); ++ i++; ++ } ++ return true; ++} ++ ++static bool ++check_same_bb (stmt_vec_info grp1, stmt_vec_info grp2) ++{ ++ if (grp1->stmt->bb->index == grp2->stmt->bb->index) ++ { ++ return true; ++ } ++ return false; ++} ++ ++/* Check if grouped stores are of same type. ++ input: t1/t2 = TREE_TYPE (gimple_assign_lhs (first_element->stmt)) ++ output: 0 if same, 1 or -1 else. */ ++ ++static int ++tree_type_cmp (const tree t1, const tree t2) ++{ ++ gcc_checking_assert (t1 != NULL && t2 != NULL); ++ if (t1 != t2) ++ { ++ if (TREE_CODE (t1) != TREE_CODE (t2)) ++ { ++ return TREE_CODE (t1) > TREE_CODE (t2) ? 1 : -1; ++ } ++ if (TYPE_UNSIGNED (t1) != TYPE_UNSIGNED (t2)) ++ { ++ return TYPE_UNSIGNED (t1) > TYPE_UNSIGNED (t2) ? 1 : -1; ++ } ++ if (TYPE_PRECISION (t1) != TYPE_PRECISION (t2)) ++ { ++ return TYPE_PRECISION (t1) > TYPE_PRECISION (t2) ? 1 : -1; ++ } ++ } ++ return 0; ++} ++ ++/* Check it if 2 grouped stores are of same type that ++ we can analyze them in a transpose group. */ ++static int ++check_same_store_type (stmt_vec_info grp1, stmt_vec_info grp2) ++{ ++ if (grp1 == grp2) ++ { ++ return 0; ++ } ++ if (grp1->size != grp2->size) ++ { ++ return grp1->size > grp2->size ? 1 : -1; ++ } ++ tree lhs1 = gimple_assign_lhs (grp1->stmt); ++ tree lhs2 = gimple_assign_lhs (grp2->stmt); ++ if (TREE_CODE (lhs1) != TREE_CODE (lhs2)) ++ { ++ return TREE_CODE (lhs1) > TREE_CODE (lhs2) ? 1 : -1; ++ } ++ tree grp_type1 = TREE_TYPE (gimple_assign_lhs (grp1->stmt)); ++ tree grp_type2 = TREE_TYPE (gimple_assign_lhs (grp2->stmt)); ++ int cmp = tree_type_cmp (grp_type1, grp_type2); ++ return cmp; ++} ++ ++/* Sort grouped stores according to group_size and store_type. ++ output: 0 if same, 1 if grp1 > grp2, -1 otherwise. */ ++ ++static int ++grouped_store_cmp (const void *grp1_, const void *grp2_) ++{ ++ stmt_vec_info grp1 = *(stmt_vec_info *)const_cast<void *>(grp1_); ++ stmt_vec_info grp2 = *(stmt_vec_info *)const_cast<void *>(grp2_); ++ return check_same_store_type (grp1, grp2); ++} ++ ++/* Transposing is based on permutation in registers. Permutation requires ++ vector length being power of 2 and satisfying the vector mode. */ ++ ++static inline bool ++check_filling_reg (stmt_vec_info current_element) ++{ ++ if (current_element->size == 0) ++ { ++ return false; ++ } ++ /* If the gimple STMT was already vectorized in vect pass, it's unable to ++ conduct transpose analysis, skip it. */ ++ bool lhs_vectorized ++ = TREE_CODE (TREE_TYPE (gimple_get_lhs (current_element->stmt))) ++ == VECTOR_TYPE; ++ bool rhs_vectorized ++ = TREE_CODE (TREE_TYPE (gimple_assign_rhs1 (current_element->stmt))) ++ == VECTOR_TYPE; ++ if (lhs_vectorized || rhs_vectorized) ++ { ++ return false; ++ } ++ unsigned int store_precision ++ = TYPE_PRECISION (TREE_TYPE (gimple_get_lhs (current_element->stmt))); ++ auto_vector_modes vector_modes; ++ targetm.vectorize.autovectorize_vector_modes (&vector_modes, false); ++ unsigned min_mode_size = -1u; ++ for (unsigned i = 0; i < vector_modes.length (); i++) ++ { ++ unsigned mode_bit_size = (GET_MODE_BITSIZE (vector_modes[i])).coeffs[0]; ++ min_mode_size = mode_bit_size < min_mode_size ++ ? mode_bit_size : min_mode_size; ++ } ++ return store_precision != 0 ++ && pow2p_hwi (current_element->size) ++ && (current_element->size * store_precision % min_mode_size == 0); ++} ++ ++/* Check if previous groups are suitable to transpose, if not, set their ++ group number to -1, reduce grp_num and clear current_groups. ++ Otherwise, just clear current_groups. */ ++ ++static void ++check_and_clear_groups (vec<stmt_vec_info> ¤t_groups, ++ unsigned int &grp_num) ++{ ++ stmt_vec_info first_element; ++ if (current_groups.length () == 1 ++ || (current_groups.length () != 0 ++ && !pow2p_hwi (current_groups.length ()))) ++ { ++ while (current_groups.length () != 0) ++ { ++ first_element = current_groups.pop (); ++ first_element->group_number = -1; ++ } ++ grp_num--; ++ } ++ else ++ { ++ while (current_groups.length ()) ++ { ++ current_groups.pop (); ++ } ++ } ++} ++ ++ ++/* Make sure that transpose slp vectorization is conducted only if grouped ++ stores are one dimension array ref. */ ++ ++static bool ++is_store_one_dim_array (gimple *stmt) ++{ ++ tree op = gimple_get_lhs (stmt); ++ if (TREE_CODE (op) != ARRAY_REF) ++ return false; ++ return TREE_OPERAND_LENGTH (op) > 0 ++ && TREE_OPERAND_LENGTH (TREE_OPERAND (op, 0)) == 0; ++} ++ ++/* Set grouped_stores with similar MEM_REF to the same group and mark their ++ grp_num. Groups with same grp_num consist the minimum unit to analyze ++ transpose. Return num of such units. */ ++ ++static unsigned ++vect_prepare_transpose (bb_vec_info bb_vinfo) ++{ ++ stmt_vec_info current_element = NULL; ++ stmt_vec_info first_element = NULL; ++ unsigned int i = 0; ++ unsigned int grp_num = 0; ++ /* Use arrays to record MEM_REF data in different GROUPED_STORES. */ ++ auto_vec<tree> innermost_inits; ++ auto_vec<tree> innermost_offsets; ++ ++ /* A set of stmt_vec_info with same store type. Analyze them if their size ++ is suitable to transpose. */ ++ auto_vec<stmt_vec_info> current_groups; ++ ++ FOR_EACH_VEC_ELT (bb_vinfo->grouped_stores, i, current_element) ++ { ++ /* Compare current grouped_store to the first one if first_element exists, ++ push current_element to current_groups if they are similar on innermost ++ behavior of MEM_REF. */ ++ if (first_element != NULL ++ && !check_same_store_type (first_element, current_element) ++ && compare_innermost (innermost_inits, innermost_offsets, ++ current_element) ++ && check_same_bb (first_element, current_element)) ++ { ++ current_groups.safe_push (current_element); ++ current_element->group_number = grp_num; ++ /* If current_element is the last element in grouped_stores, continue ++ will exit the loop and leave the last group unanalyzed. */ ++ if (i == bb_vinfo->grouped_stores.length () - 1) ++ { ++ check_and_clear_groups (current_groups, grp_num); ++ } ++ continue; ++ } ++ check_and_clear_groups (current_groups, grp_num); ++ innermost_inits.release (); ++ innermost_offsets.release (); ++ /* Beginning of a new group to analyze whether they are able to consist ++ a unit to conduct transpose analysis. */ ++ first_element = NULL; ++ if (is_store_one_dim_array (current_element->stmt) ++ && check_filling_reg (current_element) ++ && record_innermost (innermost_inits, innermost_offsets, ++ current_element)) ++ { ++ first_element = current_element; ++ current_groups.safe_push (current_element); ++ current_element->group_number = ++grp_num; ++ if (i == bb_vinfo->grouped_stores.length () - 1) ++ { ++ check_and_clear_groups (current_groups, grp_num); ++ } ++ continue; ++ } ++ current_element->group_number = -1; ++ } ++ return grp_num; ++} ++ ++/* Return a flag to transpose grouped stores before building slp tree. ++ Add bool may_transpose in class vec_info. */ ++ ++static bool ++vect_may_transpose (bb_vec_info bb_vinfo) ++{ ++ if (targetm.vectorize.vec_perm_const == NULL) ++ { ++ return false; ++ } ++ ++ if (bb_vinfo->grouped_stores.length () < 2) ++ { ++ return false; ++ } ++ ++ DUMP_VECT_SCOPE ("analyze if grouped stores may transpose to slp"); ++ /* Sort grouped_stores according to size and type for function ++ vect_prepare_transpose (). */ ++ bb_vinfo->grouped_stores.qsort (grouped_store_cmp); ++ ++ int groups = vect_prepare_transpose (bb_vinfo); ++ BB_VINFO_TRANS_GROUPS (bb_vinfo) = groups; ++ if (dump_enabled_p ()) ++ dump_printf_loc (MSG_NOTE, vect_location, ++ "%d groups to analyze transposed slp.\n", groups); ++ return groups != 0; ++} ++ ++/* Get the base address of STMT_INFO. */ ++ ++static tree ++get_op_base_address (stmt_vec_info stmt_info) ++{ ++ struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info); ++ tree op = DR_BASE_ADDRESS (dr); ++ while (TREE_OPERAND_LENGTH (op) > 0) ++ { ++ op = TREE_OPERAND (op, 0); ++ } ++ return op; ++} ++ ++/* Compare the UID of the two stmt_info STMTINFO_A and STMTINFO_B. ++ Sorting them in ascending order. */ ++ ++static int ++dr_group_cmp (const void *stmtinfo_a_, const void *stmtinfo_b_) ++{ ++ stmt_vec_info stmtinfo_a ++ = *(stmt_vec_info *) const_cast<void *> (stmtinfo_a_); ++ stmt_vec_info stmtinfo_b ++ = *(stmt_vec_info *) const_cast<void *> (stmtinfo_b_); ++ ++ /* Stabilize sort. */ ++ if (stmtinfo_a == stmtinfo_b) ++ { ++ return 0; ++ } ++ return gimple_uid (stmtinfo_a->stmt) < gimple_uid (stmtinfo_b->stmt) ? -1 : 1; ++} ++ ++/* Find the first elements of the grouped loads which are required to merge. */ ++ ++static void ++vect_slp_grouped_load_find (bb_vec_info bb_vinfo, vec<bool> &visited, ++ vec<stmt_vec_info> &res) ++{ ++ unsigned int i = 0; ++ stmt_vec_info merge_first_element = NULL; ++ stmt_vec_info first_element = NULL; ++ tree opa = NULL; ++ unsigned int grp_size_a = 0; ++ FOR_EACH_VEC_ELT (bb_vinfo->grouped_loads, i, first_element) ++ { ++ if (visited[i]) ++ { ++ continue; ++ } ++ if (!STMT_VINFO_GROUPED_ACCESS (first_element) ++ || !pow2p_hwi (DR_GROUP_SIZE (first_element))) ++ { ++ /* Non-conforming grouped load should be grouped separately. */ ++ if (merge_first_element == NULL) ++ { ++ visited[i] = true; ++ res.safe_push (first_element); ++ return; ++ } ++ } ++ if (merge_first_element == NULL) ++ { ++ merge_first_element = first_element; ++ opa = get_op_base_address (first_element); ++ grp_size_a = DR_GROUP_SIZE (first_element); ++ res.safe_push (first_element); ++ visited[i] = true; ++ continue; ++ } ++ ++ /* If the two first elements are of the same base address and group size, ++ these two grouped loads need to be merged. */ ++ tree opb = get_op_base_address (first_element); ++ unsigned int grp_size_b = DR_GROUP_SIZE (first_element); ++ if (opa == opb && grp_size_a == grp_size_b) ++ { ++ res.safe_push (first_element); ++ visited[i] = true; ++ } ++ } ++} ++ ++/* Merge the grouped loads that are found from ++ vect_slp_grouped_load_find (). */ ++ ++static stmt_vec_info ++vect_slp_grouped_load_merge (vec<stmt_vec_info> &res) ++{ ++ stmt_vec_info stmt_info = res[0]; ++ if (res.length () == 1) ++ { ++ return stmt_info; ++ } ++ unsigned int i = 0; ++ unsigned int size = DR_GROUP_SIZE (res[0]); ++ unsigned int new_group_size = size * res.length (); ++ stmt_vec_info first_element = NULL; ++ stmt_vec_info merge_first_element = NULL; ++ stmt_vec_info last_element = NULL; ++ FOR_EACH_VEC_ELT (res, i, first_element) ++ { ++ if (merge_first_element == NULL) ++ { ++ merge_first_element = first_element; ++ last_element = merge_first_element; ++ size = DR_GROUP_SIZE (merge_first_element); ++ } ++ ++ if (last_element != first_element ++ && !DR_GROUP_NEXT_ELEMENT (last_element)) ++ { ++ DR_GROUP_NEXT_ELEMENT (last_element) = first_element; ++ /* Store the gap from the previous member of the group. If there is ++ no gap in the access, DR_GROUP_GAP is always 1. */ ++ DR_GROUP_GAP_TRANS (first_element) = DR_GROUP_GAP (first_element); ++ DR_GROUP_GAP (first_element) = 1; ++ } ++ for (stmt_info = first_element; stmt_info; ++ stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info)) ++ { ++ DR_GROUP_FIRST_ELEMENT (stmt_info) = merge_first_element; ++ DR_GROUP_SIZE_TRANS (stmt_info) = DR_GROUP_SIZE (stmt_info); ++ DR_GROUP_SIZE (stmt_info) = new_group_size; ++ last_element = stmt_info; ++ } ++ } ++ DR_GROUP_SIZE (merge_first_element) = new_group_size; ++ DR_GROUP_SLP_TRANSPOSE (merge_first_element) = true; ++ DR_GROUP_NEXT_ELEMENT (last_element) = NULL; ++ return merge_first_element; ++} ++ ++/* Merge the grouped loads which have the same base address and group size. ++ For example, for grouped loads (opa_1, opa_2, opb_1, opb_2): ++ opa_1: a0->a1->a2->a3 ++ opa_2: a8->a9->a10->a11 ++ opb_1: b0->b1 ++ opb_2: b16->b17 ++ we can probably get two merged grouped loads: ++ opa: a0->a1->a2->a3->a8->a9->a10->a11 ++ opb: b0->b1->b16->b17. */ ++ ++static bool ++vect_merge_slp_grouped_loads (bb_vec_info bb_vinfo) ++{ ++ if (bb_vinfo->grouped_loads.length () <= 0) ++ { ++ if (dump_enabled_p ()) ++ { ++ dump_printf_loc (MSG_NOTE, vect_location, ++ "The number of grouped loads is 0.\n"); ++ } ++ return false; ++ } ++ bb_vinfo->grouped_loads.qsort (dr_group_cmp); ++ auto_vec<bool> visited (bb_vinfo->grouped_loads.length ()); ++ auto_vec<stmt_vec_info> grouped_loads_merge; ++ for (unsigned int i = 0; i < bb_vinfo->grouped_loads.length (); i++) ++ { ++ visited.safe_push (false); ++ } ++ while (1) ++ { ++ /* Find grouped loads which are required to merge. */ ++ auto_vec<stmt_vec_info> res; ++ vect_slp_grouped_load_find (bb_vinfo, visited, res); ++ if (res.is_empty ()) ++ { ++ break; ++ } ++ /* Merge the required grouped loads into one group. */ ++ grouped_loads_merge.safe_push (vect_slp_grouped_load_merge (res)); ++ } ++ if (grouped_loads_merge.length () == bb_vinfo->grouped_loads.length ()) ++ { ++ if (dump_enabled_p ()) ++ { ++ dump_printf_loc (MSG_NOTE, vect_location, ++ "No grouped loads need to be merged.\n"); ++ } ++ return false; ++ } ++ if (dump_enabled_p ()) ++ { ++ dump_printf_loc (MSG_NOTE, vect_location, ++ "Merging grouped loads successfully.\n"); ++ } ++ BB_VINFO_GROUPED_LOADS (bb_vinfo).release (); ++ for (unsigned int i = 0; i < grouped_loads_merge.length (); i++) ++ { ++ BB_VINFO_GROUPED_LOADS (bb_vinfo).safe_push (grouped_loads_merge[i]); ++ } ++ return true; ++} ++ ++/* Find the first elements of the grouped stores ++ which are required to transpose and merge. */ ++ ++static void ++vect_slp_grouped_store_find (bb_vec_info bb_vinfo, vec<bool> &visited, ++ vec<stmt_vec_info> &res) ++{ ++ stmt_vec_info first_element = NULL; ++ stmt_vec_info merge_first_element = NULL; ++ unsigned int k = 0; ++ FOR_EACH_VEC_ELT (bb_vinfo->grouped_stores, k, first_element) ++ { ++ if (visited[k]) ++ { ++ continue; ++ } ++ /* Non-conforming grouped store should be grouped separately. */ ++ if (!STMT_VINFO_GROUPED_ACCESS (first_element) ++ || first_element->group_number == -1) ++ { ++ if (merge_first_element == NULL) ++ { ++ visited[k] = true; ++ res.safe_push (first_element); ++ return; ++ } ++ } ++ if (first_element->group_number != -1 ++ && merge_first_element == NULL) ++ { ++ merge_first_element = first_element; ++ } ++ if (merge_first_element->group_number == first_element->group_number) ++ { ++ visited[k] = true; ++ res.safe_push (first_element); ++ } ++ } ++} ++ ++/* Transpose and merge the grouped stores that are found from ++ vect_slp_grouped_store_find (). */ ++ ++static stmt_vec_info ++vect_slp_grouped_store_transform (vec<stmt_vec_info> &res) ++{ ++ stmt_vec_info stmt_info = res[0]; ++ if (res.length () == 1) ++ { ++ return stmt_info; ++ } ++ stmt_vec_info rearrange_first_element = stmt_info; ++ stmt_vec_info last_element = rearrange_first_element; ++ ++ unsigned int size = DR_GROUP_SIZE (rearrange_first_element); ++ unsigned int new_group_size = size * res.length (); ++ for (unsigned int i = 1; i < res.length (); i++) ++ { ++ /* Store the gap from the previous member of the group. If there is no ++ gap in the access, DR_GROUP_GAP is always 1. */ ++ DR_GROUP_GAP_TRANS (res[i]) = DR_GROUP_GAP (res[i]); ++ DR_GROUP_GAP (res[i]) = 1; ++ } ++ while (!res.is_empty ()) ++ { ++ stmt_info = res[0]; ++ res.ordered_remove (0); ++ if (DR_GROUP_NEXT_ELEMENT (stmt_info)) ++ { ++ res.safe_push (DR_GROUP_NEXT_ELEMENT (stmt_info)); ++ } ++ DR_GROUP_FIRST_ELEMENT (stmt_info) = rearrange_first_element; ++ DR_GROUP_NEXT_ELEMENT (last_element) = stmt_info; ++ DR_GROUP_SIZE_TRANS (stmt_info) = DR_GROUP_SIZE (stmt_info); ++ DR_GROUP_SIZE (stmt_info) = new_group_size; ++ last_element = stmt_info; ++ } ++ ++ DR_GROUP_SIZE (rearrange_first_element) = new_group_size; ++ DR_GROUP_SLP_TRANSPOSE (rearrange_first_element) = true; ++ DR_GROUP_NEXT_ELEMENT (last_element) = NULL; ++ return rearrange_first_element; ++} ++ ++/* Save the STMT_INFO in the grouped stores to BB_VINFO_SCALAR_STORES for ++ transposing back grouped stores. */ ++ ++static void ++get_scalar_stores (bb_vec_info bb_vinfo) ++{ ++ unsigned int k = 0; ++ stmt_vec_info first_element = NULL; ++ FOR_EACH_VEC_ELT (bb_vinfo->grouped_stores, k, first_element) ++ { ++ /* Filter the grouped store which is unnecessary for transposing. */ ++ if (!STMT_VINFO_GROUPED_ACCESS (first_element) ++ || first_element->group_number == -1) ++ { ++ continue; ++ } ++ vec<stmt_vec_info> tmp_scalar_store; ++ tmp_scalar_store.create (DR_GROUP_SIZE (first_element)); ++ for (stmt_vec_info stmt_info = first_element; stmt_info; ++ stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info)) ++ { ++ tmp_scalar_store.safe_push (stmt_info); ++ } ++ BB_VINFO_SCALAR_STORES (bb_vinfo).safe_push (tmp_scalar_store); ++ } ++} ++ ++/* Transpose and merge the grouped stores which have the same group number. ++ For example, for grouped stores (opa_0, opa_1, opa_2, opa_3): ++ opa_0: a00->a01->a02->a03 ++ opa_1: a10->a11->a12->a13 ++ opa_2: a20->a21->a22->a23 ++ opa_2: a30->a31->a32->a33 ++ we can probably get the merged grouped store: ++ opa: a00->a10->a20->a30 ++ ->a01->a11->a21->a31 ++ ->a02->a12->a22->a32 ++ ->a03->a13->a23->a33. */ ++ ++static bool ++vect_transform_slp_grouped_stores (bb_vec_info bb_vinfo) ++{ ++ if (bb_vinfo->grouped_stores.length () <= 0) ++ { ++ if (dump_enabled_p ()) ++ { ++ dump_printf_loc (MSG_NOTE, vect_location, ++ "The number of grouped stores is 0.\n"); ++ } ++ return false; ++ } ++ ++ bb_vinfo->grouped_stores.qsort (dr_group_cmp); ++ auto_vec<stmt_vec_info> grouped_stores_merge; ++ auto_vec<bool> visited (bb_vinfo->grouped_stores.length ()); ++ unsigned int i = 0; ++ for (i = 0; i < bb_vinfo->grouped_stores.length (); i++) ++ { ++ visited.safe_push (false); ++ } ++ ++ /* Get scalar stores for the following transposition recovery. */ ++ get_scalar_stores (bb_vinfo); ++ ++ while (1) ++ { ++ /* Find grouped stores which are required to transpose and merge. */ ++ auto_vec<stmt_vec_info> res; ++ vect_slp_grouped_store_find (bb_vinfo, visited, res); ++ if (res.is_empty ()) ++ { ++ break; ++ } ++ /* Transpose and merge the required grouped stores into one group. */ ++ grouped_stores_merge.safe_push (vect_slp_grouped_store_transform (res)); ++ } ++ ++ BB_VINFO_GROUPED_STORES (bb_vinfo).release (); ++ for (i = 0; i < grouped_stores_merge.length (); i++) ++ { ++ BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (grouped_stores_merge[i]); ++ } ++ ++ if (dump_enabled_p ()) ++ { ++ dump_printf_loc (MSG_NOTE, vect_location, ++ "Transposing grouped stores successfully.\n"); ++ } ++ return true; ++} ++ ++/* A helpful function of vect_transform_back_slp_grouped_stores (). */ ++ ++static auto_vec<stmt_vec_info> ++vect_transform_back_slp_grouped_store (bb_vec_info bb_vinfo, ++ stmt_vec_info first_stmt_info) ++{ ++ auto_vec<stmt_vec_info> grouped_stores_split; ++ for (unsigned int i = 0; i < bb_vinfo->scalar_stores.length (); i++) ++ { ++ vec<stmt_vec_info> scalar_tmp = bb_vinfo->scalar_stores[i]; ++ if (scalar_tmp.length () > 1 ++ && scalar_tmp[0]->group_number != first_stmt_info->group_number) ++ { ++ continue; ++ } ++ stmt_vec_info cur_stmt_info = NULL; ++ stmt_vec_info cur_first_stmt_info = NULL; ++ stmt_vec_info last_stmt_info = NULL; ++ unsigned int k = 0; ++ FOR_EACH_VEC_ELT (scalar_tmp, k, cur_stmt_info) ++ { ++ if (k == 0) ++ { ++ cur_first_stmt_info = cur_stmt_info; ++ last_stmt_info = cur_stmt_info; ++ } ++ DR_GROUP_FIRST_ELEMENT (cur_stmt_info) = cur_first_stmt_info; ++ DR_GROUP_NEXT_ELEMENT (last_stmt_info) = cur_stmt_info; ++ last_stmt_info = cur_stmt_info; ++ } ++ DR_GROUP_SIZE (cur_first_stmt_info) = k; ++ DR_GROUP_NEXT_ELEMENT (last_stmt_info) = NULL; ++ if (first_stmt_info != cur_first_stmt_info) ++ { ++ DR_GROUP_GAP (cur_first_stmt_info) ++ = DR_GROUP_GAP_TRANS (cur_first_stmt_info); ++ DR_GROUP_SLP_TRANSPOSE (cur_first_stmt_info) = false; ++ DR_GROUP_NUMBER (cur_first_stmt_info) = -1; ++ } ++ grouped_stores_split.safe_push (cur_first_stmt_info); ++ } ++ return grouped_stores_split; ++} ++ ++/* Transform the grouped store back. */ ++ ++void ++vect_transform_back_slp_grouped_stores (bb_vec_info bb_vinfo, ++ stmt_vec_info first_stmt_info) ++{ ++ if (first_stmt_info->group_number == -1) ++ { ++ return; ++ } ++ /* Transform back. */ ++ auto_vec<stmt_vec_info> grouped_stores_split ++ = vect_transform_back_slp_grouped_store (bb_vinfo, first_stmt_info); ++ ++ /* Add the remaining grouped stores to grouped_stores_split. */ ++ stmt_vec_info first_element = NULL; ++ unsigned int i = 0; ++ FOR_EACH_VEC_ELT (bb_vinfo->grouped_stores, i, first_element) ++ { ++ if (first_element->group_number != first_stmt_info->group_number) ++ { ++ grouped_stores_split.safe_push (first_element); ++ } ++ } ++ DR_GROUP_SLP_TRANSPOSE (first_stmt_info) = false; ++ DR_GROUP_NUMBER (first_stmt_info) = -1; ++ BB_VINFO_GROUPED_STORES (bb_vinfo).release (); ++ for (i = 0; i < grouped_stores_split.length (); i++) ++ { ++ BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (grouped_stores_split[i]); ++ } ++} ++ ++/* Function check_for_slp_vectype ++ ++ Restriction for grouped stores by checking their vectype. ++ If the vectype of the grouped store is changed, it need transform back. ++ If all grouped stores need to be transformed back, return FALSE. */ ++ ++static bool ++check_for_slp_vectype (bb_vec_info bb_vinfo) ++{ ++ if (dump_file) ++ fprintf (dump_file, "check_for_slp_vectype: enter\n"); ++ stmt_vec_info first_element = NULL; ++ unsigned int i = 0; ++ int count = 0; ++ auto_vec<stmt_vec_info> grouped_stores_check; ++ FOR_EACH_VEC_ELT (bb_vinfo->grouped_stores, i, first_element) ++ { ++ grouped_stores_check.safe_push (first_element); ++ } ++ FOR_EACH_VEC_ELT (grouped_stores_check, i, first_element) ++ { ++ if (STMT_VINFO_GROUPED_ACCESS (first_element) ++ && first_element->group_number != -1) ++ { ++ unsigned int group_size_b ++ = DR_GROUP_SIZE_TRANS (first_element); ++ tree vectype = STMT_VINFO_VECTYPE (first_element); ++ gimple *stmt = STMT_VINFO_STMT (first_element); ++ tree lhs = gimple_get_lhs (stmt); ++ tree type = TREE_TYPE (lhs); ++#if 0 ++ if (!vectype && !type) ++ { ++ if (dump_file) ++ fprintf (dump_file, "check_for_slp_vectype: no vectype/stmt type\n"); ++ continue; ++ } ++ ++ if (!vectype) ++ vectype = type; ++#endif ++ if (dump_file) ++ { ++ fprintf (dump_file, "check_for_slp_vectype: %p\n", first_element); ++ print_gimple_stmt (dump_file, stmt, 0); ++ fprintf (dump_file, "check_for_slp_vectype: vectype="); ++ if (vectype) ++ print_generic_expr (dump_file, vectype); ++ fprintf (dump_file, "\n"); ++ } ++#if 0 ++ if (!vectype || !VECTOR_TYPE_P (vectype)) ++ continue; ++#endif ++ poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype); ++ if (nunits.to_constant () > group_size_b) ++ { ++ count++; ++ /* If the vectype is changed, this grouped store need ++ to be transformed back. */ ++ vect_transform_back_slp_grouped_stores (bb_vinfo, first_element); ++ if (dump_enabled_p ()) ++ { ++ dump_printf_loc (MSG_NOTE, vect_location, ++ "No supported: only supported for" ++ " group_size geq than nunits.\n"); ++ } ++ } ++ } ++ } ++ if (count == BB_VINFO_TRANS_GROUPS (bb_vinfo)) ++ { ++ return false; ++ } ++ if (dump_file) ++ fprintf (dump_file, "check_for_slp_vectype: True\n"); ++ return true; ++} ++ ++/* Function check_for_dr_alignment ++ ++ Check the alignment of the slp instance loads. ++ Return FALSE if a load cannot be vectorized. */ ++ ++static bool ++check_for_dr_alignment (bb_vec_info bb_vinfo, slp_instance instance) ++{ ++ slp_tree node = NULL; ++ unsigned int i = 0; ++ FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, node) ++ { ++ stmt_vec_info first_stmt_info = SLP_TREE_SCALAR_STMTS (node)[0]; ++ dr_vec_info *first_dr_info = STMT_VINFO_DR_INFO (first_stmt_info); ++ if (dump_file) ++ { ++ fprintf (dump_file, "check_for_dr_alignment: %p\n", first_stmt_info); ++ ++ gimple *stmt = STMT_VINFO_STMT (first_stmt_info); ++ tree lhs = gimple_get_lhs (stmt); ++ tree type = TREE_TYPE (lhs); ++ print_gimple_stmt (dump_file, stmt, 0); ++ } ++ ++ tree vectype = STMT_VINFO_VECTYPE (first_stmt_info); ++ int malign = dr_misalignment (first_dr_info, vectype); ++ enum dr_alignment_support supportable_dr_alignment ++ = vect_supportable_dr_alignment (bb_vinfo, first_dr_info, ++ vectype, malign); ++ if (supportable_dr_alignment == dr_explicit_realign_optimized ++ || supportable_dr_alignment == dr_explicit_realign) ++ { ++ return false; + } + } +- else if (kind == slp_inst_kind_reduc_chain) ++ return true; ++} ++ ++/* Initialize slp_transpose flag before transposing. */ ++ ++static void ++init_stmt_info_slp_transpose (bb_vec_info bb_vinfo) ++{ ++ stmt_vec_info first_element = NULL; ++ unsigned int k = 0; ++ FOR_EACH_VEC_ELT (bb_vinfo->grouped_stores, k, first_element) + { +- /* Collect the reduction stmts and store them in scalar_stmts. */ +- scalar_stmts.create (REDUC_GROUP_SIZE (stmt_info)); +- while (next_info) ++ if (STMT_VINFO_GROUPED_ACCESS (first_element)) + { +- scalar_stmts.quick_push (vect_stmt_to_vectorize (next_info)); +- next_info = REDUC_GROUP_NEXT_ELEMENT (next_info); ++ DR_GROUP_SLP_TRANSPOSE (first_element) = false; + } +- /* Mark the first element of the reduction chain as reduction to properly +- transform the node. In the reduction analysis phase only the last +- element of the chain is marked as reduction. */ +- STMT_VINFO_DEF_TYPE (stmt_info) +- = STMT_VINFO_DEF_TYPE (scalar_stmts.last ()); +- STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info)) +- = STMT_VINFO_REDUC_DEF (vect_orig_stmt (scalar_stmts.last ())); + } +- else if (kind == slp_inst_kind_ctor) ++ FOR_EACH_VEC_ELT (bb_vinfo->grouped_loads, k, first_element) + { +- tree rhs = gimple_assign_rhs1 (stmt_info->stmt); +- tree val; +- scalar_stmts.create (CONSTRUCTOR_NELTS (rhs)); +- FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs), i, val) ++ if (STMT_VINFO_GROUPED_ACCESS (first_element)) + { +- stmt_vec_info def_info = vinfo->lookup_def (val); +- def_info = vect_stmt_to_vectorize (def_info); +- scalar_stmts.quick_push (def_info); ++ DR_GROUP_SLP_TRANSPOSE (first_element) = false; + } +- if (dump_enabled_p ()) +- dump_printf_loc (MSG_NOTE, vect_location, +- "Analyzing vectorizable constructor: %G\n", +- stmt_info->stmt); + } +- else if (kind == slp_inst_kind_reduc_group) ++} ++ ++/* Analyze and transpose the stmts before building the SLP tree. */ ++ ++static bool ++vect_analyze_transpose (bb_vec_info bb_vinfo) ++{ ++ DUMP_VECT_SCOPE ("vect_analyze_transpose"); ++ ++ if (!vect_may_transpose (bb_vinfo)) + { +- /* Collect reduction statements. */ +- const vec<stmt_vec_info> &reductions +- = as_a <loop_vec_info> (vinfo)->reductions; +- scalar_stmts.create (reductions.length ()); +- for (i = 0; reductions.iterate (i, &next_info); i++) +- if ((STMT_VINFO_RELEVANT_P (next_info) +- || STMT_VINFO_LIVE_P (next_info)) +- /* ??? Make sure we didn't skip a conversion around a reduction +- path. In that case we'd have to reverse engineer that conversion +- stmt following the chain using reduc_idx and from the PHI +- using reduc_def. */ +- && STMT_VINFO_DEF_TYPE (next_info) == vect_reduction_def) +- scalar_stmts.quick_push (next_info); +- /* If less than two were relevant/live there's nothing to SLP. */ +- if (scalar_stmts.length () < 2) +- return false; ++ return false; + } +- else +- gcc_unreachable (); + +- vec<stmt_vec_info> roots = vNULL; +- if (kind == slp_inst_kind_ctor) ++ /* For basic block SLP, try to merge the grouped stores and loads ++ into one group. */ ++ init_stmt_info_slp_transpose (bb_vinfo); ++ if (vect_transform_slp_grouped_stores (bb_vinfo) ++ && vect_merge_slp_grouped_loads (bb_vinfo)) + { +- roots.create (1); +- roots.quick_push (stmt_info); ++ if (dump_enabled_p ()) ++ { ++ dump_printf_loc (MSG_NOTE, vect_location, ++ "Analysis succeeded with SLP transposed.\n"); ++ } ++ return true; + } +- /* Build the tree for the SLP instance. */ +- bool res = vect_build_slp_instance (vinfo, kind, scalar_stmts, +- roots, +- max_tree_size, limit, bst_map, +- kind == slp_inst_kind_store +- ? stmt_info : NULL); +- if (!res) +- roots.release (); +- +- /* ??? If this is slp_inst_kind_store and the above succeeded here's +- where we should do store group splitting. */ +- +- return res; ++ if (dump_enabled_p ()) ++ { ++ dump_printf_loc (MSG_NOTE, vect_location, ++ "Analysis failed with SLP transposed.\n"); ++ } ++ return false; + } + + /* Check if there are stmts in the loop can be vectorized using SLP. Build SLP +@@ -4963,7 +5932,7 @@ vect_slp_analyze_operations (vec_info *vinfo) + /* Check we can vectorize the reduction. */ + || (SLP_INSTANCE_KIND (instance) == slp_inst_kind_bb_reduc + && !vectorizable_bb_reduc_epilogue (instance, &cost_vec))) +- { ++ { + slp_tree node = SLP_INSTANCE_TREE (instance); + stmt_vec_info stmt_info; + if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ()) +@@ -4975,7 +5944,7 @@ vect_slp_analyze_operations (vec_info *vinfo) + "removing SLP instance operations starting from: %G", + stmt_info->stmt); + vect_free_slp_instance (instance); +- vinfo->slp_instances.ordered_remove (i); ++ vinfo->slp_instances.ordered_remove (i); + cost_vec.release (); + while (!visited_vec.is_empty ()) + visited.remove (visited_vec.pop ()); +@@ -5204,7 +6173,7 @@ vect_bb_slp_scalar_cost (vec_info *vinfo, + gimple *orig_stmt = orig_stmt_info->stmt; + + /* If there is a non-vectorized use of the defs then the scalar +- stmt is kept live in which case we do not account it or any ++ stmt is kept live in which case we do not account it or any + required defs in the SLP children in the scalar cost. This + way we make the vectorization more costly when compared to + the scalar cost. */ +@@ -5481,7 +6450,11 @@ vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo, + + vec_outside_cost = vec_prologue_cost + vec_epilogue_cost; + +- if (dump_enabled_p ()) ++ BB_VINFO_VEC_INSIDE_COST (bb_vinfo) = vec_inside_cost; ++ BB_VINFO_VEC_OUTSIDE_COST (bb_vinfo) = vec_outside_cost; ++ BB_VINFO_SCALAR_COST (bb_vinfo) = scalar_cost; ++ ++ if (!unlimited_cost_model (NULL) && dump_enabled_p ()) + { + dump_printf_loc (MSG_NOTE, vect_location, + "Cost model analysis for part in loop %d:\n", sl); +@@ -5819,7 +6792,7 @@ vect_slp_analyze_bb_1 (bb_vec_info bb_vinfo, int n_stmts, bool &fatal, + if (!vect_analyze_data_refs (bb_vinfo, &min_vf, NULL)) + { + if (dump_enabled_p ()) +- dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, ++ dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "not vectorized: unhandled data-ref in basic " + "block.\n"); + return false; +@@ -5854,6 +6827,22 @@ vect_slp_analyze_bb_1 (bb_vec_info bb_vinfo, int n_stmts, bool &fatal, + + vect_pattern_recog (bb_vinfo); + ++ /* Transpose grouped stores and loads for better vectorizable version. */ ++ if (bb_vinfo->transposed) ++ { ++ if (!vect_analyze_transpose (bb_vinfo)) ++ { ++ if (dump_enabled_p ()) ++ { ++ dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, ++ "not vectorized: unhandled slp transposed in " ++ "basic block.\n"); ++ } ++ return false; ++ } ++ } ++ bb_vinfo->before_slp = true; ++ + /* Update store groups from pattern processing. */ + vect_fixup_store_groups_with_patterns (bb_vinfo); + +@@ -5872,6 +6861,20 @@ vect_slp_analyze_bb_1 (bb_vec_info bb_vinfo, int n_stmts, bool &fatal, + return false; + } + ++ /* Check if the vectype is suitable for SLP transposed. */ ++ if (bb_vinfo->transposed && !check_for_slp_vectype (bb_vinfo)) ++ { ++ if (dump_enabled_p ()) ++ { ++ dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, ++ "Failed to SLP transposed in the basic block.\n"); ++ dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, ++ "not vectorized: vectype is not suitable for " ++ "SLP transposed in basic block.\n"); ++ } ++ return false; ++ } ++ + /* Optimize permutations. */ + vect_optimize_slp (bb_vinfo); + +@@ -5914,6 +6917,27 @@ vect_slp_analyze_bb_1 (bb_vec_info bb_vinfo, int n_stmts, bool &fatal, + if (! BB_VINFO_SLP_INSTANCES (bb_vinfo).length ()) + return false; + ++ /* Check if the alignment is suitable for SLP transposed. */ ++ if (bb_vinfo->transposed) ++ { ++ for (i = 0; BB_VINFO_SLP_INSTANCES (bb_vinfo).iterate (i, &instance); i++) ++ { ++ if (!check_for_dr_alignment (bb_vinfo, instance)) ++ { ++ if (dump_enabled_p ()) ++ { ++ dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, ++ "Failed to SLP transposed in the basic " ++ "block.\n"); ++ dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, ++ "not vectorized: alignment is not suitable " ++ "for SLP transposed in basic block.\n"); ++ } ++ return false; ++ } ++ } ++ } ++ + if (!vect_slp_analyze_operations (bb_vinfo)) + { + if (dump_enabled_p ()) +@@ -5923,7 +6947,88 @@ vect_slp_analyze_bb_1 (bb_vec_info bb_vinfo, int n_stmts, bool &fatal, + } + + vect_bb_partition_graph (bb_vinfo); ++ return true; ++} ++ ++static bool ++may_new_transpose_bbvinfo (bb_vec_info bb_vinfo_ori, bool res_ori, ++ loop_p orig_loop) ++{ ++ /* If the flag is false or the slp analysis is broken before ++ vect_analyze_slp, we don't try to analyze the transposed SLP version. */ ++ if (!flag_tree_slp_transpose_vectorize ++ || !BB_VINFO_BEFORE_SLP (bb_vinfo_ori)) ++ { ++ return false; ++ } ++ ++ /* If the original bb_vinfo can't be vectorized, try to new a bb_vinfo ++ of the transposed version. */ ++ if (!res_ori) ++ { ++ return true; ++ } ++ ++ /* Caculate the cost of the original bb_vinfo. */ ++ if (unlimited_cost_model (NULL)) ++ { ++ vec<slp_instance> &instances = BB_VINFO_SLP_INSTANCES (bb_vinfo_ori); ++ vect_bb_vectorization_profitable_p (bb_vinfo_ori, instances, orig_loop); ++ } ++ /* If the vec cost and scalar cost are not much difference (here we set the ++ threshold to 4), we try to new a bb_vinfo of the transposed version. */ ++ if (BB_VINFO_SCALAR_COST (bb_vinfo_ori) ++ < 4 * (BB_VINFO_VEC_INSIDE_COST (bb_vinfo_ori) ++ + BB_VINFO_VEC_OUTSIDE_COST (bb_vinfo_ori))) ++ { ++ return true; ++ } ++ return false; ++} + ++static bool ++may_choose_transpose_bbvinfo (bb_vec_info bb_vinfo_trans, bool res_trans, ++ bb_vec_info bb_vinfo_ori, bool res_ori, ++ loop_p orig_loop) ++{ ++ /* The original bb_vinfo is chosen if the transposed bb_vinfo ++ can't be vectorized. */ ++ if (!res_trans) ++ { ++ return false; ++ } ++ /* Caculate the cost of the transposed bb_vinfo. */ ++ if (unlimited_cost_model (NULL)) ++ { ++ vec<slp_instance> &instances = BB_VINFO_SLP_INSTANCES (bb_vinfo_trans); ++ vect_bb_vectorization_profitable_p (bb_vinfo_trans, instances, ++ orig_loop); ++ } ++ int diff_bb_cost = -1; ++ int diff_bb_cost_trans = -1; ++ if (res_ori) ++ { ++ diff_bb_cost = BB_VINFO_SCALAR_COST (bb_vinfo_ori) ++ - BB_VINFO_VEC_INSIDE_COST (bb_vinfo_ori) ++ - BB_VINFO_VEC_OUTSIDE_COST (bb_vinfo_ori); ++ } ++ if (res_trans) ++ { ++ diff_bb_cost_trans = BB_VINFO_SCALAR_COST (bb_vinfo_trans) ++ - BB_VINFO_VEC_INSIDE_COST (bb_vinfo_trans) ++ - BB_VINFO_VEC_OUTSIDE_COST (bb_vinfo_trans); ++ } ++ /* The original bb_vinfo is chosen when one of the following conditions ++ is satisfied as follows: ++ 1) The cost of original version is better transposed version. ++ 2) The vec cost is similar to scalar cost in the transposed version. */ ++ if ((res_ori && res_trans && diff_bb_cost >= diff_bb_cost_trans) ++ || (res_trans && BB_VINFO_SCALAR_COST (bb_vinfo_trans) ++ <= (BB_VINFO_VEC_INSIDE_COST (bb_vinfo_trans) ++ + BB_VINFO_VEC_OUTSIDE_COST (bb_vinfo_trans)))) ++ { ++ return false; ++ } + return true; + } + +@@ -5937,6 +7042,7 @@ vect_slp_region (vec<basic_block> bbs, vec<data_reference_p> datarefs, + loop_p orig_loop) + { + bb_vec_info bb_vinfo; ++ bb_vec_info bb_vinfo_trans = NULL; + auto_vector_modes vector_modes; + + /* Autodetect first vector size we try. */ +@@ -5951,6 +7057,10 @@ vect_slp_region (vec<basic_block> bbs, vec<data_reference_p> datarefs, + { + bool vectorized = false; + bool fatal = false; ++ bool res_bb_vinfo_ori = false; ++ bool res_bb_vinfo_trans = false; ++ ++ /* New a bb_vinfo of the original version. */ + bb_vinfo = new _bb_vec_info (bbs, &shared); + + bool first_time_p = shared.datarefs.is_empty (); +@@ -5960,8 +7070,113 @@ vect_slp_region (vec<basic_block> bbs, vec<data_reference_p> datarefs, + else + bb_vinfo->shared->check_datarefs (); + bb_vinfo->vector_mode = next_vector_mode; ++ bb_vinfo->transposed = false; ++ bb_vinfo->before_slp = false; ++ ++ res_bb_vinfo_ori = vect_slp_analyze_bb_1 (bb_vinfo, n_stmts, fatal, ++ dataref_groups); ++ auto_vec<slp_instance> profitable_subgraphs; ++ auto_vec<slp_instance> profitable_subgraphs_trans; ++ for (slp_instance instance : BB_VINFO_SLP_INSTANCES (bb_vinfo)) ++ { ++ if (instance->subgraph_entries.is_empty ()) ++ continue; ++ ++ vect_location = instance->location (); ++ if (!unlimited_cost_model (NULL) ++ && !vect_bb_vectorization_profitable_p ++ (bb_vinfo, instance->subgraph_entries, orig_loop)) ++ { ++ if (dump_enabled_p ()) ++ dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, ++ "not vectorized: vectorization is not " ++ "profitable.\n"); ++ continue; ++ } ++ if (res_bb_vinfo_ori) ++ { ++ if (!dbg_cnt (vect_slp)) ++ continue; ++ profitable_subgraphs.safe_push (instance); ++ } ++ } ++ ++ /* Analyze and new a transposed bb_vinfo. */ ++ if (may_new_transpose_bbvinfo (bb_vinfo, res_bb_vinfo_ori, orig_loop)) ++ { ++ bool fatal_trans = false; ++ bb_vinfo_trans ++ = new _bb_vec_info (bbs, &shared); ++ bool first_time_p = shared.datarefs.is_empty (); ++ BB_VINFO_DATAREFS (bb_vinfo_trans) = datarefs; ++ if (first_time_p) ++ { ++ bb_vinfo_trans->shared->save_datarefs (); ++ } ++ else ++ { ++ bb_vinfo_trans->shared->check_datarefs (); ++ } ++ bb_vinfo_trans->vector_mode = next_vector_mode; ++ bb_vinfo_trans->transposed = true; ++ bb_vinfo_trans->before_slp = false; ++ ++ res_bb_vinfo_trans ++ = vect_slp_analyze_bb_1 (bb_vinfo_trans, n_stmts, fatal_trans, ++ dataref_groups); ++ if (res_bb_vinfo_trans) ++ { ++ for (slp_instance instance : BB_VINFO_SLP_INSTANCES (bb_vinfo_trans)) ++ { ++ if (instance->subgraph_entries.is_empty ()) ++ continue; ++ ++ vect_location = instance->location (); ++ if (!unlimited_cost_model (NULL) ++ && !vect_bb_vectorization_profitable_p ++ (bb_vinfo_trans, instance->subgraph_entries, orig_loop)) ++ { ++ if (dump_enabled_p ()) ++ dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, ++ "not vectorized: transpose vectorization is not " ++ "profitable.\n"); ++ res_bb_vinfo_trans = false; ++ continue; ++ } ++ if (res_bb_vinfo_trans) ++ { ++ if (!dbg_cnt (vect_slp)) ++ continue; ++ profitable_subgraphs_trans.safe_push (instance); ++ } ++ } ++ } ++ if (may_choose_transpose_bbvinfo (bb_vinfo_trans, ++ res_bb_vinfo_trans, ++ bb_vinfo, res_bb_vinfo_ori, ++ orig_loop)) ++ { ++ bb_vinfo = bb_vinfo_trans; ++ fatal = fatal_trans; ++ if (dump_enabled_p ()) ++ { ++ dump_printf_loc (MSG_NOTE, vect_location, ++ "Basic block part vectorized " ++ "using transposed version.\n"); ++ } ++ } ++ else ++ { ++ if (dump_enabled_p ()) ++ { ++ dump_printf_loc (MSG_NOTE, vect_location, ++ "Basic block part vectorized " ++ "\n"); ++ } ++ } ++ } + +- if (vect_slp_analyze_bb_1 (bb_vinfo, n_stmts, fatal, dataref_groups)) ++ if (res_bb_vinfo_ori || res_bb_vinfo_trans) + { + if (dump_enabled_p ()) + { +@@ -5972,90 +7187,129 @@ vect_slp_region (vec<basic_block> bbs, vec<data_reference_p> datarefs, + } + + bb_vinfo->shared->check_datarefs (); +- +- auto_vec<slp_instance> profitable_subgraphs; +- for (slp_instance instance : BB_VINFO_SLP_INSTANCES (bb_vinfo)) ++ if (!res_bb_vinfo_trans) + { +- if (instance->subgraph_entries.is_empty ()) +- continue; +- +- vect_location = instance->location (); +- if (!unlimited_cost_model (NULL) +- && !vect_bb_vectorization_profitable_p +- (bb_vinfo, instance->subgraph_entries, orig_loop)) ++ /* When we're vectorizing an if-converted loop body make sure ++ we vectorized all if-converted code. */ ++ if (!profitable_subgraphs.is_empty () ++ && orig_loop) + { +- if (dump_enabled_p ()) +- dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, +- "not vectorized: vectorization is not " +- "profitable.\n"); +- continue; ++ gcc_assert (bb_vinfo->bbs.length () == 1); ++ for (gimple_stmt_iterator gsi = gsi_start_bb (bb_vinfo->bbs[0]); ++ !gsi_end_p (gsi); gsi_next (&gsi)) ++ { ++ /* The costing above left us with DCEable vectorized scalar ++ stmts having the visited flag set on profitable ++ subgraphs. Do the delayed clearing of the flag here. */ ++ if (gimple_visited_p (gsi_stmt (gsi))) ++ { ++ gimple_set_visited (gsi_stmt (gsi), false); ++ continue; ++ } ++ if (flag_vect_cost_model == VECT_COST_MODEL_UNLIMITED) ++ continue; ++ ++ if (gassign *ass = dyn_cast <gassign *> (gsi_stmt (gsi))) ++ if (gimple_assign_rhs_code (ass) == COND_EXPR) ++ { ++ if (!profitable_subgraphs.is_empty () ++ && dump_enabled_p ()) ++ dump_printf_loc (MSG_NOTE, vect_location, ++ "not profitable because of " ++ "unprofitable if-converted scalar " ++ "code\n"); ++ profitable_subgraphs.truncate (0); ++ } ++ } + } + +- if (!dbg_cnt (vect_slp)) +- continue; ++ /* Finally schedule the profitable subgraphs. */ ++ for (slp_instance instance : profitable_subgraphs) ++ { ++ if (!vectorized && dump_enabled_p ()) ++ dump_printf_loc (MSG_NOTE, vect_location, ++ "Basic block will be vectorized " ++ "using SLP\n"); ++ vectorized = true; + +- profitable_subgraphs.safe_push (instance); +- } ++ vect_schedule_slp (bb_vinfo, instance->subgraph_entries); + +- /* When we're vectorizing an if-converted loop body make sure +- we vectorized all if-converted code. */ +- if (!profitable_subgraphs.is_empty () +- && orig_loop) ++ unsigned HOST_WIDE_INT bytes; ++ if (dump_enabled_p ()) ++ { ++ if (GET_MODE_SIZE ++ (bb_vinfo->vector_mode).is_constant (&bytes)) ++ dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location, ++ "basic block part vectorized using %wu " ++ "byte vectors\n", bytes); ++ else ++ dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location, ++ "basic block part vectorized using " ++ "variable length vectors\n"); ++ } ++ } ++ } ++ else + { +- gcc_assert (bb_vinfo->bbs.length () == 1); +- for (gimple_stmt_iterator gsi = gsi_start_bb (bb_vinfo->bbs[0]); +- !gsi_end_p (gsi); gsi_next (&gsi)) ++ if (!profitable_subgraphs_trans.is_empty () ++ && orig_loop) + { +- /* The costing above left us with DCEable vectorized scalar +- stmts having the visited flag set on profitable +- subgraphs. Do the delayed clearing of the flag here. */ +- if (gimple_visited_p (gsi_stmt (gsi))) ++ gcc_assert (bb_vinfo->bbs.length () == 1); ++ for (gimple_stmt_iterator gsi = gsi_start_bb (bb_vinfo->bbs[0]); ++ !gsi_end_p (gsi); gsi_next (&gsi)) + { +- gimple_set_visited (gsi_stmt (gsi), false); +- continue; ++ /* The costing above left us with DCEable vectorized scalar ++ stmts having the visited flag set on profitable ++ subgraphs. Do the delayed clearing of the flag here. */ ++ if (gimple_visited_p (gsi_stmt (gsi))) ++ { ++ gimple_set_visited (gsi_stmt (gsi), false); ++ continue; ++ } ++ if (flag_vect_cost_model == VECT_COST_MODEL_UNLIMITED) ++ continue; ++ ++ if (gassign *ass = dyn_cast <gassign *> (gsi_stmt (gsi))) ++ if (gimple_assign_rhs_code (ass) == COND_EXPR) ++ { ++ if (!profitable_subgraphs_trans.is_empty () ++ && dump_enabled_p ()) ++ dump_printf_loc (MSG_NOTE, vect_location, ++ "not profitable because of " ++ "unprofitable if-converted scalar " ++ "code\n"); ++ profitable_subgraphs_trans.truncate (0); ++ } + } +- if (flag_vect_cost_model == VECT_COST_MODEL_UNLIMITED) +- continue; +- +- if (gassign *ass = dyn_cast <gassign *> (gsi_stmt (gsi))) +- if (gimple_assign_rhs_code (ass) == COND_EXPR) +- { +- if (!profitable_subgraphs.is_empty () +- && dump_enabled_p ()) +- dump_printf_loc (MSG_NOTE, vect_location, +- "not profitable because of " +- "unprofitable if-converted scalar " +- "code\n"); +- profitable_subgraphs.truncate (0); +- } + } +- } + +- /* Finally schedule the profitable subgraphs. */ +- for (slp_instance instance : profitable_subgraphs) +- { +- if (!vectorized && dump_enabled_p ()) +- dump_printf_loc (MSG_NOTE, vect_location, +- "Basic block will be vectorized " +- "using SLP\n"); +- vectorized = true; ++ /* Finally schedule the profitable subgraphs. */ ++ for (slp_instance instance : profitable_subgraphs_trans) ++ { ++ if (!vectorized && dump_enabled_p ()) ++ dump_printf_loc (MSG_NOTE, vect_location, ++ "Basic block will be vectorized " ++ "using SLP\n"); ++ vectorized = true; + +- vect_schedule_slp (bb_vinfo, instance->subgraph_entries); ++ vect_schedule_slp (bb_vinfo, instance->subgraph_entries); + +- unsigned HOST_WIDE_INT bytes; +- if (dump_enabled_p ()) +- { +- if (GET_MODE_SIZE +- (bb_vinfo->vector_mode).is_constant (&bytes)) +- dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location, +- "basic block part vectorized using %wu " +- "byte vectors\n", bytes); +- else +- dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location, +- "basic block part vectorized using " +- "variable length vectors\n"); ++ unsigned HOST_WIDE_INT bytes; ++ if (dump_enabled_p ()) ++ { ++ if (GET_MODE_SIZE ++ (bb_vinfo->vector_mode).is_constant (&bytes)) ++ dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location, ++ "basic block part vectorized using %wu " ++ "byte vectors\n", bytes); ++ else ++ dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location, ++ "basic block part vectorized using " ++ "variable length vectors\n"); ++ } + } + } ++ + } + else + { +@@ -6081,6 +7335,10 @@ vect_slp_region (vec<basic_block> bbs, vec<data_reference_p> datarefs, + } + + delete bb_vinfo; ++ if (bb_vinfo_trans) ++ { ++ bb_vinfo_trans = NULL; ++ } + + if (mode_i < vector_modes.length () + && VECTOR_MODE_P (autodetected_vector_mode) +@@ -7244,10 +8502,17 @@ vect_schedule_slp_node (vec_info *vinfo, + ready early, vectorized stores go before the last scalar + stmt which is where all uses are ready. */ + stmt_vec_info last_stmt_info = NULL; +- if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info))) +- last_stmt_info = vect_find_first_scalar_stmt_in_slp (node); +- else /* DR_IS_WRITE */ +- last_stmt_info = vect_find_last_scalar_stmt_in_slp (node); ++ ++ if (DR_GROUP_FIRST_ELEMENT (stmt_info) ++ && DR_GROUP_SLP_TRANSPOSE (DR_GROUP_FIRST_ELEMENT (stmt_info))) ++ last_stmt_info = vect_find_last_scalar_stmt_in_slp (node); ++ else ++ { ++ if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info))) ++ last_stmt_info = vect_find_first_scalar_stmt_in_slp (node); ++ else /* DR_IS_WRITE */ ++ last_stmt_info = vect_find_last_scalar_stmt_in_slp (node); ++ } + si = gsi_for_stmt (last_stmt_info->stmt); + } + else if ((STMT_VINFO_TYPE (stmt_info) == cycle_phi_info_type +diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc +index 349200411..3099f6743 100644 +--- a/gcc/tree-vect-stmts.cc ++++ b/gcc/tree-vect-stmts.cc +@@ -1369,10 +1369,10 @@ vect_get_load_cost (vec_info *, stmt_vec_info stmt_info, int ncopies, + + static void + vect_init_vector_1 (vec_info *vinfo, stmt_vec_info stmt_vinfo, gimple *new_stmt, +- gimple_stmt_iterator *gsi) ++ gimple_stmt_iterator *gsi, bool transpose=false) + { + if (gsi) +- vect_finish_stmt_generation (vinfo, stmt_vinfo, new_stmt, gsi); ++ vect_finish_stmt_generation (vinfo, stmt_vinfo, new_stmt, gsi, transpose); + else + vinfo->insert_on_entry (stmt_vinfo, new_stmt); + +@@ -1393,7 +1393,7 @@ vect_init_vector_1 (vec_info *vinfo, stmt_vec_info stmt_vinfo, gimple *new_stmt, + + tree + vect_init_vector (vec_info *vinfo, stmt_vec_info stmt_info, tree val, tree type, +- gimple_stmt_iterator *gsi) ++ gimple_stmt_iterator *gsi, bool transpose) + { + gimple *init_stmt; + tree new_temp; +@@ -1418,7 +1418,7 @@ vect_init_vector (vec_info *vinfo, stmt_vec_info stmt_info, tree val, tree type, + new_temp = make_ssa_name (TREE_TYPE (type)); + init_stmt = gimple_build_assign (new_temp, COND_EXPR, + val, true_val, false_val); +- vect_init_vector_1 (vinfo, stmt_info, init_stmt, gsi); ++ vect_init_vector_1 (vinfo, stmt_info, init_stmt, gsi, transpose); + val = new_temp; + } + } +@@ -1437,7 +1437,7 @@ vect_init_vector (vec_info *vinfo, stmt_vec_info stmt_info, tree val, tree type, + { + init_stmt = gsi_stmt (gsi2); + gsi_remove (&gsi2, false); +- vect_init_vector_1 (vinfo, stmt_info, init_stmt, gsi); ++ vect_init_vector_1 (vinfo, stmt_info, init_stmt, gsi, transpose); + } + } + } +@@ -1446,7 +1446,7 @@ vect_init_vector (vec_info *vinfo, stmt_vec_info stmt_info, tree val, tree type, + + new_temp = vect_get_new_ssa_name (type, vect_simple_var, "cst_"); + init_stmt = gimple_build_assign (new_temp, val); +- vect_init_vector_1 (vinfo, stmt_info, init_stmt, gsi); ++ vect_init_vector_1 (vinfo, stmt_info, init_stmt, gsi, transpose); + return new_temp; + } + +@@ -1572,9 +1572,11 @@ vect_get_vec_defs (vec_info *vinfo, stmt_vec_info stmt_info, slp_tree slp_node, + statement and create and return a stmt_vec_info for it. */ + + static void +-vect_finish_stmt_generation_1 (vec_info *, +- stmt_vec_info stmt_info, gimple *vec_stmt) ++vect_finish_stmt_generation_1 (vec_info *vinfo, ++ stmt_vec_info stmt_info, gimple *vec_stmt, bool transpose=false) + { ++ if (transpose) ++ stmt_vec_info vec_stmt_info = vinfo->add_pattern_stmt (vec_stmt, NULL); + if (dump_enabled_p ()) + dump_printf_loc (MSG_NOTE, vect_location, "add new stmt: %G", vec_stmt); + +@@ -1616,7 +1618,7 @@ vect_finish_replace_stmt (vec_info *vinfo, + void + vect_finish_stmt_generation (vec_info *vinfo, + stmt_vec_info stmt_info, gimple *vec_stmt, +- gimple_stmt_iterator *gsi) ++ gimple_stmt_iterator *gsi, bool transpose) + { + gcc_assert (!stmt_info || gimple_code (stmt_info->stmt) != GIMPLE_LABEL); + +@@ -1648,7 +1650,7 @@ vect_finish_stmt_generation (vec_info *vinfo, + } + } + gsi_insert_before (gsi, vec_stmt, GSI_SAME_STMT); +- vect_finish_stmt_generation_1 (vinfo, stmt_info, vec_stmt); ++ vect_finish_stmt_generation_1 (vinfo, stmt_info, vec_stmt, transpose); + } + + /* We want to vectorize a call to combined function CFN with function +@@ -2159,6 +2161,173 @@ vector_vector_composition_type (tree vtype, poly_uint64 nelts, tree *ptype) + return NULL_TREE; + } + ++/* Check succeedor BB, BB without load is regarded as empty BB. Ignore empty ++ BB in DFS. */ ++ ++static unsigned ++mem_refs_in_bb (basic_block bb, vec<gimple *> &stmts) ++{ ++ unsigned num = 0; ++ for (gimple_stmt_iterator gsi = gsi_start_bb (bb); ++ !gsi_end_p (gsi); gsi_next (&gsi)) ++ { ++ gimple *stmt = gsi_stmt (gsi); ++ if (is_gimple_debug (stmt)) ++ continue; ++ if (is_gimple_assign (stmt) && gimple_has_mem_ops (stmt) ++ && !gimple_has_volatile_ops (stmt)) ++ { ++ if (gimple_assign_rhs_code (stmt) == MEM_REF ++ || gimple_assign_rhs_code (stmt) == ARRAY_REF) ++ { ++ stmts.safe_push (stmt); ++ num++; ++ } ++ else if (TREE_CODE (gimple_get_lhs (stmt)) == MEM_REF ++ || TREE_CODE (gimple_get_lhs (stmt)) == ARRAY_REF) ++ num++; ++ } ++ } ++ return num; ++} ++ ++static bool ++check_same_base (vec<data_reference_p> *datarefs, data_reference_p dr) ++{ ++ for (unsigned ui = 0; ui < datarefs->length (); ui++) ++ { ++ tree op1 = TREE_OPERAND (DR_BASE_OBJECT (dr), 0); ++ tree op2 = TREE_OPERAND (DR_BASE_OBJECT ((*datarefs)[ui]), 0); ++ if (TREE_CODE (op1) != TREE_CODE (op2)) ++ continue; ++ if (TREE_CODE (op1) == ADDR_EXPR) ++ { ++ op1 = TREE_OPERAND (op1, 0); ++ op2 = TREE_OPERAND (op2, 0); ++ } ++ enum tree_code code = TREE_CODE (op1); ++ switch (code) ++ { ++ case VAR_DECL: ++ if (DECL_NAME (op1) == DECL_NAME (op2) ++ && DR_IS_READ ((*datarefs)[ui])) ++ return true; ++ break; ++ case SSA_NAME: ++ if (SSA_NAME_VERSION (op1) == SSA_NAME_VERSION (op2) ++ && DR_IS_READ ((*datarefs)[ui])) ++ return true; ++ break; ++ default: ++ break; ++ } ++ } ++ return false; ++} ++ ++/* Iterate all load STMTS, if staisfying same base vectorized stmt, then return, ++ Otherwise, set false to SUCCESS. */ ++ ++static void ++check_vec_use (loop_vec_info loop_vinfo, vec<gimple *> &stmts, ++ stmt_vec_info stmt_info, bool &success) ++{ ++ if (stmt_info == NULL) ++ { ++ success = false; ++ return; ++ } ++ if (DR_IS_READ (stmt_info->dr_aux.dr)) ++ { ++ success = false; ++ return; ++ } ++ unsigned ui = 0; ++ gimple *candidate = NULL; ++ FOR_EACH_VEC_ELT (stmts, ui, candidate) ++ { ++ if (TREE_CODE (TREE_TYPE (gimple_get_lhs (candidate))) != VECTOR_TYPE) ++ continue; ++ ++ if (candidate->bb != candidate->bb->loop_father->header) ++ { ++ success = false; ++ return; ++ } ++ auto_vec<data_reference_p> datarefs; ++ tree res = find_data_references_in_bb (candidate->bb->loop_father, ++ candidate->bb, &datarefs); ++ if (res == chrec_dont_know) ++ { ++ success = false; ++ return; ++ } ++ if (check_same_base (&datarefs, stmt_info->dr_aux.dr)) ++ return; ++ } ++ success = false; ++} ++ ++/* Deep first search from present BB. If succeedor has load STMTS, ++ stop further searching. */ ++ ++static void ++dfs_check_bb (loop_vec_info loop_vinfo, basic_block bb, stmt_vec_info stmt_info, ++ bool &success, vec<basic_block> &visited_bbs) ++{ ++ if (bb == cfun->cfg->x_exit_block_ptr) ++ { ++ success = false; ++ return; ++ } ++ if (!success || visited_bbs.contains (bb) || bb == loop_vinfo->loop->latch) ++ return; ++ ++ visited_bbs.safe_push (bb); ++ auto_vec<gimple *> stmts; ++ unsigned num = mem_refs_in_bb (bb, stmts); ++ /* Empty BB. */ ++ if (num == 0) ++ { ++ edge e; ++ edge_iterator ei; ++ FOR_EACH_EDGE (e, ei, bb->succs) ++ { ++ dfs_check_bb (loop_vinfo, e->dest, stmt_info, success, visited_bbs); ++ if (!success) ++ return; ++ } ++ return; ++ } ++ /* Non-empty BB. */ ++ check_vec_use (loop_vinfo, stmts, stmt_info, success); ++} ++ ++/* For grouped store, if all succeedors of present BB have vectorized load ++ from same base of store. If so, set memory_access_type using ++ VMAT_CONTIGUOUS_PERMUTE instead of VMAT_LOAD_STORE_LANES. */ ++ ++static bool ++conti_perm (stmt_vec_info stmt_vinfo, loop_vec_info loop_vinfo) ++{ ++ gimple *stmt = stmt_vinfo->stmt; ++ if (gimple_code (stmt) != GIMPLE_ASSIGN) ++ return false; ++ ++ if (DR_IS_READ (stmt_vinfo->dr_aux.dr)) ++ return false; ++ ++ basic_block bb = stmt->bb; ++ bool success = true; ++ auto_vec<basic_block> visited_bbs; ++ visited_bbs.safe_push (bb); ++ edge e; ++ edge_iterator ei; ++ FOR_EACH_EDGE (e, ei, bb->succs) ++ dfs_check_bb (loop_vinfo, e->dest, stmt_vinfo, success, visited_bbs); ++ return success; ++} ++ + /* A subroutine of get_load_store_type, with a subset of the same + arguments. Handle the case where STMT_INFO is part of a grouped load + or store. +@@ -2373,6 +2542,20 @@ get_group_load_store_type (vec_info *vinfo, stmt_vec_info stmt_info, + *memory_access_type = VMAT_CONTIGUOUS_PERMUTE; + overrun_p = would_overrun_p; + } ++ ++ if (*memory_access_type == VMAT_LOAD_STORE_LANES ++ && TREE_CODE (loop_vinfo->num_iters) == INTEGER_CST ++ && maybe_eq (tree_to_shwi (loop_vinfo->num_iters), ++ loop_vinfo->vectorization_factor) ++ && conti_perm (stmt_info, loop_vinfo) ++ && (vls_type == VLS_LOAD ++ ? vect_grouped_load_supported (vectype, single_element_p, ++ group_size) ++ : vect_grouped_store_supported (vectype, group_size))) ++ { ++ *memory_access_type = VMAT_CONTIGUOUS_PERMUTE; ++ overrun_p = would_overrun_p; ++ } + } + + /* As a last resort, trying using a gather load or scatter store. +@@ -7456,6 +7639,154 @@ vectorizable_scan_store (vec_info *vinfo, + return true; + } + ++/* Function vect_permute_store_chains ++ ++ Call function vect_permute_store_chain (). ++ Given a chain of interleaved stores in DR_CHAIN, generate ++ interleave_high/low stmts to reorder the data correctly. ++ Return the final references for stores in RESULT_CHAIN. */ ++ ++static void ++vect_permute_store_chains (vec_info *vinfo, vec<tree> dr_chain, ++ unsigned int num_each, stmt_vec_info stmt_info, ++ gimple_stmt_iterator *gsi, vec<tree> *result_chain, ++ unsigned int group) ++{ ++ unsigned int k = 0; ++ unsigned int t = 0; ++ ++ /* Divide vectors into GROUP parts. And permute every NUM_EACH vectors ++ together. */ ++ for (k = 0; k < group; k++) ++ { ++ auto_vec<tree> dr_chain_transposed (num_each); ++ auto_vec<tree> result_chain_transposed (num_each); ++ for (t = k; t < dr_chain.length (); t = t + group) ++ { ++ dr_chain_transposed.quick_push (dr_chain[t]); ++ } ++ vect_permute_store_chain (vinfo, dr_chain_transposed, num_each, ++ stmt_info, gsi, &result_chain_transposed); ++ for (t = 0; t < num_each; t++) ++ { ++ result_chain->quick_push (result_chain_transposed[t]); ++ } ++ } ++} ++ ++/* Function transpose_oprnd_store ++ ++ Calculate the transposed results from VEC_OPRNDS (VEC_STMT) ++ for vectorizable_store. */ ++ ++static void ++transpose_oprnd_store (vec_info *vinfo, vec<tree>vec_oprnds, ++ vec<tree> *result_chain, unsigned int vec_num, ++ unsigned int const_nunits, unsigned int array_num, ++ stmt_vec_info first_stmt_info, ++ gimple_stmt_iterator *gsi) ++{ ++ unsigned int group_for_transform = 0; ++ unsigned int num_each = 0; ++ ++ /* Transpose back for vec_oprnds. */ ++ /* vec = {vec1, vec2, ...} */ ++ if (array_num < const_nunits ++ && const_nunits % array_num == 0) ++ { ++ vect_transpose_store_chain (vinfo, vec_oprnds, ++ vec_num, array_num, ++ first_stmt_info, ++ gsi, result_chain); ++ } ++ /* vec1 = {vec_part1}, vec2 = {vec_part2}, ... */ ++ else if (array_num >= const_nunits ++ && array_num % const_nunits == 0) ++ { ++ group_for_transform = array_num / const_nunits; ++ num_each = vec_oprnds.length () / group_for_transform; ++ vect_permute_store_chains (vinfo, vec_oprnds, ++ num_each, first_stmt_info, ++ gsi, result_chain, ++ group_for_transform); ++ } ++ else ++ { ++ gcc_unreachable (); ++ } ++} ++ ++static dr_vec_info * ++get_dr_info (stmt_vec_info stmt_info) ++{ ++ dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info); ++ if (dr_info->misalignment == DR_MISALIGNMENT_UNINITIALIZED) ++ { ++ SET_DR_MISALIGNMENT (dr_info, DR_MISALIGNMENT_UNKNOWN); ++ } ++ return dr_info; ++} ++ ++static unsigned ++dr_align_vect_store (vec_info *vinfo, dr_vec_info *cur_first_dr_info, ++ tree vectype, unsigned HOST_WIDE_INT &align) ++{ ++ unsigned misalign = 0; ++ align = known_alignment (DR_TARGET_ALIGNMENT (cur_first_dr_info)); ++ if (aligned_access_p (cur_first_dr_info, vectype)) ++ { ++ return misalign; ++ } ++ else if (cur_first_dr_info->misalignment == -1) ++ { ++ align = dr_alignment (vect_dr_behavior (vinfo, cur_first_dr_info)); ++ } ++ else ++ { ++ misalign = cur_first_dr_info->misalignment; ++ } ++ return misalign; ++} ++ ++static void ++add_new_stmt_vect_store (vec_info *vinfo, tree vectype, tree dataref_ptr, ++ tree dataref_offset, tree ref_type, ++ dr_vec_info *cur_first_dr_info, tree vec_oprnd, ++ gimple_stmt_iterator *gsi, stmt_vec_info stmt_info) ++{ ++ /* Data align. */ ++ unsigned HOST_WIDE_INT align; ++ unsigned misalign = dr_align_vect_store (vinfo, cur_first_dr_info, ++ vectype, align); ++ ++ if (dataref_offset == NULL_TREE && TREE_CODE (dataref_ptr) == SSA_NAME) ++ { ++ set_ptr_info_alignment (get_ptr_info (dataref_ptr), align, misalign); ++ } ++ ++ /* Get data_ref. */ ++ tree offset = dataref_offset ? dataref_offset : build_int_cst (ref_type, 0); ++ tree data_ref = fold_build2 (MEM_REF, vectype, dataref_ptr, offset); ++ if (aligned_access_p (cur_first_dr_info, vectype)) ++ { ++ ; ++ } ++ else if (cur_first_dr_info->misalignment == -1) ++ { ++ TREE_TYPE (data_ref) = build_aligned_type (TREE_TYPE (data_ref), ++ align * BITS_PER_UNIT); ++ } ++ else ++ { ++ tree elem_type = TREE_TYPE (vectype); ++ TREE_TYPE (data_ref) = build_aligned_type (TREE_TYPE (data_ref), ++ TYPE_ALIGN (elem_type)); ++ } ++ /* Add new stmt. */ ++ vect_copy_ref_info (data_ref, DR_REF (cur_first_dr_info->dr)); ++ gassign *new_stmt = gimple_build_assign (data_ref, vec_oprnd); ++ vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi, true); ++} + + /* Function vectorizable_store. + +@@ -8333,6 +8664,16 @@ vectorizable_store (vec_info *vinfo, + &vec_offsets); + vec_offset = vec_offsets[0]; + } ++ /* If the stmt_info need to be transposed recovery, dataref_ptr ++ will be caculated later. */ ++ else if (memory_access_type == VMAT_CONTIGUOUS ++ && is_a <bb_vec_info> (vinfo) ++ && STMT_VINFO_GROUPED_ACCESS (stmt_info) ++ && DR_GROUP_SLP_TRANSPOSE ( ++ DR_GROUP_FIRST_ELEMENT (stmt_info))) ++ { ++ dataref_ptr = NULL_TREE; ++ } + else + dataref_ptr + = vect_create_data_ref_ptr (vinfo, first_stmt_info, aggr_type, +@@ -8423,6 +8764,75 @@ vectorizable_store (vec_info *vinfo, + } + else + { ++ /* group_size: the size of group after transposing and merging. ++ group_size_b: the size of group before transposing and merging, ++ and only group_size_b >= const_nunits is supported. ++ array_num: the number of arrays. ++ const_nunits: TYPE_VECTOR_SUBPARTS (vectype). ++ ncontinues: group_size_b / const_nunits, it means the number of ++ times an array is stored in memory. */ ++ if (slp && is_a <bb_vec_info> (vinfo) ++ && STMT_VINFO_GROUPED_ACCESS (stmt_info) ++ && DR_GROUP_SLP_TRANSPOSE (DR_GROUP_FIRST_ELEMENT (stmt_info))) ++ { ++ if (dump_enabled_p ()) ++ { ++ dump_printf_loc (MSG_NOTE, vect_location, ++ "vectorizable_store for slp transpose.\n"); ++ } ++ /* Transpose back for grouped stores. */ ++ vect_transform_back_slp_grouped_stores (bb_vinfo, ++ first_stmt_info); ++ ++ result_chain.create (vec_oprnds.length ()); ++ unsigned int const_nunits = nunits.to_constant (); ++ unsigned int group_size_b = DR_GROUP_SIZE_TRANS (first_stmt_info); ++ unsigned int array_num = group_size / group_size_b; ++ transpose_oprnd_store (vinfo, vec_oprnds, &result_chain, vec_num, ++ const_nunits, array_num, ++ first_stmt_info, gsi); ++ ++ /* For every store group, not for every vec, because transposing ++ and merging have changed the data reference access. */ ++ gcc_assert (group_size_b >= const_nunits); ++ unsigned int ncontinues = group_size_b / const_nunits; ++ ++ unsigned int k = 0; ++ for (i = 0; i < array_num; i++) ++ { ++ stmt_vec_info first_stmt_b; ++ BB_VINFO_GROUPED_STORES (vinfo).iterate (i, &first_stmt_b); ++ bool simd_lane_access_p ++ = STMT_VINFO_SIMD_LANE_ACCESS_P (first_stmt_b) != 0; ++ tree ref_type = get_group_alias_ptr_type (first_stmt_b); ++ dataref_ptr = vect_create_data_ref_ptr ( ++ vinfo, first_stmt_b, aggr_type, ++ simd_lane_access_p ? loop : NULL, ++ offset, &dummy, gsi, &ptr_incr, ++ simd_lane_access_p, bump); ++ dr_vec_info *cur_first_dr_info = get_dr_info (first_stmt_b); ++ for (unsigned int t = 0; t < ncontinues; t++) ++ { ++ vec_oprnd = result_chain[k]; ++ k++; ++ if (t > 0) ++ { ++ /* Bump the vector pointer. */ ++ dataref_ptr = bump_vector_ptr (vinfo, dataref_ptr, ++ ptr_incr, gsi, ++ first_stmt_b, bump); ++ } ++ add_new_stmt_vect_store (vinfo, vectype, dataref_ptr, ++ dataref_offset, ref_type, ++ cur_first_dr_info, vec_oprnd, ++ gsi, first_stmt_b); ++ } ++ } ++ oprnds.release (); ++ result_chain.release (); ++ vec_oprnds.release (); ++ return true; ++ } + new_stmt = NULL; + if (grouped_store) + { +@@ -8719,6 +9129,451 @@ hoist_defs_of_uses (stmt_vec_info stmt_info, class loop *loop) + return true; + } + ++static tree ++calculate_new_type (tree vectype, unsigned int const_nunits, ++ unsigned int group_size_b, unsigned int &nloads, ++ unsigned int &ncontinues, tree &lvectype) ++{ ++ tree ltype = TREE_TYPE (vectype); ++ /* nloads is the number of ARRAYs in a vector. ++ vectemp = {a[], b[], ...} */ ++ if (group_size_b < const_nunits) ++ { ++ tree ptype; ++ tree vtype ++ = vector_vector_composition_type (vectype, ++ const_nunits / group_size_b, ++ &ptype); ++ if (vtype != NULL_TREE) ++ { ++ nloads = const_nunits / group_size_b; ++ lvectype = vtype; ++ ltype = ptype; ++ ncontinues = 1; ++ } ++ } ++ /* ncontinues is the number of vectors from an ARRAY. ++ vectemp1 = {a[0], a[1], ...} ++ ... ++ vectempm = {a[k], a[k+1], ...} */ ++ else ++ { ++ nloads = 1; ++ ltype = vectype; ++ ncontinues = group_size_b / const_nunits; ++ } ++ ltype = build_aligned_type (ltype, TYPE_ALIGN (TREE_TYPE (vectype))); ++ return ltype; ++} ++ ++static void ++generate_old_load_permutations (slp_tree slp_node, unsigned int group_size, ++ vec<unsigned> &old_load_permutation) ++{ ++ /* Generate the old load permutations from the slp_node. */ ++ unsigned i = 0; ++ unsigned k = 0; ++ ++ /* If SLP_NODE has load_permutation, we copy it to old_load_permutation. ++ Otherwise, we generate a permutation sequentially. */ ++ if (SLP_TREE_LOAD_PERMUTATION (slp_node).exists ()) ++ { ++ FOR_EACH_VEC_ELT (SLP_TREE_LOAD_PERMUTATION (slp_node), i, k) ++ { ++ old_load_permutation.safe_push (k); ++ } ++ } ++ else ++ { ++ for (unsigned i = 0; i < group_size; i++) ++ { ++ old_load_permutation.safe_push (i); ++ } ++ } ++} ++ ++static void ++generate_new_load_permutation_mapping (unsigned slp_node_length, ++ vec<unsigned> &group_idx, ++ const vec<unsigned> &load_permutation, ++ unsigned int group_size_b, ++ unsigned &new_group_size, ++ vec<unsigned> &group_from) ++{ ++ /* group_num_vec: only stores the group_loads IDs which are caculated from ++ load_permutation. */ ++ auto_vec<unsigned> group_num_vec; ++ ++ /* Caculate which group_loads are the stmts in SLP_NODE from. */ ++ unsigned i = 0; ++ unsigned k = 0; ++ FOR_EACH_VEC_ELT (load_permutation, i, k) ++ { ++ unsigned int t0 = k / group_size_b; ++ if (!group_num_vec.contains (t0)) ++ { ++ group_num_vec.safe_push (t0); ++ } ++ group_from.safe_push (t0); ++ } ++ group_num_vec.qsort (cmp_for_group_num); ++ /* n_groups: the number of group_loads. */ ++ unsigned int n_groups = group_num_vec.length (); ++ new_group_size = n_groups * group_size_b; ++ for (i = 0; i < n_groups; i++) ++ { ++ group_idx.safe_push (group_num_vec[i] * group_size_b); ++ } ++ /* A new mapping from group_ind_vec to group_from. ++ For example: ++ Origin: group_from = {1,1,3,3,5,5,7,7}; ++ After mapping: group_from = {0,0,1,1,2,2,2,2}; */ ++ auto_vec<unsigned> group_ind_vec (n_groups); ++ for (k = 0; k < n_groups; k++) ++ { ++ group_ind_vec.safe_push (k); ++ } ++ for (i = 0; i < slp_node_length; i++) ++ { ++ for (k = 0; k < n_groups; k++) ++ { ++ if (group_from[i] == group_num_vec[k]) ++ { ++ group_from[i] = group_ind_vec[k]; ++ break; ++ } ++ } ++ } ++} ++ ++static void ++generate_new_load_permutation (vec<unsigned> &new_load_permutation, ++ const vec<unsigned> &old_load_permutation, ++ slp_tree slp_node, bool &this_load_permuted, ++ const vec<unsigned> &group_from, ++ unsigned int group_size_b) ++{ ++ unsigned slp_node_length = SLP_TREE_SCALAR_STMTS (slp_node).length (); ++ /* Generate the new load permutation from the new mapping. */ ++ new_load_permutation.create (slp_node_length); ++ unsigned i = 0; ++ unsigned k = 0; ++ FOR_EACH_VEC_ELT (old_load_permutation, i, k) ++ { ++ /* t1 is the new permutation of k in the old permutation. ++ t1 = base_address + offset: ++ base_address = group_from[i] * group_size_b; ++ offset = k % group_size_b. */ ++ unsigned int t1 ++ = group_from[i] * group_size_b + k % group_size_b; ++ new_load_permutation.safe_push (t1); ++ if (t1 != k) ++ { ++ this_load_permuted = true; ++ } ++ } ++} ++ ++static bool ++is_slp_perm (bool slp_perm, bool this_load_permuted, poly_uint64 nunits, ++ unsigned int group_size, stmt_vec_info first_stmt_info) ++{ ++ /* Calculate the unrolling factor based on the smallest type. */ ++ poly_uint64 unrolling_factor ++ = exact_div (common_multiple (nunits, group_size), group_size); ++ /* The load requires permutation when unrolling exposes ++ a gap either because the group is larger than the SLP ++ group-size or because there is a gap between the groups. */ ++ if (!slp_perm && !this_load_permuted ++ && (known_eq (unrolling_factor, 1U) ++ || (group_size == DR_GROUP_SIZE (first_stmt_info) ++ && DR_GROUP_GAP (first_stmt_info) == 0))) ++ { ++ return false; ++ } ++ else ++ { ++ return true; ++ } ++} ++ ++static void ++generate_load_permutation (slp_tree slp_node, unsigned &new_group_size, ++ unsigned int group_size, unsigned int group_size_b, ++ bool &this_load_permuted, vec<unsigned> &group_idx, ++ vec<unsigned> &new_load_permutation) ++{ ++ /* Generate the old load permutations from SLP_NODE. */ ++ vec<unsigned> old_load_permutation; ++ old_load_permutation.create (group_size); ++ generate_old_load_permutations (slp_node, group_size, old_load_permutation); ++ ++ /* Caculate which group_loads are the stmts in SLP_NODE from. */ ++ unsigned slp_node_length = SLP_TREE_SCALAR_STMTS (slp_node).length (); ++ /* group_from: stores the group_loads ID for every stmt in SLP_NODE. */ ++ vec<unsigned> group_from; ++ group_from.create (slp_node_length); ++ generate_new_load_permutation_mapping (slp_node_length, group_idx, ++ old_load_permutation, ++ group_size_b, new_group_size, ++ group_from); ++ ++ /* Generate the new load permutation from the new mapping and caculate ++ this_load_permuted flag. If this_load_permuted is true, we need execute ++ slp permutation by using new load permutation. */ ++ generate_new_load_permutation (new_load_permutation, old_load_permutation, ++ slp_node, this_load_permuted, group_from, ++ group_size_b); ++ old_load_permutation.release (); ++ group_from.release (); ++} ++ ++static unsigned int ++dr_align_vect_load (vec_info *vinfo, dr_vec_info *cur_first_dr_info, ++ tree vectype, unsigned HOST_WIDE_INT &align, ++ enum dr_alignment_support alignment_support_scheme) ++{ ++ unsigned int misalign = 0; ++ ++ align = known_alignment (DR_TARGET_ALIGNMENT (cur_first_dr_info)); ++ if (alignment_support_scheme == dr_aligned) ++ { ++ gcc_assert (aligned_access_p (cur_first_dr_info, vectype)); ++ } ++ else if (cur_first_dr_info->misalignment == -1) ++ { ++ align = dr_alignment (vect_dr_behavior (vinfo, cur_first_dr_info)); ++ } ++ else ++ { ++ misalign = cur_first_dr_info->misalignment; ++ } ++ return misalign; ++} ++ ++static stmt_vec_info ++add_new_stmt_vect_load (vec_info *vinfo, tree vectype, tree dataref_ptr, ++ tree dataref_offset, tree ref_type, tree ltype, ++ gassign *(&new_stmt), dr_vec_info *cur_first_dr_info, ++ gimple_stmt_iterator *gsi, stmt_vec_info stmt_info) ++{ ++ /* Data align. */ ++ int malign = dr_misalignment (cur_first_dr_info, vectype); ++ enum dr_alignment_support alignment_support_scheme ++ = vect_supportable_dr_alignment (vinfo, cur_first_dr_info, ++ vectype, malign); ++ unsigned HOST_WIDE_INT align; ++ unsigned int misalign = dr_align_vect_load (vinfo, cur_first_dr_info, ++ vectype, align, ++ alignment_support_scheme); ++ if (dataref_offset == NULL_TREE && TREE_CODE (dataref_ptr) == SSA_NAME) ++ { ++ set_ptr_info_alignment (get_ptr_info (dataref_ptr), align, misalign); ++ } ++ ++ /* Get data_ref. */ ++ tree offset = dataref_offset ? dataref_offset : build_int_cst (ref_type, 0); ++ tree data_ref = fold_build2 (MEM_REF, ltype, dataref_ptr, offset); ++ if (alignment_support_scheme == dr_aligned) ++ { ++ ; ++ } ++ else if (cur_first_dr_info->misalignment == -1) ++ { ++ TREE_TYPE (data_ref) ++ = build_aligned_type (TREE_TYPE (data_ref), align * BITS_PER_UNIT); ++ } ++ else ++ { ++ tree elem_type = TREE_TYPE (vectype); ++ TREE_TYPE (data_ref) ++ = build_aligned_type (TREE_TYPE (data_ref), TYPE_ALIGN (elem_type)); ++ } ++ ++ /* Add new stmt. */ ++ vect_copy_ref_info (data_ref, DR_REF (cur_first_dr_info->dr)); ++ new_stmt = gimple_build_assign (make_ssa_name (ltype), data_ref); ++ vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi, true); ++ stmt_vec_info vec_stmt_info = vinfo->lookup_stmt (new_stmt); ++ return vec_stmt_info; ++} ++ ++static void ++push_new_stmt_to_dr_chain (bool slp_perm, stmt_vec_info new_stmt_info, ++ vec<tree> dr_chain, slp_tree slp_node) ++{ ++ if (slp_perm) ++ dr_chain.quick_push (gimple_assign_lhs (new_stmt_info->stmt)); ++ else ++ SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt_info->stmt); ++} ++ ++static stmt_vec_info ++get_first_stmt_info_before_transpose (stmt_vec_info first_stmt_info, ++ unsigned int group_el, ++ unsigned int group_size) ++{ ++ stmt_vec_info last_stmt_info = first_stmt_info; ++ unsigned int count = 0; ++ gcc_assert (group_el < group_size); ++ while (count < group_el) ++ { ++ last_stmt_info = DR_GROUP_NEXT_ELEMENT (last_stmt_info); ++ count++; ++ } ++ return last_stmt_info; ++} ++ ++static stmt_vec_info ++add_new_stmt_for_nloads_greater_than_one (vec_info *vinfo, tree lvectype, ++ tree vectype, ++ vec<constructor_elt, va_gc> *v, ++ stmt_vec_info stmt_info, ++ gimple_stmt_iterator *gsi) ++{ ++ tree vec_inv = build_constructor (lvectype, v); ++ tree new_temp = vect_init_vector (vinfo, stmt_info, vec_inv, lvectype, gsi, true); ++ stmt_vec_info new_stmt_info = vinfo->lookup_def (new_temp); ++ if (lvectype != vectype) ++ { ++ gassign *new_stmt = gimple_build_assign (make_ssa_name (vectype), ++ VIEW_CONVERT_EXPR, ++ build1 (VIEW_CONVERT_EXPR, ++ vectype, new_temp)); ++ vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi, true); ++ new_stmt_info = vinfo->lookup_stmt (new_stmt); ++ } ++ return new_stmt_info; ++} ++ ++/* Function new_vect_stmt_for_nloads. ++ ++ New a VEC_STMT when nloads Arrays are merged into a vector. ++ ++ ncopies is the number of vectors that need to be loaded from memmory. ++ nloads is the number of ARRAYs in a vector. ++ vectemp = {a[], b[], ...} */ ++ ++static void ++new_vect_stmt_for_nloads (vec_info *vinfo, unsigned int ncopies, ++ unsigned int nloads, const vec<unsigned> &group_idx, ++ stmt_vec_info stmt_info, offset_info *offset_info, ++ vectype_info *vectype_info, ++ vect_memory_access_type memory_access_type, ++ bool slp_perm, vec<tree> dr_chain, slp_tree slp_node, ++ gimple_stmt_iterator *gsi) ++{ ++ vec<constructor_elt, va_gc> *v = NULL; ++ stmt_vec_info first_stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info); ++ unsigned int group_size = DR_GROUP_SIZE (first_stmt_info); ++ stmt_vec_info first_stmt_info_b = NULL; ++ stmt_vec_info new_stmt_info = NULL; ++ tree dataref_ptr = NULL_TREE; ++ tree dummy; ++ gimple *ptr_incr = NULL; ++ unsigned int n = 0; ++ for (unsigned int i = 0; i < ncopies; i++) ++ { ++ vec_alloc (v, nloads); ++ for (unsigned int t = 0; t < nloads; t++) ++ { ++ first_stmt_info_b = get_first_stmt_info_before_transpose ( ++ first_stmt_info, group_idx[n++], group_size); ++ dr_vec_info* cur_first_dr_info = get_dr_info (first_stmt_info_b); ++ tree bump = vect_get_data_ptr_increment (vinfo, cur_first_dr_info, ++ vectype_info->ltype, ++ memory_access_type); ++ bool simd_lane_access_p ++ = STMT_VINFO_SIMD_LANE_ACCESS_P (first_stmt_info_b) != 0; ++ ++ /* Create dataref_ptr which is point to init_address. */ ++ dataref_ptr = vect_create_data_ref_ptr ( ++ vinfo, first_stmt_info_b, vectype_info->ltype, NULL, ++ offset_info->offset, &dummy, gsi, &ptr_incr, ++ simd_lane_access_p, bump); ++ ++ gassign *new_stmt = NULL; ++ new_stmt_info = add_new_stmt_vect_load (vinfo, vectype_info->vectype, dataref_ptr, ++ offset_info->dataref_offset, ++ vectype_info->ref_type, vectype_info->ltype, ++ new_stmt, cur_first_dr_info, gsi, ++ first_stmt_info_b); ++ ++ CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, gimple_assign_lhs (new_stmt)); ++ } ++ new_stmt_info = add_new_stmt_for_nloads_greater_than_one ( ++ vinfo, vectype_info->lvectype, ++ vectype_info->vectype, v, ++ first_stmt_info_b, gsi); ++ push_new_stmt_to_dr_chain (slp_perm, new_stmt_info, ++ dr_chain, slp_node); ++ } ++} ++ ++/* Function new_vect_stmt_for_ncontinues. ++ ++ New a VEC_STMTs when an Array is divided into several vectors. ++ ++ n_groups is the number of ARRAYs. ++ ncontinues is the number of vectors from an ARRAY. ++ vectemp1 = {a[0], a[1], ...} ++ ... ++ vectempm = {a[k], a[k+1], ...} */ ++ ++static void ++new_vect_stmt_for_ncontinues (vec_info *vinfo, unsigned int ncontinues, ++ const vec<unsigned> &group_idx, ++ stmt_vec_info stmt_info, ++ offset_info* offset_info, ++ vectype_info* vectype_info, ++ vect_memory_access_type memory_access_type, ++ bool slp_perm, vec<tree> &dr_chain, ++ slp_tree slp_node, ++ gimple_stmt_iterator *gsi) ++{ ++ stmt_vec_info first_stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info); ++ unsigned int group_size = DR_GROUP_SIZE (first_stmt_info); ++ stmt_vec_info new_stmt_info = NULL; ++ tree dataref_ptr = NULL_TREE; ++ tree dummy; ++ gimple *ptr_incr = NULL; ++ unsigned int n_groups = group_idx.length (); ++ for (unsigned int i = 0; i < n_groups; i++) ++ { ++ stmt_vec_info first_stmt_info_b = get_first_stmt_info_before_transpose ( ++ first_stmt_info, group_idx[i], group_size); ++ dr_vec_info* cur_first_dr_info = get_dr_info (first_stmt_info_b); ++ tree bump = vect_get_data_ptr_increment (vinfo, cur_first_dr_info, ++ vectype_info->ltype, memory_access_type); ++ bool simd_lane_access_p ++ = STMT_VINFO_SIMD_LANE_ACCESS_P (first_stmt_info_b) != 0; ++ for (unsigned int k = 0; k < ncontinues; k++) ++ { ++ /* Create dataref_ptr which is point to init_address. */ ++ if (k == 0) ++ { ++ dataref_ptr = vect_create_data_ref_ptr ( ++ vinfo, first_stmt_info_b, vectype_info->ltype, NULL, ++ offset_info->offset, &dummy, gsi, &ptr_incr, ++ simd_lane_access_p, bump); ++ } ++ else ++ { ++ dataref_ptr = bump_vector_ptr (vinfo, dataref_ptr, ptr_incr, ++ gsi, first_stmt_info_b, bump); ++ } ++ gassign *new_stmt = NULL; ++ new_stmt_info = add_new_stmt_vect_load (vinfo, vectype_info->vectype, dataref_ptr, ++ offset_info->dataref_offset, ++ vectype_info->ref_type, vectype_info->ltype, ++ new_stmt, cur_first_dr_info, gsi, ++ first_stmt_info_b); ++ push_new_stmt_to_dr_chain (slp_perm, new_stmt_info, ++ dr_chain, slp_node); ++ } ++ } ++} ++ + /* vectorizable_load. + + Check if STMT_INFO reads a non scalar data-ref (array/pointer/structure) +@@ -9338,6 +10193,8 @@ vectorizable_load (vec_info *vinfo, + if (bb_vinfo) + first_stmt_info_for_drptr + = vect_find_first_scalar_stmt_in_slp (slp_node); ++ // first_stmt_info_for_drptr = SLP_TREE_SCALAR_STMTS (slp_node)[0]; ++ + + /* Check if the chain of loads is already vectorized. */ + if (STMT_VINFO_VEC_STMTS (first_stmt_info).exists () +@@ -9601,6 +10458,9 @@ vectorizable_load (vec_info *vinfo, + } + tree vec_mask = NULL_TREE; + poly_uint64 group_elt = 0; ++ unsigned new_group_size = 0; ++ vec<unsigned> new_load_permutation; ++ + for (j = 0; j < ncopies; j++) + { + /* 1. Create the vector or array pointer update chain. */ +@@ -9621,6 +10481,15 @@ vectorizable_load (vec_info *vinfo, + dataref_ptr = unshare_expr (DR_BASE_ADDRESS (first_dr_info->dr)); + dataref_offset = build_int_cst (ref_type, 0); + } ++ /* If the stmt_info need to be transposed recovery, dataref_ptr ++ will be caculated later. */ ++ else if (slp && is_a <bb_vec_info> (vinfo) ++ && STMT_VINFO_GROUPED_ACCESS (stmt_info) ++ && DR_GROUP_SLP_TRANSPOSE ( ++ DR_GROUP_FIRST_ELEMENT (stmt_info))) ++ { ++ dataref_ptr = NULL_TREE; ++ } + else if (diff_first_stmt_info) + { + dataref_ptr +@@ -9731,6 +10600,63 @@ vectorizable_load (vec_info *vinfo, + /* Record that VEC_ARRAY is now dead. */ + vect_clobber_variable (vinfo, stmt_info, gsi, vec_array); + } ++ else if (slp && is_a <bb_vec_info> (vinfo) ++ && STMT_VINFO_GROUPED_ACCESS (stmt_info) ++ && DR_GROUP_SLP_TRANSPOSE (DR_GROUP_FIRST_ELEMENT (stmt_info))) ++ { ++ if (dump_enabled_p ()) ++ { ++ dump_printf_loc (MSG_NOTE, vect_location, ++ "vectorizable_load for slp transpose.\n"); ++ } ++ /* group_size: the size of group after merging. ++ group_size_b: the size of group before merging. ++ const_nunits: TYPE_VECTOR_SUBPARTS (vectype), it is the number of ++ elements in a vector. ++ nloads: const_nunits / group_size_b or 1, it means the number ++ of ARRAYs in a vector. ++ ncontinues: group_size_b / const_nunits or 1, it means the number ++ of vectors from an ARRAY. */ ++ unsigned int group_size_b = DR_GROUP_SIZE_TRANS (first_stmt_info); ++ unsigned int const_nunits = nunits.to_constant (); ++ unsigned int nloads = const_nunits; ++ unsigned int ncontinues = group_size_b; ++ tree lvectype = vectype; ++ tree ltype = calculate_new_type (vectype, const_nunits, ++ group_size_b, nloads, ++ ncontinues, lvectype); ++ bool this_load_permuted = false; ++ auto_vec<unsigned> group_idx; ++ generate_load_permutation (slp_node, new_group_size, group_size, ++ group_size_b, this_load_permuted, ++ group_idx, new_load_permutation); ++ slp_perm = is_slp_perm (slp_perm, this_load_permuted, nunits, ++ group_size, first_stmt_info); ++ ++ /* ncopies: the number of vectors that need to be loaded from ++ memmory. */ ++ unsigned int ncopies = new_group_size / const_nunits; ++ offset_info offset_info = {offset, NULL_TREE, dataref_offset}; ++ vectype_info vectype_info = {vectype, ltype, lvectype, ref_type}; ++ if (slp_perm) ++ { ++ dr_chain.create (ncopies); ++ } ++ if (nloads > 1 && ncontinues == 1) ++ { ++ new_vect_stmt_for_nloads (vinfo, ncopies, nloads, group_idx, ++ stmt_info, &offset_info, &vectype_info, ++ memory_access_type, slp_perm, dr_chain, ++ slp_node, gsi); ++ } ++ else ++ { ++ new_vect_stmt_for_ncontinues (vinfo, ncontinues, group_idx, ++ stmt_info, &offset_info, ++ &vectype_info, memory_access_type, ++ slp_perm, dr_chain, slp_node, gsi); ++ } ++ } + else + { + for (i = 0; i < vec_num; i++) +@@ -10177,7 +11103,32 @@ vectorizable_load (vec_info *vinfo, + if (slp && !slp_perm) + continue; + +- if (slp_perm) ++ /* Using the new load permutation to generate vector permute statements ++ from a list of loads in DR_CHAIN. */ ++ if (slp && slp_perm && is_a <bb_vec_info> (vinfo) ++ && STMT_VINFO_GROUPED_ACCESS (stmt_info) ++ && DR_GROUP_SLP_TRANSPOSE (DR_GROUP_FIRST_ELEMENT (stmt_info))) ++ { ++ unsigned n_perms; ++ stmt_vec_info stmt_info_ = SLP_TREE_SCALAR_STMTS (slp_node)[0]; ++ unsigned int old_size = DR_GROUP_SIZE (stmt_info); ++ DR_GROUP_SIZE (stmt_info_) = new_group_size; ++ vec<unsigned> old_load_permutation ++ = SLP_TREE_LOAD_PERMUTATION (slp_node); ++ SLP_TREE_LOAD_PERMUTATION (slp_node) = new_load_permutation; ++ bool perm_load_success = vect_transform_slp_perm_load ( ++ vinfo, slp_node, dr_chain, gsi, vf, ++ false, &n_perms); ++ DR_GROUP_SIZE (stmt_info_) = old_size; ++ SLP_TREE_LOAD_PERMUTATION (slp_node) = old_load_permutation; ++ new_load_permutation.release (); ++ if (!perm_load_success) ++ { ++ dr_chain.release (); ++ return false; ++ } ++ } ++ else if (slp_perm) + { + unsigned n_perms; + /* For SLP we know we've seen all possible uses of dr_chain so +diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h +index 642eb0aeb..e13bc6c99 100644 +--- a/gcc/tree-vectorizer.h ++++ b/gcc/tree-vectorizer.h +@@ -412,6 +412,21 @@ public: + vec<ddr_p> ddrs; + }; + ++/* Information about offset in vectorizable_load. */ ++struct offset_info { ++ tree offset; ++ tree byte_offset; ++ tree dataref_offset; ++}; ++ ++/* Information about vectype in vectorizable_load. */ ++struct vectype_info { ++ tree vectype; ++ tree ltype; ++ tree lvectype; ++ tree ref_type; ++}; ++ + /* Vectorizer state common between loop and basic-block vectorization. */ + class vec_info { + public: +@@ -455,6 +470,14 @@ public: + stmt in the chain. */ + auto_vec<stmt_vec_info> grouped_stores; + ++ /* All interleaving chains of loads, represented by the first ++ stmt in the chain. */ ++ auto_vec<stmt_vec_info> grouped_loads; ++ ++ /* All interleaving chains of stores (before transposed), represented by all ++ stmt in the chain. */ ++ auto_vec<vec<stmt_vec_info> > scalar_stores; ++ + /* The set of vector modes used in the vectorized region. */ + mode_set used_vector_modes; + +@@ -899,6 +922,8 @@ public: + #define LOOP_VINFO_CHECK_NONZERO(L) (L)->check_nonzero + #define LOOP_VINFO_LOWER_BOUNDS(L) (L)->lower_bounds + #define LOOP_VINFO_GROUPED_STORES(L) (L)->grouped_stores ++#define LOOP_VINFO_GROUPED_LOADS(L) (L)->grouped_loads ++#define LOOP_VINFO_SCALAR_STORES(L) (L)->scalar_stores + #define LOOP_VINFO_SLP_INSTANCES(L) (L)->slp_instances + #define LOOP_VINFO_SLP_UNROLLING_FACTOR(L) (L)->slp_unrolling_factor + #define LOOP_VINFO_REDUCTIONS(L) (L)->reductions +@@ -982,6 +1007,25 @@ public: + vec<basic_block> bbs; + + vec<slp_root> roots; ++ ++ /* True, if bb_vinfo can goto vect_analyze_slp. */ ++ bool before_slp; ++ ++ /* True, if bb_vinfo is a transposed version. */ ++ bool transposed; ++ ++ /* The number of transposed groups. */ ++ int transposed_group; ++ ++ /* The cost of the scalar iterations. */ ++ int scalar_cost; ++ ++ /* The cost of the vector prologue and epilogue, including peeled ++ iterations and set-up code. */ ++ int vec_outside_cost; ++ ++ /* The cost of the vector loop body. */ ++ int vec_inside_cost; + } *bb_vec_info; + + #define BB_VINFO_BB(B) (B)->bb +@@ -989,6 +1033,14 @@ public: + #define BB_VINFO_SLP_INSTANCES(B) (B)->slp_instances + #define BB_VINFO_DATAREFS(B) (B)->shared->datarefs + #define BB_VINFO_DDRS(B) (B)->shared->ddrs ++#define BB_VINFO_GROUPED_LOADS(B) (B)->grouped_loads ++#define BB_VINFO_SCALAR_STORES(B) (B)->scalar_stores ++#define BB_VINFO_VEC_OUTSIDE_COST(B) (B)->vec_outside_cost ++#define BB_VINFO_VEC_INSIDE_COST(B) (B)->vec_inside_cost ++#define BB_VINFO_SCALAR_COST(B) (B)->scalar_cost ++#define BB_VINFO_SLP_TRANSPOSED(B) (B)->transposed ++#define BB_VINFO_BEFORE_SLP(B) (B)->before_slp ++#define BB_VINFO_TRANS_GROUPS(B) (B)->transposed_group + + /*-----------------------------------------------------------------*/ + /* Info on vectorized defs. */ +@@ -1219,6 +1271,17 @@ public: + stmt_vec_info next_element; + /* The size of the group. */ + unsigned int size; ++ ++ /* The size of the group before transposed. */ ++ unsigned int size_before_transpose; ++ ++ /* If true, the stmt_info is slp transposed. */ ++ bool slp_transpose; ++ ++ /* Mark the group store number for rebuild interleaving chain ++ during transpose phase. Value -1 represents unable to transpose. */ ++ int group_number; ++ + /* For stores, number of stores from this group seen. We vectorize the last + one. */ + unsigned int store_count; +@@ -1226,6 +1289,9 @@ public: + is 1. */ + unsigned int gap; + ++ /* The gap before transposed. */ ++ unsigned int gap_before_transpose; ++ + /* The minimum negative dependence distance this stmt participates in + or zero if none. */ + unsigned int min_neg_dist; +@@ -1427,6 +1493,12 @@ struct gather_scatter_info { + #define STMT_VINFO_SLP_VECT_ONLY(S) (S)->slp_vect_only_p + #define STMT_VINFO_SLP_VECT_ONLY_PATTERN(S) (S)->slp_vect_pattern_only_p + ++#define DR_GROUP_SLP_TRANSPOSE(S) \ ++ (gcc_checking_assert ((S)->dr_aux.dr), (S)->slp_transpose) ++#define DR_GROUP_SIZE_TRANS(S) \ ++ (gcc_checking_assert ((S)->dr_aux.dr), (S)->size_before_transpose) ++#define DR_GROUP_NUMBER(S) \ ++ (gcc_checking_assert ((S)->dr_aux.dr), (S)->group_number) + #define DR_GROUP_FIRST_ELEMENT(S) \ + (gcc_checking_assert ((S)->dr_aux.dr), (S)->first_element) + #define DR_GROUP_NEXT_ELEMENT(S) \ +@@ -1437,6 +1509,8 @@ struct gather_scatter_info { + (gcc_checking_assert ((S)->dr_aux.dr), (S)->store_count) + #define DR_GROUP_GAP(S) \ + (gcc_checking_assert ((S)->dr_aux.dr), (S)->gap) ++#define DR_GROUP_GAP_TRANS(S) \ ++ (gcc_checking_assert ((S)->dr_aux.dr), (S)->gap_before_transpose) + + #define REDUC_GROUP_FIRST_ELEMENT(S) \ + (gcc_checking_assert (!(S)->dr_aux.dr), (S)->first_element) +@@ -2033,6 +2107,17 @@ vect_get_scalar_dr_size (dr_vec_info *dr_info) + return tree_to_uhwi (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr_info->dr)))); + } + ++/* Compare two unsigned int A and B. ++ Sorting them in ascending order. */ ++ ++static inline int ++cmp_for_group_num (const void *a_, const void *b_) ++{ ++ unsigned int a = *(unsigned int *)const_cast<void *>(a_); ++ unsigned int b = *(unsigned int *)const_cast<void *>(b_); ++ return a < b ? -1 : 1; ++} ++ + /* Return true if LOOP_VINFO requires a runtime check for whether the + vector loop is profitable. */ + +@@ -2152,7 +2237,7 @@ record_stmt_cost (stmt_vector_for_cost *body_cost_vec, int count, + + extern void vect_finish_replace_stmt (vec_info *, stmt_vec_info, gimple *); + extern void vect_finish_stmt_generation (vec_info *, stmt_vec_info, gimple *, +- gimple_stmt_iterator *); ++ gimple_stmt_iterator *,bool transpose=false); + extern opt_result vect_mark_stmts_to_be_vectorized (loop_vec_info, bool *); + extern tree vect_get_store_rhs (stmt_vec_info); + void vect_get_vec_defs_for_operand (vec_info *vinfo, stmt_vec_info, unsigned, +@@ -2168,7 +2253,7 @@ void vect_get_vec_defs (vec_info *, stmt_vec_info, slp_tree, unsigned, + tree = NULL, vec<tree> * = NULL, tree = NULL, + tree = NULL, vec<tree> * = NULL, tree = NULL); + extern tree vect_init_vector (vec_info *, stmt_vec_info, tree, tree, +- gimple_stmt_iterator *); ++ gimple_stmt_iterator *, bool transpose=false); + extern tree vect_get_slp_vect_def (slp_tree, unsigned); + extern bool vect_transform_stmt (vec_info *, stmt_vec_info, + gimple_stmt_iterator *, +@@ -2235,6 +2320,9 @@ extern bool vect_load_lanes_supported (tree, unsigned HOST_WIDE_INT, bool); + extern void vect_permute_store_chain (vec_info *, vec<tree> &, + unsigned int, stmt_vec_info, + gimple_stmt_iterator *, vec<tree> *); ++extern void vect_transpose_store_chain (vec_info *, vec<tree>, unsigned int, ++ unsigned int, stmt_vec_info, ++ gimple_stmt_iterator *, vec<tree> *); + extern tree vect_setup_realignment (vec_info *, + stmt_vec_info, gimple_stmt_iterator *, + tree *, enum dr_alignment_support, tree, +@@ -2262,7 +2350,8 @@ extern bool check_reduction_path (dump_user_location_t, loop_p, gphi *, tree, + enum tree_code); + extern bool needs_fold_left_reduction_p (tree, code_helper); + /* Drive for loop analysis stage. */ +-extern opt_loop_vec_info vect_analyze_loop (class loop *, vec_info_shared *); ++extern opt_loop_vec_info vect_analyze_loop (class loop *, vec_info_shared *, ++ bool result_only_p = false); + extern tree vect_build_loop_niters (loop_vec_info, bool * = NULL); + extern void vect_gen_vector_loop_niters (loop_vec_info, tree, tree *, + tree *, bool); +@@ -2331,6 +2420,7 @@ extern bool vect_transform_slp_perm_load (vec_info *, slp_tree, const vec<tree> + gimple_stmt_iterator *, poly_uint64, + bool, unsigned *, + unsigned * = nullptr, bool = false); ++extern void vect_transform_back_slp_grouped_stores (bb_vec_info, stmt_vec_info); + extern bool vect_slp_analyze_operations (vec_info *); + extern void vect_schedule_slp (vec_info *, const vec<slp_instance> &); + extern opt_result vect_analyze_slp (vec_info *, unsigned); +-- +2.33.0 + |