1 files changed, 5624 insertions, 0 deletions
diff --git a/0099-Enable-Transposed-SLP.patch b/0099-Enable-Transposed-SLP.patch
new file mode 100644
index 0000000..b4e8b24
--- /dev/null
+++ b/0099-Enable-Transposed-SLP.patch
@@ -0,0 +1,5624 @@
+From 0dd3b8532f35486bd5db2c71342c8dfed4c0893a Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?=E9=83=91=E6=99=A8=E5=8D=89?= <zhengchenhui1@huawei.com>
+Date: Thu, 25 Jul 2024 17:25:23 +0800
+Subject: [PATCH] Enable Transposed SLP.
+
+---
+ gcc/common.opt                          |    4 +
+ gcc/testsuite/gcc.dg/vect/transpose-1.c |   53 +
+ gcc/testsuite/gcc.dg/vect/transpose-2.c |   50 +
+ gcc/testsuite/gcc.dg/vect/transpose-3.c |   54 +
+ gcc/testsuite/gcc.dg/vect/transpose-4.c |   53 +
+ gcc/testsuite/gcc.dg/vect/transpose-5.c |   74 ++
+ gcc/testsuite/gcc.dg/vect/transpose-6.c |   67 +
+ gcc/testsuite/gcc.dg/vect/transpose-7.c |   53 +
+ gcc/testsuite/gcc.dg/vect/transpose-8.c |   53 +
+ gcc/testsuite/gcc.dg/vect/vect.exp      |    7 +
+ gcc/tree-loop-distribution.cc           | 1464 ++++++++++++++++++++-
+ gcc/tree-vect-data-refs.cc              |  237 ++++
+ gcc/tree-vect-loop.cc                   |   42 +-
+ gcc/tree-vect-patterns.cc               |    4 +-
+ gcc/tree-vect-slp.cc                    | 1553 ++++++++++++++++++++---
+ gcc/tree-vect-stmts.cc                  |  973 +++++++++++++-
+ gcc/tree-vectorizer.h                   |   96 +-
+ 17 files changed, 4648 insertions(+), 189 deletions(-)
+ create mode 100644 gcc/testsuite/gcc.dg/vect/transpose-1.c
+ create mode 100644 gcc/testsuite/gcc.dg/vect/transpose-2.c
+ create mode 100644 gcc/testsuite/gcc.dg/vect/transpose-3.c
+ create mode 100644 gcc/testsuite/gcc.dg/vect/transpose-4.c
+ create mode 100644 gcc/testsuite/gcc.dg/vect/transpose-5.c
+ create mode 100644 gcc/testsuite/gcc.dg/vect/transpose-6.c
+ create mode 100644 gcc/testsuite/gcc.dg/vect/transpose-7.c
+ create mode 100644 gcc/testsuite/gcc.dg/vect/transpose-8.c
+
+diff --git a/gcc/common.opt b/gcc/common.opt
+index b18f0b944..5958c4e0b 100644
+--- a/gcc/common.opt
++++ b/gcc/common.opt
+@@ -3221,6 +3221,10 @@ ftree-slp-vectorize
+ Common Var(flag_tree_slp_vectorize) Optimization EnabledBy(ftree-vectorize)
+ Enable basic block vectorization (SLP) on trees.
+ 
++ftree-slp-transpose-vectorize
++Common Var(flag_tree_slp_transpose_vectorize) Optimization Init(0)
++Enable basic block vectorization (SLP) for transposed stores and loads on trees.
++
+ fvect-cost-model=
+ Common Joined RejectNegative Enum(vect_cost_model) Var(flag_vect_cost_model) Init(VECT_COST_MODEL_DEFAULT) Optimization
+ -fvect-cost-model=[unlimited|dynamic|cheap|very-cheap]	Specifies the cost model for vectorization.
+diff --git a/gcc/testsuite/gcc.dg/vect/transpose-1.c b/gcc/testsuite/gcc.dg/vect/transpose-1.c
+new file mode 100644
+index 000000000..8237a8b9e
+--- /dev/null
++++ b/gcc/testsuite/gcc.dg/vect/transpose-1.c
+@@ -0,0 +1,53 @@
++/* { dg-do compile { target { aarch64*-*-linux* } } } */
++/* { dg-require-effective-target vect_int } */
++#include <stdio.h>
++#include <stdlib.h>
++#include "tree-vect.h"
++
++#define N 4
++#define M 256
++
++int foo (unsigned char *pix1, int i_pix1, unsigned char *pix2, int i_pix2)
++{
++  int i = 0;
++  int sum = 0;
++  unsigned c0[N], c1[N], c2[N], c3[N], c4[N], c5[N], c6[N], c7[N];
++  for (i = 0; i < N; i++, pix1 += i_pix1, pix2 += i_pix2)
++    {
++      c0[i] = pix1[0] - pix2[0];
++      c1[i] = pix1[1] - pix2[1];
++      c2[i] = pix1[2] - pix2[2];
++      c3[i] = pix1[3] - pix2[3];
++      c4[i] = pix1[4] - pix2[4];
++      c5[i] = pix1[5] - pix2[5];
++      c6[i] = pix1[6] - pix2[6];
++      c7[i] = pix1[7] - pix2[7];
++    }
++  for (int i = 0; i < N; i++)
++    {
++      sum += c0[i] + c1[i] + c2[i] + c3[i] + c4[i] + c5[i] + c6[i] + c7[i];
++    }
++  return sum;
++}
++
++int main (int argc, const char* argv[])
++{
++  unsigned char input1[M];
++  unsigned char input2[M];
++  int i1 = 16;
++  int i2 = 8;
++  check_vect ();
++  for (int i = 0; i < M; i++)
++    {
++	input1[i] = i * 2;
++	input2[i] = i;
++    }
++  int sum = foo (input1, i1, input2, i2);
++  if (sum != 1264)
++    {
++      abort ();
++    }
++  return 0;
++}
++
++/* { dg-final { scan-tree-dump "vectorized using transposed version" "slp1" } } */
+diff --git a/gcc/testsuite/gcc.dg/vect/transpose-2.c b/gcc/testsuite/gcc.dg/vect/transpose-2.c
+new file mode 100644
+index 000000000..fdf4dbd96
+--- /dev/null
++++ b/gcc/testsuite/gcc.dg/vect/transpose-2.c
+@@ -0,0 +1,50 @@
++/* { dg-do compile { target { aarch64*-*-linux* } } } */
++/* { dg-additional-options "-fno-tree-loop-vectorize -fno-tree-dse" } */
++/* { dg-require-effective-target vect_int } */
++#include <stdio.h>
++#include <stdlib.h>
++#include "tree-vect.h"
++
++#define N 8
++#define M 256
++
++int foo (unsigned char *pix1, int i_pix1, unsigned char *pix2, int i_pix2)
++{
++  int i = 0;
++  int sum = 0;
++  unsigned short c0[N], c1[N], c2[N], c3[N], c4[N], c5[N], c6[N], c7[N];
++  for (i = 0; i < N; i++, pix1 += i_pix1, pix2 += i_pix2)
++    {
++      c0[i] = pix1[0] - pix2[0];
++      c1[i] = pix1[1] - pix2[1];
++      c2[i] = pix1[2] - pix2[2];
++      c3[i] = pix1[3] - pix2[3];
++    }
++  for (int i = 0; i < N; i++)
++    {
++      sum += c0[i] + c1[i] + c2[i] + c3[i];
++    }
++  return sum;
++}
++
++int main (int argc, const char* argv[])
++{
++  unsigned char input1[M];
++  unsigned char input2[M];
++  int i1 = 5;
++  int i2 = 4;
++  check_vect ();
++  for (int i = 0; i < M; i++)
++    {
++	input1[i] = i * 4;
++	input2[i] = i * 2;
++    }
++  int sum = foo (input1, i1, input2, i2);
++  if (sum != 1440)
++    {
++      abort ();
++    }
++  return 0;
++}
++
++/* { dg-final { scan-tree-dump "vectorized using transposed version" "slp1" } } */
+diff --git a/gcc/testsuite/gcc.dg/vect/transpose-3.c b/gcc/testsuite/gcc.dg/vect/transpose-3.c
+new file mode 100644
+index 000000000..e492e3717
+--- /dev/null
++++ b/gcc/testsuite/gcc.dg/vect/transpose-3.c
+@@ -0,0 +1,54 @@
++/* { dg-do compile { target { aarch64*-*-linux* } } } */
++/* { dg-additional-options "-fno-tree-loop-vectorize -fno-tree-dse -fno-tree-fre" } */
++/* { dg-require-effective-target vect_int } */
++#include <stdio.h>
++#include <stdlib.h>
++#include "tree-vect.h"
++
++#define N 4
++#define M 256
++
++int foo (unsigned short *pix1, int i_pix1, unsigned short *pix2, int i_pix2)
++{
++  int i = 0;
++  int sum = 0;
++  unsigned c0[N], c1[N], c2[N], c3[N], c4[N], c5[N], c6[N], c7[N];
++  for (i = 0; i < N; i++, pix1 += i_pix1, pix2 += i_pix2)
++    {
++      c0[i] = pix1[0] - pix2[0];
++      c1[i] = pix1[1] - pix2[1];
++      c2[i] = pix1[2] - pix2[2];
++      c3[i] = pix1[3] - pix2[3];
++      c4[i] = pix1[4] - pix2[4];
++      c5[i] = pix1[5] - pix2[5];
++      c6[i] = pix1[6] - pix2[6];
++      c7[i] = pix1[7] - pix2[7];
++    }
++  for (int i = 0; i < N; i++)
++     {
++      sum += c0[i] + c1[i] + c2[i] + c3[i] + c4[i] + c5[i] + c6[i] + c7[i];
++    }
++  return sum;
++}
++
++int main (int argc, const char* argv[])
++{
++  unsigned short input1[M];
++  unsigned short input2[M];
++  int i1 = 8;
++  int i2 = 4;
++  check_vect ();
++  for (int i = 0; i < M; i++)
++    {
++	input1[i] = i * 4;
++	input2[i] = i;
++    }
++  int sum = foo (input1, i1, input2, i2);
++  if (sum != 1680)
++    {
++      abort ();
++    }
++  return 0;
++}
++
++/* { dg-final { scan-tree-dump "vectorized using transposed version" "slp1" } } */
+diff --git a/gcc/testsuite/gcc.dg/vect/transpose-4.c b/gcc/testsuite/gcc.dg/vect/transpose-4.c
+new file mode 100644
+index 000000000..0b4adea9b
+--- /dev/null
++++ b/gcc/testsuite/gcc.dg/vect/transpose-4.c
+@@ -0,0 +1,53 @@
++/* { dg-do compile { target { aarch64*-*-linux* } } } */
++/* { dg-require-effective-target vect_int } */
++#include <stdio.h>
++#include <stdlib.h>
++#include "tree-vect.h"
++
++#define N 4
++#define M 256
++
++int foo (unsigned *pix1, int i_pix1, unsigned *pix2, int i_pix2)
++{
++  int i = 0;
++  int sum = 0;
++  unsigned c0[N], c1[N], c2[N], c3[N], c4[N], c5[N], c6[N], c7[N];
++  for (i = 0; i < N; i++, pix1 += i_pix1, pix2 += i_pix2)
++    {
++      c0[i] = pix1[0] - pix2[0];
++      c1[i] = pix1[1] - pix2[1];
++      c2[i] = pix1[2] - pix2[2];
++      c3[i] = pix1[3] - pix2[3];
++      c4[i] = pix1[4] - pix2[4];
++      c5[i] = pix1[5] - pix2[5];
++      c6[i] = pix1[6] - pix2[6];
++      c7[i] = pix1[7] - pix2[7];
++    }
++  for (int i = 0; i < N; i++)
++     {
++      sum += c0[i] + c1[i] + c2[i] + c3[i] + c4[i] + c5[i] + c6[i] + c7[i];
++    }
++  return sum;
++}
++
++int main (int argc, const char* argv[])
++{
++  unsigned input1[M];
++  unsigned input2[M];
++  int i1 = 12;
++  int i2 = 6;
++  check_vect ();
++  for (int i = 0; i < M; i++)
++    {
++	input1[i] = i * 7;
++	input2[i] = i * 3;
++    }
++  int sum = foo (input1, i1, input2, i2);
++  if (sum != 3616)
++    {
++      abort ();
++    }
++  return 0;
++}
++
++/* { dg-final { scan-tree-dump "vectorized using transposed version" "slp1" } } */
+diff --git a/gcc/testsuite/gcc.dg/vect/transpose-5.c b/gcc/testsuite/gcc.dg/vect/transpose-5.c
+new file mode 100644
+index 000000000..040dedf1b
+--- /dev/null
++++ b/gcc/testsuite/gcc.dg/vect/transpose-5.c
+@@ -0,0 +1,74 @@
++/* { dg-do compile { target { aarch64*-*-linux* } } } */
++/* { dg-additional-options "-fno-tree-dse -fno-tree-fre" } */
++/* { dg-require-effective-target vect_int } */
++#include <stdio.h>
++#include <stdlib.h>
++#include <math.h>
++#include "tree-vect.h"
++
++#define N 4
++#define M 256
++#define eps 1e-8
++
++double foo (unsigned char *pix1, int i_pix1, unsigned char *pix2, int i_pix2)
++{
++  unsigned a0[N];
++  unsigned a1[N];
++  unsigned a2[N];
++  unsigned a3[N];
++
++  int b0[N];
++  int b1[N];
++  int b2[N];
++  int b3[N];
++
++  for (int i = 0; i < N; i++, pix1 += i_pix1, pix2 += i_pix2)
++    {
++      a0[i] = (pix1[0] - pix2[0]) + ((pix1[4] + pix2[4]) << 16);
++      a1[i] = (pix1[1] - pix2[1]) + ((pix1[5] + pix2[5]) << 16);
++      a2[i] = (pix1[2] - pix2[2]) + ((pix1[6] + pix2[6]) << 16);
++      a3[i] = (pix1[3] - pix2[3]) + ((pix1[7] + pix2[7]) << 16);
++    }
++
++  for (int i = 0; i < N; i++, pix1 += i_pix1, pix2 += i_pix2)
++    {
++      b0[i] = (pix1[0] - pix2[0]) + (pix1[4] + pix2[4]);
++      b1[i] = (pix1[1] - pix2[1]) + (pix1[5] + pix2[5]);
++      b2[i] = (pix1[2] - pix2[2]) + (pix1[6] + pix2[6]);
++      b3[i] = (pix1[3] - pix2[3]) + (pix1[7] + pix2[7]);
++    }
++
++  double sum = 0;
++  for (int i = 0; i < N; i++)
++    {
++      sum += a0[i] + a1[i] + a2[i] + a3[i] + b0[i] + b1[i] + b2[i] + b3[i];
++    }
++  return sum;
++}
++
++int main (int argc, const char* argv[])
++{
++  unsigned char input1[M];
++  unsigned char input2[M];
++  int i1 = 8;
++  int i2 = 3;
++  unsigned char m = 2;
++  unsigned short n = 12;
++  float t = 3.0;
++  double k = 4.2;
++  check_vect ();
++  for (int i = 0; i < M; i++)
++    {
++	input1[i] = i * 6;
++	input2[i] = i * 3;
++    }
++  double sum = foo (input1, i1, input2, i2);
++  if (fabs (sum - 78648144) > eps)
++    {
++      abort ();
++    }
++  return 0;
++}
++
++/* { dg-final { scan-tree-dump "vectorized using transposed version" "slp1" } } */
++/* { dg-final { scan-tree-dump-times "vectorizable_store for slp transpose" 2 "slp1" } } */
+diff --git a/gcc/testsuite/gcc.dg/vect/transpose-6.c b/gcc/testsuite/gcc.dg/vect/transpose-6.c
+new file mode 100644
+index 000000000..3e134ac02
+--- /dev/null
++++ b/gcc/testsuite/gcc.dg/vect/transpose-6.c
+@@ -0,0 +1,67 @@
++/* { dg-do compile { target { aarch64*-*-linux* } } } */
++/* { dg-require-effective-target vect_int } */
++/* { dg-require-effective-target vect_float } */
++#include <stdio.h>
++#include <stdlib.h>
++#include <math.h>
++#include "tree-vect.h"
++
++#define N 4
++#define M 256
++#define eps 1e-8
++
++float foo (unsigned char *pix1, int i_pix1, unsigned char *pix2, int i_pix2)
++{
++  unsigned a0[N];
++  unsigned a1[N];
++  unsigned a2[N];
++  unsigned a3[N];
++
++  float c0[N];
++  float c1[N];
++  float c2[N];
++  float c3[N];
++
++  for (int i = 0; i < N; i++, pix1 += i_pix1, pix2 += i_pix2)
++    {
++      a0[i] = (pix1[0] - pix2[0]) + ((pix1[4] - pix2[4]) << 16);
++      a1[i] = (pix1[1] - pix2[1]) + ((pix1[5] - pix2[5]) << 16);
++      a2[i] = (pix1[2] - pix2[2]) + ((pix1[6] - pix2[6]) << 16);
++      a3[i] = (pix1[3] - pix2[3]) + ((pix1[7] - pix2[7]) << 16);
++
++      c0[i] = (pix1[0] * pix2[0]) + (pix1[4] * pix2[4]);
++      c1[i] = (pix1[1] * pix2[1]) + (pix1[5] * pix2[5]);
++      c2[i] = (pix1[2] * pix2[2]) + (pix1[6] * pix2[6]);
++      c3[i] = (pix1[3] * pix2[3]) + (pix1[7] * pix2[7]);
++    }
++
++  float sum = 0;
++  for (int i = 0; i < N; i++)
++    {
++      sum += a0[i] + a1[i] + a2[i] + a3[i] + c0[i] + c1[i] + c2[i] + c3[i];
++    }
++  return sum;
++}
++
++int main (int argc, const char* argv[])
++{
++  unsigned char input1[M];
++  unsigned char input2[M];
++  int i1 = 18;
++  int i2 = 6;
++  check_vect ();
++  for (int i = 0; i < M; i++)
++    {
++	input1[i] = i * 4;
++	input2[i] = i * 2;
++    }
++  float sum = foo (input1, i1, input2, i2);
++  if (fabs (sum - 106041168) > eps) 
++    {
++      abort ();
++    }
++  return 0;
++}
++
++/* { dg-final { scan-tree-dump "vectorized using transposed version" "slp1" } } */
++/* { dg-final { scan-tree-dump-times "vectorizable_store for slp transpose" 2 "slp1" } } */
+diff --git a/gcc/testsuite/gcc.dg/vect/transpose-7.c b/gcc/testsuite/gcc.dg/vect/transpose-7.c
+new file mode 100644
+index 000000000..8ba1b1b6d
+--- /dev/null
++++ b/gcc/testsuite/gcc.dg/vect/transpose-7.c
+@@ -0,0 +1,53 @@
++/* { dg-do compile { target { aarch64*-*-linux* } } } */
++/* { dg-additional-options "-fno-tree-loop-vectorize -fno-tree-dse" } */
++/* { dg-require-effective-target vect_int } */
++#include <stdio.h>
++#include <stdlib.h>
++#include "tree-vect.h"
++
++#define N 16
++#define M 256
++
++int foo (unsigned char *pix1, int i_pix1, unsigned char *pix2, int i_pix2)
++{
++  int i = 0;
++  int sum = 0;
++  unsigned char c0[N], c1[N];
++  for (int i = 0; i < N/2; i++, pix1 += i_pix1, pix2 += i_pix2)
++    {
++      c0[i] = pix1[0] - pix2[0];
++      c1[i] = pix1[1] - pix2[1];
++    }
++  for (int i = N/2; i < N; i++, pix1 += i_pix1, pix2 += i_pix2)
++    {
++      c0[i] = pix1[0] - pix2[0];
++      c1[i] = pix1[1] - pix2[1];
++   }
++  for (int i = 0; i < N; i++)
++    {
++      sum += c0[i] + c1[i];
++    }
++  return sum;
++}
++
++int main (int argc, const char* argv[])
++{
++  unsigned char input1[M];
++  unsigned char input2[M];
++  int i1 = 6;
++  int i2 = 4;
++  check_vect ();
++  for (int i = 0; i < M; i++)
++    {
++	input1[i] = i * 5;
++	input2[i] = i * 2;
++    }
++  int sum = foo (input1, i1, input2, i2);
++  if (sum != 3280)
++    {
++      abort ();
++    }
++  return 0;
++}
++
++/* { dg-final { scan-tree-dump "vectorized using transposed version" "slp1" } } */
+diff --git a/gcc/testsuite/gcc.dg/vect/transpose-8.c b/gcc/testsuite/gcc.dg/vect/transpose-8.c
+new file mode 100644
+index 000000000..a154f012a
+--- /dev/null
++++ b/gcc/testsuite/gcc.dg/vect/transpose-8.c
+@@ -0,0 +1,53 @@
++/* { dg-do compile { target { aarch64*-*-linux* } } } */
++/* { dg-additional-options "-fno-tree-loop-vectorize" } */
++/* { dg-require-effective-target vect_int } */
++#include <stdio.h>
++#include <stdlib.h>
++#include "tree-vect.h"
++
++#define N 32
++#define M 256
++
++int foo (unsigned char *pix1, int i_pix1, unsigned char *pix2, int i_pix2)
++{
++  int i = 0;
++  int sum = 0;
++  unsigned char c0[N], c1[N];
++  for (int i = 0; i < N/2; i++, pix1 += i_pix1, pix2 += i_pix2)
++    {
++      c0[i] = pix1[0] - pix2[0];
++      c1[i] = pix1[1] - pix2[1];
++    }
++  for (int i = N/2; i < N; i++, pix1 += i_pix1, pix2 += i_pix2)
++    {
++      c0[i] = pix1[0] - pix2[0];
++      c1[i] = pix1[1] - pix2[1];
++   }
++  for (int i = 0; i < N; i++)
++    {
++      sum += c0[i] + c1[i];
++    }
++  return sum;
++}
++
++int main (int argc, const char* argv[])
++{
++  unsigned char input1[M];
++  unsigned char input2[M];
++  int i1 = 6;
++  int i2 = 4;
++  check_vect ();
++  for (int i = 0; i < M; i++)
++    {
++	input1[i] = i * 5;
++	input2[i] = i * 2;
++    }
++  int sum = foo (input1, i1, input2, i2);
++  if (sum != 7584)
++    {
++      abort ();
++    }
++  return 0;
++}
++
++/* { dg-final { scan-tree-dump "vectorized using transposed version" "slp1" } } */
+diff --git a/gcc/testsuite/gcc.dg/vect/vect.exp b/gcc/testsuite/gcc.dg/vect/vect.exp
+index dcaef1e0a..ae5212411 100644
+--- a/gcc/testsuite/gcc.dg/vect/vect.exp
++++ b/gcc/testsuite/gcc.dg/vect/vect.exp
+@@ -117,6 +117,13 @@ et-dg-runtest dg-runtest [lsort \
+ 	[glob -nocomplain $srcdir/$subdir/no-vfa-*.\[cS\]]] \
+ 	"" $DEFAULT_VECTCFLAGS
+ 
++# -ftree-slp-transpose-vectorize SLP tests
++set VECT_SLP_CFLAGS $SAVED_VECT_SLP_CFLAGS
++lappend VECT_SLP_CFLAGS "-ftree-slp-transpose-vectorize"
++et-dg-runtest dg-runtest [lsort \
++	[glob -nocomplain $srcdir/$subdir/transpose-*.\[cS\]]] \
++	"" "-ftree-slp-transpose-vectorize -fdump-tree-slp-details -O3"
++
+ # -ffast-math tests
+ set DEFAULT_VECTCFLAGS $SAVED_DEFAULT_VECTCFLAGS
+ lappend DEFAULT_VECTCFLAGS "-ffast-math"
+diff --git a/gcc/tree-loop-distribution.cc b/gcc/tree-loop-distribution.cc
+index 606eb05e6..8d118e987 100644
+--- a/gcc/tree-loop-distribution.cc
++++ b/gcc/tree-loop-distribution.cc
+@@ -36,6 +36,47 @@ along with GCC; see the file COPYING3.  If not see
+    |   D(I) = A(I-1)*E
+    |ENDDO
+ 
++   If an unvectorizable loop has grouped loads, and calculations from grouped
++   loads are isomorphic, build temp arrays using stmts where isomorphic
++   calculations end.  Afer distribution, the partition built from temp
++   arrays can be vectorized in pass SLP after loop unrolling.  For example,
++
++   |DO I = 1, N
++   |    A = FOO (ARG_1);
++   |    B = FOO (ARG_2);
++   |    C = BAR_0 (A);
++   |    D = BAR_1 (B);
++   |ENDDO
++
++   is transformed to
++
++   |DO I = 1, N
++   |    J = FOO (ARG_1);
++   |    K = FOO (ARG_2);
++   |    X[I] = J;
++   |    Y[I] = K;
++   |    A = X[I];
++   |    B = Y[I];
++   |    C = BAR_0 (A);
++   |    D = BAR_1 (B);
++   |ENDDO
++
++   and is then distributed to
++
++   |DO I = 1, N
++   |    J = FOO (ARG_1);
++   |    K = FOO (ARG_2);
++   |    X[I] = J;
++   |    Y[I] = K;
++   |ENDDO
++
++   |DO I = 1, N
++   |    A = X[I];
++   |    B = Y[I];
++   |    C = BAR_0 (A);
++   |    D = BAR_1 (B);
++   |ENDDO
++
+    Loop distribution is the dual of loop fusion.  It separates statements
+    of a loop (or loop nest) into multiple loops (or loop nests) with the
+    same loop header.  The major goal is to separate statements which may
+@@ -44,7 +85,9 @@ along with GCC; see the file COPYING3.  If not see
+ 
+      1) Seed partitions with specific type statements.  For now we support
+ 	two types seed statements: statement defining variable used outside
+-	of loop; statement storing to memory.
++	of loop; statement storing to memory.  Moreover, for unvectorizable
++	loops, we try to find isomorphic stmts from grouped load and build
++	temp arrays as new seed statements.
+      2) Build reduced dependence graph (RDG) for loop to be distributed.
+ 	The vertices (RDG:V) model all statements in the loop and the edges
+ 	(RDG:E) model flow and control dependencies between statements.
+@@ -90,6 +133,8 @@ along with GCC; see the file COPYING3.  If not see
+ 	data reuse.  */
+ 
+ #include "config.h"
++#define INCLUDE_MAP
++#define INCLUDE_ALGORITHM
+ #include "system.h"
+ #include "coretypes.h"
+ #include "backend.h"
+@@ -115,6 +160,7 @@ along with GCC; see the file COPYING3.  If not see
+ #include "tree-vectorizer.h"
+ #include "tree-eh.h"
+ #include "gimple-fold.h"
++#include "optabs-tree.h"
+ #include "tree-affine.h"
+ #include "intl.h"
+ #include "rtl.h"
+@@ -188,6 +234,52 @@ struct rdg_vertex
+ #define RDG_MEM_WRITE_STMT(RDG, I) RDGV_HAS_MEM_WRITE (&(RDG->vertices[I]))
+ #define RDG_MEM_READS_STMT(RDG, I) RDGV_HAS_MEM_READS (&(RDG->vertices[I]))
+ 
++/* Results of isomorphic group analysis.  */
++#define UNINITIALIZED	(0)
++#define ISOMORPHIC	(1)
++#define HETEROGENEOUS	(1 << 1)
++#define UNCERTAIN	(1 << 2)
++
++/* Information of a stmt while analyzing isomorphic use in group.  */
++
++typedef struct _group_info
++{
++  gimple *stmt;
++
++  /* True if stmt can be a cut point.  */
++  bool cut_point;
++
++  /* For use_stmt with two rhses, one of which is the lhs of stmt.
++     If the other is unknown to be isomorphic, mark it uncertain.  */
++  bool uncertain;
++
++  /* Searching of isomorphic stmt reaches heterogeneous groups or reaches
++     MEM stmts.  */
++  bool done;
++
++  _group_info ()
++    {
++      stmt = NULL;
++      cut_point = false;
++      uncertain = false;
++      done = false;
++    }
++} *group_info;
++
++/* PAIR of cut points and corresponding profit.  */
++typedef std::pair<vec<gimple *> *, int> stmts_profit;
++
++/* MAP of vector factor VF and corresponding stmts_profit PAIR.  */
++typedef std::map<unsigned, stmts_profit> vf_stmts_profit_map;
++
++/* PAIR of group_num and iteration_num.  We consider rhses from the same
++   group and interation are isomorphic.  */
++typedef std::pair<unsigned, unsigned> group_iteration;
++
++/* An isomorphic stmt is detetmined by lhs of use_stmt, group_num and
++   the iteration_num when we insert this stmt to this map.  */
++typedef std::map<tree, group_iteration> isomer_stmt_lhs;
++
+ /* Data dependence type.  */
+ 
+ enum rdg_dep_type
+@@ -600,13 +692,14 @@ class loop_distribution
+   /* Returns true when PARTITION1 and PARTITION2 access the same memory
+      object in RDG.  */
+   bool share_memory_accesses (struct graph *rdg,
+-			      partition *partition1, partition *partition2);
++			      partition *partition1, partition *partition2,
++			      hash_set<tree> *excluded_arrays);
+ 
+   /* For each seed statement in STARTING_STMTS, this function builds
+      partition for it by adding depended statements according to RDG.
+      All partitions are recorded in PARTITIONS.  */
+   void rdg_build_partitions (struct graph *rdg,
+-			     vec<gimple *> starting_stmts,
++			     vec<gimple *> *starting_stmts,
+ 			     vec<partition *> *partitions);
+ 
+   /* Compute partition dependence created by the data references in DRS1
+@@ -643,15 +736,50 @@ class loop_distribution
+ 
+   /* Fuse PARTITIONS of LOOP if necessary before finalizing distribution.
+      ALIAS_DDRS contains ddrs which need runtime alias check.  */
+-  void finalize_partitions (class loop *loop, vec<struct partition *>
+-			    *partitions, vec<ddr_p> *alias_ddrs);
++  void finalize_partitions (class loop *loop,
++			    vec<struct partition *> *partitions,
++			    vec<ddr_p> *alias_ddrs, bitmap producers);
++
++  /* Analyze loop form and if it's vectorizable to decide if we need to
++     insert temp arrays to distribute it.  */
++  bool may_insert_temp_arrays (loop_p loop, struct graph *&rdg,
++			       control_dependences *cd);
++
++  /* Reset gimple_uid of GIMPLE_DEBUG and GIMPLE_LABEL to -1.  */
++  void reset_gimple_uid (loop_p loop);
++
++  bool check_loop_vectorizable (loop_p loop);
++
++  inline void rebuild_rdg (loop_p loop, struct graph *&rdg,
++			   control_dependences *cd);
++
++  /* If loop is not distributed, remove inserted temp arrays.  */
++  void remove_insertion (loop_p loop, struct graph *flow_only_rdg,
++			 bitmap producers, struct partition *partition);
++
++  /* Insert temp arrays if isomorphic computation exists.  Temp arrays will be
++     regarded as SEED_STMTS for building partitions in succeeding processes.  */
++  bool insert_temp_arrays (loop_p loop, vec<gimple *> seed_stmts,
++			   hash_set<tree> *tmp_array_vars, bitmap producers);
++
++  void build_producers (loop_p loop, bitmap producers,
++			vec<gimple *> &transformed);
++
++  void do_insertion (loop_p loop, struct graph *flow_only_rdg, tree iv,
++		     bitmap cut_points, hash_set <tree> *tmp_array_vars,
++		     bitmap producers);
++
++  /* Fuse PARTITIONS built from inserted temp arrays into one partition,
++     fuse the rest into another.  */
++  void merge_remaining_partitions (vec<struct partition *> *partitions,
++				   bitmap producers);
+ 
+   /* Distributes the code from LOOP in such a way that producer statements
+      are placed before consumer statements.  Tries to separate only the
+      statements from STMTS into separate loops.  Returns the number of
+      distributed loops.  Set NB_CALLS to number of generated builtin calls.
+      Set *DESTROY_P to whether LOOP needs to be destroyed.  */
+-  int distribute_loop (class loop *loop, const vec<gimple *> &stmts,
++  int distribute_loop (class loop *loop, vec<gimple *> &stmts,
+ 		       control_dependences *cd, int *nb_calls, bool *destroy_p,
+ 		       bool only_patterns_p);
+ 
+@@ -1893,7 +2021,8 @@ loop_distribution::classify_partition (loop_p loop,
+ 
+ bool
+ loop_distribution::share_memory_accesses (struct graph *rdg,
+-		       partition *partition1, partition *partition2)
++		       partition *partition1, partition *partition2,
++		       hash_set <tree> *excluded_arrays)
+ {
+   unsigned i, j;
+   bitmap_iterator bi, bj;
+@@ -1927,7 +2056,10 @@ loop_distribution::share_memory_accesses (struct graph *rdg,
+ 	  if (operand_equal_p (DR_BASE_ADDRESS (dr1), DR_BASE_ADDRESS (dr2), 0)
+ 	      && operand_equal_p (DR_OFFSET (dr1), DR_OFFSET (dr2), 0)
+ 	      && operand_equal_p (DR_INIT (dr1), DR_INIT (dr2), 0)
+-	      && operand_equal_p (DR_STEP (dr1), DR_STEP (dr2), 0))
++	      && operand_equal_p (DR_STEP (dr1), DR_STEP (dr2), 0)
++	      /* An exception, if PARTITION1 and PARTITION2 contain the
++		 temp array we inserted, do not merge them.  */
++	      && !excluded_arrays->contains (DR_REF (dr1)))
+ 	    return true;
+ 	}
+     }
+@@ -1941,14 +2073,14 @@ loop_distribution::share_memory_accesses (struct graph *rdg,
+ 
+ void
+ loop_distribution::rdg_build_partitions (struct graph *rdg,
+-					 vec<gimple *> starting_stmts,
++					 vec<gimple *> *starting_stmts,
+ 					 vec<partition *> *partitions)
+ {
+   auto_bitmap processed;
+   int i;
+   gimple *stmt;
+ 
+-  FOR_EACH_VEC_ELT (starting_stmts, i, stmt)
++  FOR_EACH_VEC_ELT (*starting_stmts, i, stmt)
+     {
+       int v = rdg_vertex_for_stmt (rdg, stmt);
+ 
+@@ -2912,13 +3044,47 @@ fuse_memset_builtins (vec<struct partition *> *partitions)
+     }
+ }
+ 
++void
++loop_distribution::merge_remaining_partitions
++			(vec<struct partition *> *partitions,
++			 bitmap producers)
++{
++  struct partition *partition = NULL;
++  struct partition *p1 = NULL, *p2 = NULL;
++  for (unsigned i = 0; partitions->iterate (i, &partition); i++)
++    {
++      if (bitmap_intersect_p (producers, partition->stmts))
++	{
++	  if (p1 == NULL)
++	    {
++	      p1 = partition;
++	      continue;
++	    }
++	  partition_merge_into (NULL, p1, partition, FUSE_FINALIZE);
++	}
++      else
++	{
++	  if (p2 == NULL)
++	    {
++	      p2 = partition;
++	      continue;
++	    }
++	  partition_merge_into (NULL, p2, partition, FUSE_FINALIZE);
++	}
++      partitions->unordered_remove (i);
++      partition_free (partition);
++      i--;
++    }
++}
++
+ void
+ loop_distribution::finalize_partitions (class loop *loop,
+ 					vec<struct partition *> *partitions,
+-					vec<ddr_p> *alias_ddrs)
++					vec<ddr_p> *alias_ddrs,
++					bitmap producers)
+ {
+   unsigned i;
+-  struct partition *partition, *a;
++  struct partition *partition;
+ 
+   if (partitions->length () == 1
+       || alias_ddrs->length () > 0)
+@@ -2950,13 +3116,7 @@ loop_distribution::finalize_partitions (class loop *loop,
+       || (loop->inner == NULL
+ 	  && i >= NUM_PARTITION_THRESHOLD && num_normal > num_builtin))
+     {
+-      a = (*partitions)[0];
+-      for (i = 1; partitions->iterate (i, &partition); ++i)
+-	{
+-	  partition_merge_into (NULL, a, partition, FUSE_FINALIZE);
+-	  partition_free (partition);
+-	}
+-      partitions->truncate (1);
++      merge_remaining_partitions (partitions, producers);
+     }
+ 
+   /* Fuse memset builtins if possible.  */
+@@ -2964,6 +3124,1216 @@ loop_distribution::finalize_partitions (class loop *loop,
+     fuse_memset_builtins (partitions);
+ }
+ 
++/* Gimple uids of GIMPLE_DEBUG and GIMPLE_LABEL were changed during function
++   vect_analyze_loop, reset them to -1.  */
++
++void
++loop_distribution::reset_gimple_uid (loop_p loop)
++{
++  basic_block *bbs = get_loop_body_in_custom_order (loop, this,
++						    bb_top_order_cmp_r);
++  for (int i = 0; i < int (loop->num_nodes); i++)
++    {
++      basic_block bb = bbs[i];
++      for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
++	   gsi_next (&gsi))
++	{
++	  gimple *stmt = gsi_stmt (gsi);
++	  if (is_gimple_debug (stmt) || gimple_code (stmt) == GIMPLE_LABEL)
++	    gimple_set_uid (stmt, -1);
++	}
++    }
++  free (bbs);
++}
++
++bool
++loop_distribution::check_loop_vectorizable (loop_p loop)
++{
++  vec_info_shared shared;
++  vect_analyze_loop (loop, &shared, true);
++  loop_vec_info vinfo = loop_vec_info_for_loop (loop);
++  reset_gimple_uid (loop);
++  if (vinfo == NULL)
++    {
++      if (dump_file && (dump_flags & TDF_DETAILS))
++	fprintf (dump_file,
++		 "Loop %d no temp array insertion: bad data access pattern,"
++		 " unable to generate loop_vinfo.\n", loop->num);
++      return false;
++    }
++  if (vinfo->vectorizable)
++    {
++      if (dump_file && (dump_flags & TDF_DETAILS))
++	fprintf (dump_file, "Loop %d no temp array insertion: original loop"
++			    " can be vectorized without distribution.\n",
++			    loop->num);
++      delete vinfo;
++      loop->aux = NULL;
++      return false;
++    }
++  if (vinfo->grouped_loads.length () == 0)
++    {
++      if (dump_file && (dump_flags & TDF_DETAILS))
++	fprintf (dump_file, "Loop %d no temp array insertion: original loop"
++			    " has no grouped loads.\n" , loop->num);
++      delete vinfo;
++      loop->aux = NULL;
++      return false;
++    }
++  return true;
++}
++
++inline void
++loop_distribution::rebuild_rdg (loop_p loop, struct graph *&rdg,
++				control_dependences *cd)
++{
++  free_rdg (rdg);
++  rdg = build_rdg (loop, cd);
++  gcc_checking_assert (rdg != NULL);
++}
++
++bool
++loop_distribution::may_insert_temp_arrays (loop_p loop, struct graph *&rdg,
++					   control_dependences *cd)
++{
++  if (!(flag_tree_slp_transpose_vectorize && flag_tree_loop_vectorize))
++    return false;
++
++  /* Only loops with two basic blocks HEADER and LATCH are supported.  HEADER
++     is the main body of a LOOP and LATCH is the basic block that controls the
++     LOOP execution.  Size of temp array is determined by loop execution time,
++     so it must be a const.  */
++  tree loop_extent = number_of_latch_executions (loop);
++  if (loop->inner != NULL || loop->num_nodes > 2
++      || TREE_CODE (loop_extent) != INTEGER_CST)
++    {
++      if (dump_file && (dump_flags & TDF_DETAILS))
++	fprintf (dump_file, "Loop %d: no temp array insertion: bad loop"
++			    " form.\n", loop->num);
++      return false;
++    }
++
++  if (loop->dont_vectorize)
++    {
++      if (dump_file && (dump_flags & TDF_DETAILS))
++	fprintf (dump_file, "Loop %d: no temp array insertion: this loop"
++			    " should never be vectorized.\n",
++			    loop->num);
++      return false;
++    }
++
++  /* Do not distribute a LOOP that is able to be vectorized without
++     distribution.  */
++  if (!check_loop_vectorizable (loop))
++    {
++      rebuild_rdg (loop, rdg, cd);
++      return false;
++    }
++
++  rebuild_rdg (loop, rdg, cd);
++  return true;
++}
++
++/* Return max grouped loads' length if all groupes length satisfy len = 2 ^ n.
++   Otherwise, return 0.  */
++
++static unsigned
++get_max_vf (loop_vec_info vinfo)
++{
++  unsigned size = 0;
++  unsigned max = 0;
++  stmt_vec_info stmt_info;
++  unsigned i = 0;
++  FOR_EACH_VEC_ELT (vinfo->grouped_loads, i, stmt_info)
++    {
++      size = stmt_info->size;
++      if (!pow2p_hwi (size))
++	return 0;
++      max = size > max ? size : max;
++    }
++  return max;
++}
++
++/* Convert grouped_loads from linked list to vector with length vf.  Init
++   group_info of each stmt in the same group and put then into a vector.  And
++   these vectors consist WORKLISTS.  We will re-analyze a group if it is
++   uncertain, so we regard WORKLISTS as a circular queue.  */
++
++static unsigned
++build_queue (loop_vec_info vinfo, unsigned vf,
++	     vec<vec<group_info> *> &worklists)
++{
++  stmt_vec_info stmt_info;
++  unsigned i = 0;
++  group_info ginfo = NULL;
++  vec<group_info> *worklist = NULL;
++  FOR_EACH_VEC_ELT (vinfo->grouped_loads, i, stmt_info)
++    {
++      unsigned group_size = stmt_info->size;
++      stmt_vec_info c_stmt_info = stmt_info;
++      bool succ = true;
++      while (group_size >= vf)
++	{
++	  vec_alloc (worklist, vf);
++	  for (unsigned j = 0; j < vf; ++j)
++	    {
++	      if (c_stmt_info == NULL)
++		{
++		  succ = false;
++		  break;
++		}
++	      ginfo = new _group_info ();
++	      ginfo->stmt = c_stmt_info->stmt;
++	      worklist->safe_push (ginfo);
++	      c_stmt_info = c_stmt_info->next_element;
++	    }
++	  if (!succ)
++	    {
++	      unsigned k = 0;
++	      ginfo = NULL;
++	      FOR_EACH_VEC_ELT (*worklist, k, ginfo)
++		delete ginfo;
++	      vec_free (worklist);
++	      break;
++	    }
++	  worklists.safe_push (worklist);
++	  group_size -= vf;
++	}
++    }
++  return worklists.length ();
++}
++
++static bool
++check_same_oprand_type (tree op1, tree op2)
++{
++  tree type1 = TREE_TYPE (op1);
++  tree type2 = TREE_TYPE (op2);
++  if (TREE_CODE (type1) != INTEGER_TYPE && TREE_CODE (type1) != REAL_TYPE)
++    return false;
++
++  return (TREE_CODE (type1) == TREE_CODE (type2)
++	  && TYPE_UNSIGNED (type1) == TYPE_UNSIGNED (type2)
++	  && TYPE_PRECISION (type1) == TYPE_PRECISION (type2));
++}
++
++static bool
++bit_field_p (gimple *stmt)
++{
++  unsigned i = 0;
++  auto_vec<data_reference_p, 2> datarefs_vec;
++  data_reference_p dr;
++  if (!find_data_references_in_stmt (NULL, stmt, &datarefs_vec))
++    return true;
++
++  FOR_EACH_VEC_ELT (datarefs_vec, i, dr)
++    {
++      if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
++	  && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
++	return true;
++    }
++  return false;
++}
++
++static inline bool
++shift_operation (enum tree_code op)
++{
++  return op == LSHIFT_EXPR || op == RSHIFT_EXPR || op == LROTATE_EXPR
++	 || op == RROTATE_EXPR;
++}
++
++/* Return relationship between USE_STMT and the first use_stmt of the group.
++   RHS1 is the lhs of stmt recorded in group_info.  If another rhs of use_stmt
++   is not a constant, return UNCERTAIN and re-check it later.  */
++
++static unsigned
++check_isomorphic (gimple *use_stmt, gimple *first,
++		  tree rhs1, vec<tree> &hetero_lhs)
++{
++  /* Check same operation.  */
++  enum tree_code rhs_code_first = gimple_assign_rhs_code (first);
++  enum tree_code rhs_code_current = gimple_assign_rhs_code (use_stmt);
++  if (rhs_code_first != rhs_code_current)
++    return HETEROGENEOUS;
++
++  /* For shift operations, oprands should be equal.  */
++  if (shift_operation (rhs_code_current))
++    {
++      tree shift_op_first = gimple_assign_rhs2 (first);
++      tree shift_op_current = gimple_assign_rhs2 (use_stmt);
++      if (!operand_equal_p (shift_op_first, shift_op_current, 0)
++	  || !TREE_CONSTANT (shift_op_first))
++	return HETEROGENEOUS;
++
++      return ISOMORPHIC;
++    }
++  /* Type convertion expr or assignment.  */
++  if (gimple_num_ops (first) == 2)
++    return (rhs_code_first == NOP_EXPR || rhs_code_first == CONVERT_EXPR
++	      || rhs_code_first == SSA_NAME) ? ISOMORPHIC : HETEROGENEOUS;
++
++  /* We find USE_STMT from lhs of a stmt, denote it as rhs1 of USE_STMT and
++     the other one as rhs2.  Check if define-stmt of current rhs2 is isomorphic
++     with define-stmt of rhs2 in the first USE_STMT at this group.  */
++  tree rhs2_first = gimple_assign_rhs1 (use_stmt) == rhs1
++		    ? gimple_assign_rhs2 (first) : gimple_assign_rhs1 (first);
++  tree rhs2_curr = gimple_assign_rhs1 (use_stmt) == rhs1
++	      ? gimple_assign_rhs2 (use_stmt) : gimple_assign_rhs1 (use_stmt);
++
++  if (check_same_oprand_type (rhs2_first, rhs2_curr))
++    {
++      if (TREE_CONSTANT (rhs2_curr))
++	return ISOMORPHIC;
++      else if (hetero_lhs.contains (rhs2_curr))
++	return HETEROGENEOUS;
++
++      /* Provisionally set the stmt as uncertain and analyze the whole group
++	 in function CHECK_UNCERTAIN later if all use_stmts are uncertain.  */
++      return UNCERTAIN;
++    }
++  return HETEROGENEOUS;
++}
++
++static bool
++unsupported_operations (gimple *stmt)
++{
++  enum tree_code code = gimple_assign_rhs_code (stmt);
++  return code == COND_EXPR;
++}
++
++/* Check if the single use_stmt of STMT is isomorphic with the first one's
++   use_stmt in current group.  */
++
++static unsigned
++check_use_stmt (group_info elmt, gimple *&first,
++		vec<gimple *> &tmp_stmts, vec<tree> &hetero_lhs)
++{
++  if (gimple_code (elmt->stmt) != GIMPLE_ASSIGN)
++    return HETEROGENEOUS;
++  use_operand_p dummy;
++  tree lhs = gimple_assign_lhs (elmt->stmt);
++  gimple *use_stmt = NULL;
++  single_imm_use (lhs, &dummy, &use_stmt);
++  /* STMTs with three rhs are not supported, e.g., GIMPLE_COND.  */
++  if (use_stmt == NULL || gimple_code (use_stmt) != GIMPLE_ASSIGN
++      || unsupported_operations (use_stmt) || bit_field_p (use_stmt))
++    return HETEROGENEOUS;
++  tmp_stmts.safe_push (use_stmt);
++  if (first == NULL)
++    {
++      first = use_stmt;
++      return UNINITIALIZED;
++    }
++  /* Check if current use_stmt and the first menber's use_stmt in the group
++     are of the same type.  */
++  tree first_lhs = gimple_assign_lhs (first);
++  tree curr_lhs = gimple_assign_lhs (use_stmt);
++  if (!check_same_oprand_type (first_lhs, curr_lhs))
++    return HETEROGENEOUS;
++  return check_isomorphic (use_stmt, first, lhs, hetero_lhs);
++}
++
++/* Replace stmt field in group with stmts in TMP_STMTS, and insert their
++   lhs_info to ISOMER_LHS.  */
++
++static void
++update_isomer_lhs (vec<group_info> *group, unsigned group_num,
++		   unsigned iteration, isomer_stmt_lhs &isomer_lhs,
++		   vec<gimple *> &tmp_stmts, int &profit,
++		   vec<unsigned> &merged_groups)
++{
++  group_info elmt = NULL;
++  /* Do not insert temp array if isomorphic stmts from grouped load have
++     only casting operations.  Once isomorphic calculation has 3 oprands,
++     such as plus operation, this group can be regarded as cut point.  */
++  bool operated = (gimple_num_ops (tmp_stmts[0]) == 3);
++  /* Do not insert temp arrays if search of iosomophic stmts reaches
++     MEM stmts.  */
++  bool has_vdef = gimple_vdef (tmp_stmts[0]) != NULL;
++  bool merge = false;
++  for (unsigned i = 0; i < group->length (); i++)
++    {
++      elmt = (*group)[i];
++      elmt->stmt = has_vdef ? NULL : tmp_stmts[i];
++      elmt->cut_point = has_vdef ? false : (elmt->cut_point || operated);
++      elmt->uncertain = false;
++      elmt->done = has_vdef;
++      tree lhs = gimple_assign_lhs (tmp_stmts[i]);
++      if (isomer_lhs.find (lhs) != isomer_lhs.end ())
++	{
++	  merge = true;
++	  continue;
++	}
++      isomer_lhs[lhs] = std::make_pair (group_num, iteration);
++    }
++  if (merge)
++    {
++      merged_groups.safe_push (group_num);
++      profit = 0;
++      return;
++    }
++  enum vect_cost_for_stmt kind = scalar_stmt;
++  int scalar_cost = builtin_vectorization_cost (kind, NULL_TREE, 0);
++  profit = (tmp_stmts.length () - 1) * scalar_cost;
++}
++
++/* Try to find rhs2 in ISOMER_LHS, if all rhs2 were found and their group_num
++   and iteration are same, GROUP is isomorphic.  */
++
++static unsigned
++check_isomorphic_rhs (vec<group_info> *group, vec<gimple *> &tmp_stmts,
++		      isomer_stmt_lhs &isomer_lhs)
++{
++  group_info elmt = NULL;
++  gimple *stmt = NULL;
++  unsigned j = 0;
++  unsigned group_num = -1u;
++  unsigned iteration = -1u;
++  tree rhs1 = NULL;
++  tree rhs2 = NULL;
++  unsigned status = UNINITIALIZED;
++  FOR_EACH_VEC_ELT (*group, j, elmt)
++    {
++      rhs1 = gimple_assign_lhs (elmt->stmt);
++      stmt = tmp_stmts[j];
++      rhs2 = (rhs1 == gimple_assign_rhs1 (stmt))
++	     ? gimple_assign_rhs2 (stmt) : gimple_assign_rhs1 (stmt);
++      isomer_stmt_lhs::iterator iter = isomer_lhs.find (rhs2);
++      if (iter != isomer_lhs.end ())
++	{
++	  if (group_num == -1u)
++	    {
++	      group_num = iter->second.first;
++	      iteration = iter->second.second;
++	      status |= ISOMORPHIC;
++	      continue;
++	    }
++	  if (iter->second.first == group_num
++	      && iter->second.second == iteration)
++	    {
++	      status |= ISOMORPHIC;
++	      continue;
++	    }
++	  return HETEROGENEOUS;
++	}
++      else
++	status |= UNCERTAIN;
++    }
++  return status;
++}
++
++/* Update group_info for uncertain groups.  */
++
++static void
++update_uncertain_stmts (vec<group_info> *group, unsigned group_num,
++			 unsigned iteration, vec<gimple *> &tmp_stmts)
++{
++  unsigned j = 0;
++  group_info elmt = NULL;
++  FOR_EACH_VEC_ELT (*group, j, elmt)
++    {
++      elmt->uncertain = true;
++      elmt->done = false;
++    }
++}
++
++/* Push stmts in TMP_STMTS into HETERO_LHS.  */
++
++static void
++set_hetero (vec<group_info> *group, vec<tree> &hetero_lhs,
++	    vec<gimple *> &tmp_stmts)
++{
++  group_info elmt = NULL;
++  unsigned i = 0;
++  for (i = 0; i < group->length (); i++)
++    {
++      elmt = (*group)[i];
++      elmt->uncertain = false;
++      elmt->done = true;
++    }
++  gimple *stmt = NULL;
++  FOR_EACH_VEC_ELT (tmp_stmts, i, stmt)
++    if (stmt != NULL)
++      hetero_lhs.safe_push (gimple_assign_lhs (stmt));
++}
++
++/* Given an uncertain group, TMP_STMTS are use_stmts of stmts in GROUP.
++   Rhs1 is the lhs of stmt in GROUP, rhs2 is the other rhs of USE_STMT.
++
++   Try to find rhs2 in ISOMER_LHS, if all found rhs2 have same group_num
++   and iteration, this uncertain group is isomorphic.
++
++   If no rhs matched, this GROUP remains uncertain and update group_info.
++
++   Otherwise, this GROUP is heterogeneous and return true to end analysis
++   for this group.  */
++
++static bool
++check_uncertain (vec<group_info> *group, unsigned group_num,
++		 unsigned iteration, int &profit,
++		 vec<gimple *> &tmp_stmts, isomer_stmt_lhs &isomer_lhs,
++		 vec<tree> &hetero_lhs, vec<unsigned> &merged_groups)
++{
++  unsigned status = check_isomorphic_rhs (group, tmp_stmts, isomer_lhs);
++  bool done = false;
++  switch (status)
++    {
++      case UNCERTAIN:
++	update_uncertain_stmts (group, group_num, iteration, tmp_stmts);
++	break;
++      case ISOMORPHIC:
++	update_isomer_lhs (group, group_num, iteration, isomer_lhs,
++			   tmp_stmts, profit, merged_groups);
++	break;
++      default:
++	set_hetero (group, hetero_lhs, tmp_stmts);
++	done = true;
++    }
++  return done;
++}
++
++/* Return false if analysis of this group is not finished, e.g., isomorphic or
++   uncertain.  Calculate the profit if vectorized.  */
++
++static bool
++check_group (vec<group_info> *group, unsigned group_num, unsigned iteration,
++	     int &profit, vec<unsigned> &merged_groups,
++	     isomer_stmt_lhs &isomer_lhs, vec<tree> &hetero_lhs)
++{
++  unsigned j = 0;
++  group_info elmt = NULL;
++  gimple *first = NULL;
++  unsigned res = 0;
++  /* Record single use stmts in TMP_STMTS and decide whether replace stmts in
++     ginfo in succeeding processes.  */
++  auto_vec<gimple *, 12> tmp_stmts;
++  FOR_EACH_VEC_ELT (*group, j, elmt)
++    {
++      if (merged_groups.contains (group_num))
++	return true;
++      res |= check_use_stmt (elmt, first, tmp_stmts, hetero_lhs);
++    }
++
++  /* Update each group member according to RES.  */
++  switch (res)
++    {
++      case ISOMORPHIC:
++	update_isomer_lhs (group, group_num, iteration, isomer_lhs,
++			   tmp_stmts, profit, merged_groups);
++	return false;
++      case UNCERTAIN:
++	return check_uncertain (group, group_num, iteration, profit,
++				tmp_stmts, isomer_lhs, hetero_lhs,
++				merged_groups);
++      default:
++	set_hetero (group, hetero_lhs, tmp_stmts);
++	return true;
++    }
++}
++
++/* Return true if all analysises are done except uncertain groups.  */
++
++static bool
++end_of_search (vec<vec<group_info> *> &circular_queue,
++	       vec<unsigned> &merged_groups)
++{
++  unsigned i = 0;
++  vec<group_info> *group = NULL;
++  group_info elmt = NULL;
++  FOR_EACH_VEC_ELT (circular_queue, i, group)
++    {
++      if (merged_groups.contains (i))
++	continue;
++      elmt = (*group)[0];
++      /* If there is any isomorphic use_stmts, continue analysis of isomorphic
++	 use_stmts.  */
++      if (!elmt->done && !elmt->uncertain)
++	return false;
++    }
++  return true;
++}
++
++/* Push valid stmts to STMTS as cutpoints.  */
++
++static bool
++check_any_cutpoints (vec<vec<group_info> *> &circular_queue,
++		     vec<gimple *> *&stmts, vec<unsigned> &merged_groups)
++{
++  unsigned front = 0;
++  vec<group_info> *group = NULL;
++  group_info elmt = NULL;
++  unsigned max = circular_queue.length () * circular_queue[0]->length ();
++  vec_alloc (stmts, max);
++  while (front < circular_queue.length ())
++    {
++      unsigned i = 0;
++      if (merged_groups.contains (front))
++	{
++	  front++;
++	  continue;
++	}
++      group = circular_queue[front++];
++      FOR_EACH_VEC_ELT (*group, i, elmt)
++	if (elmt->stmt != NULL && elmt->done && elmt->cut_point)
++	  stmts->safe_push (elmt->stmt);
++    }
++  return stmts->length () != 0;
++}
++
++/* Grouped loads are isomorphic.  Make pair for group number and iteration,
++   map load stmt to this pair.  We set iteration 0 here.  */
++
++static void
++init_isomer_lhs (vec<vec<group_info> *> &groups, isomer_stmt_lhs &isomer_lhs)
++{
++  vec<group_info> *group = NULL;
++  group_info elmt = NULL;
++  unsigned i = 0;
++  FOR_EACH_VEC_ELT (groups, i, group)
++    {
++      unsigned j = 0;
++      FOR_EACH_VEC_ELT (*group, j, elmt)
++	isomer_lhs[gimple_assign_lhs (elmt->stmt)] = std::make_pair (i, 0);
++    }
++}
++
++/* It's not a strict analysis of load/store profit.  Assume scalar and vector
++   load/store are of the same cost.  The result PROFIT equals profit form
++   vectorizing of scalar loads/stores minus cost of a vectorized load/store.  */
++
++static int
++load_store_profit (unsigned scalar_mem_ops, unsigned vf, unsigned new_mem_ops)
++{
++  int profit = 0;
++  enum vect_cost_for_stmt kind = scalar_load;
++  int scalar_cost = builtin_vectorization_cost (kind, NULL_TREE, 0);
++  profit += (scalar_mem_ops - (scalar_mem_ops / vf)) * scalar_cost;
++  profit -= new_mem_ops / vf * scalar_cost;
++  kind = scalar_store;
++  scalar_cost = builtin_vectorization_cost (kind, NULL_TREE, 0);
++  profit -= new_mem_ops / vf * scalar_cost;
++  return profit;
++}
++
++/* Breadth first search the graph consisting of define-use chain starting from
++   the circular queue initialized by function BUILD_QUEUE.  Find single use of
++   each stmt in group and check if they are isomorphic.  Isomorphic is defined
++   as same rhs type, same operator, and isomorphic calculation of each rhs
++   starting from load.  If another rhs is uncertain to be isomorphic, put it
++   at the end of circular queue and re-analyze it during the next iteration.
++   If a group shares the same use_stmt with another group, skip one of them in
++   succeedor prcoesses as merged.  Iterate the circular queue until all
++   remianing groups heterogeneous or reaches MEN stmts.  If all other groups
++   have finishes the analysis, and the remaining groups are uncertain,
++   return false to avoid endless loop.  */
++
++bool
++bfs_find_isomer_stmts (vec<vec<group_info> *> &circular_queue,
++		       stmts_profit &profit_pair, unsigned vf,
++		       bool &reach_vdef)
++{
++  isomer_stmt_lhs isomer_lhs;
++  auto_vec<tree> hetero_lhs;
++  auto_vec<unsigned> merged_groups;
++  vec<group_info> *group = NULL;
++  /* True if analysis finishes.  */
++  bool done = false;
++  int profit_sum = 0;
++  vec<gimple *> *stmts = NULL;
++  init_isomer_lhs (circular_queue, isomer_lhs);
++  for (unsigned i = 1; !done; ++i)
++    {
++      unsigned front = 0;
++      /* Re-initialize DONE to TRUE while a new iteration begins.  */
++      done = true;
++      while (front < circular_queue.length ())
++	{
++	  int profit = 0;
++	  group = circular_queue[front];
++	  done &= check_group (group, front, i, profit, merged_groups,
++			       isomer_lhs, hetero_lhs);
++	  profit_sum += profit;
++	  if (profit != 0 && (*group)[0]->stmt == NULL)
++	    {
++	      reach_vdef = true;
++	      return false;
++	    }
++	  ++front;
++	}
++      /* Uncertain result, return.  */
++      if (!done && end_of_search (circular_queue, merged_groups))
++	return false;
++    }
++  if (check_any_cutpoints (circular_queue, stmts, merged_groups))
++    {
++      profit_pair.first = stmts;
++      unsigned loads = circular_queue.length () * circular_queue[0]->length ();
++      profit_pair.second = profit_sum + load_store_profit (loads, vf,
++							   stmts->length ());
++      if (profit_pair.second > 0)
++	return true;
++    }
++  return false;
++}
++
++/* Free memory allocated by ginfo.  */
++
++static void
++free_ginfos (vec<vec<group_info> *> &worklists)
++{
++  vec<group_info> *worklist;
++  unsigned i = 0;
++  while (i < worklists.length ())
++    {
++      worklist = worklists[i++];
++      group_info ginfo;
++      unsigned j = 0;
++      FOR_EACH_VEC_ELT (*worklist, j, ginfo)
++	delete ginfo;
++      vec_free (worklist);
++    }
++}
++
++static void
++release_tmp_stmts (vf_stmts_profit_map &candi_stmts)
++{
++  vf_stmts_profit_map::iterator iter;
++  for (iter = candi_stmts.begin (); iter != candi_stmts.end (); ++iter)
++    iter->second.first->release ();
++}
++
++/* Choose the group of stmt with maximun profit.  */
++
++static bool
++decide_stmts_by_profit (vf_stmts_profit_map &candi_stmts, vec<gimple *> &stmts)
++{
++  vf_stmts_profit_map::iterator iter;
++  int profit = 0;
++  int max = 0;
++  vec<gimple *> *tmp = NULL;
++  for (iter = candi_stmts.begin (); iter != candi_stmts.end (); ++iter)
++    {
++      profit = iter->second.second;
++      if (profit > max)
++	{
++	  tmp = iter->second.first;
++	  max = profit;
++	}
++    }
++  if (max == 0)
++    {
++      release_tmp_stmts (candi_stmts);
++      return false;
++    }
++  unsigned i = 0;
++  gimple *stmt = NULL;
++  FOR_EACH_VEC_ELT (*tmp, i, stmt)
++    stmts.safe_push (stmt);
++  release_tmp_stmts (candi_stmts);
++  return stmts.length () != 0;
++}
++
++/* Find isomorphic stmts from grouped loads with vector factor VF.
++
++   Given source code as follows and ignore casting.
++
++   a0 = (a[0] + b[0]) + ((a[4] - b[4]) << 16);
++   a1 = (a[1] + b[1]) + ((a[5] - b[5]) << 16);
++   a2 = (a[2] + b[2]) + ((a[6] - b[6]) << 16);
++   a3 = (a[3] + b[3]) + ((a[7] - b[7]) << 16);
++
++   We get grouped loads in VINFO as
++
++   GROUP_1		GROUP_2
++   _1 = *a		_11 = *b
++   _2 = *(a + 1)	_12 = *(b + 1)
++   _3 = *(a + 2)	_13 = *(b + 2)
++   _4 = *(a + 3)	_14 = *(b + 3)
++   _5 = *(a + 4)	_15 = *(b + 4)
++   _6 = *(a + 5)	_16 = *(b + 5)
++   _7 = *(a + 6)	_17 = *(b + 6)
++   _8 = *(a + 7)	_18 = *(b + 7)
++
++   First we try VF = 8, we get two worklists
++
++   WORKLIST_1		WORKLIST_2
++   _1 = *a		_11 = *b
++   _2 = *(a + 1)	_12 = *(b + 1)
++   _3 = *(a + 2)	_13 = *(b + 2)
++   _4 = *(a + 3)	_14 = *(b + 3)
++   _5 = *(a + 4)	_15 = *(b + 4)
++   _6 = *(a + 5)	_16 = *(b + 5)
++   _7 = *(a + 6)	_17 = *(b + 6)
++   _8 = *(a + 7)	_18 = *(b + 7)
++
++   We find _111 = _1 + _11 and _115 = _5 - _15 are not isomorphic,
++   so we try VF = VF / 2.
++
++   GROUP_1		GROUP_2
++   _1 = *a		_5 = *(a + 4)
++   _2 = *(a + 1)	_6 = *(a + 5)
++   _3 = *(a + 2)	_7 = *(a + 6)
++   _4 = *(a + 3)	_8 = *(a + 7)
++
++   GROUP_3		GROUP_4
++   _11 = *b		_15 = *(b + 4)
++   _12 = *(b + 1)	_16 = *(b + 5)
++   _13 = *(b + 2)	_17 = *(b + 6)
++   _14 = *(b + 3)	_18 = *(b + 7)
++
++   We first analyze group_1, and find all operations are isomorphic, then
++   replace stmts in group_1 with their use_stmts.  Group_2 as well.
++
++   GROUP_1		GROUP_2
++   _111 = _1 + _11	_115 = _5 - _15
++   _112 = _2 + _12	_116 = _6 - _16
++   _113 = _3 + _13	_117 = _7 - _17
++   _114 = _4 + _14	_118 = _8 - _18
++
++   When analyzing group_3 and group_4, we find their use_stmts are the same
++   as group_1 and group_2.  So group_3 is regarded as being merged to group_1
++   and group_4 being merged to group_2.  In future procedures, we will skip
++   group_3 and group_4.
++
++   We repeat such processing until opreations are not isomorphic or searching
++   reaches MEM stmts.  In our given case, searching end up at a0, a1, a2 and
++   a3.  */
++
++static bool
++find_isomorphic_stmts (loop_vec_info vinfo, vec<gimple *> &stmts)
++{
++  unsigned vf = get_max_vf (vinfo);
++  if (vf == 0)
++    return false;
++  auto_vec<vec<group_info> *> circular_queue;
++  /* Map of vector factor and corresponding vectorizing profit.  */
++  stmts_profit profit_map;
++  /* Map of cut_points and vector factor.  */
++  vf_stmts_profit_map candi_stmts;
++  bool reach_vdef = false;
++  while (vf > 2)
++    {
++      if (build_queue (vinfo, vf, circular_queue) == 0)
++	return false;
++      if (!bfs_find_isomer_stmts (circular_queue, profit_map, vf, reach_vdef))
++	{
++	  if (reach_vdef)
++	    {
++	      release_tmp_stmts (candi_stmts);
++	      free_ginfos (circular_queue);
++	      circular_queue.release ();
++	      return false;
++	    }
++	  vf /= 2;
++	  free_ginfos (circular_queue);
++	  circular_queue.release ();
++	  continue;
++	}
++      candi_stmts[vf] = profit_map;
++      free_ginfos (circular_queue);
++      vf /= 2;
++      circular_queue.release ();
++    }
++  return decide_stmts_by_profit (candi_stmts, stmts);
++}
++
++/* Get iv from SEED_STMTS and make sure each seed_stmt has only one iv as index
++   and all indices are the same.  */
++
++static tree
++find_index (vec<gimple *> seed_stmts)
++{
++  if (seed_stmts.length () == 0)
++    return NULL;
++  bool found_index = false;
++  tree index = NULL;
++  unsigned ui = 0;
++  for (ui = 0; ui < seed_stmts.length (); ui++)
++    {
++      if (!gimple_vdef (seed_stmts[ui]))
++	return NULL;
++      tree lhs = gimple_assign_lhs (seed_stmts[ui]);
++      unsigned num_index = 0;
++      while (TREE_CODE (lhs) == ARRAY_REF)
++	{
++	  if (TREE_CODE (TREE_OPERAND (lhs, 1)) == SSA_NAME)
++	    {
++	      num_index++;
++	      if (num_index > 1)
++		return NULL;
++	      if (index == NULL)
++		{
++		  index = TREE_OPERAND (lhs, 1);
++		  found_index = true;
++		}
++	      else if (index != TREE_OPERAND (lhs, 1))
++		return NULL;
++	    }
++	  lhs = TREE_OPERAND (lhs, 0);
++	}
++      if (!found_index)
++	return NULL;
++    }
++  return index;
++}
++
++/* Check if expression of phi is an increament of a const.  */
++
++static void
++check_phi_inc (struct vertex *v_phi, struct graph *rdg, bool &found_inc)
++{
++  struct graph_edge *e_phi;
++  for (e_phi = v_phi->succ; e_phi; e_phi = e_phi->succ_next)
++    {
++      struct vertex *v_inc = &(rdg->vertices[e_phi->dest]);
++      if (!is_gimple_assign (RDGV_STMT (v_inc))
++	  || gimple_expr_code (RDGV_STMT (v_inc)) != PLUS_EXPR)
++	continue;
++      tree rhs1 = gimple_assign_rhs1 (RDGV_STMT (v_inc));
++      tree rhs2 = gimple_assign_rhs2 (RDGV_STMT (v_inc));
++      if (!(integer_onep (rhs1) || integer_onep (rhs2)))
++	continue;
++      struct graph_edge *e_inc;
++      /* find cycle with only two vertices inc and phi: inc <--> phi.  */
++      bool found_cycle = false;
++      for (e_inc = v_inc->succ; e_inc; e_inc = e_inc->succ_next)
++	{
++	  if (e_inc->dest == e_phi->src)
++	    {
++	      found_cycle = true;
++	      break;
++	    }
++	}
++      if (!found_cycle)
++	continue;
++      found_inc = true;
++    }
++}
++
++/* Check if phi satisfies form like PHI <0, i>.  */
++
++static inline bool
++iv_check_phi_stmt (gimple *phi_stmt)
++{
++  return gimple_phi_num_args (phi_stmt) == 2
++	 && (integer_zerop (gimple_phi_arg_def (phi_stmt, 0))
++	     || integer_zerop (gimple_phi_arg_def (phi_stmt, 1)));
++}
++
++/* Make sure the iteration varible is a phi.  */
++
++static tree
++get_iv_from_seed (struct graph *flow_only_rdg, vec<gimple *> seed_stmts)
++{
++  tree index = find_index (seed_stmts);
++  if (index == NULL)
++    return NULL;
++  for (int i = 0; i < flow_only_rdg->n_vertices; i++)
++    {
++      struct vertex *v = &(flow_only_rdg->vertices[i]);
++      if (RDGV_STMT (v) != seed_stmts[0])
++	continue;
++      struct graph_edge *e;
++      bool found_phi = false;
++      for (e = v->pred; e; e = e->pred_next)
++	{
++	  struct vertex *v_phi = &(flow_only_rdg->vertices[e->src]);
++	  gimple *phi_stmt = RDGV_STMT (v_phi);
++	  if (gimple_code (phi_stmt) != GIMPLE_PHI
++	      || gimple_phi_result (phi_stmt) != index)
++	    continue;
++	  if (!iv_check_phi_stmt (phi_stmt))
++	    return NULL;
++	  /* find inc expr in succ of phi.  */
++	  bool found_inc = false;
++	  check_phi_inc (v_phi, flow_only_rdg, found_inc);
++	  if (!found_inc)
++	    return NULL;
++	  found_phi = true;
++	  break;
++	}
++      if (!found_phi)
++	return NULL;
++      break;
++    }
++  return index;
++}
++
++/* Do not distribute loop if vertexes in ROOT_MAP have antidependence with in
++   FLOW_ONLY_RDG.  */
++
++static bool
++check_no_dependency (struct graph *flow_only_rdg, bitmap root_map)
++{
++  bitmap_iterator bi;
++  unsigned ui;
++  auto_vec<unsigned, 16> visited_nodes;
++  auto_bitmap visited_map;
++  EXECUTE_IF_SET_IN_BITMAP (root_map, 0, ui, bi)
++    visited_nodes.safe_push (ui);
++  for (ui = 0; ui < visited_nodes.length (); ui++)
++    {
++      struct vertex *v = &(flow_only_rdg->vertices[visited_nodes[ui]]);
++      struct graph_edge *e;
++      for (e = v->succ; e; e = e->succ_next)
++	{
++	  if (bitmap_bit_p (root_map, e->dest))
++	    return false;
++	  if (bitmap_bit_p (visited_map, e->dest))
++	    continue;
++	  visited_nodes.safe_push (e->dest);
++	  bitmap_set_bit (visited_map, e->dest);
++	}
++    }
++  return true;
++}
++
++/* Find isomorphic stmts from GROUPED_LOADS in VINFO and make sure
++   there is no dependency among those STMT we found.  */
++
++static unsigned
++get_cut_points (struct graph *flow_only_rdg, bitmap cut_points,
++		loop_vec_info vinfo)
++{
++  unsigned n_stmts = 0;
++
++  /* STMTS that may be CUT_POINTS.  */
++  auto_vec<gimple *> stmts;
++  if (!find_isomorphic_stmts (vinfo, stmts))
++    {
++      if (dump_file && (dump_flags & TDF_DETAILS))
++	fprintf (dump_file, "No temp array insertion: no isomorphic stmts"
++			    " were found.\n");
++      return 0;
++    }
++
++  for (int i = 0; i < flow_only_rdg->n_vertices; i++)
++    {
++      if (stmts.contains (RDG_STMT (flow_only_rdg, i)))
++	bitmap_set_bit (cut_points, i);
++    }
++  n_stmts = bitmap_count_bits (cut_points);
++
++  bool succ = check_no_dependency (flow_only_rdg, cut_points);
++  if (!succ)
++    {
++      if (dump_file && (dump_flags & TDF_DETAILS))
++	fprintf (dump_file, "No temp array inserted: data dependency"
++			    " among isomorphic stmts.\n");
++      return 0;
++    }
++  return n_stmts;
++}
++
++static void
++build_temp_array (struct vertex *v, gimple_stmt_iterator &gsi,
++		  poly_uint64 array_extent, tree iv,
++		  hash_set<tree> *tmp_array_vars, vec<gimple *> *transformed)
++{
++  gimple *stmt = RDGV_STMT (v);
++  tree lhs = gimple_assign_lhs (stmt);
++  if (dump_file && (dump_flags & TDF_DETAILS))
++    {
++      fprintf (dump_file, "original stmt:\t");
++      print_gimple_stmt (dump_file, stmt, 0, TDF_VOPS|TDF_MEMSYMS);
++    }
++  tree var_ssa = duplicate_ssa_name (lhs, stmt);
++  gimple_assign_set_lhs (stmt, var_ssa);
++  if (dump_file && (dump_flags & TDF_DETAILS))
++    {
++      fprintf (dump_file, "changed to:\t");
++      print_gimple_stmt (dump_file, stmt, 0, TDF_VOPS | TDF_MEMSYMS);
++    }
++  gimple_set_uid (gsi_stmt (gsi), -1);
++  tree vect_elt_type = TREE_TYPE (lhs);
++  tree array_type = build_array_type_nelts (vect_elt_type, array_extent);
++  tree array = create_tmp_var (array_type);
++  tree array_ssa = build4 (ARRAY_REF, vect_elt_type, array, iv, NULL, NULL);
++  tmp_array_vars->add (array_ssa);
++  gimple *store = gimple_build_assign (array_ssa, var_ssa);
++  tree new_vdef = make_ssa_name (gimple_vop (cfun), store);
++  gsi_insert_after (&gsi, store, GSI_NEW_STMT);
++  gimple_set_vdef (store, new_vdef);
++  transformed->safe_push (store);
++  gimple_set_uid (gsi_stmt (gsi), -1);
++  tree array_ssa2 = build4 (ARRAY_REF, vect_elt_type, array, iv, NULL, NULL);
++  tmp_array_vars->add (array_ssa2);
++  gimple *load = gimple_build_assign (lhs, array_ssa2);
++  if (dump_file && (dump_flags & TDF_DETAILS))
++    {
++      fprintf (dump_file, "insert stmt:\t");
++      print_gimple_stmt (dump_file, store, 0, TDF_VOPS|TDF_MEMSYMS);
++      fprintf (dump_file, " and stmt:\t");
++      print_gimple_stmt (dump_file, load, 0, TDF_VOPS|TDF_MEMSYMS);
++    }
++  gimple_set_vuse (load, new_vdef);
++  gsi_insert_after (&gsi, load, GSI_NEW_STMT);
++  gimple_set_uid (gsi_stmt (gsi), -1);
++}
++
++/* Set bitmap PRODUCERS based on vec TRANSFORMED.  */
++
++void
++loop_distribution::build_producers (loop_p loop, bitmap producers,
++				    vec<gimple *> &transformed)
++{
++  auto_vec<gimple *, 10> stmts;
++  stmts_from_loop (loop, &stmts);
++  int i = 0;
++  gimple *stmt = NULL;
++
++  FOR_EACH_VEC_ELT (stmts, i, stmt)
++    gimple_set_uid (stmt, i);
++  i = 0;
++  FOR_EACH_VEC_ELT (transformed, i, stmt)
++    bitmap_set_bit (producers, stmt->uid);
++}
++
++/* Transform stmt
++
++   A = FOO (ARG_1);
++
++   to
++
++   STMT_1: A1 = FOO (ARG_1);
++   STMT_2: X[I] = A1;
++   STMT_3: A = X[I];
++
++   Producer is STMT_2 who defines the temp array and consumer is
++   STMT_3 who uses the temp array.  */
++
++void
++loop_distribution::do_insertion (loop_p loop, struct graph *flow_only_rdg,
++				 tree iv, bitmap cut_points,
++				 hash_set<tree> *tmp_array_vars,
++				 bitmap producers)
++{
++  if (dump_file && (dump_flags & TDF_DETAILS))
++    fprintf (dump_file, "=== do insertion ===\n");
++
++  auto_vec<gimple *> transformed;
++
++  /* Execution times of loop.  */
++  poly_uint64 array_extent
++    = tree_to_poly_uint64 (number_of_latch_executions (loop)) + 1;
++
++  basic_block *bbs = get_loop_body_in_custom_order (loop, this,
++						    bb_top_order_cmp_r);
++
++  for (int i = 0; i < int (loop->num_nodes); i++)
++    {
++      basic_block bb = bbs[i];
++
++      /* Find all cut points in bb and transform them.  */
++      for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
++	   gsi_next (&gsi))
++	{
++	  unsigned j = gimple_uid (gsi_stmt (gsi));
++	  if (bitmap_bit_p (cut_points, j))
++	    {
++	      struct vertex *v = &(flow_only_rdg->vertices[j]);
++	      build_temp_array (v, gsi, array_extent, iv, tmp_array_vars,
++				&transformed);
++	    }
++	}
++    }
++  build_producers (loop, producers, transformed);
++  update_ssa (TODO_update_ssa);
++  free (bbs);
++}
++
++/* After temp array insertion, given stmts
++   STMT_1: M = FOO (ARG_1);
++   STMT_2: X[I] = M;
++   STMT_3: A = X[I];
++   STMT_2 is the producer, STMT_1 is its prev and STMT_3 is its next.
++   Replace M with A, and remove STMT_2 and STMT_3.  */
++
++static void
++reset_gimple_assign (struct graph *flow_only_rdg, struct partition *partition,
++		     gimple_stmt_iterator &gsi, int j)
++{
++  struct vertex *v = &(flow_only_rdg->vertices[j]);
++  gimple *stmt = RDGV_STMT (v);
++  gimple *prev = stmt->prev;
++  gimple *next = stmt->next;
++  tree n_lhs = gimple_assign_lhs (next);
++  gimple_assign_set_lhs (prev, n_lhs);
++  unlink_stmt_vdef (stmt);
++  if (partition)
++    bitmap_clear_bit (partition->stmts, gimple_uid (gsi_stmt (gsi)));
++  gsi_remove (&gsi, true);
++  release_defs (stmt);
++  if (partition)
++    bitmap_clear_bit (partition->stmts, gimple_uid (gsi_stmt (gsi)));
++  gsi_remove (&gsi, true);
++}
++
++void
++loop_distribution::remove_insertion (loop_p loop, struct graph *flow_only_rdg,
++		  bitmap producers, struct partition *partition)
++{
++  basic_block *bbs = get_loop_body_in_custom_order (loop, this,
++						    bb_top_order_cmp_r);
++  for (int i = 0; i < int (loop->num_nodes); i++)
++    {
++      basic_block bb = bbs[i];
++      for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
++	   gsi_next (&gsi))
++	{
++	  unsigned j = gimple_uid (gsi_stmt (gsi));
++	  if (bitmap_bit_p (producers, j))
++	    reset_gimple_assign (flow_only_rdg, partition, gsi, j);
++	}
++    }
++  update_ssa (TODO_update_ssa);
++  free (bbs);
++}
++
++/* Insert temp arrays if isomorphic computation exists.  Temp arrays will be
++   regarded as SEED_STMTS for building partitions in succeeding processes.  */
++
++bool
++loop_distribution::insert_temp_arrays (loop_p loop, vec<gimple *> seed_stmts,
++			hash_set<tree> *tmp_array_vars, bitmap producers)
++{
++  struct graph *flow_only_rdg = build_rdg (loop, NULL);
++  gcc_checking_assert (flow_only_rdg != NULL);
++  tree iv = get_iv_from_seed (flow_only_rdg, seed_stmts);
++  if (iv == NULL)
++    {
++      if (dump_file && (dump_flags & TDF_DETAILS))
++	fprintf (dump_file, "Loop %d no temp array insertion: failed to get"
++			    " iteration variable.\n", loop->num);
++      free_rdg (flow_only_rdg);
++      return false;
++  }
++  auto_bitmap cut_points;
++  loop_vec_info vinfo = loop_vec_info_for_loop (loop);
++  unsigned n_cut_points = get_cut_points (flow_only_rdg, cut_points, vinfo);
++  delete vinfo;
++  loop->aux = NULL;
++  if (n_cut_points == 0)
++    {
++      if (dump_file && (dump_flags & TDF_DETAILS))
++	fprintf (dump_file, "Loop %d no temp array insertion: no cut points"
++			    " found.\n", loop->num);
++      free_rdg (flow_only_rdg);
++      return false;
++    }
++  do_insertion (loop, flow_only_rdg, iv, cut_points, tmp_array_vars, producers);
++  if (dump_enabled_p ())
++    {
++      dump_user_location_t loc = find_loop_location (loop);
++      dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, loc, "Insertion done:"
++		       " %d temp arrays inserted in Loop %d.\n",
++		       n_cut_points, loop->num);
++    }
++  free_rdg (flow_only_rdg);
++  return true;
++}
++
++static bool find_seed_stmts_for_distribution (class loop *, vec<gimple *> *);
++
+ /* Distributes the code from LOOP in such a way that producer statements
+    are placed before consumer statements.  Tries to separate only the
+    statements from STMTS into separate loops.  Returns the number of
+@@ -2972,7 +4342,7 @@ loop_distribution::finalize_partitions (class loop *loop,
+ 
+ int
+ loop_distribution::distribute_loop (class loop *loop,
+-		 const vec<gimple *> &stmts,
++		 vec<gimple *> &stmts,
+ 		 control_dependences *cd, int *nb_calls, bool *destroy_p,
+ 		 bool only_patterns_p)
+ {
+@@ -3021,6 +4391,33 @@ loop_distribution::distribute_loop (class loop *loop,
+       return 0;
+     }
+ 
++  /* Try to distribute LOOP if LOOP is simple enough and unable to vectorize.
++     If LOOP has grouped loads, recursively find isomorphic stmts and insert
++     temp arrays, rebuild RDG and call find_seed_stmts_for_distribution
++     to replace STMTS.  */
++
++  hash_set<tree> tmp_array_vars;
++
++  /* STMTs that define those inserted TMP_ARRAYs.  */
++  auto_bitmap producers;
++
++  /* New SEED_STMTS after insertion.  */
++  auto_vec<gimple *> work_list;
++  bool insert_success = false;
++  if (may_insert_temp_arrays (loop, rdg, cd))
++    {
++      if (insert_temp_arrays (loop, stmts, &tmp_array_vars, producers))
++	{
++	  if (find_seed_stmts_for_distribution (loop, &work_list))
++	    {
++	      insert_success = true;
++	    }
++	  else
++	    remove_insertion (loop, rdg, producers, NULL);
++	  rebuild_rdg (loop, rdg, cd);
++	}
++     }
++
+   data_reference_p dref;
+   for (i = 0; datarefs_vec.iterate (i, &dref); ++i)
+     dref->aux = (void *) (uintptr_t) i;
+@@ -3029,7 +4426,10 @@ loop_distribution::distribute_loop (class loop *loop,
+     dump_rdg (dump_file, rdg);
+ 
+   auto_vec<struct partition *, 3> partitions;
+-  rdg_build_partitions (rdg, stmts, &partitions);
++  if (work_list.length() > stmts.length())
++	rdg_build_partitions (rdg, &work_list, &partitions);
++  else
++	rdg_build_partitions (rdg, &stmts, &partitions);
+ 
+   auto_vec<ddr_p> alias_ddrs;
+ 
+@@ -3101,7 +4501,7 @@ loop_distribution::distribute_loop (class loop *loop,
+       for (int j = i + 1;
+ 	   partitions.iterate (j, &partition); ++j)
+ 	{
+-	  if (share_memory_accesses (rdg, into, partition))
++	  if (share_memory_accesses (rdg, into, partition, &tmp_array_vars))
+ 	    {
+ 	      partition_merge_into (rdg, into, partition, FUSE_SHARE_REF);
+ 	      partitions.unordered_remove (j);
+@@ -3151,7 +4551,7 @@ loop_distribution::distribute_loop (class loop *loop,
+ 	}
+     }
+ 
+-  finalize_partitions (loop, &partitions, &alias_ddrs);
++  finalize_partitions (loop, &partitions, &alias_ddrs, producers);
+ 
+   /* If there is a reduction in all partitions make sure the last one
+      is not classified for builtin code generation.  */
+@@ -3169,6 +4569,24 @@ loop_distribution::distribute_loop (class loop *loop,
+     }
+ 
+   nbp = partitions.length ();
++
++  /* If we have inserted TMP_ARRAYs but there is only one partition left in
++     the succeeding processes, remove those inserted TMP_ARRAYs back to the
++     original version.  */
++
++  if (nbp == 1 && insert_success)
++    {
++      struct partition *partition = NULL;
++      partitions.iterate (0, &partition);
++      remove_insertion (loop, rdg, producers, partition);
++      if (dump_enabled_p ())
++	{
++	  dump_user_location_t loc = find_loop_location (loop);
++	  dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, loc, "Insertion removed:"
++			   " unable to distribute loop %d.\n", loop->num);
++	}
++    }
++
+   if (nbp == 0
+       || (nbp == 1 && !partition_builtin_p (partitions[0]))
+       || (nbp > 1 && partition_contains_all_rw (rdg, partitions)))
+diff --git a/gcc/tree-vect-data-refs.cc b/gcc/tree-vect-data-refs.cc
+index 04e68f621..aae7f62f3 100644
+--- a/gcc/tree-vect-data-refs.cc
++++ b/gcc/tree-vect-data-refs.cc
+@@ -2791,6 +2791,9 @@ vect_analyze_group_access_1 (vec_info *vinfo, dr_vec_info *dr_info)
+       DR_GROUP_GAP (stmt_info) = groupsize - last_accessed_element;
+ 
+       DR_GROUP_SIZE (stmt_info) = groupsize;
++
++      DR_GROUP_SLP_TRANSPOSE (stmt_info) = false;
++
+       if (dump_enabled_p ())
+ 	{
+ 	  dump_printf_loc (MSG_NOTE, vect_location,
+@@ -2820,6 +2823,20 @@ vect_analyze_group_access_1 (vec_info *vinfo, dr_vec_info *dr_info)
+ 			     DR_GROUP_GAP (stmt_info));
+ 	}
+ 
++      /* SLP: create an SLP data structure for every interleaving group of
++	 loads for further analysis in vect_analyse_slp.  */
++      if (DR_IS_READ (dr) && !slp_impossible)
++	{
++	  if (loop_vinfo)
++	    {
++	      LOOP_VINFO_GROUPED_LOADS (loop_vinfo).safe_push (stmt_info);
++	    }
++	  if (bb_vinfo)
++	    {
++	      BB_VINFO_GROUPED_LOADS (bb_vinfo).safe_push (stmt_info);
++	    }
++	}
++
+       /* SLP: create an SLP data structure for every interleaving group of
+ 	 stores for further analysis in vect_analyse_slp.  */
+       if (DR_IS_WRITE (dr) && !slp_impossible)
+@@ -5636,6 +5653,226 @@ vect_permute_store_chain (vec_info *vinfo, vec<tree> &dr_chain,
+     }
+ }
+ 
++/* Encoding the PERM_MASK_FIRST.  */
++
++static void
++vect_indices_encoding_first (tree vectype, unsigned int array_num,
++			     tree &perm_mask_high_first,
++			     tree &perm_mask_low_first)
++{
++  unsigned int nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
++  vec_perm_builder sel (nelt, nelt, 1);
++  sel.quick_grow (nelt);
++  unsigned int group_num = nelt / array_num;
++  unsigned int index = 0;
++  unsigned int array = 0;
++  unsigned int group = 0;
++
++  /* The encoding has 1 pattern in the fisrt stage.  */
++  for (array = 0; array < array_num / 2; array++)
++    {
++      for (group = 0; group < group_num * 2; group++)
++	{
++	  sel[index++] = array + array_num * group;
++	}
++    }
++  vec_perm_indices indices (sel, 2, nelt);
++  perm_mask_high_first = vect_gen_perm_mask_checked (vectype, indices);
++
++  index = 0;
++  for (array = array_num / 2; array < array_num; array++)
++    {
++      for (group = 0; group < group_num * 2; group++)
++	{
++	  sel[index++] = array + array_num * group;
++	}
++    }
++  indices.new_vector (sel, 2, nelt);
++  perm_mask_low_first = vect_gen_perm_mask_checked (vectype, indices);
++}
++
++/* Encoding the PERM_MASK.  */
++
++static void
++vect_indices_encoding (tree vectype, unsigned int array_num,
++		       tree &perm_mask_high, tree &perm_mask_low)
++{
++  unsigned int nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
++  vec_perm_builder sel (nelt, nelt, 1);
++  sel.quick_grow (nelt);
++  unsigned int group_num = nelt / array_num;
++  unsigned int index = 0;
++  unsigned int array = 0;
++  unsigned int group = 0;
++
++  /* The encoding has 2 patterns in the folllowing stages.  */
++  for (array = 0; array < array_num / 2; array++)
++    {
++      for (group = 0; group < group_num; group++)
++	{
++	  sel[index++] = group + group_num * array;
++	}
++      for (group = 0; group < group_num; group++)
++	{
++	  sel[index++] = nelt + group + group_num * array;
++	}
++    }
++  vec_perm_indices indices (sel, 2, nelt);
++  perm_mask_high = vect_gen_perm_mask_checked (vectype, indices);
++
++  index = 0;
++  for (array = array_num / 2; array < array_num; array++)
++    {
++      for (group = 0; group < group_num; group++)
++	{
++	  sel[index++] = group + group_num * array;
++	}
++      for (group = 0; group < group_num; group++)
++	{
++	  sel[index++] = nelt + group + group_num * array;
++	}
++    }
++  indices.new_vector (sel, 2, nelt);
++  perm_mask_low = vect_gen_perm_mask_checked (vectype, indices);
++}
++
++/* Function vect_transpose_store_chain.
++
++   Given a chain of interleaved stores in DR_CHAIN of LENGTH and ARRAY_NUM that
++   must be a power of 2.  Generate interleave_high/low stmts to reorder
++   the data correctly for the stores.  Return the final references for stores
++   in RESULT_CHAIN.  This function is similar to vect_permute_store_chain (),
++   we interleave the contents of the vectors in their order.
++
++   E.g., LENGTH is 4, the scalar type is short (i.e., VF is 8) and ARRAY_NUM
++   is 4.  That is, the input is 4 vectors each containing 8 elements.
++   And 2 (VF / ARRAY_NUM) of 8 elements come from the same array.  we interleave
++   the contents of the four vectors in their order.  We assign a number to each
++   element, the input sequence is:
++
++   1st vec:   0  1  2  3  4  5  6  7
++   2nd vec:   8  9 10 11 12 13 14 15
++   3rd vec:  16 17 18 19 20 21 22 23
++   4th vec:  24 25 26 27 28 29 30 31
++
++   The output sequence should be:
++
++   1st vec:   0  4  8 12 16 20 24 28
++   2nd vec:   1  5  9 13 17 21 25 29
++   3rd vec:   2  6 10 14 18 22 26 30
++   4th vec:   3  7 11 15 19 23 27 31
++
++   In our example,
++   We get 2 (VF / ARRAY_NUM) elements together in every vector.
++
++   I1:   0  4  1  5  2  6  3  7
++   I2:   8 12  9 13 10 14 11 15
++   I3:  16 20 17 21 18 22 19 23
++   I4:  24 28 25 29 26 30 27 31
++
++   Then, we use interleave_high/low instructions to create such output.
++   Every 2 (VF / ARRAY_NUM) elements are regarded as a whole.  The permutation
++   is done in log LENGTH stages.
++
++   I1: interleave_high (1st vec, 3rd vec)
++   I2: interleave_low (1st vec, 3rd vec)
++   I3: interleave_high (2nd vec, 4th vec)
++   I4: interleave_low (2nd vec, 4th vec)
++
++   The first stage of the sequence should be:
++
++   I1:   0  4 16 20  1  5 17 21
++   I2:   2  6 18 22  3  7 19 23
++   I3:   8 12 24 28  9 13 25 29
++   I4:  10 14 26 30 11 15 27 31
++
++   The following stage sequence should be, i.e. the final result is:
++
++   I1:   0  4  8 12 16 20 24 28
++   I2:   1  5  9 13 17 21 25 29
++   I3:   2  6 10 14 18 22 26 30
++   I4:   3  7 11 15 19 23 27 31.  */
++
++void
++vect_transpose_store_chain (vec_info *vinfo, vec<tree> dr_chain,
++			    unsigned int length, unsigned int array_num,
++			    stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
++			    vec<tree> *result_chain)
++{
++  gimple *perm_stmt = NULL;
++  tree vectype = STMT_VINFO_VECTYPE (stmt_info);
++  tree perm_mask_low_first = NULL;
++  tree perm_mask_high_first = NULL;
++  tree perm_mask_low = NULL;
++  tree perm_mask_high = NULL;
++  unsigned int log_length = exact_log2 (length);
++
++  /* Only power of 2 is supported.  */
++  gcc_assert (pow2p_hwi (length));
++
++  /* The encoding has 2 types, one for the grouped pattern in the fisrt stage,
++     another for the interleaved patterns in the following stages.  */
++  gcc_assert (array_num != 0);
++
++  /* Create grouped stmt (in the first stage):
++	group = nelt / array_num;
++	high_first = VEC_PERM_EXPR <vect1, vect2,
++		{0, array_num, 2*array_num, ..., (2*group-1)*array_num,
++		1, 1+array_num, 1+2*array_num, ..., 1+(2*group-1)*array_num,
++		...,
++		array_num/2-1, (array_num/2-1)+array_num, ...,
++		(array_num/2-1)+(2*group-1)*array_num}>
++	low_first = VEC_PERM_EXPR <vect1, vect2,
++		{array_num/2, array_num/2+array_num, array_num/2+2*array_num,
++		..., array_num/2+(2*group-1)*array_num,
++		array_num/2+1, array_num/2+1+array_num,
++		..., array_num/2+1+(2*group-1)*array_num,
++		...,
++		array_num-1, array_num-1+array_num,
++		..., array_num-1+(2*group-1)*array_num}>  */
++  vect_indices_encoding_first (vectype, array_num, perm_mask_high_first,
++			       perm_mask_low_first);
++
++  /* Create interleaving stmt (in the following stages):
++	high = VEC_PERM_EXPR <vect1, vect2, {0, 1, ..., group-1,
++		nelt, nelt+1, ..., nelt+group-1,
++		group, group+1, ..., 2*group-1,
++		nelt+group, nelt+group+1, ..., nelt+2*group-1,
++		...}>
++	low = VEC_PERM_EXPR <vect1, vect2,
++		{nelt/2, nelt/2+1, ..., nelt/2+group-1,
++		nelt*3/2, nelt*3/2+1, ..., nelt*3/2+group-1,
++		nelt/2+group, nelt/2+group+1, ..., nelt/2+2*group-1,
++		nelt*3/2+group, nelt*3/2+group+1, ..., nelt*3/2+2*group-1,
++		...}>  */
++  vect_indices_encoding (vectype, array_num, perm_mask_high, perm_mask_low);
++
++  for (unsigned int perm_time = 0; perm_time < log_length; perm_time++)
++    {
++      for (unsigned int index = 0; index < length / 2; index++)
++	{
++	  tree vect1 = dr_chain[index];
++	  tree vect2 = dr_chain[index + length / 2];
++
++	  tree high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
++	  perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1, vect2,
++					   perm_time == 0 ? perm_mask_high_first
++							  : perm_mask_high);
++	  vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
++	  (*result_chain)[2 * index] = high;
++
++	  tree low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
++	  perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1, vect2,
++					   perm_time == 0 ? perm_mask_low_first
++							  : perm_mask_low);
++	  vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
++	  (*result_chain)[2 * index+1] = low;
++	}
++      memcpy (dr_chain.address (), result_chain->address (),
++	      length * sizeof (tree));
++    }
++}
++
+ /* Function vect_setup_realignment
+ 
+    This function is called when vectorizing an unaligned load using
+diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
+index 3435f9378..f296e9415 100644
+--- a/gcc/tree-vect-loop.cc
++++ b/gcc/tree-vect-loop.cc
+@@ -2856,7 +2856,7 @@ vect_analyze_loop_1 (class loop *loop, vec_info_shared *shared,
+ 		     loop_vec_info main_loop_vinfo,
+ 		     const vector_modes &vector_modes, unsigned &mode_i,
+ 		     machine_mode &autodetected_vector_mode,
+-		     bool &fatal)
++		     bool &fatal, bool result_only_p)
+ {
+   loop_vec_info loop_vinfo
+     = vect_create_loop_vinfo (loop, shared, loop_form_info, main_loop_vinfo);
+@@ -2865,6 +2865,8 @@ vect_analyze_loop_1 (class loop *loop, vec_info_shared *shared,
+   loop_vinfo->vector_mode = vector_mode;
+   unsigned int suggested_unroll_factor = 1;
+ 
++  /* Loop_vinfo for loop-distribution pass.  */
++  opt_loop_vec_info fail_loop_vinfo = opt_loop_vec_info::success (NULL);
+   /* Run the main analysis.  */
+   opt_result res = vect_analyze_loop_2 (loop_vinfo, fatal,
+ 					&suggested_unroll_factor);
+@@ -2933,7 +2935,21 @@ vect_analyze_loop_1 (class loop *loop, vec_info_shared *shared,
+ 
+   if (!res)
+     {
+-      delete loop_vinfo;
++
++	/* If current analysis shows LOOP is unable to vectorize, loop_vinfo
++	will be deleted.  If LOOP is under ldist analysis, backup it before
++	it is deleted and return it if all modes are analyzed and still
++	fail to vectorize.  */
++      if (result_only_p && (mode_i == vector_modes.length ()
++	    || autodetected_vector_mode == VOIDmode))
++	{
++	    fail_loop_vinfo = opt_loop_vec_info::success (loop_vinfo);
++	    loop->aux = (loop_vec_info) fail_loop_vinfo;
++	}
++      else
++	{
++	    delete loop_vinfo;
++	}
+       if (fatal)
+ 	gcc_checking_assert (main_loop_vinfo == NULL);
+       return opt_loop_vec_info::propagate_failure (res);
+@@ -2946,9 +2962,11 @@ vect_analyze_loop_1 (class loop *loop, vec_info_shared *shared,
+ 
+    Apply a set of analyses on LOOP, and create a loop_vec_info struct
+    for it.  The different analyses will record information in the
+-   loop_vec_info struct.  */
++   loop_vec_info struct.  When RESULT_ONLY_P is true, quit analysis
++   if loop is vectorizable, otherwise, do not delete vinfo. */
+ opt_loop_vec_info
+-vect_analyze_loop (class loop *loop, vec_info_shared *shared)
++vect_analyze_loop (class loop *loop, vec_info_shared *shared,
++		   bool result_only_p)
+ {
+   DUMP_VECT_SCOPE ("analyze_loop_nest");
+ 
+@@ -2996,6 +3014,12 @@ vect_analyze_loop (class loop *loop, vec_info_shared *shared)
+ 			     && !unlimited_cost_model (loop));
+   machine_mode autodetected_vector_mode = VOIDmode;
+   opt_loop_vec_info first_loop_vinfo = opt_loop_vec_info::success (NULL);
++  /* Loop_vinfo for loop-distribution pass.  */
++  opt_loop_vec_info fail_loop_vinfo = opt_loop_vec_info::success (NULL);
++  if (result_only_p)
++  {
++     vect_slp_init ();
++  }
+   unsigned int mode_i = 0;
+   unsigned HOST_WIDE_INT simdlen = loop->simdlen;
+ 
+@@ -3019,10 +3043,16 @@ vect_analyze_loop (class loop *loop, vec_info_shared *shared)
+       opt_loop_vec_info loop_vinfo
+ 	= vect_analyze_loop_1 (loop, shared, &loop_form_info,
+ 			       NULL, vector_modes, mode_i,
+-			       autodetected_vector_mode, fatal);
++			       autodetected_vector_mode, fatal, result_only_p);
+       if (fatal)
+ 	break;
+ 
++      if (result_only_p && (mode_i == vector_modes.length ()
++	  || autodetected_vector_mode == VOIDmode))
++	{
++		return loop_vinfo;
++	}
++
+       if (loop_vinfo)
+ 	{
+ 	  /*  Analyzis has been successful so update the VF value.  The
+@@ -3132,7 +3162,7 @@ vect_analyze_loop (class loop *loop, vec_info_shared *shared)
+ 	= vect_analyze_loop_1 (loop, shared, &loop_form_info,
+ 			       first_loop_vinfo,
+ 			       vector_modes, mode_i,
+-			       autodetected_vector_mode, fatal);
++			       autodetected_vector_mode, fatal, result_only_p);
+       if (fatal)
+ 	break;
+ 
+diff --git a/gcc/tree-vect-patterns.cc b/gcc/tree-vect-patterns.cc
+index e1bcab0f7..c0c15773d 100644
+--- a/gcc/tree-vect-patterns.cc
++++ b/gcc/tree-vect-patterns.cc
+@@ -5632,8 +5632,8 @@ static vect_recog_func vect_vect_recog_func_ptrs[] = {
+      internal functions.  */
+   { vect_recog_gather_scatter_pattern, "gather_scatter" },
+   { vect_recog_mask_conversion_pattern, "mask_conversion" },
+-  { vect_recog_widen_plus_pattern, "widen_plus" },
+-  { vect_recog_widen_minus_pattern, "widen_minus" },
++  // { vect_recog_widen_plus_pattern, "widen_plus" },
++  // { vect_recog_widen_minus_pattern, "widen_minus" },
+ };
+ 
+ const unsigned int NUM_PATTERNS = ARRAY_SIZE (vect_vect_recog_func_ptrs);
+diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc
+index af477c31a..6cbf8085f 100644
+--- a/gcc/tree-vect-slp.cc
++++ b/gcc/tree-vect-slp.cc
+@@ -49,6 +49,8 @@ along with GCC; see the file COPYING3.  If not see
+ #include "tree-eh.h"
+ #include "tree-cfg.h"
+ #include "alloc-pool.h"
++#include "print-tree.h"
++#include "gimple-pretty-print.h"
+ 
+ static bool vectorizable_slp_permutation (vec_info *, gimple_stmt_iterator *,
+ 					  slp_tree, stmt_vector_for_cost *);
+@@ -994,6 +996,21 @@ vect_build_slp_tree_1 (vec_info *vinfo, unsigned char *swap,
+ 	}
+ 
+       gcc_assert (vectype);
++      if (!STMT_VINFO_VECTYPE (stmt_info))
++	STMT_VINFO_VECTYPE (stmt_info) = vectype;
++      if (dump_file)
++	{
++	  fprintf (dump_file, "vect_build_slp_tree_1: %p\n", stmt_info);
++	  print_gimple_stmt (dump_file, stmt, 0);
++	  fprintf (dump_file, "vect_build_slp_tree_1: vectype=");
++	  if (vectype)
++	    print_generic_expr (dump_file, vectype);
++	  fprintf (dump_file, "\n");
++	  fprintf (dump_file, "internal vectype=");
++	  if (STMT_VINFO_VECTYPE (stmt_info))
++	    print_generic_expr (dump_file, STMT_VINFO_VECTYPE (stmt_info));
++	  fprintf (dump_file, "\n");
++	}
+ 
+       gcall *call_stmt = dyn_cast <gcall *> (stmt);
+       if (call_stmt)
+@@ -1575,10 +1592,10 @@ vect_build_slp_tree (vec_info *vinfo,
+ 	dump_printf_loc (MSG_NOTE, vect_location,
+ 			 "SLP discovery for node %p succeeded\n", res);
+       gcc_assert (res_ == res);
+-      res->max_nunits = this_max_nunits;
++      res_->max_nunits = this_max_nunits;
+       vect_update_max_nunits (max_nunits, this_max_nunits);
+       /* Keep a reference for the bst_map use.  */
+-      SLP_TREE_REF_COUNT (res)++;
++      SLP_TREE_REF_COUNT (res_)++;
+     }
+   return res_;
+ }
+@@ -3190,8 +3207,10 @@ vect_build_slp_instance (vec_info *vinfo,
+ 
+       /* For basic block SLP, try to break the group up into multiples of
+ 	 a vector size.  */
++      bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo);
+       if (is_a <bb_vec_info> (vinfo)
+-	  && (i > 1 && i < group_size))
++	  && (i > 1 && i < group_size)
++	  && !bb_vinfo->transposed)
+ 	{
+ 	  tree scalar_type
+ 	    = TREE_TYPE (DR_REF (STMT_VINFO_DATA_REF (stmt_info)));
+@@ -3301,84 +3320,1034 @@ vect_analyze_slp_instance (vec_info *vinfo,
+       scalar_stmts.create (DR_GROUP_SIZE (stmt_info));
+       while (next_info)
+ 	{
+-	  scalar_stmts.quick_push (vect_stmt_to_vectorize (next_info));
+-	  next_info = DR_GROUP_NEXT_ELEMENT (next_info);
++	  scalar_stmts.quick_push (vect_stmt_to_vectorize (next_info));
++	  next_info = DR_GROUP_NEXT_ELEMENT (next_info);
++	}
++    }
++  else if (kind == slp_inst_kind_reduc_chain)
++    {
++      /* Collect the reduction stmts and store them in scalar_stmts.  */
++      scalar_stmts.create (REDUC_GROUP_SIZE (stmt_info));
++      while (next_info)
++	{
++	  scalar_stmts.quick_push (vect_stmt_to_vectorize (next_info));
++	  next_info = REDUC_GROUP_NEXT_ELEMENT (next_info);
++	}
++      /* Mark the first element of the reduction chain as reduction to properly
++	 transform the node.  In the reduction analysis phase only the last
++	 element of the chain is marked as reduction.  */
++      STMT_VINFO_DEF_TYPE (stmt_info)
++	= STMT_VINFO_DEF_TYPE (scalar_stmts.last ());
++      STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))
++	= STMT_VINFO_REDUC_DEF (vect_orig_stmt (scalar_stmts.last ()));
++    }
++  else if (kind == slp_inst_kind_ctor)
++    {
++      tree rhs = gimple_assign_rhs1 (stmt_info->stmt);
++      tree val;
++      scalar_stmts.create (CONSTRUCTOR_NELTS (rhs));
++      FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs), i, val)
++	{
++	  stmt_vec_info def_info = vinfo->lookup_def (val);
++	  def_info = vect_stmt_to_vectorize (def_info);
++	  scalar_stmts.quick_push (def_info);
++	}
++      if (dump_enabled_p ())
++	dump_printf_loc (MSG_NOTE, vect_location,
++			 "Analyzing vectorizable constructor: %G\n",
++			 stmt_info->stmt);
++    }
++  else if (kind == slp_inst_kind_reduc_group)
++    {
++      /* Collect reduction statements.  */
++      const vec<stmt_vec_info> &reductions
++	= as_a <loop_vec_info> (vinfo)->reductions;
++      scalar_stmts.create (reductions.length ());
++      for (i = 0; reductions.iterate (i, &next_info); i++)
++	if ((STMT_VINFO_RELEVANT_P (next_info)
++	     || STMT_VINFO_LIVE_P (next_info))
++	    /* ???  Make sure we didn't skip a conversion around a reduction
++	       path.  In that case we'd have to reverse engineer that conversion
++	       stmt following the chain using reduc_idx and from the PHI
++	       using reduc_def.  */
++	    && STMT_VINFO_DEF_TYPE (next_info) == vect_reduction_def)
++	  scalar_stmts.quick_push (next_info);
++      /* If less than two were relevant/live there's nothing to SLP.  */
++      if (scalar_stmts.length () < 2)
++	return false;
++    }
++  else
++    gcc_unreachable ();
++
++  vec<stmt_vec_info> roots = vNULL;
++  if (kind == slp_inst_kind_ctor)
++    {
++      roots.create (1);
++      roots.quick_push (stmt_info);
++    }
++  /* Build the tree for the SLP instance.  */
++  bool res = vect_build_slp_instance (vinfo, kind, scalar_stmts,
++				      roots,
++				      max_tree_size, limit, bst_map,
++				      kind == slp_inst_kind_store
++				      ? stmt_info : NULL);
++  if (!res)
++    roots.release ();
++
++  /* ???  If this is slp_inst_kind_store and the above succeeded here's
++     where we should do store group splitting.  */
++
++  return res;
++}
++
++static inline bool
++is_const_assign (stmt_vec_info store_elem)
++{
++  if (store_elem == NULL)
++    {
++      gcc_unreachable ();
++    }
++  gimple *stmt = store_elem->stmt;
++  gimple_rhs_class rhs_class = gimple_assign_rhs_class (stmt);
++  return rhs_class == GIMPLE_SINGLE_RHS
++	 && TREE_CONSTANT (gimple_assign_rhs1 (store_elem->stmt));
++}
++
++/* Push inits to INNERMOST_INITS and check const assign.  */
++
++static bool
++record_innermost (vec<tree> &innermost_inits,
++		  vec<tree> &innermost_offsets,
++		  stmt_vec_info stmt_vinfo)
++{
++  if (!stmt_vinfo)
++    {
++      return false;
++    }
++  stmt_vec_info next_info = stmt_vinfo;
++  while (next_info)
++    {
++      /* No need to vectorize constant assign in a transposed version.  */
++      if (is_const_assign (next_info))
++	{
++	  if (dump_enabled_p ())
++	    {
++	      dump_printf_loc (MSG_NOTE, vect_location,
++			      "no need to vectorize, store is const assign: %G",
++			      next_info->stmt);
++	    }
++	  return false;
++	}
++      innermost_inits.safe_push (STMT_VINFO_DR_INIT (next_info));
++      innermost_offsets.safe_push (STMT_VINFO_DR_OFFSET (next_info));
++      next_info = DR_GROUP_NEXT_ELEMENT (next_info);
++    }
++  return true;
++}
++
++/* Compare inits to INNERMOST_INITS, return FALSE if inits do not match
++   the first grouped_store.  And check const assign meanwhile.  */
++
++static bool
++compare_innermost (const vec<tree> &innermost_inits,
++		   const vec<tree> &innermost_offsets,
++		   stmt_vec_info stmt_vinfo)
++{
++  if (!stmt_vinfo || innermost_inits.length () != stmt_vinfo->size)
++    {
++      return false;
++    }
++  stmt_vec_info next_info = stmt_vinfo;
++  unsigned int i = 0;
++  while (next_info)
++    {
++      if (is_const_assign (next_info))
++	{
++	  if (dump_enabled_p ())
++	    {
++	      dump_printf_loc (MSG_NOTE, vect_location,
++			       "no need to vectorize, store is const "
++			       "assign: %G", next_info->stmt);
++	    }
++	  return false;
++	}
++      if (innermost_inits[i] != STMT_VINFO_DR_INIT (next_info)
++	  || innermost_offsets[i] != STMT_VINFO_DR_OFFSET (next_info))
++	{
++	  return false;
++	}
++      next_info = DR_GROUP_NEXT_ELEMENT (next_info);
++      i++;
++    }
++  return true;
++}
++
++static bool
++check_same_bb (stmt_vec_info grp1, stmt_vec_info grp2)
++{
++  if (grp1->stmt->bb->index == grp2->stmt->bb->index)
++    {
++       return true;
++    }
++  return false;
++}
++
++/* Check if grouped stores are of same type.
++   input: t1/t2 = TREE_TYPE (gimple_assign_lhs (first_element->stmt))
++   output: 0 if same, 1 or -1 else.  */
++
++static int
++tree_type_cmp (const tree t1, const tree t2)
++{
++  gcc_checking_assert (t1 != NULL && t2 != NULL);
++  if (t1 != t2)
++    {
++      if (TREE_CODE (t1) != TREE_CODE (t2))
++	{
++	  return TREE_CODE (t1) > TREE_CODE (t2) ? 1 : -1;
++	}
++      if (TYPE_UNSIGNED (t1) != TYPE_UNSIGNED (t2))
++	{
++	  return TYPE_UNSIGNED (t1) > TYPE_UNSIGNED (t2) ? 1 : -1;
++	}
++      if (TYPE_PRECISION (t1) != TYPE_PRECISION (t2))
++	{
++	  return TYPE_PRECISION (t1) > TYPE_PRECISION (t2) ? 1 : -1;
++	}
++    }
++  return 0;
++}
++
++/* Check it if 2 grouped stores are of same type that
++   we can analyze them in a transpose group.  */
++static int
++check_same_store_type (stmt_vec_info grp1, stmt_vec_info grp2)
++{
++  if (grp1 == grp2)
++    {
++      return 0;
++    }
++  if (grp1->size != grp2->size)
++    {
++      return grp1->size > grp2->size ? 1 : -1;
++    }
++  tree lhs1 = gimple_assign_lhs (grp1->stmt);
++  tree lhs2 = gimple_assign_lhs (grp2->stmt);
++  if (TREE_CODE (lhs1) != TREE_CODE (lhs2))
++    {
++      return TREE_CODE (lhs1) > TREE_CODE (lhs2) ? 1 : -1;
++    }
++  tree grp_type1 = TREE_TYPE (gimple_assign_lhs (grp1->stmt));
++  tree grp_type2 = TREE_TYPE (gimple_assign_lhs (grp2->stmt));
++  int cmp = tree_type_cmp (grp_type1, grp_type2);
++  return cmp;
++}
++
++/* Sort grouped stores according to group_size and store_type.
++   output: 0 if same, 1 if grp1 > grp2, -1 otherwise.  */
++
++static int
++grouped_store_cmp (const void *grp1_, const void *grp2_)
++{
++  stmt_vec_info grp1 = *(stmt_vec_info *)const_cast<void *>(grp1_);
++  stmt_vec_info grp2 = *(stmt_vec_info *)const_cast<void *>(grp2_);
++  return check_same_store_type (grp1, grp2);
++}
++
++/* Transposing is based on permutation in registers.  Permutation requires
++   vector length being power of 2 and satisfying the vector mode.  */
++
++static inline bool
++check_filling_reg (stmt_vec_info current_element)
++{
++  if (current_element->size == 0)
++    {
++      return false;
++    }
++  /* If the gimple STMT was already vectorized in vect pass, it's unable to
++     conduct transpose analysis, skip it.  */
++  bool lhs_vectorized
++	= TREE_CODE (TREE_TYPE (gimple_get_lhs (current_element->stmt)))
++	  == VECTOR_TYPE;
++  bool rhs_vectorized
++	= TREE_CODE (TREE_TYPE (gimple_assign_rhs1 (current_element->stmt)))
++	  == VECTOR_TYPE;
++  if (lhs_vectorized || rhs_vectorized)
++    {
++      return false;
++    }
++  unsigned int store_precision
++    = TYPE_PRECISION (TREE_TYPE (gimple_get_lhs (current_element->stmt)));
++  auto_vector_modes vector_modes;
++  targetm.vectorize.autovectorize_vector_modes (&vector_modes, false);
++  unsigned min_mode_size = -1u;
++  for (unsigned i = 0; i < vector_modes.length (); i++)
++    {
++      unsigned mode_bit_size = (GET_MODE_BITSIZE (vector_modes[i])).coeffs[0];
++      min_mode_size = mode_bit_size < min_mode_size
++			? mode_bit_size : min_mode_size;
++    }
++  return store_precision != 0
++	 && pow2p_hwi (current_element->size)
++	 && (current_element->size * store_precision % min_mode_size == 0);
++}
++
++/* Check if previous groups are suitable to transpose, if not, set their
++   group number to -1, reduce grp_num and clear current_groups.
++   Otherwise, just clear current_groups.  */
++
++static void
++check_and_clear_groups (vec<stmt_vec_info> &current_groups,
++			unsigned int &grp_num)
++{
++  stmt_vec_info first_element;
++  if (current_groups.length () == 1
++      || (current_groups.length () != 0
++	  && !pow2p_hwi (current_groups.length ())))
++    {
++      while (current_groups.length () != 0)
++	{
++	  first_element = current_groups.pop ();
++	  first_element->group_number = -1;
++	}
++      grp_num--;
++    }
++  else
++    {
++      while (current_groups.length ())
++	{
++	  current_groups.pop ();
++	}
++    }
++}
++
++
++/* Make sure that transpose slp vectorization is conducted only if grouped
++   stores are one dimension array ref.  */
++
++static bool
++is_store_one_dim_array (gimple *stmt)
++{
++  tree op = gimple_get_lhs (stmt);
++  if (TREE_CODE (op) != ARRAY_REF)
++    return false;
++  return TREE_OPERAND_LENGTH (op) > 0
++	 && TREE_OPERAND_LENGTH (TREE_OPERAND (op, 0)) == 0;
++}
++
++/* Set grouped_stores with similar MEM_REF to the same group and mark their
++   grp_num.  Groups with same grp_num consist the minimum unit to analyze
++   transpose.  Return num of such units.  */
++
++static unsigned
++vect_prepare_transpose (bb_vec_info bb_vinfo)
++{
++  stmt_vec_info current_element = NULL;
++  stmt_vec_info first_element = NULL;
++  unsigned int i = 0;
++  unsigned int grp_num = 0;
++  /* Use arrays to record MEM_REF data in different GROUPED_STORES.  */
++  auto_vec<tree> innermost_inits;
++  auto_vec<tree> innermost_offsets;
++
++  /* A set of stmt_vec_info with same store type.  Analyze them if their size
++     is suitable to transpose.  */
++  auto_vec<stmt_vec_info> current_groups;
++
++  FOR_EACH_VEC_ELT (bb_vinfo->grouped_stores, i, current_element)
++    {
++      /* Compare current grouped_store to the first one if first_element exists,
++	 push current_element to current_groups if they are similar on innermost
++	 behavior of MEM_REF.  */
++      if (first_element != NULL
++	  && !check_same_store_type (first_element, current_element)
++	  && compare_innermost (innermost_inits, innermost_offsets,
++				current_element)
++	  && check_same_bb (first_element, current_element))
++	{
++	  current_groups.safe_push (current_element);
++	  current_element->group_number = grp_num;
++	  /* If current_element is the last element in grouped_stores, continue
++	     will exit the loop and leave the last group unanalyzed.  */
++	  if (i == bb_vinfo->grouped_stores.length () - 1)
++	    {
++	      check_and_clear_groups (current_groups, grp_num);
++	    }
++	  continue;
++	}
++      check_and_clear_groups (current_groups, grp_num);
++      innermost_inits.release ();
++      innermost_offsets.release ();
++      /* Beginning of a new group to analyze whether they are able to consist
++	 a unit to conduct transpose analysis.  */
++      first_element = NULL;
++      if (is_store_one_dim_array (current_element->stmt)
++	  && check_filling_reg (current_element)
++	  && record_innermost (innermost_inits, innermost_offsets,
++			       current_element))
++	{
++	  first_element = current_element;
++	  current_groups.safe_push (current_element);
++	  current_element->group_number = ++grp_num;
++	  if (i == bb_vinfo->grouped_stores.length () - 1)
++	    {
++	      check_and_clear_groups (current_groups, grp_num);
++	    }
++	  continue;
++	}
++      current_element->group_number = -1;
++    }
++  return grp_num;
++}
++
++/* Return a flag to transpose grouped stores before building slp tree.
++   Add bool may_transpose in class vec_info.  */
++
++static bool
++vect_may_transpose (bb_vec_info bb_vinfo)
++{
++  if (targetm.vectorize.vec_perm_const == NULL)
++    {
++      return false;
++    }
++
++  if (bb_vinfo->grouped_stores.length () < 2)
++    {
++      return false;
++    }
++
++  DUMP_VECT_SCOPE ("analyze if grouped stores may transpose to slp");
++  /* Sort grouped_stores according to size and type for function
++     vect_prepare_transpose ().  */
++  bb_vinfo->grouped_stores.qsort (grouped_store_cmp);
++
++  int groups = vect_prepare_transpose (bb_vinfo);
++  BB_VINFO_TRANS_GROUPS (bb_vinfo) = groups;
++  if (dump_enabled_p ())
++      dump_printf_loc (MSG_NOTE, vect_location,
++		       "%d groups to analyze transposed slp.\n", groups);
++  return groups != 0;
++}
++
++/* Get the base address of STMT_INFO.  */
++
++static tree
++get_op_base_address (stmt_vec_info stmt_info)
++{
++  struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
++  tree op = DR_BASE_ADDRESS (dr);
++  while (TREE_OPERAND_LENGTH (op) > 0)
++    {
++      op = TREE_OPERAND (op, 0);
++    }
++  return op;
++}
++
++/* Compare the UID of the two stmt_info STMTINFO_A and STMTINFO_B.
++   Sorting them in ascending order.  */
++
++static int
++dr_group_cmp (const void *stmtinfo_a_, const void *stmtinfo_b_)
++{
++  stmt_vec_info stmtinfo_a
++	= *(stmt_vec_info *) const_cast<void *> (stmtinfo_a_);
++  stmt_vec_info stmtinfo_b
++	= *(stmt_vec_info *) const_cast<void *> (stmtinfo_b_);
++
++  /* Stabilize sort.  */
++  if (stmtinfo_a == stmtinfo_b)
++    {
++      return 0;
++    }
++  return gimple_uid (stmtinfo_a->stmt) < gimple_uid (stmtinfo_b->stmt) ? -1 : 1;
++}
++
++/* Find the first elements of the grouped loads which are required to merge.  */
++
++static void
++vect_slp_grouped_load_find (bb_vec_info bb_vinfo, vec<bool> &visited,
++			    vec<stmt_vec_info> &res)
++{
++  unsigned int i = 0;
++  stmt_vec_info merge_first_element = NULL;
++  stmt_vec_info first_element = NULL;
++  tree opa = NULL;
++  unsigned int grp_size_a = 0;
++  FOR_EACH_VEC_ELT (bb_vinfo->grouped_loads, i, first_element)
++    {
++      if (visited[i])
++	{
++	  continue;
++	}
++      if (!STMT_VINFO_GROUPED_ACCESS (first_element)
++	  || !pow2p_hwi (DR_GROUP_SIZE (first_element)))
++	{
++	  /* Non-conforming grouped load should be grouped separately.  */
++	  if (merge_first_element == NULL)
++	    {
++	      visited[i] = true;
++	      res.safe_push (first_element);
++	      return;
++	    }
++	}
++      if (merge_first_element == NULL)
++	{
++	  merge_first_element = first_element;
++	  opa = get_op_base_address (first_element);
++	  grp_size_a = DR_GROUP_SIZE (first_element);
++	  res.safe_push (first_element);
++	  visited[i] = true;
++	  continue;
++	}
++
++      /* If the two first elements are of the same base address and group size,
++	 these two grouped loads need to be merged.  */
++      tree opb = get_op_base_address (first_element);
++      unsigned int grp_size_b = DR_GROUP_SIZE (first_element);
++      if (opa == opb && grp_size_a == grp_size_b)
++	{
++	  res.safe_push (first_element);
++	  visited[i] = true;
++	}
++    }
++}
++
++/* Merge the grouped loads that are found from
++   vect_slp_grouped_load_find ().  */
++
++static stmt_vec_info
++vect_slp_grouped_load_merge (vec<stmt_vec_info> &res)
++{
++  stmt_vec_info stmt_info = res[0];
++  if (res.length () == 1)
++    {
++      return stmt_info;
++    }
++  unsigned int i = 0;
++  unsigned int size = DR_GROUP_SIZE (res[0]);
++  unsigned int new_group_size = size * res.length ();
++  stmt_vec_info first_element = NULL;
++  stmt_vec_info merge_first_element = NULL;
++  stmt_vec_info last_element = NULL;
++  FOR_EACH_VEC_ELT (res, i, first_element)
++    {
++      if (merge_first_element == NULL)
++	{
++	  merge_first_element = first_element;
++	  last_element = merge_first_element;
++	  size = DR_GROUP_SIZE (merge_first_element);
++	}
++
++      if (last_element != first_element
++	  && !DR_GROUP_NEXT_ELEMENT (last_element))
++	{
++	  DR_GROUP_NEXT_ELEMENT (last_element) = first_element;
++	  /* Store the gap from the previous member of the group.  If there is
++	     no gap in the access, DR_GROUP_GAP is always 1.  */
++	  DR_GROUP_GAP_TRANS (first_element) = DR_GROUP_GAP (first_element);
++	  DR_GROUP_GAP (first_element) = 1;
++	}
++      for (stmt_info = first_element; stmt_info;
++	   stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info))
++	{
++	  DR_GROUP_FIRST_ELEMENT (stmt_info) = merge_first_element;
++	  DR_GROUP_SIZE_TRANS (stmt_info) = DR_GROUP_SIZE (stmt_info);
++	  DR_GROUP_SIZE (stmt_info) = new_group_size;
++	  last_element = stmt_info;
++	}
++    }
++  DR_GROUP_SIZE (merge_first_element) = new_group_size;
++  DR_GROUP_SLP_TRANSPOSE (merge_first_element) = true;
++  DR_GROUP_NEXT_ELEMENT (last_element) = NULL;
++  return merge_first_element;
++}
++
++/* Merge the grouped loads which have the same base address and group size.
++   For example, for grouped loads (opa_1, opa_2, opb_1, opb_2):
++     opa_1: a0->a1->a2->a3
++     opa_2: a8->a9->a10->a11
++     opb_1: b0->b1
++     opb_2: b16->b17
++   we can probably get two merged grouped loads:
++     opa: a0->a1->a2->a3->a8->a9->a10->a11
++     opb: b0->b1->b16->b17.  */
++
++static bool
++vect_merge_slp_grouped_loads (bb_vec_info bb_vinfo)
++{
++  if (bb_vinfo->grouped_loads.length () <= 0)
++    {
++      if (dump_enabled_p ())
++	{
++	  dump_printf_loc (MSG_NOTE, vect_location,
++			   "The number of grouped loads is 0.\n");
++	}
++      return false;
++    }
++  bb_vinfo->grouped_loads.qsort (dr_group_cmp);
++  auto_vec<bool> visited (bb_vinfo->grouped_loads.length ());
++  auto_vec<stmt_vec_info> grouped_loads_merge;
++  for (unsigned int i = 0; i < bb_vinfo->grouped_loads.length (); i++)
++    {
++      visited.safe_push (false);
++    }
++  while (1)
++    {
++      /* Find grouped loads which are required to merge.  */
++      auto_vec<stmt_vec_info> res;
++      vect_slp_grouped_load_find (bb_vinfo, visited, res);
++      if (res.is_empty ())
++	{
++	  break;
++	}
++      /* Merge the required grouped loads into one group.  */
++      grouped_loads_merge.safe_push (vect_slp_grouped_load_merge (res));
++    }
++  if (grouped_loads_merge.length () == bb_vinfo->grouped_loads.length ())
++    {
++      if (dump_enabled_p ())
++	{
++	  dump_printf_loc (MSG_NOTE, vect_location,
++			   "No grouped loads need to be merged.\n");
++	}
++      return false;
++    }
++  if (dump_enabled_p ())
++    {
++      dump_printf_loc (MSG_NOTE, vect_location,
++		       "Merging grouped loads successfully.\n");
++    }
++  BB_VINFO_GROUPED_LOADS (bb_vinfo).release ();
++  for (unsigned int i = 0; i < grouped_loads_merge.length (); i++)
++    {
++      BB_VINFO_GROUPED_LOADS (bb_vinfo).safe_push (grouped_loads_merge[i]);
++    }
++  return true;
++}
++
++/* Find the first elements of the grouped stores
++   which are required to transpose and merge.  */
++
++static void
++vect_slp_grouped_store_find (bb_vec_info bb_vinfo, vec<bool> &visited,
++			     vec<stmt_vec_info> &res)
++{
++  stmt_vec_info first_element = NULL;
++  stmt_vec_info merge_first_element = NULL;
++  unsigned int k = 0;
++  FOR_EACH_VEC_ELT (bb_vinfo->grouped_stores, k, first_element)
++    {
++      if (visited[k])
++	{
++	  continue;
++	}
++      /* Non-conforming grouped store should be grouped separately.  */
++      if (!STMT_VINFO_GROUPED_ACCESS (first_element)
++	  || first_element->group_number == -1)
++	{
++	  if (merge_first_element == NULL)
++	    {
++	      visited[k] = true;
++	      res.safe_push (first_element);
++	      return;
++	    }
++	}
++      if (first_element->group_number != -1
++	  && merge_first_element == NULL)
++	{
++	  merge_first_element = first_element;
++	}
++      if (merge_first_element->group_number == first_element->group_number)
++	{
++	  visited[k] = true;
++	  res.safe_push (first_element);
++	}
++    }
++}
++
++/* Transpose and merge the grouped stores that are found from
++   vect_slp_grouped_store_find ().  */
++
++static stmt_vec_info
++vect_slp_grouped_store_transform (vec<stmt_vec_info> &res)
++{
++  stmt_vec_info stmt_info = res[0];
++  if (res.length () == 1)
++    {
++      return stmt_info;
++    }
++  stmt_vec_info rearrange_first_element = stmt_info;
++  stmt_vec_info last_element = rearrange_first_element;
++
++  unsigned int size = DR_GROUP_SIZE (rearrange_first_element);
++  unsigned int new_group_size = size * res.length ();
++  for (unsigned int i = 1; i < res.length (); i++)
++    {
++      /* Store the gap from the previous member of the group.  If there is no
++	 gap in the access, DR_GROUP_GAP is always 1.  */
++      DR_GROUP_GAP_TRANS (res[i]) = DR_GROUP_GAP (res[i]);
++      DR_GROUP_GAP (res[i]) = 1;
++    }
++  while (!res.is_empty ())
++    {
++      stmt_info = res[0];
++      res.ordered_remove (0);
++      if (DR_GROUP_NEXT_ELEMENT (stmt_info))
++	{
++	  res.safe_push (DR_GROUP_NEXT_ELEMENT (stmt_info));
++	}
++      DR_GROUP_FIRST_ELEMENT (stmt_info) = rearrange_first_element;
++      DR_GROUP_NEXT_ELEMENT (last_element) = stmt_info;
++      DR_GROUP_SIZE_TRANS (stmt_info) = DR_GROUP_SIZE (stmt_info);
++      DR_GROUP_SIZE (stmt_info) = new_group_size;
++      last_element = stmt_info;
++    }
++
++  DR_GROUP_SIZE (rearrange_first_element) = new_group_size;
++  DR_GROUP_SLP_TRANSPOSE (rearrange_first_element) = true;
++  DR_GROUP_NEXT_ELEMENT (last_element) = NULL;
++  return rearrange_first_element;
++}
++
++/* Save the STMT_INFO in the grouped stores to BB_VINFO_SCALAR_STORES for
++   transposing back grouped stores.  */
++
++static void
++get_scalar_stores (bb_vec_info bb_vinfo)
++{
++  unsigned int k = 0;
++  stmt_vec_info first_element = NULL;
++  FOR_EACH_VEC_ELT (bb_vinfo->grouped_stores, k, first_element)
++    {
++      /* Filter the grouped store which is unnecessary for transposing.  */
++      if (!STMT_VINFO_GROUPED_ACCESS (first_element)
++	  || first_element->group_number == -1)
++	{
++	  continue;
++	}
++      vec<stmt_vec_info> tmp_scalar_store;
++      tmp_scalar_store.create (DR_GROUP_SIZE (first_element));
++      for (stmt_vec_info stmt_info = first_element; stmt_info;
++	   stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info))
++	{
++	  tmp_scalar_store.safe_push (stmt_info);
++	}
++      BB_VINFO_SCALAR_STORES (bb_vinfo).safe_push (tmp_scalar_store);
++    }
++}
++
++/* Transpose and merge the grouped stores which have the same group number.
++   For example, for grouped stores (opa_0, opa_1, opa_2, opa_3):
++     opa_0: a00->a01->a02->a03
++     opa_1: a10->a11->a12->a13
++     opa_2: a20->a21->a22->a23
++     opa_2: a30->a31->a32->a33
++   we can probably get the merged grouped store:
++     opa: a00->a10->a20->a30
++	->a01->a11->a21->a31
++	->a02->a12->a22->a32
++	->a03->a13->a23->a33.  */
++
++static bool
++vect_transform_slp_grouped_stores (bb_vec_info bb_vinfo)
++{
++  if (bb_vinfo->grouped_stores.length () <= 0)
++    {
++      if (dump_enabled_p ())
++	{
++	  dump_printf_loc (MSG_NOTE, vect_location,
++			   "The number of grouped stores is 0.\n");
++	}
++      return false;
++    }
++
++  bb_vinfo->grouped_stores.qsort (dr_group_cmp);
++  auto_vec<stmt_vec_info> grouped_stores_merge;
++  auto_vec<bool> visited (bb_vinfo->grouped_stores.length ());
++  unsigned int i = 0;
++  for (i = 0; i < bb_vinfo->grouped_stores.length (); i++)
++    {
++      visited.safe_push (false);
++    }
++
++  /* Get scalar stores for the following transposition recovery.  */
++  get_scalar_stores (bb_vinfo);
++
++  while (1)
++    {
++      /* Find grouped stores which are required to transpose and merge.  */
++      auto_vec<stmt_vec_info> res;
++      vect_slp_grouped_store_find (bb_vinfo, visited, res);
++      if (res.is_empty ())
++	{
++	  break;
++	}
++      /* Transpose and merge the required grouped stores into one group.  */
++      grouped_stores_merge.safe_push (vect_slp_grouped_store_transform (res));
++    }
++
++  BB_VINFO_GROUPED_STORES (bb_vinfo).release ();
++  for (i = 0; i < grouped_stores_merge.length (); i++)
++    {
++      BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (grouped_stores_merge[i]);
++    }
++
++  if (dump_enabled_p ())
++    {
++      dump_printf_loc (MSG_NOTE, vect_location,
++		       "Transposing grouped stores successfully.\n");
++    }
++  return true;
++}
++
++/* A helpful function of vect_transform_back_slp_grouped_stores ().  */
++
++static auto_vec<stmt_vec_info>
++vect_transform_back_slp_grouped_store (bb_vec_info bb_vinfo,
++				       stmt_vec_info first_stmt_info)
++{
++  auto_vec<stmt_vec_info> grouped_stores_split;
++  for (unsigned int i = 0; i < bb_vinfo->scalar_stores.length (); i++)
++    {
++      vec<stmt_vec_info> scalar_tmp = bb_vinfo->scalar_stores[i];
++      if (scalar_tmp.length () > 1
++	  && scalar_tmp[0]->group_number != first_stmt_info->group_number)
++	{
++	  continue;
++	}
++      stmt_vec_info cur_stmt_info = NULL;
++      stmt_vec_info cur_first_stmt_info = NULL;
++      stmt_vec_info last_stmt_info = NULL;
++      unsigned int k = 0;
++      FOR_EACH_VEC_ELT (scalar_tmp, k, cur_stmt_info)
++	{
++	  if (k == 0)
++	    {
++	      cur_first_stmt_info = cur_stmt_info;
++	      last_stmt_info = cur_stmt_info;
++	    }
++	  DR_GROUP_FIRST_ELEMENT (cur_stmt_info) = cur_first_stmt_info;
++	  DR_GROUP_NEXT_ELEMENT (last_stmt_info) = cur_stmt_info;
++	  last_stmt_info = cur_stmt_info;
++	}
++      DR_GROUP_SIZE (cur_first_stmt_info) = k;
++      DR_GROUP_NEXT_ELEMENT (last_stmt_info) = NULL;
++      if (first_stmt_info != cur_first_stmt_info)
++	{
++	  DR_GROUP_GAP (cur_first_stmt_info)
++		= DR_GROUP_GAP_TRANS (cur_first_stmt_info);
++	  DR_GROUP_SLP_TRANSPOSE (cur_first_stmt_info) = false;
++	  DR_GROUP_NUMBER (cur_first_stmt_info) = -1;
++	}
++      grouped_stores_split.safe_push (cur_first_stmt_info);
++    }
++  return grouped_stores_split;
++}
++
++/* Transform the grouped store back.  */
++
++void
++vect_transform_back_slp_grouped_stores (bb_vec_info bb_vinfo,
++					stmt_vec_info first_stmt_info)
++{
++  if (first_stmt_info->group_number == -1)
++    {
++      return;
++    }
++  /* Transform back.  */
++  auto_vec<stmt_vec_info> grouped_stores_split
++	= vect_transform_back_slp_grouped_store (bb_vinfo, first_stmt_info);
++
++  /* Add the remaining grouped stores to grouped_stores_split.  */
++  stmt_vec_info first_element = NULL;
++  unsigned int i = 0;
++  FOR_EACH_VEC_ELT (bb_vinfo->grouped_stores, i, first_element)
++    {
++      if (first_element->group_number != first_stmt_info->group_number)
++	{
++	  grouped_stores_split.safe_push (first_element);
++	}
++    }
++  DR_GROUP_SLP_TRANSPOSE (first_stmt_info) = false;
++  DR_GROUP_NUMBER (first_stmt_info) = -1;
++  BB_VINFO_GROUPED_STORES (bb_vinfo).release ();
++  for (i = 0; i < grouped_stores_split.length (); i++)
++    {
++      BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (grouped_stores_split[i]);
++    }
++}
++
++/* Function check_for_slp_vectype
++
++   Restriction for grouped stores by checking their vectype.
++   If the vectype of the grouped store is changed, it need transform back.
++   If all grouped stores need to be transformed back, return FALSE.  */
++
++static bool
++check_for_slp_vectype (bb_vec_info bb_vinfo)
++{
++  if (dump_file)
++    fprintf (dump_file, "check_for_slp_vectype: enter\n");
++  stmt_vec_info first_element = NULL;
++  unsigned int i = 0;
++  int count = 0;
++  auto_vec<stmt_vec_info> grouped_stores_check;
++  FOR_EACH_VEC_ELT (bb_vinfo->grouped_stores, i, first_element)
++    {
++      grouped_stores_check.safe_push (first_element);
++    }
++  FOR_EACH_VEC_ELT (grouped_stores_check, i, first_element)
++    {
++      if (STMT_VINFO_GROUPED_ACCESS (first_element)
++	  && first_element->group_number != -1)
++	{
++	  unsigned int group_size_b
++			= DR_GROUP_SIZE_TRANS (first_element);
++	  tree vectype = STMT_VINFO_VECTYPE (first_element);
++	  gimple *stmt = STMT_VINFO_STMT (first_element);
++	  tree lhs = gimple_get_lhs (stmt);
++	  tree type = TREE_TYPE (lhs);
++#if 0
++	  if (!vectype && !type)
++	    {
++	      if (dump_file)
++		fprintf (dump_file, "check_for_slp_vectype: no vectype/stmt type\n");
++	      continue;
++	    }
++
++	  if (!vectype)
++	    vectype = type;
++#endif
++	  if (dump_file)
++	    {
++	      fprintf (dump_file, "check_for_slp_vectype: %p\n", first_element);
++	      print_gimple_stmt (dump_file, stmt, 0);
++	      fprintf (dump_file, "check_for_slp_vectype: vectype=");
++	      if (vectype)
++		print_generic_expr (dump_file, vectype);
++	      fprintf (dump_file, "\n");
++	    }
++#if 0
++	  if (!vectype || !VECTOR_TYPE_P (vectype))
++	    continue;
++#endif
++	  poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
++	  if (nunits.to_constant () > group_size_b)
++	    {
++	      count++;
++	      /* If the vectype is changed, this grouped store need
++		 to be transformed back.  */
++	      vect_transform_back_slp_grouped_stores (bb_vinfo, first_element);
++	      if (dump_enabled_p ())
++		{
++		  dump_printf_loc (MSG_NOTE, vect_location,
++				   "No supported: only supported for"
++				   " group_size geq than nunits.\n");
++		}
++	    }
++	}
++    }
++  if (count == BB_VINFO_TRANS_GROUPS (bb_vinfo))
++    {
++      return false;
++    }
++  if (dump_file)
++    fprintf (dump_file, "check_for_slp_vectype: True\n");
++  return true;
++}
++
++/* Function check_for_dr_alignment
++
++   Check the alignment of the slp instance loads.
++   Return FALSE if a load cannot be vectorized.  */
++
++static bool
++check_for_dr_alignment (bb_vec_info bb_vinfo, slp_instance instance)
++{
++  slp_tree node = NULL;
++  unsigned int i = 0;
++  FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, node)
++    {
++      stmt_vec_info first_stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
++      dr_vec_info *first_dr_info = STMT_VINFO_DR_INFO (first_stmt_info);
++      if (dump_file)
++	{
++	  fprintf (dump_file, "check_for_dr_alignment: %p\n", first_stmt_info);
++
++	  gimple *stmt = STMT_VINFO_STMT (first_stmt_info);
++	  tree lhs = gimple_get_lhs (stmt);
++	  tree type = TREE_TYPE (lhs);
++	  print_gimple_stmt (dump_file, stmt, 0);
++	}
++
++      tree vectype = STMT_VINFO_VECTYPE (first_stmt_info);
++      int malign = dr_misalignment (first_dr_info, vectype);
++      enum dr_alignment_support supportable_dr_alignment
++	= vect_supportable_dr_alignment (bb_vinfo, first_dr_info,
++					 vectype, malign);
++      if (supportable_dr_alignment == dr_explicit_realign_optimized
++	  || supportable_dr_alignment == dr_explicit_realign)
++	{
++	  return false;
+ 	}
+     }
+-  else if (kind == slp_inst_kind_reduc_chain)
++  return true;
++}
++
++/* Initialize slp_transpose flag before transposing.  */
++
++static void
++init_stmt_info_slp_transpose (bb_vec_info bb_vinfo)
++{
++  stmt_vec_info first_element = NULL;
++  unsigned int k = 0;
++  FOR_EACH_VEC_ELT (bb_vinfo->grouped_stores, k, first_element)
+     {
+-      /* Collect the reduction stmts and store them in scalar_stmts.  */
+-      scalar_stmts.create (REDUC_GROUP_SIZE (stmt_info));
+-      while (next_info)
++      if (STMT_VINFO_GROUPED_ACCESS (first_element))
+ 	{
+-	  scalar_stmts.quick_push (vect_stmt_to_vectorize (next_info));
+-	  next_info = REDUC_GROUP_NEXT_ELEMENT (next_info);
++	  DR_GROUP_SLP_TRANSPOSE (first_element) = false;
+ 	}
+-      /* Mark the first element of the reduction chain as reduction to properly
+-	 transform the node.  In the reduction analysis phase only the last
+-	 element of the chain is marked as reduction.  */
+-      STMT_VINFO_DEF_TYPE (stmt_info)
+-	= STMT_VINFO_DEF_TYPE (scalar_stmts.last ());
+-      STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))
+-	= STMT_VINFO_REDUC_DEF (vect_orig_stmt (scalar_stmts.last ()));
+     }
+-  else if (kind == slp_inst_kind_ctor)
++  FOR_EACH_VEC_ELT (bb_vinfo->grouped_loads, k, first_element)
+     {
+-      tree rhs = gimple_assign_rhs1 (stmt_info->stmt);
+-      tree val;
+-      scalar_stmts.create (CONSTRUCTOR_NELTS (rhs));
+-      FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs), i, val)
++      if (STMT_VINFO_GROUPED_ACCESS (first_element))
+ 	{
+-	  stmt_vec_info def_info = vinfo->lookup_def (val);
+-	  def_info = vect_stmt_to_vectorize (def_info);
+-	  scalar_stmts.quick_push (def_info);
++	  DR_GROUP_SLP_TRANSPOSE (first_element) = false;
+ 	}
+-      if (dump_enabled_p ())
+-	dump_printf_loc (MSG_NOTE, vect_location,
+-			 "Analyzing vectorizable constructor: %G\n",
+-			 stmt_info->stmt);
+     }
+-  else if (kind == slp_inst_kind_reduc_group)
++}
++
++/* Analyze and transpose the stmts before building the SLP tree.  */
++
++static bool
++vect_analyze_transpose (bb_vec_info bb_vinfo)
++{
++  DUMP_VECT_SCOPE ("vect_analyze_transpose");
++
++  if (!vect_may_transpose (bb_vinfo))
+     {
+-      /* Collect reduction statements.  */
+-      const vec<stmt_vec_info> &reductions
+-	= as_a <loop_vec_info> (vinfo)->reductions;
+-      scalar_stmts.create (reductions.length ());
+-      for (i = 0; reductions.iterate (i, &next_info); i++)
+-	if ((STMT_VINFO_RELEVANT_P (next_info)
+-	     || STMT_VINFO_LIVE_P (next_info))
+-	    /* ???  Make sure we didn't skip a conversion around a reduction
+-	       path.  In that case we'd have to reverse engineer that conversion
+-	       stmt following the chain using reduc_idx and from the PHI
+-	       using reduc_def.  */
+-	    && STMT_VINFO_DEF_TYPE (next_info) == vect_reduction_def)
+-	  scalar_stmts.quick_push (next_info);
+-      /* If less than two were relevant/live there's nothing to SLP.  */
+-      if (scalar_stmts.length () < 2)
+-	return false;
++      return false;
+     }
+-  else
+-    gcc_unreachable ();
+ 
+-  vec<stmt_vec_info> roots = vNULL;
+-  if (kind == slp_inst_kind_ctor)
++  /* For basic block SLP, try to merge the grouped stores and loads
++     into one group.  */
++  init_stmt_info_slp_transpose (bb_vinfo);
++  if (vect_transform_slp_grouped_stores (bb_vinfo)
++      && vect_merge_slp_grouped_loads (bb_vinfo))
+     {
+-      roots.create (1);
+-      roots.quick_push (stmt_info);
++      if (dump_enabled_p ())
++	{
++	  dump_printf_loc (MSG_NOTE, vect_location,
++			   "Analysis succeeded with SLP transposed.\n");
++	}
++      return true;
+     }
+-  /* Build the tree for the SLP instance.  */
+-  bool res = vect_build_slp_instance (vinfo, kind, scalar_stmts,
+-				      roots,
+-				      max_tree_size, limit, bst_map,
+-				      kind == slp_inst_kind_store
+-				      ? stmt_info : NULL);
+-  if (!res)
+-    roots.release ();
+-
+-  /* ???  If this is slp_inst_kind_store and the above succeeded here's
+-     where we should do store group splitting.  */
+-
+-  return res;
++  if (dump_enabled_p ())
++    {
++      dump_printf_loc (MSG_NOTE, vect_location,
++		       "Analysis failed with SLP transposed.\n");
++    }
++  return false;
+ }
+ 
+ /* Check if there are stmts in the loop can be vectorized using SLP.  Build SLP
+@@ -4963,7 +5932,7 @@ vect_slp_analyze_operations (vec_info *vinfo)
+ 	  /* Check we can vectorize the reduction.  */
+ 	  || (SLP_INSTANCE_KIND (instance) == slp_inst_kind_bb_reduc
+ 	      && !vectorizable_bb_reduc_epilogue (instance, &cost_vec)))
+-        {
++	{
+ 	  slp_tree node = SLP_INSTANCE_TREE (instance);
+ 	  stmt_vec_info stmt_info;
+ 	  if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
+@@ -4975,7 +5944,7 @@ vect_slp_analyze_operations (vec_info *vinfo)
+ 			     "removing SLP instance operations starting from: %G",
+ 			     stmt_info->stmt);
+ 	  vect_free_slp_instance (instance);
+-          vinfo->slp_instances.ordered_remove (i);
++	  vinfo->slp_instances.ordered_remove (i);
+ 	  cost_vec.release ();
+ 	  while (!visited_vec.is_empty ())
+ 	    visited.remove (visited_vec.pop ());
+@@ -5204,7 +6173,7 @@ vect_bb_slp_scalar_cost (vec_info *vinfo,
+       gimple *orig_stmt = orig_stmt_info->stmt;
+ 
+       /* If there is a non-vectorized use of the defs then the scalar
+-         stmt is kept live in which case we do not account it or any
++	 stmt is kept live in which case we do not account it or any
+ 	 required defs in the SLP children in the scalar cost.  This
+ 	 way we make the vectorization more costly when compared to
+ 	 the scalar cost.  */
+@@ -5481,7 +6450,11 @@ vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo,
+ 
+       vec_outside_cost = vec_prologue_cost + vec_epilogue_cost;
+ 
+-      if (dump_enabled_p ())
++      BB_VINFO_VEC_INSIDE_COST (bb_vinfo) = vec_inside_cost;
++      BB_VINFO_VEC_OUTSIDE_COST (bb_vinfo) = vec_outside_cost;
++      BB_VINFO_SCALAR_COST (bb_vinfo) = scalar_cost;
++
++      if (!unlimited_cost_model (NULL) && dump_enabled_p ())
+ 	{
+ 	  dump_printf_loc (MSG_NOTE, vect_location,
+ 			   "Cost model analysis for part in loop %d:\n", sl);
+@@ -5819,7 +6792,7 @@ vect_slp_analyze_bb_1 (bb_vec_info bb_vinfo, int n_stmts, bool &fatal,
+   if (!vect_analyze_data_refs (bb_vinfo, &min_vf, NULL))
+     {
+       if (dump_enabled_p ())
+-        dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
++	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+ 			 "not vectorized: unhandled data-ref in basic "
+ 			 "block.\n");
+       return false;
+@@ -5854,6 +6827,22 @@ vect_slp_analyze_bb_1 (bb_vec_info bb_vinfo, int n_stmts, bool &fatal,
+ 
+   vect_pattern_recog (bb_vinfo);
+ 
++  /* Transpose grouped stores and loads for better vectorizable version.  */
++  if (bb_vinfo->transposed)
++    {
++      if (!vect_analyze_transpose (bb_vinfo))
++	{
++	  if (dump_enabled_p ())
++	    {
++	       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
++				"not vectorized: unhandled slp transposed in "
++				"basic block.\n");
++	    }
++	  return false;
++	}
++    }
++  bb_vinfo->before_slp = true;
++
+   /* Update store groups from pattern processing.  */
+   vect_fixup_store_groups_with_patterns (bb_vinfo);
+ 
+@@ -5872,6 +6861,20 @@ vect_slp_analyze_bb_1 (bb_vec_info bb_vinfo, int n_stmts, bool &fatal,
+       return false;
+     }
+ 
++  /* Check if the vectype is suitable for SLP transposed.  */
++  if (bb_vinfo->transposed && !check_for_slp_vectype (bb_vinfo))
++    {
++      if (dump_enabled_p ())
++	{
++	  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
++			   "Failed to SLP transposed in the basic block.\n");
++	  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
++			   "not vectorized: vectype is not suitable for "
++			   "SLP transposed in basic block.\n");
++	}
++      return false;
++    }
++
+   /* Optimize permutations.  */
+   vect_optimize_slp (bb_vinfo);
+ 
+@@ -5914,6 +6917,27 @@ vect_slp_analyze_bb_1 (bb_vec_info bb_vinfo, int n_stmts, bool &fatal,
+   if (! BB_VINFO_SLP_INSTANCES (bb_vinfo).length ())
+     return false;
+ 
++  /* Check if the alignment is suitable for SLP transposed.  */
++  if (bb_vinfo->transposed)
++    {
++      for (i = 0; BB_VINFO_SLP_INSTANCES (bb_vinfo).iterate (i, &instance); i++)
++	{
++	  if (!check_for_dr_alignment (bb_vinfo, instance))
++	    {
++	      if (dump_enabled_p ())
++		{
++		  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
++				   "Failed to SLP transposed in the basic "
++				   "block.\n");
++		  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
++				   "not vectorized: alignment is not suitable "
++				   "for SLP transposed in basic block.\n");
++		}
++	      return false;
++	    }
++	}
++    }
++
+   if (!vect_slp_analyze_operations (bb_vinfo))
+     {
+       if (dump_enabled_p ())
+@@ -5923,7 +6947,88 @@ vect_slp_analyze_bb_1 (bb_vec_info bb_vinfo, int n_stmts, bool &fatal,
+     }
+ 
+   vect_bb_partition_graph (bb_vinfo);
++  return true;
++}
++
++static bool
++may_new_transpose_bbvinfo (bb_vec_info bb_vinfo_ori, bool res_ori,
++			   loop_p orig_loop)
++{
++  /* If the flag is false or the slp analysis is broken before
++     vect_analyze_slp, we don't try to analyze the transposed SLP version.  */
++  if (!flag_tree_slp_transpose_vectorize
++      || !BB_VINFO_BEFORE_SLP (bb_vinfo_ori))
++    {
++      return false;
++    }
++
++  /* If the original bb_vinfo can't be vectorized, try to new a bb_vinfo
++     of the transposed version.  */
++  if (!res_ori)
++    {
++      return true;
++    }
++
++  /* Caculate the cost of the original bb_vinfo.  */
++  if (unlimited_cost_model (NULL))
++    {
++      vec<slp_instance> &instances = BB_VINFO_SLP_INSTANCES (bb_vinfo_ori);
++      vect_bb_vectorization_profitable_p (bb_vinfo_ori, instances, orig_loop);
++    }
++  /* If the vec cost and scalar cost are not much difference (here we set the
++     threshold to 4), we try to new a bb_vinfo of the transposed version.  */
++  if (BB_VINFO_SCALAR_COST (bb_vinfo_ori)
++      < 4 * (BB_VINFO_VEC_INSIDE_COST (bb_vinfo_ori)
++	     + BB_VINFO_VEC_OUTSIDE_COST (bb_vinfo_ori)))
++    {
++      return true;
++    }
++  return false;
++}
+ 
++static bool
++may_choose_transpose_bbvinfo (bb_vec_info bb_vinfo_trans, bool res_trans,
++			      bb_vec_info bb_vinfo_ori, bool res_ori,
++			      loop_p orig_loop)
++{
++  /* The original bb_vinfo is chosen if the transposed bb_vinfo
++     can't be vectorized.  */
++  if (!res_trans)
++    {
++      return false;
++    }
++  /* Caculate the cost of the transposed bb_vinfo.  */
++  if (unlimited_cost_model (NULL))
++    {
++      vec<slp_instance> &instances = BB_VINFO_SLP_INSTANCES (bb_vinfo_trans);
++      vect_bb_vectorization_profitable_p (bb_vinfo_trans, instances,
++					  orig_loop);
++    }
++  int diff_bb_cost = -1;
++  int diff_bb_cost_trans = -1;
++  if (res_ori)
++    {
++      diff_bb_cost = BB_VINFO_SCALAR_COST (bb_vinfo_ori)
++		     - BB_VINFO_VEC_INSIDE_COST (bb_vinfo_ori)
++		     - BB_VINFO_VEC_OUTSIDE_COST (bb_vinfo_ori);
++    }
++  if (res_trans)
++    {
++      diff_bb_cost_trans = BB_VINFO_SCALAR_COST (bb_vinfo_trans)
++			   - BB_VINFO_VEC_INSIDE_COST (bb_vinfo_trans)
++			   - BB_VINFO_VEC_OUTSIDE_COST (bb_vinfo_trans);
++    }
++  /* The original bb_vinfo is chosen when one of the following conditions
++     is satisfied as follows:
++	1) The cost of original version is better transposed version.
++	2) The vec cost is similar to scalar cost in the transposed version.  */
++  if ((res_ori && res_trans && diff_bb_cost >= diff_bb_cost_trans)
++      || (res_trans && BB_VINFO_SCALAR_COST (bb_vinfo_trans)
++		       <= (BB_VINFO_VEC_INSIDE_COST (bb_vinfo_trans)
++			  + BB_VINFO_VEC_OUTSIDE_COST (bb_vinfo_trans))))
++    {
++      return false;
++    }
+   return true;
+ }
+ 
+@@ -5937,6 +7042,7 @@ vect_slp_region (vec<basic_block> bbs, vec<data_reference_p> datarefs,
+ 		 loop_p orig_loop)
+ {
+   bb_vec_info bb_vinfo;
++  bb_vec_info bb_vinfo_trans = NULL;
+   auto_vector_modes vector_modes;
+ 
+   /* Autodetect first vector size we try.  */
+@@ -5951,6 +7057,10 @@ vect_slp_region (vec<basic_block> bbs, vec<data_reference_p> datarefs,
+     {
+       bool vectorized = false;
+       bool fatal = false;
++      bool res_bb_vinfo_ori = false;
++      bool res_bb_vinfo_trans = false;
++
++      /* New a bb_vinfo of the original version.  */
+       bb_vinfo = new _bb_vec_info (bbs, &shared);
+ 
+       bool first_time_p = shared.datarefs.is_empty ();
+@@ -5960,8 +7070,113 @@ vect_slp_region (vec<basic_block> bbs, vec<data_reference_p> datarefs,
+       else
+ 	bb_vinfo->shared->check_datarefs ();
+       bb_vinfo->vector_mode = next_vector_mode;
++      bb_vinfo->transposed = false;
++      bb_vinfo->before_slp = false;
++
++      res_bb_vinfo_ori = vect_slp_analyze_bb_1 (bb_vinfo, n_stmts, fatal,
++						dataref_groups);
++      auto_vec<slp_instance> profitable_subgraphs;
++      auto_vec<slp_instance> profitable_subgraphs_trans;
++      for (slp_instance instance : BB_VINFO_SLP_INSTANCES (bb_vinfo))
++	{
++	  if (instance->subgraph_entries.is_empty ())
++	    continue;
++
++	    vect_location = instance->location ();
++	    if (!unlimited_cost_model (NULL)
++		&& !vect_bb_vectorization_profitable_p
++		      (bb_vinfo, instance->subgraph_entries, orig_loop))
++	      {
++		if (dump_enabled_p ())
++		  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
++				   "not vectorized: vectorization is not "
++				   "profitable.\n");
++		  continue;
++	      }
++	    if (res_bb_vinfo_ori)
++	      {
++		if (!dbg_cnt (vect_slp))
++		  continue;
++		profitable_subgraphs.safe_push (instance);
++	      }
++	}
++
++      /* Analyze and new a transposed bb_vinfo.  */
++      if (may_new_transpose_bbvinfo (bb_vinfo, res_bb_vinfo_ori, orig_loop))
++	{
++	  bool fatal_trans = false;
++	  bb_vinfo_trans
++	    = new _bb_vec_info (bbs, &shared);
++	  bool first_time_p = shared.datarefs.is_empty ();
++	  BB_VINFO_DATAREFS (bb_vinfo_trans) = datarefs;
++	  if (first_time_p)
++	    {
++	      bb_vinfo_trans->shared->save_datarefs ();
++	    }
++	  else
++	    {
++	      bb_vinfo_trans->shared->check_datarefs ();
++	    }
++	  bb_vinfo_trans->vector_mode = next_vector_mode;
++	  bb_vinfo_trans->transposed = true;
++	  bb_vinfo_trans->before_slp = false;
++
++	  res_bb_vinfo_trans
++	    = vect_slp_analyze_bb_1 (bb_vinfo_trans, n_stmts, fatal_trans,
++				     dataref_groups);
++	  if (res_bb_vinfo_trans)
++	    {
++	      for (slp_instance instance : BB_VINFO_SLP_INSTANCES (bb_vinfo_trans))
++		{
++		  if (instance->subgraph_entries.is_empty ())
++		    continue;
++
++		  vect_location = instance->location ();
++		  if (!unlimited_cost_model (NULL)
++		      && !vect_bb_vectorization_profitable_p
++			(bb_vinfo_trans, instance->subgraph_entries, orig_loop))
++		    {
++		      if (dump_enabled_p ())
++			  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
++					   "not vectorized: transpose vectorization is not "
++					   "profitable.\n");
++		      res_bb_vinfo_trans = false;
++		      continue;
++		     }
++		  if (res_bb_vinfo_trans)
++		    {
++		      if (!dbg_cnt (vect_slp))
++			continue;
++		      profitable_subgraphs_trans.safe_push (instance);
++		    }
++		}
++	    }
++	  if (may_choose_transpose_bbvinfo (bb_vinfo_trans,
++					    res_bb_vinfo_trans,
++					    bb_vinfo, res_bb_vinfo_ori,
++					    orig_loop))
++	    {
++	      bb_vinfo = bb_vinfo_trans;
++	      fatal = fatal_trans;
++	      if (dump_enabled_p ())
++		{
++		  dump_printf_loc (MSG_NOTE, vect_location,
++				   "Basic block part vectorized "
++				   "using transposed version.\n");
++		}
++	    }
++	  else
++	    {
++	      if (dump_enabled_p ())
++		{
++		  dump_printf_loc (MSG_NOTE, vect_location,
++				   "Basic block part vectorized "
++				   "\n");
++		}
++	    }
++	}
+ 
+-      if (vect_slp_analyze_bb_1 (bb_vinfo, n_stmts, fatal, dataref_groups))
++      if (res_bb_vinfo_ori || res_bb_vinfo_trans)
+ 	{
+ 	  if (dump_enabled_p ())
+ 	    {
+@@ -5972,90 +7187,129 @@ vect_slp_region (vec<basic_block> bbs, vec<data_reference_p> datarefs,
+ 	    }
+ 
+ 	  bb_vinfo->shared->check_datarefs ();
+-
+-	  auto_vec<slp_instance> profitable_subgraphs;
+-	  for (slp_instance instance : BB_VINFO_SLP_INSTANCES (bb_vinfo))
++	  if (!res_bb_vinfo_trans)
+ 	    {
+-	      if (instance->subgraph_entries.is_empty ())
+-		continue;
+-
+-	      vect_location = instance->location ();
+-	      if (!unlimited_cost_model (NULL)
+-		  && !vect_bb_vectorization_profitable_p
+-			(bb_vinfo, instance->subgraph_entries, orig_loop))
++	      /* When we're vectorizing an if-converted loop body make sure
++		 we vectorized all if-converted code.  */
++	      if (!profitable_subgraphs.is_empty ()
++		  && orig_loop)
+ 		{
+-		  if (dump_enabled_p ())
+-		    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+-				     "not vectorized: vectorization is not "
+-				     "profitable.\n");
+-		  continue;
++		  gcc_assert (bb_vinfo->bbs.length () == 1);
++		  for (gimple_stmt_iterator gsi = gsi_start_bb (bb_vinfo->bbs[0]);
++		       !gsi_end_p (gsi); gsi_next (&gsi))
++		    {
++		      /* The costing above left us with DCEable vectorized scalar
++			 stmts having the visited flag set on profitable
++			 subgraphs.  Do the delayed clearing of the flag here.  */
++		      if (gimple_visited_p (gsi_stmt (gsi)))
++			{
++			  gimple_set_visited (gsi_stmt (gsi), false);
++			  continue;
++			}
++		      if (flag_vect_cost_model == VECT_COST_MODEL_UNLIMITED)
++			continue;
++
++		      if (gassign *ass = dyn_cast <gassign *> (gsi_stmt (gsi)))
++		       if (gimple_assign_rhs_code (ass) == COND_EXPR)
++			 {
++			   if (!profitable_subgraphs.is_empty ()
++			       && dump_enabled_p ())
++			     dump_printf_loc (MSG_NOTE, vect_location,
++					      "not profitable because of "
++					      "unprofitable if-converted scalar "
++					      "code\n");
++			   profitable_subgraphs.truncate (0);
++			 }
++		    }
+ 		}
+ 
+-	      if (!dbg_cnt (vect_slp))
+-		continue;
++	      /* Finally schedule the profitable subgraphs.  */
++	      for (slp_instance instance : profitable_subgraphs)
++		{
++		  if (!vectorized && dump_enabled_p ())
++		    dump_printf_loc (MSG_NOTE, vect_location,
++				     "Basic block will be vectorized "
++				     "using SLP\n");
++		  vectorized = true;
+ 
+-	      profitable_subgraphs.safe_push (instance);
+-	    }
++		  vect_schedule_slp (bb_vinfo, instance->subgraph_entries);
+ 
+-	  /* When we're vectorizing an if-converted loop body make sure
+-	     we vectorized all if-converted code.  */
+-	  if (!profitable_subgraphs.is_empty ()
+-	      && orig_loop)
++		  unsigned HOST_WIDE_INT bytes;
++		  if (dump_enabled_p ())
++		    {
++		      if (GET_MODE_SIZE
++			   (bb_vinfo->vector_mode).is_constant (&bytes))
++			 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location,
++					  "basic block part vectorized using %wu "
++					  "byte vectors\n", bytes);
++		      else
++			 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location,
++					  "basic block part vectorized using "
++					  "variable length vectors\n");
++		    }
++		}
++	    }
++	  else
+ 	    {
+-	      gcc_assert (bb_vinfo->bbs.length () == 1);
+-	      for (gimple_stmt_iterator gsi = gsi_start_bb (bb_vinfo->bbs[0]);
+-		   !gsi_end_p (gsi); gsi_next (&gsi))
++	      if (!profitable_subgraphs_trans.is_empty ()
++		  && orig_loop)
+ 		{
+-		  /* The costing above left us with DCEable vectorized scalar
+-		     stmts having the visited flag set on profitable
+-		     subgraphs.  Do the delayed clearing of the flag here.  */
+-		  if (gimple_visited_p (gsi_stmt (gsi)))
++		  gcc_assert (bb_vinfo->bbs.length () == 1);
++		  for (gimple_stmt_iterator gsi = gsi_start_bb (bb_vinfo->bbs[0]);
++		       !gsi_end_p (gsi); gsi_next (&gsi))
+ 		    {
+-		      gimple_set_visited (gsi_stmt (gsi), false);
+-		      continue;
++		      /* The costing above left us with DCEable vectorized scalar
++			 stmts having the visited flag set on profitable
++			 subgraphs.  Do the delayed clearing of the flag here.  */
++		      if (gimple_visited_p (gsi_stmt (gsi)))
++			{
++			  gimple_set_visited (gsi_stmt (gsi), false);
++			  continue;
++			}
++		       if (flag_vect_cost_model == VECT_COST_MODEL_UNLIMITED)
++			 continue;
++
++		       if (gassign *ass = dyn_cast <gassign *> (gsi_stmt (gsi)))
++			if (gimple_assign_rhs_code (ass) == COND_EXPR)
++			 {
++			   if (!profitable_subgraphs_trans.is_empty ()
++			       && dump_enabled_p ())
++			     dump_printf_loc (MSG_NOTE, vect_location,
++					      "not profitable because of "
++					      "unprofitable if-converted scalar "
++					      "code\n");
++			   profitable_subgraphs_trans.truncate (0);
++			 }
+ 		    }
+-		  if (flag_vect_cost_model == VECT_COST_MODEL_UNLIMITED)
+-		    continue;
+-
+-		  if (gassign *ass = dyn_cast <gassign *> (gsi_stmt (gsi)))
+-		    if (gimple_assign_rhs_code (ass) == COND_EXPR)
+-		      {
+-			if (!profitable_subgraphs.is_empty ()
+-			    && dump_enabled_p ())
+-			  dump_printf_loc (MSG_NOTE, vect_location,
+-					   "not profitable because of "
+-					   "unprofitable if-converted scalar "
+-					   "code\n");
+-			profitable_subgraphs.truncate (0);
+-		      }
+ 		}
+-	    }
+ 
+-	  /* Finally schedule the profitable subgraphs.  */
+-	  for (slp_instance instance : profitable_subgraphs)
+-	    {
+-	      if (!vectorized && dump_enabled_p ())
+-		dump_printf_loc (MSG_NOTE, vect_location,
+-				 "Basic block will be vectorized "
+-				 "using SLP\n");
+-	      vectorized = true;
++	      /* Finally schedule the profitable subgraphs.  */
++	      for (slp_instance instance : profitable_subgraphs_trans)
++		{
++		  if (!vectorized && dump_enabled_p ())
++		    dump_printf_loc (MSG_NOTE, vect_location,
++				     "Basic block will be vectorized "
++				     "using SLP\n");
++		  vectorized = true;
+ 
+-	      vect_schedule_slp (bb_vinfo, instance->subgraph_entries);
++		  vect_schedule_slp (bb_vinfo, instance->subgraph_entries);
+ 
+-	      unsigned HOST_WIDE_INT bytes;
+-	      if (dump_enabled_p ())
+-		{
+-		  if (GET_MODE_SIZE
+-			(bb_vinfo->vector_mode).is_constant (&bytes))
+-		    dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location,
+-				     "basic block part vectorized using %wu "
+-				     "byte vectors\n", bytes);
+-		  else
+-		    dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location,
+-				     "basic block part vectorized using "
+-				     "variable length vectors\n");
++		  unsigned HOST_WIDE_INT bytes;
++		  if (dump_enabled_p ())
++		    {
++		      if (GET_MODE_SIZE
++			   (bb_vinfo->vector_mode).is_constant (&bytes))
++			 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location,
++					  "basic block part vectorized using %wu "
++					  "byte vectors\n", bytes);
++		      else
++			 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location,
++					  "basic block part vectorized using "
++					  "variable length vectors\n");
++		    }
+ 		}
+ 	    }
++
+ 	}
+       else
+ 	{
+@@ -6081,6 +7335,10 @@ vect_slp_region (vec<basic_block> bbs, vec<data_reference_p> datarefs,
+ 	  }
+ 
+       delete bb_vinfo;
++      if (bb_vinfo_trans)
++	{
++	  bb_vinfo_trans = NULL;
++	}
+ 
+       if (mode_i < vector_modes.length ()
+ 	  && VECTOR_MODE_P (autodetected_vector_mode)
+@@ -7244,10 +8502,17 @@ vect_schedule_slp_node (vec_info *vinfo,
+ 	 ready early, vectorized stores go before the last scalar
+ 	 stmt which is where all uses are ready.  */
+       stmt_vec_info last_stmt_info = NULL;
+-      if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
+-	last_stmt_info = vect_find_first_scalar_stmt_in_slp (node);
+-      else /* DR_IS_WRITE */
+-	last_stmt_info = vect_find_last_scalar_stmt_in_slp (node);
++
++      if (DR_GROUP_FIRST_ELEMENT (stmt_info)
++	  && DR_GROUP_SLP_TRANSPOSE (DR_GROUP_FIRST_ELEMENT (stmt_info)))
++	 last_stmt_info = vect_find_last_scalar_stmt_in_slp (node);
++      else
++	{
++	   if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
++		last_stmt_info = vect_find_first_scalar_stmt_in_slp (node);
++	   else /* DR_IS_WRITE */
++		last_stmt_info = vect_find_last_scalar_stmt_in_slp (node);
++	}
+       si = gsi_for_stmt (last_stmt_info->stmt);
+     }
+   else if ((STMT_VINFO_TYPE (stmt_info) == cycle_phi_info_type
+diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
+index 349200411..3099f6743 100644
+--- a/gcc/tree-vect-stmts.cc
++++ b/gcc/tree-vect-stmts.cc
+@@ -1369,10 +1369,10 @@ vect_get_load_cost (vec_info *, stmt_vec_info stmt_info, int ncopies,
+ 
+ static void
+ vect_init_vector_1 (vec_info *vinfo, stmt_vec_info stmt_vinfo, gimple *new_stmt,
+-		    gimple_stmt_iterator *gsi)
++		    gimple_stmt_iterator *gsi, bool transpose=false)
+ {
+   if (gsi)
+-    vect_finish_stmt_generation (vinfo, stmt_vinfo, new_stmt, gsi);
++    vect_finish_stmt_generation (vinfo, stmt_vinfo, new_stmt, gsi, transpose);
+   else
+     vinfo->insert_on_entry (stmt_vinfo, new_stmt);
+ 
+@@ -1393,7 +1393,7 @@ vect_init_vector_1 (vec_info *vinfo, stmt_vec_info stmt_vinfo, gimple *new_stmt,
+ 
+ tree
+ vect_init_vector (vec_info *vinfo, stmt_vec_info stmt_info, tree val, tree type,
+-		  gimple_stmt_iterator *gsi)
++		  gimple_stmt_iterator *gsi, bool transpose)
+ {
+   gimple *init_stmt;
+   tree new_temp;
+@@ -1418,7 +1418,7 @@ vect_init_vector (vec_info *vinfo, stmt_vec_info stmt_info, tree val, tree type,
+ 		  new_temp = make_ssa_name (TREE_TYPE (type));
+ 		  init_stmt = gimple_build_assign (new_temp, COND_EXPR,
+ 						   val, true_val, false_val);
+-		  vect_init_vector_1 (vinfo, stmt_info, init_stmt, gsi);
++		  vect_init_vector_1 (vinfo, stmt_info, init_stmt, gsi, transpose);
+ 		  val = new_temp;
+ 		}
+ 	    }
+@@ -1437,7 +1437,7 @@ vect_init_vector (vec_info *vinfo, stmt_vec_info stmt_info, tree val, tree type,
+ 		{
+ 		  init_stmt = gsi_stmt (gsi2);
+ 		  gsi_remove (&gsi2, false);
+-		  vect_init_vector_1 (vinfo, stmt_info, init_stmt, gsi);
++		  vect_init_vector_1 (vinfo, stmt_info, init_stmt, gsi, transpose);
+ 		}
+ 	    }
+ 	}
+@@ -1446,7 +1446,7 @@ vect_init_vector (vec_info *vinfo, stmt_vec_info stmt_info, tree val, tree type,
+ 
+   new_temp = vect_get_new_ssa_name (type, vect_simple_var, "cst_");
+   init_stmt = gimple_build_assign (new_temp, val);
+-  vect_init_vector_1 (vinfo, stmt_info, init_stmt, gsi);
++  vect_init_vector_1 (vinfo, stmt_info, init_stmt, gsi, transpose);
+   return new_temp;
+ }
+ 
+@@ -1572,9 +1572,11 @@ vect_get_vec_defs (vec_info *vinfo, stmt_vec_info stmt_info, slp_tree slp_node,
+    statement and create and return a stmt_vec_info for it.  */
+ 
+ static void
+-vect_finish_stmt_generation_1 (vec_info *,
+-			       stmt_vec_info stmt_info, gimple *vec_stmt)
++vect_finish_stmt_generation_1 (vec_info *vinfo,
++			       stmt_vec_info stmt_info, gimple *vec_stmt, bool transpose=false)
+ {
++  if (transpose)
++    stmt_vec_info vec_stmt_info = vinfo->add_pattern_stmt (vec_stmt, NULL);
+   if (dump_enabled_p ())
+     dump_printf_loc (MSG_NOTE, vect_location, "add new stmt: %G", vec_stmt);
+ 
+@@ -1616,7 +1618,7 @@ vect_finish_replace_stmt (vec_info *vinfo,
+ void
+ vect_finish_stmt_generation (vec_info *vinfo,
+ 			     stmt_vec_info stmt_info, gimple *vec_stmt,
+-			     gimple_stmt_iterator *gsi)
++			     gimple_stmt_iterator *gsi, bool transpose)
+ {
+   gcc_assert (!stmt_info || gimple_code (stmt_info->stmt) != GIMPLE_LABEL);
+ 
+@@ -1648,7 +1650,7 @@ vect_finish_stmt_generation (vec_info *vinfo,
+ 	}
+     }
+   gsi_insert_before (gsi, vec_stmt, GSI_SAME_STMT);
+-  vect_finish_stmt_generation_1 (vinfo, stmt_info, vec_stmt);
++  vect_finish_stmt_generation_1 (vinfo, stmt_info, vec_stmt, transpose);
+ }
+ 
+ /* We want to vectorize a call to combined function CFN with function
+@@ -2159,6 +2161,173 @@ vector_vector_composition_type (tree vtype, poly_uint64 nelts, tree *ptype)
+   return NULL_TREE;
+ }
+ 
++/* Check succeedor BB, BB without load is regarded as empty BB.  Ignore empty
++   BB in DFS.  */
++
++static unsigned
++mem_refs_in_bb (basic_block bb, vec<gimple *> &stmts)
++{
++  unsigned num = 0;
++  for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
++       !gsi_end_p (gsi); gsi_next (&gsi))
++    {
++      gimple *stmt = gsi_stmt (gsi);
++      if (is_gimple_debug (stmt))
++	continue;
++      if (is_gimple_assign (stmt) && gimple_has_mem_ops (stmt)
++	  && !gimple_has_volatile_ops (stmt))
++	{
++	  if (gimple_assign_rhs_code (stmt) == MEM_REF
++	      || gimple_assign_rhs_code (stmt) == ARRAY_REF)
++	    {
++	      stmts.safe_push (stmt);
++	      num++;
++	    }
++	  else if (TREE_CODE (gimple_get_lhs (stmt)) == MEM_REF
++		   || TREE_CODE (gimple_get_lhs (stmt)) == ARRAY_REF)
++	    num++;
++	}
++    }
++  return num;
++}
++
++static bool
++check_same_base (vec<data_reference_p> *datarefs, data_reference_p dr)
++{
++  for (unsigned ui = 0; ui < datarefs->length (); ui++)
++    {
++      tree op1 = TREE_OPERAND (DR_BASE_OBJECT (dr), 0);
++      tree op2 = TREE_OPERAND (DR_BASE_OBJECT ((*datarefs)[ui]), 0);
++      if (TREE_CODE (op1) != TREE_CODE (op2))
++	continue;
++      if (TREE_CODE (op1) == ADDR_EXPR)
++	{
++	  op1 = TREE_OPERAND (op1, 0);
++	  op2 = TREE_OPERAND (op2, 0);
++	}
++      enum tree_code code = TREE_CODE (op1);
++      switch (code)
++	{
++	case VAR_DECL:
++	  if (DECL_NAME (op1) == DECL_NAME (op2)
++	      && DR_IS_READ ((*datarefs)[ui]))
++	    return true;
++	  break;
++	case SSA_NAME:
++	  if (SSA_NAME_VERSION (op1) == SSA_NAME_VERSION (op2)
++	      && DR_IS_READ ((*datarefs)[ui]))
++	    return true;
++	  break;
++	default:
++	  break;
++	}
++    }
++  return false;
++}
++
++/* Iterate all load STMTS, if staisfying same base vectorized stmt, then return,
++   Otherwise, set false to SUCCESS.  */
++
++static void
++check_vec_use (loop_vec_info loop_vinfo, vec<gimple *> &stmts,
++	       stmt_vec_info stmt_info, bool &success)
++{
++  if (stmt_info == NULL)
++    {
++      success = false;
++      return;
++    }
++  if (DR_IS_READ (stmt_info->dr_aux.dr))
++    {
++      success = false;
++      return;
++    }
++  unsigned ui = 0;
++  gimple *candidate = NULL;
++  FOR_EACH_VEC_ELT (stmts, ui, candidate)
++    {
++      if (TREE_CODE (TREE_TYPE (gimple_get_lhs (candidate))) != VECTOR_TYPE)
++	continue;
++
++      if (candidate->bb != candidate->bb->loop_father->header)
++	{
++	  success = false;
++	  return;
++	}
++      auto_vec<data_reference_p> datarefs;
++      tree res = find_data_references_in_bb (candidate->bb->loop_father,
++					     candidate->bb, &datarefs);
++      if (res == chrec_dont_know)
++	{
++	  success = false;
++	  return;
++	}
++      if (check_same_base (&datarefs, stmt_info->dr_aux.dr))
++	return;
++    }
++  success = false;
++}
++
++/* Deep first search from present BB.  If succeedor has load STMTS,
++   stop further searching.  */
++
++static void
++dfs_check_bb (loop_vec_info loop_vinfo, basic_block bb, stmt_vec_info stmt_info,
++	      bool &success, vec<basic_block> &visited_bbs)
++{
++  if (bb == cfun->cfg->x_exit_block_ptr)
++    {
++      success = false;
++      return;
++    }
++  if (!success || visited_bbs.contains (bb) || bb == loop_vinfo->loop->latch)
++    return;
++
++  visited_bbs.safe_push (bb);
++  auto_vec<gimple *> stmts;
++  unsigned num = mem_refs_in_bb (bb, stmts);
++  /* Empty BB.  */
++  if (num == 0)
++    {
++      edge e;
++      edge_iterator ei;
++      FOR_EACH_EDGE (e, ei, bb->succs)
++	{
++	  dfs_check_bb (loop_vinfo, e->dest, stmt_info, success, visited_bbs);
++	  if (!success)
++	    return;
++	}
++      return;
++    }
++  /* Non-empty BB.  */
++  check_vec_use (loop_vinfo, stmts, stmt_info, success);
++}
++
++/* For grouped store, if all succeedors of present BB have vectorized load
++   from same base of store.  If so, set memory_access_type using
++   VMAT_CONTIGUOUS_PERMUTE instead of VMAT_LOAD_STORE_LANES.  */
++
++static bool
++conti_perm (stmt_vec_info stmt_vinfo, loop_vec_info loop_vinfo)
++{
++  gimple *stmt = stmt_vinfo->stmt;
++  if (gimple_code (stmt) != GIMPLE_ASSIGN)
++    return false;
++
++  if (DR_IS_READ (stmt_vinfo->dr_aux.dr))
++    return false;
++
++  basic_block bb = stmt->bb;
++  bool success = true;
++  auto_vec<basic_block> visited_bbs;
++  visited_bbs.safe_push (bb);
++  edge e;
++  edge_iterator ei;
++  FOR_EACH_EDGE (e, ei, bb->succs)
++    dfs_check_bb (loop_vinfo, e->dest, stmt_vinfo, success, visited_bbs);
++  return success;
++}
++
+ /* A subroutine of get_load_store_type, with a subset of the same
+    arguments.  Handle the case where STMT_INFO is part of a grouped load
+    or store.
+@@ -2373,6 +2542,20 @@ get_group_load_store_type (vec_info *vinfo, stmt_vec_info stmt_info,
+ 	      *memory_access_type = VMAT_CONTIGUOUS_PERMUTE;
+ 	      overrun_p = would_overrun_p;
+ 	    }
++
++	  if (*memory_access_type == VMAT_LOAD_STORE_LANES
++	      && TREE_CODE (loop_vinfo->num_iters) == INTEGER_CST
++	      && maybe_eq (tree_to_shwi (loop_vinfo->num_iters),
++			   loop_vinfo->vectorization_factor)
++	      && conti_perm (stmt_info, loop_vinfo)
++	      && (vls_type == VLS_LOAD
++		  ? vect_grouped_load_supported (vectype, single_element_p,
++						 group_size)
++		  : vect_grouped_store_supported (vectype, group_size)))
++	    {
++	      *memory_access_type = VMAT_CONTIGUOUS_PERMUTE;
++	      overrun_p = would_overrun_p;
++	    }
+ 	}
+ 
+       /* As a last resort, trying using a gather load or scatter store.
+@@ -7456,6 +7639,154 @@ vectorizable_scan_store (vec_info *vinfo,
+   return true;
+ }
+ 
++/* Function vect_permute_store_chains
++
++   Call function vect_permute_store_chain ().
++   Given a chain of interleaved stores in DR_CHAIN, generate
++   interleave_high/low stmts to reorder the data correctly.
++   Return the final references for stores in RESULT_CHAIN.  */
++
++static void
++vect_permute_store_chains (vec_info *vinfo, vec<tree> dr_chain,
++			   unsigned int num_each, stmt_vec_info stmt_info,
++			   gimple_stmt_iterator *gsi, vec<tree> *result_chain,
++			   unsigned int group)
++{
++  unsigned int k = 0;
++  unsigned int t = 0;
++
++  /* Divide vectors into GROUP parts.  And permute every NUM_EACH vectors
++     together.  */
++  for (k = 0; k < group; k++)
++    {
++      auto_vec<tree> dr_chain_transposed (num_each);
++      auto_vec<tree> result_chain_transposed (num_each);
++      for (t = k; t < dr_chain.length (); t = t + group)
++	{
++	  dr_chain_transposed.quick_push (dr_chain[t]);
++	}
++      vect_permute_store_chain (vinfo, dr_chain_transposed, num_each,
++				stmt_info, gsi, &result_chain_transposed);
++      for (t = 0; t < num_each; t++)
++	{
++	  result_chain->quick_push (result_chain_transposed[t]);
++	}
++    }
++}
++
++/* Function transpose_oprnd_store
++
++    Calculate the transposed results from VEC_OPRNDS (VEC_STMT)
++    for vectorizable_store.  */
++
++static void
++transpose_oprnd_store (vec_info *vinfo, vec<tree>vec_oprnds,
++		       vec<tree> *result_chain, unsigned int vec_num,
++		       unsigned int const_nunits, unsigned int array_num,
++		       stmt_vec_info first_stmt_info,
++		       gimple_stmt_iterator *gsi)
++{
++  unsigned int group_for_transform = 0;
++  unsigned int num_each = 0;
++
++  /* Transpose back for vec_oprnds.  */
++  /* vec = {vec1, vec2, ...}  */
++  if (array_num < const_nunits
++      && const_nunits % array_num == 0)
++    {
++      vect_transpose_store_chain (vinfo, vec_oprnds,
++				  vec_num, array_num,
++				  first_stmt_info,
++				  gsi, result_chain);
++    }
++   /* vec1 = {vec_part1}, vec2 = {vec_part2}, ...  */
++  else if (array_num >= const_nunits
++	   && array_num % const_nunits == 0)
++    {
++      group_for_transform = array_num / const_nunits;
++      num_each = vec_oprnds.length () / group_for_transform;
++      vect_permute_store_chains (vinfo, vec_oprnds,
++				 num_each, first_stmt_info,
++				 gsi, result_chain,
++				 group_for_transform);
++    }
++  else
++    {
++      gcc_unreachable ();
++    }
++}
++
++static dr_vec_info *
++get_dr_info (stmt_vec_info stmt_info)
++{
++  dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
++  if (dr_info->misalignment == DR_MISALIGNMENT_UNINITIALIZED)
++    {
++      SET_DR_MISALIGNMENT (dr_info, DR_MISALIGNMENT_UNKNOWN);
++    }
++  return dr_info;
++}
++
++static unsigned
++dr_align_vect_store (vec_info *vinfo, dr_vec_info *cur_first_dr_info,
++		     tree vectype, unsigned HOST_WIDE_INT &align)
++{
++  unsigned misalign = 0;
++  align = known_alignment (DR_TARGET_ALIGNMENT (cur_first_dr_info));
++  if (aligned_access_p (cur_first_dr_info, vectype))
++    {
++      return misalign;
++    }
++  else if (cur_first_dr_info->misalignment == -1)
++    {
++      align = dr_alignment (vect_dr_behavior (vinfo, cur_first_dr_info));
++    }
++  else
++    {
++      misalign = cur_first_dr_info->misalignment;
++    }
++  return misalign;
++}
++
++static void
++add_new_stmt_vect_store (vec_info *vinfo, tree vectype, tree dataref_ptr,
++			 tree dataref_offset, tree ref_type,
++			 dr_vec_info *cur_first_dr_info, tree vec_oprnd,
++			 gimple_stmt_iterator *gsi, stmt_vec_info stmt_info)
++{
++  /* Data align.  */
++  unsigned HOST_WIDE_INT align;
++  unsigned misalign = dr_align_vect_store (vinfo, cur_first_dr_info,
++					   vectype, align);
++
++  if (dataref_offset == NULL_TREE && TREE_CODE (dataref_ptr) == SSA_NAME)
++    {
++      set_ptr_info_alignment (get_ptr_info (dataref_ptr), align, misalign);
++    }
++
++  /* Get data_ref.  */
++  tree offset = dataref_offset ? dataref_offset : build_int_cst (ref_type, 0);
++  tree data_ref = fold_build2 (MEM_REF, vectype, dataref_ptr, offset);
++  if (aligned_access_p (cur_first_dr_info, vectype))
++    {
++      ;
++    }
++  else if (cur_first_dr_info->misalignment == -1)
++    {
++      TREE_TYPE (data_ref) = build_aligned_type (TREE_TYPE (data_ref),
++						 align * BITS_PER_UNIT);
++    }
++  else
++    {
++      tree elem_type = TREE_TYPE (vectype);
++      TREE_TYPE (data_ref) = build_aligned_type (TREE_TYPE (data_ref),
++						 TYPE_ALIGN (elem_type));
++    }
++  /* Add new stmt.  */
++  vect_copy_ref_info (data_ref, DR_REF (cur_first_dr_info->dr));
++  gassign *new_stmt = gimple_build_assign (data_ref, vec_oprnd);
++  vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi, true);
++}
+ 
+ /* Function vectorizable_store.
+ 
+@@ -8333,6 +8664,16 @@ vectorizable_store (vec_info *vinfo,
+ 					   &vec_offsets);
+ 	      vec_offset = vec_offsets[0];
+ 	    }
++	  /* If the stmt_info need to be transposed recovery, dataref_ptr
++	     will be caculated later.  */
++	  else if (memory_access_type == VMAT_CONTIGUOUS
++		   && is_a <bb_vec_info> (vinfo)
++		   && STMT_VINFO_GROUPED_ACCESS (stmt_info)
++		   && DR_GROUP_SLP_TRANSPOSE (
++			DR_GROUP_FIRST_ELEMENT (stmt_info)))
++	    {
++	      dataref_ptr = NULL_TREE;
++	    }
+ 	  else
+ 	    dataref_ptr
+ 	      = vect_create_data_ref_ptr (vinfo, first_stmt_info, aggr_type,
+@@ -8423,6 +8764,75 @@ vectorizable_store (vec_info *vinfo,
+ 	}
+       else
+ 	{
++	  /* group_size: the size of group after transposing and merging.
++	     group_size_b: the size of group before transposing and merging,
++			 and only group_size_b >= const_nunits is supported.
++	     array_num: the number of arrays.
++	     const_nunits: TYPE_VECTOR_SUBPARTS (vectype).
++	     ncontinues: group_size_b / const_nunits, it means the number of
++			 times an array is stored in memory.  */
++	  if (slp && is_a <bb_vec_info> (vinfo)
++	      && STMT_VINFO_GROUPED_ACCESS (stmt_info)
++	      && DR_GROUP_SLP_TRANSPOSE (DR_GROUP_FIRST_ELEMENT (stmt_info)))
++	    {
++	      if (dump_enabled_p ())
++		{
++		  dump_printf_loc (MSG_NOTE, vect_location,
++				   "vectorizable_store for slp transpose.\n");
++		}
++	      /* Transpose back for grouped stores.  */
++	      vect_transform_back_slp_grouped_stores (bb_vinfo,
++						      first_stmt_info);
++
++	      result_chain.create (vec_oprnds.length ());
++	      unsigned int const_nunits = nunits.to_constant ();
++	      unsigned int group_size_b = DR_GROUP_SIZE_TRANS (first_stmt_info);
++	      unsigned int array_num = group_size / group_size_b;
++	      transpose_oprnd_store (vinfo, vec_oprnds, &result_chain, vec_num,
++				     const_nunits, array_num,
++				     first_stmt_info, gsi);
++
++	      /* For every store group, not for every vec, because transposing
++		 and merging have changed the data reference access.  */
++	      gcc_assert (group_size_b >= const_nunits);
++	      unsigned int ncontinues = group_size_b / const_nunits;
++
++	      unsigned int k = 0;
++	      for (i = 0; i < array_num; i++)
++		{
++		  stmt_vec_info first_stmt_b;
++		  BB_VINFO_GROUPED_STORES (vinfo).iterate (i, &first_stmt_b);
++		  bool simd_lane_access_p
++			= STMT_VINFO_SIMD_LANE_ACCESS_P (first_stmt_b) != 0;
++		  tree ref_type = get_group_alias_ptr_type (first_stmt_b);
++		  dataref_ptr = vect_create_data_ref_ptr (
++				 vinfo, first_stmt_b, aggr_type,
++				 simd_lane_access_p ? loop : NULL,
++				 offset, &dummy, gsi, &ptr_incr,
++				 simd_lane_access_p, bump);
++		  dr_vec_info *cur_first_dr_info = get_dr_info (first_stmt_b);
++		  for (unsigned int t = 0; t < ncontinues; t++)
++		    {
++		      vec_oprnd = result_chain[k];
++		      k++;
++		      if (t > 0)
++			{
++			  /* Bump the vector pointer.  */
++			  dataref_ptr = bump_vector_ptr (vinfo, dataref_ptr,
++							 ptr_incr, gsi,
++							 first_stmt_b, bump);
++			}
++		      add_new_stmt_vect_store (vinfo, vectype, dataref_ptr,
++					       dataref_offset, ref_type,
++					       cur_first_dr_info, vec_oprnd,
++					       gsi, first_stmt_b);
++		    }
++		}
++	      oprnds.release ();
++	      result_chain.release ();
++	      vec_oprnds.release ();
++	      return true;
++	    }
+ 	  new_stmt = NULL;
+ 	  if (grouped_store)
+ 	    {
+@@ -8719,6 +9129,451 @@ hoist_defs_of_uses (stmt_vec_info stmt_info, class loop *loop)
+   return true;
+ }
+ 
++static tree
++calculate_new_type (tree vectype, unsigned int const_nunits,
++		    unsigned int group_size_b, unsigned int &nloads,
++		    unsigned int &ncontinues, tree &lvectype)
++{
++  tree ltype = TREE_TYPE (vectype);
++  /* nloads is the number of ARRAYs in a vector.
++     vectemp = {a[], b[], ...}  */
++  if (group_size_b < const_nunits)
++    {
++      tree ptype;
++      tree vtype
++	= vector_vector_composition_type (vectype,
++					  const_nunits / group_size_b,
++					  &ptype);
++      if (vtype != NULL_TREE)
++	{
++	  nloads = const_nunits / group_size_b;
++	  lvectype = vtype;
++	  ltype = ptype;
++	  ncontinues = 1;
++	}
++    }
++  /* ncontinues is the number of vectors from an ARRAY.
++     vectemp1 = {a[0], a[1], ...}
++     ...
++     vectempm = {a[k], a[k+1], ...}  */
++  else
++    {
++      nloads = 1;
++      ltype = vectype;
++      ncontinues = group_size_b / const_nunits;
++    }
++  ltype = build_aligned_type (ltype, TYPE_ALIGN (TREE_TYPE (vectype)));
++  return ltype;
++}
++
++static void
++generate_old_load_permutations (slp_tree slp_node, unsigned int group_size,
++				vec<unsigned> &old_load_permutation)
++{
++  /* Generate the old load permutations from the slp_node.  */
++  unsigned i = 0;
++  unsigned k = 0;
++
++  /* If SLP_NODE has load_permutation, we copy it to old_load_permutation.
++     Otherwise, we generate a permutation sequentially.  */
++  if (SLP_TREE_LOAD_PERMUTATION (slp_node).exists ())
++    {
++      FOR_EACH_VEC_ELT (SLP_TREE_LOAD_PERMUTATION (slp_node), i, k)
++	{
++	  old_load_permutation.safe_push (k);
++	}
++    }
++  else
++    {
++      for (unsigned i = 0; i < group_size; i++)
++	{
++	  old_load_permutation.safe_push (i);
++	}
++    }
++}
++
++static void
++generate_new_load_permutation_mapping (unsigned slp_node_length,
++				       vec<unsigned> &group_idx,
++				       const vec<unsigned> &load_permutation,
++				       unsigned int group_size_b,
++				       unsigned &new_group_size,
++				       vec<unsigned> &group_from)
++{
++  /* group_num_vec: only stores the group_loads IDs which are caculated from
++     load_permutation.  */
++  auto_vec<unsigned> group_num_vec;
++
++  /* Caculate which group_loads are the stmts in SLP_NODE from.  */
++  unsigned i = 0;
++  unsigned k = 0;
++  FOR_EACH_VEC_ELT (load_permutation, i, k)
++    {
++      unsigned int t0 = k / group_size_b;
++      if (!group_num_vec.contains (t0))
++	{
++	  group_num_vec.safe_push (t0);
++	}
++      group_from.safe_push (t0);
++    }
++  group_num_vec.qsort (cmp_for_group_num);
++  /* n_groups: the number of group_loads.  */
++  unsigned int n_groups = group_num_vec.length ();
++  new_group_size = n_groups * group_size_b;
++  for (i = 0; i < n_groups; i++)
++    {
++      group_idx.safe_push (group_num_vec[i] * group_size_b);
++    }
++  /* A new mapping from group_ind_vec to group_from.
++      For example:
++	Origin: group_from = {1,1,3,3,5,5,7,7};
++	After mapping: group_from = {0,0,1,1,2,2,2,2};  */
++  auto_vec<unsigned> group_ind_vec (n_groups);
++  for (k = 0; k < n_groups; k++)
++    {
++      group_ind_vec.safe_push (k);
++    }
++  for (i = 0; i < slp_node_length; i++)
++    {
++      for (k = 0; k < n_groups; k++)
++	{
++	  if (group_from[i] == group_num_vec[k])
++	    {
++	      group_from[i] = group_ind_vec[k];
++	      break;
++	    }
++	}
++    }
++}
++
++static void
++generate_new_load_permutation (vec<unsigned> &new_load_permutation,
++			       const vec<unsigned> &old_load_permutation,
++			       slp_tree slp_node, bool &this_load_permuted,
++			       const vec<unsigned> &group_from,
++			       unsigned int group_size_b)
++{
++  unsigned slp_node_length = SLP_TREE_SCALAR_STMTS (slp_node).length ();
++  /* Generate the new load permutation from the new mapping.  */
++  new_load_permutation.create (slp_node_length);
++  unsigned i = 0;
++  unsigned k = 0;
++  FOR_EACH_VEC_ELT (old_load_permutation, i, k)
++    {
++      /* t1 is the new permutation of k in the old permutation.
++	 t1 = base_address + offset:
++	 base_address = group_from[i] * group_size_b;
++	 offset = k % group_size_b.  */
++      unsigned int t1
++	= group_from[i] * group_size_b + k % group_size_b;
++      new_load_permutation.safe_push (t1);
++      if (t1 != k)
++	{
++	  this_load_permuted = true;
++	}
++    }
++}
++
++static bool
++is_slp_perm (bool slp_perm, bool this_load_permuted, poly_uint64 nunits,
++	     unsigned int group_size, stmt_vec_info first_stmt_info)
++{
++  /* Calculate the unrolling factor based on the smallest type.  */
++  poly_uint64 unrolling_factor
++    = exact_div (common_multiple (nunits, group_size), group_size);
++  /* The load requires permutation when unrolling exposes
++     a gap either because the group is larger than the SLP
++     group-size or because there is a gap between the groups.  */
++  if (!slp_perm && !this_load_permuted
++      && (known_eq (unrolling_factor, 1U)
++	  || (group_size == DR_GROUP_SIZE (first_stmt_info)
++	      && DR_GROUP_GAP (first_stmt_info) == 0)))
++    {
++      return false;
++    }
++  else
++    {
++      return true;
++    }
++}
++
++static void
++generate_load_permutation (slp_tree slp_node, unsigned &new_group_size,
++			   unsigned int group_size, unsigned int group_size_b,
++			   bool &this_load_permuted, vec<unsigned> &group_idx,
++			   vec<unsigned> &new_load_permutation)
++{
++  /* Generate the old load permutations from SLP_NODE.  */
++  vec<unsigned> old_load_permutation;
++  old_load_permutation.create (group_size);
++  generate_old_load_permutations (slp_node, group_size, old_load_permutation);
++
++  /* Caculate which group_loads are the stmts in SLP_NODE from.  */
++  unsigned slp_node_length = SLP_TREE_SCALAR_STMTS (slp_node).length ();
++  /* group_from: stores the group_loads ID for every stmt in SLP_NODE.  */
++  vec<unsigned> group_from;
++  group_from.create (slp_node_length);
++  generate_new_load_permutation_mapping (slp_node_length, group_idx,
++					 old_load_permutation,
++					 group_size_b, new_group_size,
++					 group_from);
++
++  /* Generate the new load permutation from the new mapping and caculate
++     this_load_permuted flag.  If this_load_permuted is true, we need execute
++     slp permutation by using new load permutation.  */
++  generate_new_load_permutation (new_load_permutation, old_load_permutation,
++				 slp_node, this_load_permuted, group_from,
++				 group_size_b);
++  old_load_permutation.release ();
++  group_from.release ();
++}
++
++static unsigned int
++dr_align_vect_load (vec_info *vinfo, dr_vec_info *cur_first_dr_info,
++		    tree vectype, unsigned HOST_WIDE_INT &align,
++		    enum dr_alignment_support alignment_support_scheme)
++{
++  unsigned int misalign = 0;
++
++  align = known_alignment (DR_TARGET_ALIGNMENT (cur_first_dr_info));
++  if (alignment_support_scheme == dr_aligned)
++    {
++      gcc_assert (aligned_access_p (cur_first_dr_info, vectype));
++    }
++  else if (cur_first_dr_info->misalignment == -1)
++    {
++      align = dr_alignment (vect_dr_behavior (vinfo, cur_first_dr_info));
++    }
++  else
++    {
++      misalign = cur_first_dr_info->misalignment;
++    }
++  return misalign;
++}
++
++static stmt_vec_info
++add_new_stmt_vect_load (vec_info *vinfo, tree vectype, tree dataref_ptr,
++			tree dataref_offset, tree ref_type, tree ltype,
++			gassign *(&new_stmt), dr_vec_info *cur_first_dr_info,
++			gimple_stmt_iterator *gsi, stmt_vec_info stmt_info)
++{
++  /* Data align.  */
++  int malign = dr_misalignment (cur_first_dr_info, vectype);
++  enum dr_alignment_support alignment_support_scheme
++	= vect_supportable_dr_alignment (vinfo, cur_first_dr_info,
++					 vectype, malign);
++  unsigned HOST_WIDE_INT align;
++  unsigned int misalign = dr_align_vect_load (vinfo, cur_first_dr_info,
++					      vectype, align,
++					      alignment_support_scheme);
++  if (dataref_offset == NULL_TREE && TREE_CODE (dataref_ptr) == SSA_NAME)
++    {
++      set_ptr_info_alignment (get_ptr_info (dataref_ptr), align, misalign);
++    }
++
++  /* Get data_ref.  */
++  tree offset = dataref_offset ? dataref_offset : build_int_cst (ref_type, 0);
++  tree data_ref = fold_build2 (MEM_REF, ltype, dataref_ptr, offset);
++  if (alignment_support_scheme == dr_aligned)
++    {
++      ;
++    }
++  else if (cur_first_dr_info->misalignment == -1)
++    {
++      TREE_TYPE (data_ref)
++	= build_aligned_type (TREE_TYPE (data_ref), align * BITS_PER_UNIT);
++    }
++  else
++    {
++      tree elem_type = TREE_TYPE (vectype);
++      TREE_TYPE (data_ref)
++	= build_aligned_type (TREE_TYPE (data_ref), TYPE_ALIGN (elem_type));
++    }
++
++  /* Add new stmt.  */
++  vect_copy_ref_info (data_ref, DR_REF (cur_first_dr_info->dr));
++  new_stmt = gimple_build_assign (make_ssa_name (ltype), data_ref);
++  vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi, true);
++  stmt_vec_info vec_stmt_info = vinfo->lookup_stmt (new_stmt);
++  return vec_stmt_info;
++}
++
++static void
++push_new_stmt_to_dr_chain (bool slp_perm, stmt_vec_info new_stmt_info,
++			   vec<tree> dr_chain, slp_tree slp_node)
++{
++  if (slp_perm)
++    dr_chain.quick_push (gimple_assign_lhs (new_stmt_info->stmt));
++  else
++    SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt_info->stmt);
++}
++
++static stmt_vec_info
++get_first_stmt_info_before_transpose (stmt_vec_info first_stmt_info,
++				      unsigned int group_el,
++				      unsigned int group_size)
++{
++  stmt_vec_info last_stmt_info = first_stmt_info;
++  unsigned int count = 0;
++  gcc_assert (group_el < group_size);
++  while (count < group_el)
++    {
++      last_stmt_info = DR_GROUP_NEXT_ELEMENT (last_stmt_info);
++      count++;
++    }
++  return last_stmt_info;
++}
++
++static stmt_vec_info
++add_new_stmt_for_nloads_greater_than_one (vec_info *vinfo, tree lvectype,
++					  tree vectype,
++					  vec<constructor_elt, va_gc> *v,
++					  stmt_vec_info stmt_info,
++					  gimple_stmt_iterator *gsi)
++{
++  tree vec_inv = build_constructor (lvectype, v);
++  tree new_temp = vect_init_vector (vinfo, stmt_info, vec_inv, lvectype, gsi, true);
++  stmt_vec_info new_stmt_info = vinfo->lookup_def (new_temp);
++  if (lvectype != vectype)
++    {
++      gassign *new_stmt = gimple_build_assign (make_ssa_name (vectype),
++					       VIEW_CONVERT_EXPR,
++					       build1 (VIEW_CONVERT_EXPR,
++						       vectype, new_temp));
++      vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi, true);
++      new_stmt_info = vinfo->lookup_stmt (new_stmt);
++    }
++  return new_stmt_info;
++}
++
++/* Function new_vect_stmt_for_nloads.
++
++   New a VEC_STMT when nloads Arrays are merged into a vector.
++
++   ncopies is the number of vectors that need to be loaded from memmory.
++   nloads is the number of ARRAYs in a vector.
++   vectemp = {a[], b[], ...}  */
++
++static void
++new_vect_stmt_for_nloads (vec_info *vinfo, unsigned int ncopies,
++			  unsigned int nloads, const vec<unsigned> &group_idx,
++			  stmt_vec_info stmt_info, offset_info *offset_info,
++			  vectype_info *vectype_info,
++			  vect_memory_access_type memory_access_type,
++			  bool slp_perm, vec<tree> dr_chain, slp_tree slp_node,
++			  gimple_stmt_iterator *gsi)
++{
++  vec<constructor_elt, va_gc> *v = NULL;
++  stmt_vec_info first_stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
++  unsigned int group_size = DR_GROUP_SIZE (first_stmt_info);
++  stmt_vec_info first_stmt_info_b = NULL;
++  stmt_vec_info new_stmt_info = NULL;
++  tree dataref_ptr = NULL_TREE;
++  tree dummy;
++  gimple *ptr_incr = NULL;
++  unsigned int n = 0;
++  for (unsigned int i = 0; i < ncopies; i++)
++    {
++      vec_alloc (v, nloads);
++      for (unsigned int t = 0; t < nloads; t++)
++	{
++	  first_stmt_info_b = get_first_stmt_info_before_transpose (
++				first_stmt_info, group_idx[n++], group_size);
++	  dr_vec_info* cur_first_dr_info = get_dr_info (first_stmt_info_b);
++	  tree bump = vect_get_data_ptr_increment (vinfo, cur_first_dr_info,
++						   vectype_info->ltype,
++						   memory_access_type);
++	  bool simd_lane_access_p
++		= STMT_VINFO_SIMD_LANE_ACCESS_P (first_stmt_info_b) != 0;
++
++	  /* Create dataref_ptr which is point to init_address.  */
++	  dataref_ptr = vect_create_data_ref_ptr (
++			 vinfo, first_stmt_info_b, vectype_info->ltype, NULL,
++			 offset_info->offset, &dummy, gsi, &ptr_incr,
++			 simd_lane_access_p, bump);
++
++	  gassign *new_stmt = NULL;
++	  new_stmt_info = add_new_stmt_vect_load (vinfo, vectype_info->vectype, dataref_ptr,
++				  offset_info->dataref_offset,
++				  vectype_info->ref_type,  vectype_info->ltype,
++				  new_stmt, cur_first_dr_info, gsi,
++				  first_stmt_info_b);
++
++	  CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, gimple_assign_lhs (new_stmt));
++	}
++	new_stmt_info = add_new_stmt_for_nloads_greater_than_one (
++				 vinfo, vectype_info->lvectype,
++				 vectype_info->vectype, v,
++				 first_stmt_info_b, gsi);
++	push_new_stmt_to_dr_chain (slp_perm, new_stmt_info,
++				   dr_chain, slp_node);
++    }
++}
++
++/* Function new_vect_stmt_for_ncontinues.
++
++   New a VEC_STMTs when an Array is divided into several vectors.
++
++   n_groups is the number of ARRAYs.
++   ncontinues is the number of vectors from an ARRAY.
++   vectemp1 = {a[0], a[1], ...}
++   ...
++   vectempm = {a[k], a[k+1], ...}  */
++
++static void
++new_vect_stmt_for_ncontinues (vec_info *vinfo, unsigned int ncontinues,
++			      const vec<unsigned> &group_idx,
++			      stmt_vec_info stmt_info,
++			      offset_info* offset_info,
++			      vectype_info* vectype_info,
++			      vect_memory_access_type memory_access_type,
++			      bool slp_perm, vec<tree> &dr_chain,
++			      slp_tree slp_node,
++			      gimple_stmt_iterator *gsi)
++{
++  stmt_vec_info first_stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
++  unsigned int group_size = DR_GROUP_SIZE (first_stmt_info);
++  stmt_vec_info new_stmt_info = NULL;
++  tree dataref_ptr = NULL_TREE;
++  tree dummy;
++  gimple *ptr_incr = NULL;
++  unsigned int n_groups = group_idx.length ();
++  for (unsigned int i = 0; i < n_groups; i++)
++    {
++      stmt_vec_info first_stmt_info_b = get_first_stmt_info_before_transpose (
++				first_stmt_info, group_idx[i], group_size);
++      dr_vec_info* cur_first_dr_info = get_dr_info (first_stmt_info_b);
++      tree bump = vect_get_data_ptr_increment (vinfo, cur_first_dr_info,
++			vectype_info->ltype, memory_access_type);
++      bool simd_lane_access_p
++		= STMT_VINFO_SIMD_LANE_ACCESS_P (first_stmt_info_b) != 0;
++      for (unsigned int k = 0; k < ncontinues; k++)
++	{
++	  /* Create dataref_ptr which is point to init_address.  */
++	  if (k == 0)
++	    {
++	      dataref_ptr = vect_create_data_ref_ptr (
++			 vinfo, first_stmt_info_b, vectype_info->ltype, NULL,
++			 offset_info->offset, &dummy, gsi, &ptr_incr,
++			 simd_lane_access_p, bump);
++	    }
++	  else
++	    {
++	      dataref_ptr = bump_vector_ptr (vinfo, dataref_ptr, ptr_incr,
++					     gsi, first_stmt_info_b, bump);
++	    }
++	  gassign *new_stmt = NULL;
++	  new_stmt_info = add_new_stmt_vect_load (vinfo, vectype_info->vectype, dataref_ptr,
++				  offset_info->dataref_offset,
++				  vectype_info->ref_type, vectype_info->ltype,
++				  new_stmt, cur_first_dr_info, gsi,
++				  first_stmt_info_b);
++	  push_new_stmt_to_dr_chain (slp_perm, new_stmt_info,
++	  		dr_chain, slp_node);
++	}
++    }
++}
++
+ /* vectorizable_load.
+ 
+    Check if STMT_INFO reads a non scalar data-ref (array/pointer/structure)
+@@ -9338,6 +10193,8 @@ vectorizable_load (vec_info *vinfo,
+       if (bb_vinfo)
+ 	first_stmt_info_for_drptr
+ 	  = vect_find_first_scalar_stmt_in_slp (slp_node);
++  // first_stmt_info_for_drptr = SLP_TREE_SCALAR_STMTS (slp_node)[0];
++
+ 
+       /* Check if the chain of loads is already vectorized.  */
+       if (STMT_VINFO_VEC_STMTS (first_stmt_info).exists ()
+@@ -9601,6 +10458,9 @@ vectorizable_load (vec_info *vinfo,
+     }
+   tree vec_mask = NULL_TREE;
+   poly_uint64 group_elt = 0;
++  unsigned new_group_size = 0;
++  vec<unsigned> new_load_permutation;
++
+   for (j = 0; j < ncopies; j++)
+     {
+       /* 1. Create the vector or array pointer update chain.  */
+@@ -9621,6 +10481,15 @@ vectorizable_load (vec_info *vinfo,
+ 	      dataref_ptr = unshare_expr (DR_BASE_ADDRESS (first_dr_info->dr));
+ 	      dataref_offset = build_int_cst (ref_type, 0);
+ 	    }
++	  /* If the stmt_info need to be transposed recovery, dataref_ptr
++	     will be caculated later.  */
++	  else if (slp && is_a <bb_vec_info> (vinfo)
++		   && STMT_VINFO_GROUPED_ACCESS (stmt_info)
++		   && DR_GROUP_SLP_TRANSPOSE (
++			DR_GROUP_FIRST_ELEMENT (stmt_info)))
++	    {
++	      dataref_ptr = NULL_TREE;
++	    }
+ 	  else if (diff_first_stmt_info)
+ 	    {
+ 	      dataref_ptr
+@@ -9731,6 +10600,63 @@ vectorizable_load (vec_info *vinfo,
+ 	  /* Record that VEC_ARRAY is now dead.  */
+ 	  vect_clobber_variable (vinfo, stmt_info, gsi, vec_array);
+ 	}
++      else if (slp && is_a <bb_vec_info> (vinfo)
++	       && STMT_VINFO_GROUPED_ACCESS (stmt_info)
++	       && DR_GROUP_SLP_TRANSPOSE (DR_GROUP_FIRST_ELEMENT (stmt_info)))
++	{
++	  if (dump_enabled_p ())
++	    {
++	      dump_printf_loc (MSG_NOTE, vect_location,
++			       "vectorizable_load for slp transpose.\n");
++	    }
++	  /* group_size: the size of group after merging.
++	     group_size_b: the size of group before merging.
++	     const_nunits: TYPE_VECTOR_SUBPARTS (vectype), it is the number of
++		elements in a vector.
++	     nloads: const_nunits / group_size_b or 1, it means the number
++		of ARRAYs in a vector.
++	     ncontinues: group_size_b / const_nunits or 1, it means the number
++		of vectors from an ARRAY.  */
++	  unsigned int group_size_b = DR_GROUP_SIZE_TRANS (first_stmt_info);
++	  unsigned int const_nunits = nunits.to_constant ();
++	  unsigned int nloads = const_nunits;
++	  unsigned int ncontinues = group_size_b;
++	  tree lvectype = vectype;
++	  tree ltype = calculate_new_type (vectype, const_nunits,
++					   group_size_b, nloads,
++					   ncontinues, lvectype);
++	  bool this_load_permuted = false;
++	  auto_vec<unsigned> group_idx;
++	  generate_load_permutation (slp_node, new_group_size, group_size,
++				     group_size_b, this_load_permuted,
++				     group_idx, new_load_permutation);
++	  slp_perm = is_slp_perm (slp_perm, this_load_permuted, nunits,
++				  group_size, first_stmt_info);
++
++	  /* ncopies: the number of vectors that need to be loaded from
++		 memmory.  */
++	  unsigned int ncopies = new_group_size / const_nunits;
++	  offset_info offset_info = {offset, NULL_TREE, dataref_offset};
++	  vectype_info vectype_info = {vectype, ltype, lvectype, ref_type};
++	  if (slp_perm)
++	    {
++	       dr_chain.create (ncopies);
++	    }
++	  if (nloads > 1 && ncontinues == 1)
++	    {
++	      new_vect_stmt_for_nloads (vinfo, ncopies, nloads, group_idx,
++					stmt_info, &offset_info, &vectype_info,
++					memory_access_type, slp_perm, dr_chain,
++					slp_node, gsi);
++	    }
++	  else
++	    {
++	      new_vect_stmt_for_ncontinues (vinfo, ncontinues, group_idx,
++					    stmt_info, &offset_info,
++					    &vectype_info, memory_access_type,
++					    slp_perm, dr_chain, slp_node, gsi);
++	    }
++	}
+       else
+ 	{
+ 	  for (i = 0; i < vec_num; i++)
+@@ -10177,7 +11103,32 @@ vectorizable_load (vec_info *vinfo,
+       if (slp && !slp_perm)
+ 	continue;
+ 
+-      if (slp_perm)
++      /* Using the new load permutation to generate vector permute statements
++	 from a list of loads in DR_CHAIN.  */
++      if (slp && slp_perm && is_a <bb_vec_info> (vinfo)
++	  && STMT_VINFO_GROUPED_ACCESS (stmt_info)
++	  && DR_GROUP_SLP_TRANSPOSE (DR_GROUP_FIRST_ELEMENT (stmt_info)))
++	{
++	  unsigned n_perms;
++	  stmt_vec_info stmt_info_ = SLP_TREE_SCALAR_STMTS (slp_node)[0];
++	  unsigned int old_size = DR_GROUP_SIZE (stmt_info);
++	  DR_GROUP_SIZE (stmt_info_) = new_group_size;
++	  vec<unsigned> old_load_permutation
++			  = SLP_TREE_LOAD_PERMUTATION (slp_node);
++	  SLP_TREE_LOAD_PERMUTATION (slp_node) = new_load_permutation;
++	  bool perm_load_success = vect_transform_slp_perm_load (
++				     vinfo, slp_node, dr_chain, gsi, vf,
++				     false, &n_perms);
++	  DR_GROUP_SIZE (stmt_info_) = old_size;
++	  SLP_TREE_LOAD_PERMUTATION (slp_node) = old_load_permutation;
++	  new_load_permutation.release ();
++	  if (!perm_load_success)
++	    {
++	      dr_chain.release ();
++	      return false;
++	    }
++	}
++      else if (slp_perm)
+         {
+ 	  unsigned n_perms;
+ 	  /* For SLP we know we've seen all possible uses of dr_chain so
+diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
+index 642eb0aeb..e13bc6c99 100644
+--- a/gcc/tree-vectorizer.h
++++ b/gcc/tree-vectorizer.h
+@@ -412,6 +412,21 @@ public:
+   vec<ddr_p> ddrs;
+ };
+ 
++/* Information about offset in vectorizable_load.  */
++struct offset_info {
++  tree offset;
++  tree byte_offset;
++  tree dataref_offset;
++};
++
++/* Information about vectype in vectorizable_load.  */
++struct vectype_info {
++  tree vectype;
++  tree ltype;
++  tree lvectype;
++  tree ref_type;
++};
++
+ /* Vectorizer state common between loop and basic-block vectorization.  */
+ class vec_info {
+ public:
+@@ -455,6 +470,14 @@ public:
+      stmt in the chain.  */
+   auto_vec<stmt_vec_info> grouped_stores;
+ 
++  /* All interleaving chains of loads, represented by the first
++     stmt in the chain.  */
++  auto_vec<stmt_vec_info> grouped_loads;
++
++  /* All interleaving chains of stores (before transposed), represented by all
++     stmt in the chain.  */
++  auto_vec<vec<stmt_vec_info> > scalar_stores;
++
+   /* The set of vector modes used in the vectorized region.  */
+   mode_set used_vector_modes;
+ 
+@@ -899,6 +922,8 @@ public:
+ #define LOOP_VINFO_CHECK_NONZERO(L)        (L)->check_nonzero
+ #define LOOP_VINFO_LOWER_BOUNDS(L)         (L)->lower_bounds
+ #define LOOP_VINFO_GROUPED_STORES(L)       (L)->grouped_stores
++#define LOOP_VINFO_GROUPED_LOADS(L)	    (L)->grouped_loads
++#define LOOP_VINFO_SCALAR_STORES(L)	    (L)->scalar_stores
+ #define LOOP_VINFO_SLP_INSTANCES(L)        (L)->slp_instances
+ #define LOOP_VINFO_SLP_UNROLLING_FACTOR(L) (L)->slp_unrolling_factor
+ #define LOOP_VINFO_REDUCTIONS(L)           (L)->reductions
+@@ -982,6 +1007,25 @@ public:
+   vec<basic_block> bbs;
+ 
+   vec<slp_root> roots;
++
++  /* True, if bb_vinfo can goto vect_analyze_slp.  */
++  bool before_slp;
++
++  /* True, if bb_vinfo is a transposed version.  */
++  bool transposed;
++
++  /* The number of transposed groups.  */
++  int transposed_group;
++
++  /* The cost of the scalar iterations.  */
++  int scalar_cost;
++
++  /* The cost of the vector prologue and epilogue, including peeled
++     iterations and set-up code.  */
++  int vec_outside_cost;
++
++  /* The cost of the vector loop body.  */
++  int vec_inside_cost;
+ } *bb_vec_info;
+ 
+ #define BB_VINFO_BB(B)               (B)->bb
+@@ -989,6 +1033,14 @@ public:
+ #define BB_VINFO_SLP_INSTANCES(B)    (B)->slp_instances
+ #define BB_VINFO_DATAREFS(B)         (B)->shared->datarefs
+ #define BB_VINFO_DDRS(B)             (B)->shared->ddrs
++#define BB_VINFO_GROUPED_LOADS(B)    (B)->grouped_loads
++#define BB_VINFO_SCALAR_STORES(B)    (B)->scalar_stores
++#define BB_VINFO_VEC_OUTSIDE_COST(B) (B)->vec_outside_cost
++#define BB_VINFO_VEC_INSIDE_COST(B)  (B)->vec_inside_cost
++#define BB_VINFO_SCALAR_COST(B)      (B)->scalar_cost
++#define BB_VINFO_SLP_TRANSPOSED(B)   (B)->transposed
++#define BB_VINFO_BEFORE_SLP(B)       (B)->before_slp
++#define BB_VINFO_TRANS_GROUPS(B)     (B)->transposed_group
+ 
+ /*-----------------------------------------------------------------*/
+ /* Info on vectorized defs.                                        */
+@@ -1219,6 +1271,17 @@ public:
+   stmt_vec_info next_element;
+   /* The size of the group.  */
+   unsigned int size;
++
++  /* The size of the group before transposed.  */
++  unsigned int size_before_transpose;
++
++  /* If true, the stmt_info is slp transposed.  */
++  bool slp_transpose;
++
++  /* Mark the group store number for rebuild interleaving chain
++     during transpose phase.  Value -1 represents unable to transpose.  */
++  int group_number;
++
+   /* For stores, number of stores from this group seen. We vectorize the last
+      one.  */
+   unsigned int store_count;
+@@ -1226,6 +1289,9 @@ public:
+      is 1.  */
+   unsigned int gap;
+ 
++  /* The gap before transposed.  */
++  unsigned int gap_before_transpose;
++
+   /* The minimum negative dependence distance this stmt participates in
+      or zero if none.  */
+   unsigned int min_neg_dist;
+@@ -1427,6 +1493,12 @@ struct gather_scatter_info {
+ #define STMT_VINFO_SLP_VECT_ONLY(S)     (S)->slp_vect_only_p
+ #define STMT_VINFO_SLP_VECT_ONLY_PATTERN(S) (S)->slp_vect_pattern_only_p
+ 
++#define DR_GROUP_SLP_TRANSPOSE(S) \
++  (gcc_checking_assert ((S)->dr_aux.dr), (S)->slp_transpose)
++#define DR_GROUP_SIZE_TRANS(S) \
++  (gcc_checking_assert ((S)->dr_aux.dr), (S)->size_before_transpose)
++#define DR_GROUP_NUMBER(S) \
++  (gcc_checking_assert ((S)->dr_aux.dr), (S)->group_number)
+ #define DR_GROUP_FIRST_ELEMENT(S) \
+   (gcc_checking_assert ((S)->dr_aux.dr), (S)->first_element)
+ #define DR_GROUP_NEXT_ELEMENT(S) \
+@@ -1437,6 +1509,8 @@ struct gather_scatter_info {
+   (gcc_checking_assert ((S)->dr_aux.dr), (S)->store_count)
+ #define DR_GROUP_GAP(S) \
+   (gcc_checking_assert ((S)->dr_aux.dr), (S)->gap)
++#define DR_GROUP_GAP_TRANS(S) \
++  (gcc_checking_assert ((S)->dr_aux.dr), (S)->gap_before_transpose)
+ 
+ #define REDUC_GROUP_FIRST_ELEMENT(S) \
+   (gcc_checking_assert (!(S)->dr_aux.dr), (S)->first_element)
+@@ -2033,6 +2107,17 @@ vect_get_scalar_dr_size (dr_vec_info *dr_info)
+   return tree_to_uhwi (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr_info->dr))));
+ }
+ 
++/* Compare two unsigned int A and B.
++   Sorting them in ascending order.  */
++
++static inline int
++cmp_for_group_num (const void *a_, const void *b_)
++{
++  unsigned int a = *(unsigned int *)const_cast<void *>(a_);
++  unsigned int b = *(unsigned int *)const_cast<void *>(b_);
++  return a < b ? -1 : 1;
++}
++
+ /* Return true if LOOP_VINFO requires a runtime check for whether the
+    vector loop is profitable.  */
+ 
+@@ -2152,7 +2237,7 @@ record_stmt_cost (stmt_vector_for_cost *body_cost_vec, int count,
+ 
+ extern void vect_finish_replace_stmt (vec_info *, stmt_vec_info, gimple *);
+ extern void vect_finish_stmt_generation (vec_info *, stmt_vec_info, gimple *,
+-					 gimple_stmt_iterator *);
++					 gimple_stmt_iterator *,bool transpose=false);
+ extern opt_result vect_mark_stmts_to_be_vectorized (loop_vec_info, bool *);
+ extern tree vect_get_store_rhs (stmt_vec_info);
+ void vect_get_vec_defs_for_operand (vec_info *vinfo, stmt_vec_info, unsigned,
+@@ -2168,7 +2253,7 @@ void vect_get_vec_defs (vec_info *, stmt_vec_info, slp_tree, unsigned,
+ 			tree = NULL, vec<tree> * = NULL, tree = NULL,
+ 			tree = NULL, vec<tree> * = NULL, tree = NULL);
+ extern tree vect_init_vector (vec_info *, stmt_vec_info, tree, tree,
+-                              gimple_stmt_iterator *);
++			      gimple_stmt_iterator *, bool transpose=false);
+ extern tree vect_get_slp_vect_def (slp_tree, unsigned);
+ extern bool vect_transform_stmt (vec_info *, stmt_vec_info,
+ 				 gimple_stmt_iterator *,
+@@ -2235,6 +2320,9 @@ extern bool vect_load_lanes_supported (tree, unsigned HOST_WIDE_INT, bool);
+ extern void vect_permute_store_chain (vec_info *, vec<tree> &,
+ 				      unsigned int, stmt_vec_info,
+ 				      gimple_stmt_iterator *, vec<tree> *);
++extern void vect_transpose_store_chain (vec_info *, vec<tree>, unsigned int,
++					unsigned int, stmt_vec_info,
++					gimple_stmt_iterator *, vec<tree> *);
+ extern tree vect_setup_realignment (vec_info *,
+ 				    stmt_vec_info, gimple_stmt_iterator *,
+ 				    tree *, enum dr_alignment_support, tree,
+@@ -2262,7 +2350,8 @@ extern bool check_reduction_path (dump_user_location_t, loop_p, gphi *, tree,
+ 				  enum tree_code);
+ extern bool needs_fold_left_reduction_p (tree, code_helper);
+ /* Drive for loop analysis stage.  */
+-extern opt_loop_vec_info vect_analyze_loop (class loop *, vec_info_shared *);
++extern opt_loop_vec_info vect_analyze_loop (class loop *, vec_info_shared *,
++					    bool result_only_p = false);
+ extern tree vect_build_loop_niters (loop_vec_info, bool * = NULL);
+ extern void vect_gen_vector_loop_niters (loop_vec_info, tree, tree *,
+ 					 tree *, bool);
+@@ -2331,6 +2420,7 @@ extern bool vect_transform_slp_perm_load (vec_info *, slp_tree, const vec<tree>
+ 					  gimple_stmt_iterator *, poly_uint64,
+ 					  bool, unsigned *,
+ 					  unsigned * = nullptr, bool = false);
++extern void vect_transform_back_slp_grouped_stores (bb_vec_info, stmt_vec_info);
+ extern bool vect_slp_analyze_operations (vec_info *);
+ extern void vect_schedule_slp (vec_info *, const vec<slp_instance> &);
+ extern opt_result vect_analyze_slp (vec_info *, unsigned);
+-- 
+2.33.0
+