diff options
Diffstat (limited to '0081-Loop-distribution-Insert-temp-arrays-built-from-isom.patch')
-rw-r--r-- | 0081-Loop-distribution-Insert-temp-arrays-built-from-isom.patch | 826 |
1 files changed, 826 insertions, 0 deletions
diff --git a/0081-Loop-distribution-Insert-temp-arrays-built-from-isom.patch b/0081-Loop-distribution-Insert-temp-arrays-built-from-isom.patch new file mode 100644 index 0000000..2197b2f --- /dev/null +++ b/0081-Loop-distribution-Insert-temp-arrays-built-from-isom.patch @@ -0,0 +1,826 @@ +From ca2a541ed3425bec64f97fe277c6c02bf4f20049 Mon Sep 17 00:00:00 2001 +From: benniaobufeijiushiji <linda7@huawei.com> +Date: Thu, 27 Oct 2022 10:26:34 +0800 +Subject: [PATCH 33/35] [Loop-distribution] Insert temp arrays built from + isomorphic stmts Use option -ftree-slp-transpose-vectorize Build temp arrays + for isomorphic stmt and regard them as new seed_stmts for loop distribution. + +--- + gcc/testsuite/gcc.dg/tree-ssa/ins-ldist-1.c | 67 +++ + gcc/testsuite/gcc.dg/tree-ssa/ins-ldist-2.c | 17 + + gcc/testsuite/gcc.dg/tree-ssa/ins-ldist-3.c | 19 + + gcc/tree-loop-distribution.c | 577 +++++++++++++++++++- + 4 files changed, 663 insertions(+), 17 deletions(-) + create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/ins-ldist-1.c + create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/ins-ldist-2.c + create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/ins-ldist-3.c + +diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ins-ldist-1.c b/gcc/testsuite/gcc.dg/tree-ssa/ins-ldist-1.c +new file mode 100644 +index 000000000..649463647 +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/tree-ssa/ins-ldist-1.c +@@ -0,0 +1,67 @@ ++/* { dg-do compile { target { aarch64*-*-linux* } } } */ ++/* { dg-do run { target { aarch64*-*-linux* } } } */ ++/* { dg-options "-O3 -ftree-slp-transpose-vectorize -fdump-tree-ldist-all-details -save-temps" } */ ++ ++#include <stdio.h> ++#include <stdlib.h> ++ ++static unsigned inline abs2 (unsigned a) ++{ ++ unsigned s = ((a>>15)&0x10001)*0xffff; ++ return (a+s)^s; ++} ++ ++int foo (unsigned char *oxa, int ia, unsigned char *oxb, int ib) ++{ ++ unsigned tmp[4][4]; ++ unsigned a0, a1, a2, a3; ++ int sum = 0; ++ for (int i = 0; i < 4; i++, oxa += ia, oxb += ib) ++ { ++ a0 = (oxa[0] - oxb[0]) + ((oxa[4] - oxb[4]) << 16); ++ a1 = (oxa[1] - oxb[1]) + ((oxa[5] - oxb[5]) << 16); ++ a2 = (oxa[2] - oxb[2]) + ((oxa[6] - oxb[6]) << 16); ++ a3 = (oxa[3] - oxb[3]) + ((oxa[7] - oxb[7]) << 16); ++ int t0 = a0 + a1; ++ int t1 = a0 - a1; ++ int t2 = a2 + a3; ++ int t3 = a2 - a3; ++ tmp[i][0] = t0 + t2; ++ tmp[i][2] = t0 - t2; ++ tmp[i][1] = t1 + t3; ++ tmp[i][3] = t1 - t3; ++ } ++ for (int i = 0; i < 4; i++) ++ { ++ int t0 = tmp[0][i] + tmp[1][i]; ++ int t1 = tmp[0][i] - tmp[1][i]; ++ int t2 = tmp[2][i] + tmp[3][i]; ++ int t3 = tmp[2][i] - tmp[3][i]; ++ a0 = t0 + t2; ++ a2 = t0 - t2; ++ a1 = t1 + t3; ++ a3 = t1 - t3; ++ sum += abs2 (a0) + abs2 (a1) + abs2 (a2) + abs2 (a3); ++ } ++ return (((unsigned short) sum) + ((unsigned) sum >>16)) >> 1; ++} ++ ++int main () ++{ ++ unsigned char oxa[128] = {0}; ++ unsigned char oxb[128] = {0}; ++ for (int i = 0; i < 128; i++) ++ { ++ oxa[i] += i * 3; ++ oxb[i] = i * 2; ++ } ++ int sum = foo (oxa, 16, oxb, 32); ++ if (sum != 736) ++ { ++ abort (); ++ } ++ return 0; ++} ++ ++/* { dg-final { scan-tree-dump-times "Insertion done: 4 temp arrays inserted" 1 "ldist" } } */ ++/* { dg-final { scan-tree-dump-times "distributed: split to 2 loops" 1 "ldist" } } */ +diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ins-ldist-2.c b/gcc/testsuite/gcc.dg/tree-ssa/ins-ldist-2.c +new file mode 100644 +index 000000000..1b50fd27d +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/tree-ssa/ins-ldist-2.c +@@ -0,0 +1,17 @@ ++/* { dg-do compile { target { aarch64*-*-linux* } } } */ ++/* { dg-options "-O3 -ftree-slp-transpose-vectorize -fdump-tree-ldist-all-details" } */ ++ ++unsigned a0[4], a1[4], a2[4], a3[4]; ++ ++void foo (unsigned char *oxa, int ia, unsigned char *oxb, int ib) ++{ ++ for (int i = 0; i < 4; i++, oxa += ia, oxb += ib) ++ { ++ a0[i] = (oxa[0] - oxb[0]) + ((oxa[4] - oxb[4]) << 16); ++ a1[i] = (oxa[1] - oxb[1]) + ((oxa[5] - oxb[5]) << 16); ++ a2[i] = (oxa[2] - oxb[2]) + ((oxa[6] - oxb[6]) << 16); ++ a3[i] = (oxa[3] - oxb[3]) + ((oxa[7] - oxb[7]) << 16); ++ } ++} ++ ++/* { dg-final { scan-tree-dump-times "Loop 1 not distributed." 1 "ldist" } } */ +diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ins-ldist-3.c b/gcc/testsuite/gcc.dg/tree-ssa/ins-ldist-3.c +new file mode 100644 +index 000000000..94b992b05 +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/tree-ssa/ins-ldist-3.c +@@ -0,0 +1,19 @@ ++/* { dg-do compile { target { aarch64*-*-linux* } } } */ ++/* { dg-options "-O3 -ftree-slp-transpose-vectorize -fdump-tree-ldist-all-details" } */ ++ ++unsigned a0[4], a1[4], a2[4], a3[4]; ++ ++void foo (unsigned char *oxa, int ia, unsigned char *oxb, int ib) ++{ ++ for (int i = 0; i < 4; i++, oxa += ia, oxb += ib) ++ { ++ a0[i] = ((oxa[0] - oxb[0]) + ((oxa[4] - oxb[4]) << 16)) + 1; ++ a1[i] = ((oxa[1] - oxb[1]) + ((oxa[5] - oxb[5]) << 16)) - 2; ++ a2[i] = ((oxa[2] - oxb[2]) + ((oxa[6] - oxb[6]) << 16)) * 3; ++ a3[i] = ((oxa[3] - oxb[3]) + ((oxa[7] - oxb[7]) << 16)) / 4; ++ } ++} ++ ++/* { dg-final { scan-tree-dump-times "Insertion done: 4 temp arrays inserted" 1 "ldist" } } */ ++/* { dg-final { scan-tree-dump-times "Insertion removed" 1 "ldist" } } */ ++/* { dg-final { scan-tree-dump-times "Loop 1 not distributed." 1 "ldist" } } */ +\ No newline at end of file +diff --git a/gcc/tree-loop-distribution.c b/gcc/tree-loop-distribution.c +index c08af6562..88b56379c 100644 +--- a/gcc/tree-loop-distribution.c ++++ b/gcc/tree-loop-distribution.c +@@ -36,6 +36,47 @@ along with GCC; see the file COPYING3. If not see + | D(I) = A(I-1)*E + |ENDDO + ++ If an unvectorizable loop has grouped loads, and calculations from grouped ++ loads are isomorphic, build temp arrays using stmts where isomorphic ++ calculations end. Afer distribution, the partition built from temp ++ arrays can be vectorized in pass SLP after loop unrolling. For example, ++ ++ |DO I = 1, N ++ | A = FOO (ARG_1); ++ | B = FOO (ARG_2); ++ | C = BAR_0 (A); ++ | D = BAR_1 (B); ++ |ENDDO ++ ++ is transformed to ++ ++ |DO I = 1, N ++ | J = FOO (ARG_1); ++ | K = FOO (ARG_2); ++ | X[I] = J; ++ | Y[I] = K; ++ | A = X[I]; ++ | B = Y[I]; ++ | C = BAR_0 (A); ++ | D = BAR_1 (B); ++ |ENDDO ++ ++ and is then distributed to ++ ++ |DO I = 1, N ++ | J = FOO (ARG_1); ++ | K = FOO (ARG_2); ++ | X[I] = J; ++ | Y[I] = K; ++ |ENDDO ++ ++ |DO I = 1, N ++ | A = X[I]; ++ | B = Y[I]; ++ | C = BAR_0 (A); ++ | D = BAR_1 (B); ++ |ENDDO ++ + Loop distribution is the dual of loop fusion. It separates statements + of a loop (or loop nest) into multiple loops (or loop nests) with the + same loop header. The major goal is to separate statements which may +@@ -44,7 +85,9 @@ along with GCC; see the file COPYING3. If not see + + 1) Seed partitions with specific type statements. For now we support + two types seed statements: statement defining variable used outside +- of loop; statement storing to memory. ++ of loop; statement storing to memory. Moreover, for unvectorizable ++ loops, we try to find isomorphic stmts from grouped load and build ++ temp arrays as new seed statements. + 2) Build reduced dependence graph (RDG) for loop to be distributed. + The vertices (RDG:V) model all statements in the loop and the edges + (RDG:E) model flow and control dependencies between statements. +@@ -643,7 +686,8 @@ class loop_distribution + /* Returns true when PARTITION1 and PARTITION2 access the same memory + object in RDG. */ + bool share_memory_accesses (struct graph *rdg, +- partition *partition1, partition *partition2); ++ partition *partition1, partition *partition2, ++ hash_set<tree> *excluded_arrays); + + /* For each seed statement in STARTING_STMTS, this function builds + partition for it by adding depended statements according to RDG. +@@ -686,8 +730,9 @@ class loop_distribution + + /* Fuse PARTITIONS of LOOP if necessary before finalizing distribution. + ALIAS_DDRS contains ddrs which need runtime alias check. */ +- void finalize_partitions (class loop *loop, vec<struct partition *> +- *partitions, vec<ddr_p> *alias_ddrs); ++ void finalize_partitions (class loop *loop, ++ vec<struct partition *> *partitions, ++ vec<ddr_p> *alias_ddrs, bitmap producers); + + /* Analyze loop form and if it's vectorizable to decide if we need to + insert temp arrays to distribute it. */ +@@ -701,6 +746,28 @@ class loop_distribution + + inline void rebuild_rdg (loop_p loop, struct graph *&rdg, + control_dependences *cd); ++ ++ /* If loop is not distributed, remove inserted temp arrays. */ ++ void remove_insertion (loop_p loop, struct graph *flow_only_rdg, ++ bitmap producers, struct partition *partition); ++ ++ /* Insert temp arrays if isomorphic computation exists. Temp arrays will be ++ regarded as SEED_STMTS for building partitions in succeeding processes. */ ++ bool insert_temp_arrays (loop_p loop, vec<gimple *> seed_stmts, ++ hash_set<tree> *tmp_array_vars, bitmap producers); ++ ++ void build_producers (loop_p loop, bitmap producers, ++ vec<gimple *> &transformed); ++ ++ void do_insertion (loop_p loop, struct graph *flow_only_rdg, tree iv, ++ bitmap cut_points, hash_set <tree> *tmp_array_vars, ++ bitmap producers); ++ ++ /* Fuse PARTITIONS built from inserted temp arrays into one partition, ++ fuse the rest into another. */ ++ void merge_remaining_partitions (vec<struct partition *> *partitions, ++ bitmap producers); ++ + /* Distributes the code from LOOP in such a way that producer statements + are placed before consumer statements. Tries to separate only the + statements from STMTS into separate loops. Returns the number of +@@ -1913,7 +1980,8 @@ loop_distribution::classify_partition (loop_p loop, + + bool + loop_distribution::share_memory_accesses (struct graph *rdg, +- partition *partition1, partition *partition2) ++ partition *partition1, partition *partition2, ++ hash_set <tree> *excluded_arrays) + { + unsigned i, j; + bitmap_iterator bi, bj; +@@ -1947,7 +2015,10 @@ loop_distribution::share_memory_accesses (struct graph *rdg, + if (operand_equal_p (DR_BASE_ADDRESS (dr1), DR_BASE_ADDRESS (dr2), 0) + && operand_equal_p (DR_OFFSET (dr1), DR_OFFSET (dr2), 0) + && operand_equal_p (DR_INIT (dr1), DR_INIT (dr2), 0) +- && operand_equal_p (DR_STEP (dr1), DR_STEP (dr2), 0)) ++ && operand_equal_p (DR_STEP (dr1), DR_STEP (dr2), 0) ++ /* An exception, if PARTITION1 and PARTITION2 contain the ++ temp array we inserted, do not merge them. */ ++ && !excluded_arrays->contains (DR_REF (dr1))) + return true; + } + } +@@ -2909,13 +2980,47 @@ fuse_memset_builtins (vec<struct partition *> *partitions) + } + } + ++void ++loop_distribution::merge_remaining_partitions ++ (vec<struct partition *> *partitions, ++ bitmap producers) ++{ ++ struct partition *partition = NULL; ++ struct partition *p1 = NULL, *p2 = NULL; ++ for (unsigned i = 0; partitions->iterate (i, &partition); i++) ++ { ++ if (bitmap_intersect_p (producers, partition->stmts)) ++ { ++ if (p1 == NULL) ++ { ++ p1 = partition; ++ continue; ++ } ++ partition_merge_into (NULL, p1, partition, FUSE_FINALIZE); ++ } ++ else ++ { ++ if (p2 == NULL) ++ { ++ p2 = partition; ++ continue; ++ } ++ partition_merge_into (NULL, p2, partition, FUSE_FINALIZE); ++ } ++ partitions->unordered_remove (i); ++ partition_free (partition); ++ i--; ++ } ++} ++ + void + loop_distribution::finalize_partitions (class loop *loop, + vec<struct partition *> *partitions, +- vec<ddr_p> *alias_ddrs) ++ vec<ddr_p> *alias_ddrs, ++ bitmap producers) + { + unsigned i; +- struct partition *partition, *a; ++ struct partition *partition; + + if (partitions->length () == 1 + || alias_ddrs->length () > 0) +@@ -2947,13 +3052,7 @@ loop_distribution::finalize_partitions (class loop *loop, + || (loop->inner == NULL + && i >= NUM_PARTITION_THRESHOLD && num_normal > num_builtin)) + { +- a = (*partitions)[0]; +- for (i = 1; partitions->iterate (i, &partition); ++i) +- { +- partition_merge_into (NULL, a, partition, FUSE_FINALIZE); +- partition_free (partition); +- } +- partitions->truncate (1); ++ merge_remaining_partitions (partitions, producers); + } + + /* Fuse memset builtins if possible. */ +@@ -3758,6 +3857,404 @@ find_isomorphic_stmts (loop_vec_info vinfo, vec<gimple *> &stmts) + return decide_stmts_by_profit (candi_stmts, stmts); + } + ++/* Get iv from SEED_STMTS and make sure each seed_stmt has only one iv as index ++ and all indices are the same. */ ++ ++static tree ++find_index (vec<gimple *> seed_stmts) ++{ ++ if (seed_stmts.length () == 0) ++ return NULL; ++ bool found_index = false; ++ tree index = NULL; ++ unsigned ui = 0; ++ for (ui = 0; ui < seed_stmts.length (); ui++) ++ { ++ if (!gimple_vdef (seed_stmts[ui])) ++ return NULL; ++ tree lhs = gimple_assign_lhs (seed_stmts[ui]); ++ unsigned num_index = 0; ++ while (TREE_CODE (lhs) == ARRAY_REF) ++ { ++ if (TREE_CODE (TREE_OPERAND (lhs, 1)) == SSA_NAME) ++ { ++ num_index++; ++ if (num_index > 1) ++ return NULL; ++ if (index == NULL) ++ { ++ index = TREE_OPERAND (lhs, 1); ++ found_index = true; ++ } ++ else if (index != TREE_OPERAND (lhs, 1)) ++ return NULL; ++ } ++ lhs = TREE_OPERAND (lhs, 0); ++ } ++ if (!found_index) ++ return NULL; ++ } ++ return index; ++} ++ ++/* Check if expression of phi is an increament of a const. */ ++ ++static void ++check_phi_inc (struct vertex *v_phi, struct graph *rdg, bool &found_inc) ++{ ++ struct graph_edge *e_phi; ++ for (e_phi = v_phi->succ; e_phi; e_phi = e_phi->succ_next) ++ { ++ struct vertex *v_inc = &(rdg->vertices[e_phi->dest]); ++ if (!is_gimple_assign (RDGV_STMT (v_inc)) ++ || gimple_expr_code (RDGV_STMT (v_inc)) != PLUS_EXPR) ++ continue; ++ tree rhs1 = gimple_assign_rhs1 (RDGV_STMT (v_inc)); ++ tree rhs2 = gimple_assign_rhs2 (RDGV_STMT (v_inc)); ++ if (!(integer_onep (rhs1) || integer_onep (rhs2))) ++ continue; ++ struct graph_edge *e_inc; ++ /* find cycle with only two vertices inc and phi: inc <--> phi. */ ++ bool found_cycle = false; ++ for (e_inc = v_inc->succ; e_inc; e_inc = e_inc->succ_next) ++ { ++ if (e_inc->dest == e_phi->src) ++ { ++ found_cycle = true; ++ break; ++ } ++ } ++ if (!found_cycle) ++ continue; ++ found_inc = true; ++ } ++} ++ ++/* Check if phi satisfies form like PHI <0, i>. */ ++ ++static inline bool ++iv_check_phi_stmt (gimple *phi_stmt) ++{ ++ return gimple_phi_num_args (phi_stmt) == 2 ++ && (integer_zerop (gimple_phi_arg_def (phi_stmt, 0)) ++ || integer_zerop (gimple_phi_arg_def (phi_stmt, 1))); ++} ++ ++/* Make sure the iteration varible is a phi. */ ++ ++static tree ++get_iv_from_seed (struct graph *flow_only_rdg, vec<gimple *> seed_stmts) ++{ ++ tree index = find_index (seed_stmts); ++ if (index == NULL) ++ return NULL; ++ for (int i = 0; i < flow_only_rdg->n_vertices; i++) ++ { ++ struct vertex *v = &(flow_only_rdg->vertices[i]); ++ if (RDGV_STMT (v) != seed_stmts[0]) ++ continue; ++ struct graph_edge *e; ++ bool found_phi = false; ++ for (e = v->pred; e; e = e->pred_next) ++ { ++ struct vertex *v_phi = &(flow_only_rdg->vertices[e->src]); ++ gimple *phi_stmt = RDGV_STMT (v_phi); ++ if (gimple_code (phi_stmt) != GIMPLE_PHI ++ || gimple_phi_result (phi_stmt) != index) ++ continue; ++ if (!iv_check_phi_stmt (phi_stmt)) ++ return NULL; ++ /* find inc expr in succ of phi. */ ++ bool found_inc = false; ++ check_phi_inc (v_phi, flow_only_rdg, found_inc); ++ if (!found_inc) ++ return NULL; ++ found_phi = true; ++ break; ++ } ++ if (!found_phi) ++ return NULL; ++ break; ++ } ++ return index; ++} ++ ++/* Do not distribute loop if vertexes in ROOT_MAP have antidependence with in ++ FLOW_ONLY_RDG. */ ++ ++static bool ++check_no_dependency (struct graph *flow_only_rdg, bitmap root_map) ++{ ++ bitmap_iterator bi; ++ unsigned ui; ++ auto_vec<unsigned, 16> visited_nodes; ++ auto_bitmap visited_map; ++ EXECUTE_IF_SET_IN_BITMAP (root_map, 0, ui, bi) ++ visited_nodes.safe_push (ui); ++ for (ui = 0; ui < visited_nodes.length (); ui++) ++ { ++ struct vertex *v = &(flow_only_rdg->vertices[visited_nodes[ui]]); ++ struct graph_edge *e; ++ for (e = v->succ; e; e = e->succ_next) ++ { ++ if (bitmap_bit_p (root_map, e->dest)) ++ return false; ++ if (bitmap_bit_p (visited_map, e->dest)) ++ continue; ++ visited_nodes.safe_push (e->dest); ++ bitmap_set_bit (visited_map, e->dest); ++ } ++ } ++ return true; ++} ++ ++/* Find isomorphic stmts from GROUPED_LOADS in VINFO and make sure ++ there is no dependency among those STMT we found. */ ++ ++static unsigned ++get_cut_points (struct graph *flow_only_rdg, bitmap cut_points, ++ loop_vec_info vinfo) ++{ ++ unsigned n_stmts = 0; ++ ++ /* STMTS that may be CUT_POINTS. */ ++ auto_vec<gimple *> stmts; ++ if (!find_isomorphic_stmts (vinfo, stmts)) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "No temp array insertion: no isomorphic stmts" ++ " were found.\n"); ++ return 0; ++ } ++ ++ for (int i = 0; i < flow_only_rdg->n_vertices; i++) ++ { ++ if (stmts.contains (RDG_STMT (flow_only_rdg, i))) ++ bitmap_set_bit (cut_points, i); ++ } ++ n_stmts = bitmap_count_bits (cut_points); ++ ++ bool succ = check_no_dependency (flow_only_rdg, cut_points); ++ if (!succ) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "No temp array inserted: data dependency" ++ " among isomorphic stmts.\n"); ++ return 0; ++ } ++ return n_stmts; ++} ++ ++static void ++build_temp_array (struct vertex *v, gimple_stmt_iterator &gsi, ++ poly_uint64 array_extent, tree iv, ++ hash_set<tree> *tmp_array_vars, vec<gimple *> *transformed) ++{ ++ gimple *stmt = RDGV_STMT (v); ++ tree lhs = gimple_assign_lhs (stmt); ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "original stmt:\t"); ++ print_gimple_stmt (dump_file, stmt, 0, TDF_VOPS|TDF_MEMSYMS); ++ } ++ tree var_ssa = duplicate_ssa_name (lhs, stmt); ++ gimple_assign_set_lhs (stmt, var_ssa); ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "changed to:\t"); ++ print_gimple_stmt (dump_file, stmt, 0, TDF_VOPS | TDF_MEMSYMS); ++ } ++ gimple_set_uid (gsi_stmt (gsi), -1); ++ tree vect_elt_type = TREE_TYPE (lhs); ++ tree array_type = build_array_type_nelts (vect_elt_type, array_extent); ++ tree array = create_tmp_var (array_type); ++ tree array_ssa = build4 (ARRAY_REF, vect_elt_type, array, iv, NULL, NULL); ++ tmp_array_vars->add (array_ssa); ++ gimple *store = gimple_build_assign (array_ssa, var_ssa); ++ tree new_vdef = make_ssa_name (gimple_vop (cfun), store); ++ gsi_insert_after (&gsi, store, GSI_NEW_STMT); ++ gimple_set_vdef (store, new_vdef); ++ transformed->safe_push (store); ++ gimple_set_uid (gsi_stmt (gsi), -1); ++ tree array_ssa2 = build4 (ARRAY_REF, vect_elt_type, array, iv, NULL, NULL); ++ tmp_array_vars->add (array_ssa2); ++ gimple *load = gimple_build_assign (lhs, array_ssa2); ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "insert stmt:\t"); ++ print_gimple_stmt (dump_file, store, 0, TDF_VOPS|TDF_MEMSYMS); ++ fprintf (dump_file, " and stmt:\t"); ++ print_gimple_stmt (dump_file, load, 0, TDF_VOPS|TDF_MEMSYMS); ++ } ++ gimple_set_vuse (load, new_vdef); ++ gsi_insert_after (&gsi, load, GSI_NEW_STMT); ++ gimple_set_uid (gsi_stmt (gsi), -1); ++} ++ ++/* Set bitmap PRODUCERS based on vec TRANSFORMED. */ ++ ++void ++loop_distribution::build_producers (loop_p loop, bitmap producers, ++ vec<gimple *> &transformed) ++{ ++ auto_vec<gimple *, 10> stmts; ++ stmts_from_loop (loop, &stmts); ++ int i = 0; ++ gimple *stmt = NULL; ++ ++ FOR_EACH_VEC_ELT (stmts, i, stmt) ++ gimple_set_uid (stmt, i); ++ i = 0; ++ FOR_EACH_VEC_ELT (transformed, i, stmt) ++ bitmap_set_bit (producers, stmt->uid); ++} ++ ++/* Transform stmt ++ ++ A = FOO (ARG_1); ++ ++ to ++ ++ STMT_1: A1 = FOO (ARG_1); ++ STMT_2: X[I] = A1; ++ STMT_3: A = X[I]; ++ ++ Producer is STMT_2 who defines the temp array and consumer is ++ STMT_3 who uses the temp array. */ ++ ++void ++loop_distribution::do_insertion (loop_p loop, struct graph *flow_only_rdg, ++ tree iv, bitmap cut_points, ++ hash_set<tree> *tmp_array_vars, ++ bitmap producers) ++{ ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "=== do insertion ===\n"); ++ ++ auto_vec<gimple *> transformed; ++ ++ /* Execution times of loop. */ ++ poly_uint64 array_extent ++ = tree_to_poly_uint64 (number_of_latch_executions (loop)) + 1; ++ ++ basic_block *bbs = get_loop_body_in_custom_order (loop, this, ++ bb_top_order_cmp_r); ++ ++ for (int i = 0; i < int (loop->num_nodes); i++) ++ { ++ basic_block bb = bbs[i]; ++ ++ /* Find all cut points in bb and transform them. */ ++ for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi); ++ gsi_next (&gsi)) ++ { ++ unsigned j = gimple_uid (gsi_stmt (gsi)); ++ if (bitmap_bit_p (cut_points, j)) ++ { ++ struct vertex *v = &(flow_only_rdg->vertices[j]); ++ build_temp_array (v, gsi, array_extent, iv, tmp_array_vars, ++ &transformed); ++ } ++ } ++ } ++ build_producers (loop, producers, transformed); ++ update_ssa (TODO_update_ssa); ++ free (bbs); ++} ++ ++/* After temp array insertion, given stmts ++ STMT_1: M = FOO (ARG_1); ++ STMT_2: X[I] = M; ++ STMT_3: A = X[I]; ++ STMT_2 is the producer, STMT_1 is its prev and STMT_3 is its next. ++ Replace M with A, and remove STMT_2 and STMT_3. */ ++ ++static void ++reset_gimple_assign (struct graph *flow_only_rdg, struct partition *partition, ++ gimple_stmt_iterator &gsi, int j) ++{ ++ struct vertex *v = &(flow_only_rdg->vertices[j]); ++ gimple *stmt = RDGV_STMT (v); ++ gimple *prev = stmt->prev; ++ gimple *next = stmt->next; ++ tree n_lhs = gimple_assign_lhs (next); ++ gimple_assign_set_lhs (prev, n_lhs); ++ unlink_stmt_vdef (stmt); ++ if (partition) ++ bitmap_clear_bit (partition->stmts, gimple_uid (gsi_stmt (gsi))); ++ gsi_remove (&gsi, true); ++ release_defs (stmt); ++ if (partition) ++ bitmap_clear_bit (partition->stmts, gimple_uid (gsi_stmt (gsi))); ++ gsi_remove (&gsi, true); ++} ++ ++void ++loop_distribution::remove_insertion (loop_p loop, struct graph *flow_only_rdg, ++ bitmap producers, struct partition *partition) ++{ ++ basic_block *bbs = get_loop_body_in_custom_order (loop, this, ++ bb_top_order_cmp_r); ++ for (int i = 0; i < int (loop->num_nodes); i++) ++ { ++ basic_block bb = bbs[i]; ++ for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi); ++ gsi_next (&gsi)) ++ { ++ unsigned j = gimple_uid (gsi_stmt (gsi)); ++ if (bitmap_bit_p (producers, j)) ++ reset_gimple_assign (flow_only_rdg, partition, gsi, j); ++ } ++ } ++ update_ssa (TODO_update_ssa); ++ free (bbs); ++} ++ ++/* Insert temp arrays if isomorphic computation exists. Temp arrays will be ++ regarded as SEED_STMTS for building partitions in succeeding processes. */ ++ ++bool ++loop_distribution::insert_temp_arrays (loop_p loop, vec<gimple *> seed_stmts, ++ hash_set<tree> *tmp_array_vars, bitmap producers) ++{ ++ struct graph *flow_only_rdg = build_rdg (loop, NULL); ++ gcc_checking_assert (flow_only_rdg != NULL); ++ tree iv = get_iv_from_seed (flow_only_rdg, seed_stmts); ++ if (iv == NULL) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "Loop %d no temp array insertion: failed to get" ++ " iteration variable.\n", loop->num); ++ free_rdg (flow_only_rdg); ++ return false; ++ } ++ auto_bitmap cut_points; ++ loop_vec_info vinfo = loop_vec_info_for_loop (loop); ++ unsigned n_cut_points = get_cut_points (flow_only_rdg, cut_points, vinfo); ++ delete vinfo; ++ loop->aux = NULL; ++ if (n_cut_points == 0) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "Loop %d no temp array insertion: no cut points" ++ " found.\n", loop->num); ++ free_rdg (flow_only_rdg); ++ return false; ++ } ++ do_insertion (loop, flow_only_rdg, iv, cut_points, tmp_array_vars, producers); ++ if (dump_enabled_p ()) ++ { ++ dump_user_location_t loc = find_loop_location (loop); ++ dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, loc, "Insertion done:" ++ " %d temp arrays inserted in Loop %d.\n", ++ n_cut_points, loop->num); ++ } ++ free_rdg (flow_only_rdg); ++ return true; ++} ++ ++static bool find_seed_stmts_for_distribution (class loop *, vec<gimple *> *); ++ + /* Distributes the code from LOOP in such a way that producer statements + are placed before consumer statements. Tries to separate only the + statements from STMTS into separate loops. Returns the number of +@@ -3814,6 +4311,34 @@ loop_distribution::distribute_loop (class loop *loop, vec<gimple *> stmts, + return 0; + } + ++ /* Try to distribute LOOP if LOOP is simple enough and unable to vectorize. ++ If LOOP has grouped loads, recursively find isomorphic stmts and insert ++ temp arrays, rebuild RDG and call find_seed_stmts_for_distribution ++ to replace STMTS. */ ++ ++ hash_set<tree> tmp_array_vars; ++ ++ /* STMTs that define those inserted TMP_ARRAYs. */ ++ auto_bitmap producers; ++ ++ /* New SEED_STMTS after insertion. */ ++ auto_vec<gimple *> work_list; ++ bool insert_success = false; ++ if (may_insert_temp_arrays (loop, rdg, cd)) ++ { ++ if (insert_temp_arrays (loop, stmts, &tmp_array_vars, producers)) ++ { ++ if (find_seed_stmts_for_distribution (loop, &work_list)) ++ { ++ insert_success = true; ++ stmts = work_list; ++ } ++ else ++ remove_insertion (loop, rdg, producers, NULL); ++ rebuild_rdg (loop, rdg, cd); ++ } ++ } ++ + data_reference_p dref; + for (i = 0; datarefs_vec.iterate (i, &dref); ++i) + dref->aux = (void *) (uintptr_t) i; +@@ -3894,7 +4419,7 @@ loop_distribution::distribute_loop (class loop *loop, vec<gimple *> stmts, + for (int j = i + 1; + partitions.iterate (j, &partition); ++j) + { +- if (share_memory_accesses (rdg, into, partition)) ++ if (share_memory_accesses (rdg, into, partition, &tmp_array_vars)) + { + partition_merge_into (rdg, into, partition, FUSE_SHARE_REF); + partitions.unordered_remove (j); +@@ -3944,7 +4469,7 @@ loop_distribution::distribute_loop (class loop *loop, vec<gimple *> stmts, + } + } + +- finalize_partitions (loop, &partitions, &alias_ddrs); ++ finalize_partitions (loop, &partitions, &alias_ddrs, producers); + + /* If there is a reduction in all partitions make sure the last one + is not classified for builtin code generation. */ +@@ -3962,6 +4487,24 @@ loop_distribution::distribute_loop (class loop *loop, vec<gimple *> stmts, + } + + nbp = partitions.length (); ++ ++ /* If we have inserted TMP_ARRAYs but there is only one partition left in ++ the succeeding processes, remove those inserted TMP_ARRAYs back to the ++ original version. */ ++ ++ if (nbp == 1 && insert_success) ++ { ++ struct partition *partition = NULL; ++ partitions.iterate (0, &partition); ++ remove_insertion (loop, rdg, producers, partition); ++ if (dump_enabled_p ()) ++ { ++ dump_user_location_t loc = find_loop_location (loop); ++ dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, loc, "Insertion removed:" ++ " unable to distribute loop %d.\n", loop->num); ++ } ++ } ++ + if (nbp == 0 + || (nbp == 1 && !partition_builtin_p (partitions[0])) + || (nbp > 1 && partition_contains_all_rw (rdg, partitions))) +-- +2.27.0.windows.1 + |