From ca2a541ed3425bec64f97fe277c6c02bf4f20049 Mon Sep 17 00:00:00 2001 From: benniaobufeijiushiji Date: Thu, 27 Oct 2022 10:26:34 +0800 Subject: [PATCH 33/35] [Loop-distribution] Insert temp arrays built from isomorphic stmts Use option -ftree-slp-transpose-vectorize Build temp arrays for isomorphic stmt and regard them as new seed_stmts for loop distribution. --- gcc/testsuite/gcc.dg/tree-ssa/ins-ldist-1.c | 67 +++ gcc/testsuite/gcc.dg/tree-ssa/ins-ldist-2.c | 17 + gcc/testsuite/gcc.dg/tree-ssa/ins-ldist-3.c | 19 + gcc/tree-loop-distribution.c | 577 +++++++++++++++++++- 4 files changed, 663 insertions(+), 17 deletions(-) create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/ins-ldist-1.c create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/ins-ldist-2.c create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/ins-ldist-3.c diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ins-ldist-1.c b/gcc/testsuite/gcc.dg/tree-ssa/ins-ldist-1.c new file mode 100644 index 000000000..649463647 --- /dev/null +++ b/gcc/testsuite/gcc.dg/tree-ssa/ins-ldist-1.c @@ -0,0 +1,67 @@ +/* { dg-do compile { target { aarch64*-*-linux* } } } */ +/* { dg-do run { target { aarch64*-*-linux* } } } */ +/* { dg-options "-O3 -ftree-slp-transpose-vectorize -fdump-tree-ldist-all-details -save-temps" } */ + +#include +#include + +static unsigned inline abs2 (unsigned a) +{ + unsigned s = ((a>>15)&0x10001)*0xffff; + return (a+s)^s; +} + +int foo (unsigned char *oxa, int ia, unsigned char *oxb, int ib) +{ + unsigned tmp[4][4]; + unsigned a0, a1, a2, a3; + int sum = 0; + for (int i = 0; i < 4; i++, oxa += ia, oxb += ib) + { + a0 = (oxa[0] - oxb[0]) + ((oxa[4] - oxb[4]) << 16); + a1 = (oxa[1] - oxb[1]) + ((oxa[5] - oxb[5]) << 16); + a2 = (oxa[2] - oxb[2]) + ((oxa[6] - oxb[6]) << 16); + a3 = (oxa[3] - oxb[3]) + ((oxa[7] - oxb[7]) << 16); + int t0 = a0 + a1; + int t1 = a0 - a1; + int t2 = a2 + a3; + int t3 = a2 - a3; + tmp[i][0] = t0 + t2; + tmp[i][2] = t0 - t2; + tmp[i][1] = t1 + t3; + tmp[i][3] = t1 - t3; + } + for (int i = 0; i < 4; i++) + { + int t0 = tmp[0][i] + tmp[1][i]; + int t1 = tmp[0][i] - tmp[1][i]; + int t2 = tmp[2][i] + tmp[3][i]; + int t3 = tmp[2][i] - tmp[3][i]; + a0 = t0 + t2; + a2 = t0 - t2; + a1 = t1 + t3; + a3 = t1 - t3; + sum += abs2 (a0) + abs2 (a1) + abs2 (a2) + abs2 (a3); + } + return (((unsigned short) sum) + ((unsigned) sum >>16)) >> 1; +} + +int main () +{ + unsigned char oxa[128] = {0}; + unsigned char oxb[128] = {0}; + for (int i = 0; i < 128; i++) + { + oxa[i] += i * 3; + oxb[i] = i * 2; + } + int sum = foo (oxa, 16, oxb, 32); + if (sum != 736) + { + abort (); + } + return 0; +} + +/* { dg-final { scan-tree-dump-times "Insertion done: 4 temp arrays inserted" 1 "ldist" } } */ +/* { dg-final { scan-tree-dump-times "distributed: split to 2 loops" 1 "ldist" } } */ diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ins-ldist-2.c b/gcc/testsuite/gcc.dg/tree-ssa/ins-ldist-2.c new file mode 100644 index 000000000..1b50fd27d --- /dev/null +++ b/gcc/testsuite/gcc.dg/tree-ssa/ins-ldist-2.c @@ -0,0 +1,17 @@ +/* { dg-do compile { target { aarch64*-*-linux* } } } */ +/* { dg-options "-O3 -ftree-slp-transpose-vectorize -fdump-tree-ldist-all-details" } */ + +unsigned a0[4], a1[4], a2[4], a3[4]; + +void foo (unsigned char *oxa, int ia, unsigned char *oxb, int ib) +{ + for (int i = 0; i < 4; i++, oxa += ia, oxb += ib) + { + a0[i] = (oxa[0] - oxb[0]) + ((oxa[4] - oxb[4]) << 16); + a1[i] = (oxa[1] - oxb[1]) + ((oxa[5] - oxb[5]) << 16); + a2[i] = (oxa[2] - oxb[2]) + ((oxa[6] - oxb[6]) << 16); + a3[i] = (oxa[3] - oxb[3]) + ((oxa[7] - oxb[7]) << 16); + } +} + +/* { dg-final { scan-tree-dump-times "Loop 1 not distributed." 1 "ldist" } } */ diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ins-ldist-3.c b/gcc/testsuite/gcc.dg/tree-ssa/ins-ldist-3.c new file mode 100644 index 000000000..94b992b05 --- /dev/null +++ b/gcc/testsuite/gcc.dg/tree-ssa/ins-ldist-3.c @@ -0,0 +1,19 @@ +/* { dg-do compile { target { aarch64*-*-linux* } } } */ +/* { dg-options "-O3 -ftree-slp-transpose-vectorize -fdump-tree-ldist-all-details" } */ + +unsigned a0[4], a1[4], a2[4], a3[4]; + +void foo (unsigned char *oxa, int ia, unsigned char *oxb, int ib) +{ + for (int i = 0; i < 4; i++, oxa += ia, oxb += ib) + { + a0[i] = ((oxa[0] - oxb[0]) + ((oxa[4] - oxb[4]) << 16)) + 1; + a1[i] = ((oxa[1] - oxb[1]) + ((oxa[5] - oxb[5]) << 16)) - 2; + a2[i] = ((oxa[2] - oxb[2]) + ((oxa[6] - oxb[6]) << 16)) * 3; + a3[i] = ((oxa[3] - oxb[3]) + ((oxa[7] - oxb[7]) << 16)) / 4; + } +} + +/* { dg-final { scan-tree-dump-times "Insertion done: 4 temp arrays inserted" 1 "ldist" } } */ +/* { dg-final { scan-tree-dump-times "Insertion removed" 1 "ldist" } } */ +/* { dg-final { scan-tree-dump-times "Loop 1 not distributed." 1 "ldist" } } */ \ No newline at end of file diff --git a/gcc/tree-loop-distribution.c b/gcc/tree-loop-distribution.c index c08af6562..88b56379c 100644 --- a/gcc/tree-loop-distribution.c +++ b/gcc/tree-loop-distribution.c @@ -36,6 +36,47 @@ along with GCC; see the file COPYING3. If not see | D(I) = A(I-1)*E |ENDDO + If an unvectorizable loop has grouped loads, and calculations from grouped + loads are isomorphic, build temp arrays using stmts where isomorphic + calculations end. Afer distribution, the partition built from temp + arrays can be vectorized in pass SLP after loop unrolling. For example, + + |DO I = 1, N + | A = FOO (ARG_1); + | B = FOO (ARG_2); + | C = BAR_0 (A); + | D = BAR_1 (B); + |ENDDO + + is transformed to + + |DO I = 1, N + | J = FOO (ARG_1); + | K = FOO (ARG_2); + | X[I] = J; + | Y[I] = K; + | A = X[I]; + | B = Y[I]; + | C = BAR_0 (A); + | D = BAR_1 (B); + |ENDDO + + and is then distributed to + + |DO I = 1, N + | J = FOO (ARG_1); + | K = FOO (ARG_2); + | X[I] = J; + | Y[I] = K; + |ENDDO + + |DO I = 1, N + | A = X[I]; + | B = Y[I]; + | C = BAR_0 (A); + | D = BAR_1 (B); + |ENDDO + Loop distribution is the dual of loop fusion. It separates statements of a loop (or loop nest) into multiple loops (or loop nests) with the same loop header. The major goal is to separate statements which may @@ -44,7 +85,9 @@ along with GCC; see the file COPYING3. If not see 1) Seed partitions with specific type statements. For now we support two types seed statements: statement defining variable used outside - of loop; statement storing to memory. + of loop; statement storing to memory. Moreover, for unvectorizable + loops, we try to find isomorphic stmts from grouped load and build + temp arrays as new seed statements. 2) Build reduced dependence graph (RDG) for loop to be distributed. The vertices (RDG:V) model all statements in the loop and the edges (RDG:E) model flow and control dependencies between statements. @@ -643,7 +686,8 @@ class loop_distribution /* Returns true when PARTITION1 and PARTITION2 access the same memory object in RDG. */ bool share_memory_accesses (struct graph *rdg, - partition *partition1, partition *partition2); + partition *partition1, partition *partition2, + hash_set *excluded_arrays); /* For each seed statement in STARTING_STMTS, this function builds partition for it by adding depended statements according to RDG. @@ -686,8 +730,9 @@ class loop_distribution /* Fuse PARTITIONS of LOOP if necessary before finalizing distribution. ALIAS_DDRS contains ddrs which need runtime alias check. */ - void finalize_partitions (class loop *loop, vec - *partitions, vec *alias_ddrs); + void finalize_partitions (class loop *loop, + vec *partitions, + vec *alias_ddrs, bitmap producers); /* Analyze loop form and if it's vectorizable to decide if we need to insert temp arrays to distribute it. */ @@ -701,6 +746,28 @@ class loop_distribution inline void rebuild_rdg (loop_p loop, struct graph *&rdg, control_dependences *cd); + + /* If loop is not distributed, remove inserted temp arrays. */ + void remove_insertion (loop_p loop, struct graph *flow_only_rdg, + bitmap producers, struct partition *partition); + + /* Insert temp arrays if isomorphic computation exists. Temp arrays will be + regarded as SEED_STMTS for building partitions in succeeding processes. */ + bool insert_temp_arrays (loop_p loop, vec seed_stmts, + hash_set *tmp_array_vars, bitmap producers); + + void build_producers (loop_p loop, bitmap producers, + vec &transformed); + + void do_insertion (loop_p loop, struct graph *flow_only_rdg, tree iv, + bitmap cut_points, hash_set *tmp_array_vars, + bitmap producers); + + /* Fuse PARTITIONS built from inserted temp arrays into one partition, + fuse the rest into another. */ + void merge_remaining_partitions (vec *partitions, + bitmap producers); + /* Distributes the code from LOOP in such a way that producer statements are placed before consumer statements. Tries to separate only the statements from STMTS into separate loops. Returns the number of @@ -1913,7 +1980,8 @@ loop_distribution::classify_partition (loop_p loop, bool loop_distribution::share_memory_accesses (struct graph *rdg, - partition *partition1, partition *partition2) + partition *partition1, partition *partition2, + hash_set *excluded_arrays) { unsigned i, j; bitmap_iterator bi, bj; @@ -1947,7 +2015,10 @@ loop_distribution::share_memory_accesses (struct graph *rdg, if (operand_equal_p (DR_BASE_ADDRESS (dr1), DR_BASE_ADDRESS (dr2), 0) && operand_equal_p (DR_OFFSET (dr1), DR_OFFSET (dr2), 0) && operand_equal_p (DR_INIT (dr1), DR_INIT (dr2), 0) - && operand_equal_p (DR_STEP (dr1), DR_STEP (dr2), 0)) + && operand_equal_p (DR_STEP (dr1), DR_STEP (dr2), 0) + /* An exception, if PARTITION1 and PARTITION2 contain the + temp array we inserted, do not merge them. */ + && !excluded_arrays->contains (DR_REF (dr1))) return true; } } @@ -2909,13 +2980,47 @@ fuse_memset_builtins (vec *partitions) } } +void +loop_distribution::merge_remaining_partitions + (vec *partitions, + bitmap producers) +{ + struct partition *partition = NULL; + struct partition *p1 = NULL, *p2 = NULL; + for (unsigned i = 0; partitions->iterate (i, &partition); i++) + { + if (bitmap_intersect_p (producers, partition->stmts)) + { + if (p1 == NULL) + { + p1 = partition; + continue; + } + partition_merge_into (NULL, p1, partition, FUSE_FINALIZE); + } + else + { + if (p2 == NULL) + { + p2 = partition; + continue; + } + partition_merge_into (NULL, p2, partition, FUSE_FINALIZE); + } + partitions->unordered_remove (i); + partition_free (partition); + i--; + } +} + void loop_distribution::finalize_partitions (class loop *loop, vec *partitions, - vec *alias_ddrs) + vec *alias_ddrs, + bitmap producers) { unsigned i; - struct partition *partition, *a; + struct partition *partition; if (partitions->length () == 1 || alias_ddrs->length () > 0) @@ -2947,13 +3052,7 @@ loop_distribution::finalize_partitions (class loop *loop, || (loop->inner == NULL && i >= NUM_PARTITION_THRESHOLD && num_normal > num_builtin)) { - a = (*partitions)[0]; - for (i = 1; partitions->iterate (i, &partition); ++i) - { - partition_merge_into (NULL, a, partition, FUSE_FINALIZE); - partition_free (partition); - } - partitions->truncate (1); + merge_remaining_partitions (partitions, producers); } /* Fuse memset builtins if possible. */ @@ -3758,6 +3857,404 @@ find_isomorphic_stmts (loop_vec_info vinfo, vec &stmts) return decide_stmts_by_profit (candi_stmts, stmts); } +/* Get iv from SEED_STMTS and make sure each seed_stmt has only one iv as index + and all indices are the same. */ + +static tree +find_index (vec seed_stmts) +{ + if (seed_stmts.length () == 0) + return NULL; + bool found_index = false; + tree index = NULL; + unsigned ui = 0; + for (ui = 0; ui < seed_stmts.length (); ui++) + { + if (!gimple_vdef (seed_stmts[ui])) + return NULL; + tree lhs = gimple_assign_lhs (seed_stmts[ui]); + unsigned num_index = 0; + while (TREE_CODE (lhs) == ARRAY_REF) + { + if (TREE_CODE (TREE_OPERAND (lhs, 1)) == SSA_NAME) + { + num_index++; + if (num_index > 1) + return NULL; + if (index == NULL) + { + index = TREE_OPERAND (lhs, 1); + found_index = true; + } + else if (index != TREE_OPERAND (lhs, 1)) + return NULL; + } + lhs = TREE_OPERAND (lhs, 0); + } + if (!found_index) + return NULL; + } + return index; +} + +/* Check if expression of phi is an increament of a const. */ + +static void +check_phi_inc (struct vertex *v_phi, struct graph *rdg, bool &found_inc) +{ + struct graph_edge *e_phi; + for (e_phi = v_phi->succ; e_phi; e_phi = e_phi->succ_next) + { + struct vertex *v_inc = &(rdg->vertices[e_phi->dest]); + if (!is_gimple_assign (RDGV_STMT (v_inc)) + || gimple_expr_code (RDGV_STMT (v_inc)) != PLUS_EXPR) + continue; + tree rhs1 = gimple_assign_rhs1 (RDGV_STMT (v_inc)); + tree rhs2 = gimple_assign_rhs2 (RDGV_STMT (v_inc)); + if (!(integer_onep (rhs1) || integer_onep (rhs2))) + continue; + struct graph_edge *e_inc; + /* find cycle with only two vertices inc and phi: inc <--> phi. */ + bool found_cycle = false; + for (e_inc = v_inc->succ; e_inc; e_inc = e_inc->succ_next) + { + if (e_inc->dest == e_phi->src) + { + found_cycle = true; + break; + } + } + if (!found_cycle) + continue; + found_inc = true; + } +} + +/* Check if phi satisfies form like PHI <0, i>. */ + +static inline bool +iv_check_phi_stmt (gimple *phi_stmt) +{ + return gimple_phi_num_args (phi_stmt) == 2 + && (integer_zerop (gimple_phi_arg_def (phi_stmt, 0)) + || integer_zerop (gimple_phi_arg_def (phi_stmt, 1))); +} + +/* Make sure the iteration varible is a phi. */ + +static tree +get_iv_from_seed (struct graph *flow_only_rdg, vec seed_stmts) +{ + tree index = find_index (seed_stmts); + if (index == NULL) + return NULL; + for (int i = 0; i < flow_only_rdg->n_vertices; i++) + { + struct vertex *v = &(flow_only_rdg->vertices[i]); + if (RDGV_STMT (v) != seed_stmts[0]) + continue; + struct graph_edge *e; + bool found_phi = false; + for (e = v->pred; e; e = e->pred_next) + { + struct vertex *v_phi = &(flow_only_rdg->vertices[e->src]); + gimple *phi_stmt = RDGV_STMT (v_phi); + if (gimple_code (phi_stmt) != GIMPLE_PHI + || gimple_phi_result (phi_stmt) != index) + continue; + if (!iv_check_phi_stmt (phi_stmt)) + return NULL; + /* find inc expr in succ of phi. */ + bool found_inc = false; + check_phi_inc (v_phi, flow_only_rdg, found_inc); + if (!found_inc) + return NULL; + found_phi = true; + break; + } + if (!found_phi) + return NULL; + break; + } + return index; +} + +/* Do not distribute loop if vertexes in ROOT_MAP have antidependence with in + FLOW_ONLY_RDG. */ + +static bool +check_no_dependency (struct graph *flow_only_rdg, bitmap root_map) +{ + bitmap_iterator bi; + unsigned ui; + auto_vec visited_nodes; + auto_bitmap visited_map; + EXECUTE_IF_SET_IN_BITMAP (root_map, 0, ui, bi) + visited_nodes.safe_push (ui); + for (ui = 0; ui < visited_nodes.length (); ui++) + { + struct vertex *v = &(flow_only_rdg->vertices[visited_nodes[ui]]); + struct graph_edge *e; + for (e = v->succ; e; e = e->succ_next) + { + if (bitmap_bit_p (root_map, e->dest)) + return false; + if (bitmap_bit_p (visited_map, e->dest)) + continue; + visited_nodes.safe_push (e->dest); + bitmap_set_bit (visited_map, e->dest); + } + } + return true; +} + +/* Find isomorphic stmts from GROUPED_LOADS in VINFO and make sure + there is no dependency among those STMT we found. */ + +static unsigned +get_cut_points (struct graph *flow_only_rdg, bitmap cut_points, + loop_vec_info vinfo) +{ + unsigned n_stmts = 0; + + /* STMTS that may be CUT_POINTS. */ + auto_vec stmts; + if (!find_isomorphic_stmts (vinfo, stmts)) + { + if (dump_file && (dump_flags & TDF_DETAILS)) + fprintf (dump_file, "No temp array insertion: no isomorphic stmts" + " were found.\n"); + return 0; + } + + for (int i = 0; i < flow_only_rdg->n_vertices; i++) + { + if (stmts.contains (RDG_STMT (flow_only_rdg, i))) + bitmap_set_bit (cut_points, i); + } + n_stmts = bitmap_count_bits (cut_points); + + bool succ = check_no_dependency (flow_only_rdg, cut_points); + if (!succ) + { + if (dump_file && (dump_flags & TDF_DETAILS)) + fprintf (dump_file, "No temp array inserted: data dependency" + " among isomorphic stmts.\n"); + return 0; + } + return n_stmts; +} + +static void +build_temp_array (struct vertex *v, gimple_stmt_iterator &gsi, + poly_uint64 array_extent, tree iv, + hash_set *tmp_array_vars, vec *transformed) +{ + gimple *stmt = RDGV_STMT (v); + tree lhs = gimple_assign_lhs (stmt); + if (dump_file && (dump_flags & TDF_DETAILS)) + { + fprintf (dump_file, "original stmt:\t"); + print_gimple_stmt (dump_file, stmt, 0, TDF_VOPS|TDF_MEMSYMS); + } + tree var_ssa = duplicate_ssa_name (lhs, stmt); + gimple_assign_set_lhs (stmt, var_ssa); + if (dump_file && (dump_flags & TDF_DETAILS)) + { + fprintf (dump_file, "changed to:\t"); + print_gimple_stmt (dump_file, stmt, 0, TDF_VOPS | TDF_MEMSYMS); + } + gimple_set_uid (gsi_stmt (gsi), -1); + tree vect_elt_type = TREE_TYPE (lhs); + tree array_type = build_array_type_nelts (vect_elt_type, array_extent); + tree array = create_tmp_var (array_type); + tree array_ssa = build4 (ARRAY_REF, vect_elt_type, array, iv, NULL, NULL); + tmp_array_vars->add (array_ssa); + gimple *store = gimple_build_assign (array_ssa, var_ssa); + tree new_vdef = make_ssa_name (gimple_vop (cfun), store); + gsi_insert_after (&gsi, store, GSI_NEW_STMT); + gimple_set_vdef (store, new_vdef); + transformed->safe_push (store); + gimple_set_uid (gsi_stmt (gsi), -1); + tree array_ssa2 = build4 (ARRAY_REF, vect_elt_type, array, iv, NULL, NULL); + tmp_array_vars->add (array_ssa2); + gimple *load = gimple_build_assign (lhs, array_ssa2); + if (dump_file && (dump_flags & TDF_DETAILS)) + { + fprintf (dump_file, "insert stmt:\t"); + print_gimple_stmt (dump_file, store, 0, TDF_VOPS|TDF_MEMSYMS); + fprintf (dump_file, " and stmt:\t"); + print_gimple_stmt (dump_file, load, 0, TDF_VOPS|TDF_MEMSYMS); + } + gimple_set_vuse (load, new_vdef); + gsi_insert_after (&gsi, load, GSI_NEW_STMT); + gimple_set_uid (gsi_stmt (gsi), -1); +} + +/* Set bitmap PRODUCERS based on vec TRANSFORMED. */ + +void +loop_distribution::build_producers (loop_p loop, bitmap producers, + vec &transformed) +{ + auto_vec stmts; + stmts_from_loop (loop, &stmts); + int i = 0; + gimple *stmt = NULL; + + FOR_EACH_VEC_ELT (stmts, i, stmt) + gimple_set_uid (stmt, i); + i = 0; + FOR_EACH_VEC_ELT (transformed, i, stmt) + bitmap_set_bit (producers, stmt->uid); +} + +/* Transform stmt + + A = FOO (ARG_1); + + to + + STMT_1: A1 = FOO (ARG_1); + STMT_2: X[I] = A1; + STMT_3: A = X[I]; + + Producer is STMT_2 who defines the temp array and consumer is + STMT_3 who uses the temp array. */ + +void +loop_distribution::do_insertion (loop_p loop, struct graph *flow_only_rdg, + tree iv, bitmap cut_points, + hash_set *tmp_array_vars, + bitmap producers) +{ + if (dump_file && (dump_flags & TDF_DETAILS)) + fprintf (dump_file, "=== do insertion ===\n"); + + auto_vec transformed; + + /* Execution times of loop. */ + poly_uint64 array_extent + = tree_to_poly_uint64 (number_of_latch_executions (loop)) + 1; + + basic_block *bbs = get_loop_body_in_custom_order (loop, this, + bb_top_order_cmp_r); + + for (int i = 0; i < int (loop->num_nodes); i++) + { + basic_block bb = bbs[i]; + + /* Find all cut points in bb and transform them. */ + for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi); + gsi_next (&gsi)) + { + unsigned j = gimple_uid (gsi_stmt (gsi)); + if (bitmap_bit_p (cut_points, j)) + { + struct vertex *v = &(flow_only_rdg->vertices[j]); + build_temp_array (v, gsi, array_extent, iv, tmp_array_vars, + &transformed); + } + } + } + build_producers (loop, producers, transformed); + update_ssa (TODO_update_ssa); + free (bbs); +} + +/* After temp array insertion, given stmts + STMT_1: M = FOO (ARG_1); + STMT_2: X[I] = M; + STMT_3: A = X[I]; + STMT_2 is the producer, STMT_1 is its prev and STMT_3 is its next. + Replace M with A, and remove STMT_2 and STMT_3. */ + +static void +reset_gimple_assign (struct graph *flow_only_rdg, struct partition *partition, + gimple_stmt_iterator &gsi, int j) +{ + struct vertex *v = &(flow_only_rdg->vertices[j]); + gimple *stmt = RDGV_STMT (v); + gimple *prev = stmt->prev; + gimple *next = stmt->next; + tree n_lhs = gimple_assign_lhs (next); + gimple_assign_set_lhs (prev, n_lhs); + unlink_stmt_vdef (stmt); + if (partition) + bitmap_clear_bit (partition->stmts, gimple_uid (gsi_stmt (gsi))); + gsi_remove (&gsi, true); + release_defs (stmt); + if (partition) + bitmap_clear_bit (partition->stmts, gimple_uid (gsi_stmt (gsi))); + gsi_remove (&gsi, true); +} + +void +loop_distribution::remove_insertion (loop_p loop, struct graph *flow_only_rdg, + bitmap producers, struct partition *partition) +{ + basic_block *bbs = get_loop_body_in_custom_order (loop, this, + bb_top_order_cmp_r); + for (int i = 0; i < int (loop->num_nodes); i++) + { + basic_block bb = bbs[i]; + for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi); + gsi_next (&gsi)) + { + unsigned j = gimple_uid (gsi_stmt (gsi)); + if (bitmap_bit_p (producers, j)) + reset_gimple_assign (flow_only_rdg, partition, gsi, j); + } + } + update_ssa (TODO_update_ssa); + free (bbs); +} + +/* Insert temp arrays if isomorphic computation exists. Temp arrays will be + regarded as SEED_STMTS for building partitions in succeeding processes. */ + +bool +loop_distribution::insert_temp_arrays (loop_p loop, vec seed_stmts, + hash_set *tmp_array_vars, bitmap producers) +{ + struct graph *flow_only_rdg = build_rdg (loop, NULL); + gcc_checking_assert (flow_only_rdg != NULL); + tree iv = get_iv_from_seed (flow_only_rdg, seed_stmts); + if (iv == NULL) + { + if (dump_file && (dump_flags & TDF_DETAILS)) + fprintf (dump_file, "Loop %d no temp array insertion: failed to get" + " iteration variable.\n", loop->num); + free_rdg (flow_only_rdg); + return false; + } + auto_bitmap cut_points; + loop_vec_info vinfo = loop_vec_info_for_loop (loop); + unsigned n_cut_points = get_cut_points (flow_only_rdg, cut_points, vinfo); + delete vinfo; + loop->aux = NULL; + if (n_cut_points == 0) + { + if (dump_file && (dump_flags & TDF_DETAILS)) + fprintf (dump_file, "Loop %d no temp array insertion: no cut points" + " found.\n", loop->num); + free_rdg (flow_only_rdg); + return false; + } + do_insertion (loop, flow_only_rdg, iv, cut_points, tmp_array_vars, producers); + if (dump_enabled_p ()) + { + dump_user_location_t loc = find_loop_location (loop); + dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, loc, "Insertion done:" + " %d temp arrays inserted in Loop %d.\n", + n_cut_points, loop->num); + } + free_rdg (flow_only_rdg); + return true; +} + +static bool find_seed_stmts_for_distribution (class loop *, vec *); + /* Distributes the code from LOOP in such a way that producer statements are placed before consumer statements. Tries to separate only the statements from STMTS into separate loops. Returns the number of @@ -3814,6 +4311,34 @@ loop_distribution::distribute_loop (class loop *loop, vec stmts, return 0; } + /* Try to distribute LOOP if LOOP is simple enough and unable to vectorize. + If LOOP has grouped loads, recursively find isomorphic stmts and insert + temp arrays, rebuild RDG and call find_seed_stmts_for_distribution + to replace STMTS. */ + + hash_set tmp_array_vars; + + /* STMTs that define those inserted TMP_ARRAYs. */ + auto_bitmap producers; + + /* New SEED_STMTS after insertion. */ + auto_vec work_list; + bool insert_success = false; + if (may_insert_temp_arrays (loop, rdg, cd)) + { + if (insert_temp_arrays (loop, stmts, &tmp_array_vars, producers)) + { + if (find_seed_stmts_for_distribution (loop, &work_list)) + { + insert_success = true; + stmts = work_list; + } + else + remove_insertion (loop, rdg, producers, NULL); + rebuild_rdg (loop, rdg, cd); + } + } + data_reference_p dref; for (i = 0; datarefs_vec.iterate (i, &dref); ++i) dref->aux = (void *) (uintptr_t) i; @@ -3894,7 +4419,7 @@ loop_distribution::distribute_loop (class loop *loop, vec stmts, for (int j = i + 1; partitions.iterate (j, &partition); ++j) { - if (share_memory_accesses (rdg, into, partition)) + if (share_memory_accesses (rdg, into, partition, &tmp_array_vars)) { partition_merge_into (rdg, into, partition, FUSE_SHARE_REF); partitions.unordered_remove (j); @@ -3944,7 +4469,7 @@ loop_distribution::distribute_loop (class loop *loop, vec stmts, } } - finalize_partitions (loop, &partitions, &alias_ddrs); + finalize_partitions (loop, &partitions, &alias_ddrs, producers); /* If there is a reduction in all partitions make sure the last one is not classified for builtin code generation. */ @@ -3962,6 +4487,24 @@ loop_distribution::distribute_loop (class loop *loop, vec stmts, } nbp = partitions.length (); + + /* If we have inserted TMP_ARRAYs but there is only one partition left in + the succeeding processes, remove those inserted TMP_ARRAYs back to the + original version. */ + + if (nbp == 1 && insert_success) + { + struct partition *partition = NULL; + partitions.iterate (0, &partition); + remove_insertion (loop, rdg, producers, partition); + if (dump_enabled_p ()) + { + dump_user_location_t loc = find_loop_location (loop); + dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, loc, "Insertion removed:" + " unable to distribute loop %d.\n", loop->num); + } + } + if (nbp == 0 || (nbp == 1 && !partition_builtin_p (partitions[0])) || (nbp > 1 && partition_contains_all_rw (rdg, partitions))) -- 2.27.0.windows.1