diff options
Diffstat (limited to '0027-Autoprefetch-Support-auto-feedback-prefetch.patch')
-rw-r--r-- | 0027-Autoprefetch-Support-auto-feedback-prefetch.patch | 1000 |
1 files changed, 1000 insertions, 0 deletions
diff --git a/0027-Autoprefetch-Support-auto-feedback-prefetch.patch b/0027-Autoprefetch-Support-auto-feedback-prefetch.patch new file mode 100644 index 0000000..c3dcf50 --- /dev/null +++ b/0027-Autoprefetch-Support-auto-feedback-prefetch.patch @@ -0,0 +1,1000 @@ +From 6b944bed1158d3454b1db27aeab4ec1f2b8e5866 Mon Sep 17 00:00:00 2001 +From: huangxiaoquan <huangxiaoquan1@huawei.com> +Date: Thu, 27 Jan 2022 18:24:53 +0800 +Subject: [PATCH 27/28] [Autoprefetch] Support auto feedback prefetch + +1.Add option -fprefetch-loop-arrays=[value]. + +2.A prefetch distance analysis algorithm based on branch weight + is proposed to improve the accuracy of prefetch distance. + +3.Propose automatic feedback prefetching: + use the cache-miss profile information to guide the insertion of + prefetching instructions. +--- + gcc/auto-profile.c | 5 +- + gcc/common.opt | 5 + + gcc/opts.c | 7 + + gcc/params.opt | 16 + + gcc/tree-ssa-loop-prefetch.c | 735 ++++++++++++++++++++++++++++++++++- + 5 files changed, 748 insertions(+), 20 deletions(-) + +diff --git a/gcc/auto-profile.c b/gcc/auto-profile.c +index e6164b91b..f221978fc 100644 +--- a/gcc/auto-profile.c ++++ b/gcc/auto-profile.c +@@ -21,6 +21,8 @@ along with GCC; see the file COPYING3. If not see + #include "config.h" + #define INCLUDE_MAP + #define INCLUDE_SET ++#define INCLUDE_ALGORITHM ++#define INCLUDE_VECTOR + #include "system.h" + #include "coretypes.h" + #include "backend.h" +@@ -49,9 +51,6 @@ along with GCC; see the file COPYING3. If not see + #include "auto-profile.h" + #include "tree-pretty-print.h" + #include "gimple-pretty-print.h" +-#include <map> +-#include <vector> +-#include <algorithm> + + /* The following routines implements AutoFDO optimization. + +diff --git a/gcc/common.opt b/gcc/common.opt +index 37cbbd8c0..9488bd90f 100644 +--- a/gcc/common.opt ++++ b/gcc/common.opt +@@ -2201,6 +2201,11 @@ fprefetch-loop-arrays + Common Report Var(flag_prefetch_loop_arrays) Init(-1) Optimization + Generate prefetch instructions, if available, for arrays in loops. + ++fprefetch-loop-arrays= ++Common Joined RejectNegative UInteger Var(prefetch_level) Init(0) IntegerRange(0, 3) ++Generate prefetch instructions, if available, for arrays in loops. The prefetch ++level can control the optimize level to array prefetch. ++ + fprofile + Common Report Var(profile_flag) + Enable basic program profiling code. +diff --git a/gcc/opts.c b/gcc/opts.c +index 7a39f618b..f49f5ee58 100644 +--- a/gcc/opts.c ++++ b/gcc/opts.c +@@ -1747,6 +1747,8 @@ set_cache_misses_profile_params (struct gcc_options *opts, + struct gcc_options *opts_set) + { + SET_OPTION_IF_UNSET (opts, opts_set, flag_prefetch_loop_arrays, 1); ++ SET_OPTION_IF_UNSET (opts, opts_set, prefetch_level, 2); ++ SET_OPTION_IF_UNSET (opts, opts_set, param_simultaneous_prefetches, 100); + } + + /* -f{,no-}sanitize{,-recover}= suboptions. */ +@@ -2645,6 +2647,11 @@ common_handle_option (struct gcc_options *opts, + SET_OPTION_IF_UNSET (opts, opts_set, flag_ipa_bit_cp, value); + break; + ++ case OPT_fprefetch_loop_arrays_: ++ opts->x_prefetch_level = value; ++ opts->x_flag_prefetch_loop_arrays = true; ++ break; ++ + case OPT_fpatchable_function_entry_: + { + char *patch_area_arg = xstrdup (arg); +diff --git a/gcc/params.opt b/gcc/params.opt +index 2db69cc87..9d1faa7ab 100644 +--- a/gcc/params.opt ++++ b/gcc/params.opt +@@ -968,4 +968,20 @@ Bound on number of runtime checks inserted by the vectorizer's loop versioning f + Common Joined UInteger Var(param_vect_max_version_for_alignment_checks) Init(6) Param Optimization + Bound on number of runtime checks inserted by the vectorizer's loop versioning for alignment check. + ++-param=param-prefetch-func-topn= ++Common Joined UInteger Var(param_prefetch_func_topn) Init(3) Param Optimization ++TopN functions of cache miss counts to be analyzed in prefetching. ++ ++-param=param-prefetch-ref-topn= ++Common Joined UInteger Var(param_prefetch_ref_topn) Init(5) Param Optimization ++TopN ref of cache miss counts to be analyzed in prefetching. ++ ++-param=param-high-loop-execution-rate= ++Common Joined UInteger Var(param_high_loop_execution_rate) Init(95) IntegerRange(0, 100) Param Optimization ++High execution rate loops to be analyzed in prefetch (in%). ++ ++-param=param-prefetch-func-counts-threshold= ++Common Joined UInteger Var(param_prefetch_func_counts_threshold) Init(100) Param Optimization ++Threshold functions of cache miss counts to be analyzed in prefetching. ++ + ; This comment is to ensure we retain the blank line above. +diff --git a/gcc/tree-ssa-loop-prefetch.c b/gcc/tree-ssa-loop-prefetch.c +index d19ece641..3a5aef0fc 100644 +--- a/gcc/tree-ssa-loop-prefetch.c ++++ b/gcc/tree-ssa-loop-prefetch.c +@@ -18,6 +18,9 @@ along with GCC; see the file COPYING3. If not see + <http://www.gnu.org/licenses/>. */ + + #include "config.h" ++#define INCLUDE_ALGORITHM ++#define INCLUDE_MAP ++#define INCLUDE_VECTOR + #include "system.h" + #include "coretypes.h" + #include "backend.h" +@@ -48,6 +51,11 @@ along with GCC; see the file COPYING3. If not see + #include "tree-data-ref.h" + #include "diagnostic-core.h" + #include "dbgcnt.h" ++#include "gimple-pretty-print.h" ++#include "tree-cfg.h" ++#include "auto-profile.h" ++#include "cgraph.h" ++#include "print-tree.h" + + /* This pass inserts prefetch instructions to optimize cache usage during + accesses to arrays in loops. It processes loops sequentially and: +@@ -253,6 +261,22 @@ struct mem_ref_group + #define PREFETCH_MAX_MEM_REFS_PER_LOOP 200 + #endif + ++#ifndef PREFETCH_FUNC_TOPN ++#define PREFETCH_FUNC_TOPN param_prefetch_func_topn ++#endif ++ ++#ifndef PREFETCH_FUNC_COUNTS_THRESHOLD ++#define PREFETCH_FUNC_COUNTS_THRESHOLD param_prefetch_func_counts_threshold ++#endif ++ ++#ifndef PREFETCH_REF_TOPN ++#define PREFETCH_REF_TOPN param_prefetch_ref_topn ++#endif ++ ++#ifndef LOOP_EXECUTION_RATE ++#define LOOP_EXECUTION_RATE param_high_loop_execution_rate ++#endif ++ + /* The memory reference. */ + + struct mem_ref +@@ -279,6 +303,131 @@ struct mem_ref + nontemporal one. */ + }; + ++/* Probability information of basic blocks and branches. */ ++struct bb_bp ++{ ++ basic_block bb; ++ basic_block true_edge_bb; ++ basic_block false_edge_bb; ++ float true_edge_prob; ++ float false_edge_prob; ++ float bb_prob; ++}; ++ ++typedef struct bb_bp bb_bp; ++ ++enum PREFETCH_MODE ++{ ++ ORIGINAL_MODE=0, /* Original prefetch method. */ ++ REFINE_BB_AHEAD, ++ /* Prefetch distance algorithm for removing ++ irrelevant bb. */ ++ BRANCH_WEIGHTED_AHEAD, ++ /* Branch weighted prefetch ++ distance algorithm. */ ++ INDIRECT_MODE /* Indirect array prefetch mode. */ ++}; ++ ++typedef std::map <unsigned int, unsigned int> uid_rank_map; ++typedef std::map <location_t, unsigned int> loc_rank_map; ++typedef std::vector <std::pair<location_t, gcov_type> > loc_gcov_type_vec; ++typedef std::map <location_t, std::vector<gimple *> > loc_gimple_vec_map; ++ ++static loc_rank_map ref_rank; ++ ++/* Callback function for event_count comparison. */ ++ ++static bool ++event_count_cmp (std::pair<unsigned int, gcov_type> &a, ++ std::pair<unsigned int, gcov_type> &b) ++{ ++ return a.second > b.second; ++} ++ ++/* Prepared mappings from location to counts and from location ++ to stmt list. */ ++ ++static void ++prepare_loc_count_info (function *fun, loc_gcov_type_vec &ref_sorted, ++ loc_gimple_vec_map &loc_stmt, event_type event) ++{ ++ basic_block bb = NULL; ++ gimple_stmt_iterator bsi; ++ gimple *stmt; ++ tree lhs = NULL_TREE; ++ tree rhs = NULL_TREE; ++ ++ FOR_EACH_BB_FN (bb, fun) ++ { ++ for (bsi = gsi_start_bb (bb); !gsi_end_p (bsi); gsi_next (&bsi)) ++ { ++ stmt = gsi_stmt (bsi); ++ if (gimple_code (stmt) != GIMPLE_ASSIGN) ++ { ++ continue; ++ } ++ if (!gimple_vuse (stmt)) ++ { ++ continue; ++ } ++ lhs = gimple_assign_lhs (stmt); ++ rhs = gimple_assign_rhs1 (stmt); ++ if (REFERENCE_CLASS_P (rhs) || REFERENCE_CLASS_P (lhs)) ++ { ++ gcov_type loc_count = ++ event_get_loc_count (gimple_location (stmt), event); ++ if (loc_count > 0) ++ { ++ /* There may be multiple gimple correspond to the same ++ location. */ ++ if (loc_stmt.count (gimple_location (stmt)) == 0) ++ { ++ ref_sorted.push_back (std::make_pair (gimple_location (stmt), ++ loc_count)); ++ } ++ loc_stmt[gimple_location (stmt)].push_back (stmt); ++ } ++ } ++ } ++ } ++} ++ ++/* Sort references by event_count and dump loc count information after ++ sorting. */ ++ ++static void ++sort_ref_by_event_count (function *fun, event_type event) ++{ ++ loc_gcov_type_vec ref_sorted; ++ loc_gimple_vec_map loc_stmt; ++ ++ prepare_loc_count_info (fun, ref_sorted, loc_stmt, event); ++ sort (ref_sorted.begin (), ref_sorted.end (), event_count_cmp); ++ ++ for (unsigned i = 0; i < ref_sorted.size (); ++i) ++ { ++ ref_rank[ref_sorted[i].first] = i + 1; ++ /* Print the stmt and count of the topn ref. */ ++ if (i < PREFETCH_REF_TOPN && dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "stmt: \n"); ++ for (unsigned j = 0; j < loc_stmt[ref_sorted[i].first].size (); ++ ++j) ++ { ++ print_gimple_stmt (dump_file, ++ loc_stmt[ref_sorted[i].first][j], 0); ++ } ++ gcov_type loc_count = ++ event_get_loc_count (ref_sorted[i].first, event); ++ fprintf (dump_file, "stmt loc %u counts is %lu: " ++ "rank %d in top %d, (candidate analysis)\n\n", ++ ref_sorted[i].first, loc_count, ++ ref_rank[ref_sorted[i].first], PREFETCH_REF_TOPN); ++ } ++ } ++ return; ++} ++ + /* Dumps information about memory reference */ + static void + dump_mem_details (FILE *file, tree base, tree step, +@@ -479,6 +628,30 @@ idx_analyze_ref (tree base, tree *index, void *data) + return true; + } + ++/* Dumps information about ar_data structure. */ ++ ++static void ++dump_ar_data_details (FILE *file, tree ref, struct ar_data &ar_data) ++{ ++ print_generic_expr (file, ref, TDF_SLIM); ++ fprintf (file, "\n"); ++ if (*(ar_data.step)) ++ { ++ fprintf (file, " step "); ++ if (cst_and_fits_in_hwi (*(ar_data.step))) ++ fprintf (file, HOST_WIDE_INT_PRINT_DEC, ++ int_cst_value (*(ar_data.step))); ++ else ++ print_generic_expr (file, *(ar_data.step), TDF_SLIM); ++ } ++ fprintf (file, "\n"); ++ if (*(ar_data.delta)) ++ { ++ fprintf (file, " delta " HOST_WIDE_INT_PRINT_DEC "\n", ++ *(ar_data.delta)); ++ } ++} ++ + /* Tries to express REF_P in shape &BASE + STEP * iter + DELTA, where DELTA and + STEP are integer constants and iter is number of iterations of LOOP. The + reference occurs in statement STMT. Strips nonaddressable component +@@ -526,7 +699,17 @@ analyze_ref (class loop *loop, tree *ref_p, tree *base, + ar_data.stmt = stmt; + ar_data.step = step; + ar_data.delta = delta; +- return for_each_index (base, idx_analyze_ref, &ar_data); ++ ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ dump_ar_data_details (dump_file, ref, ar_data); ++ } ++ bool idx_flag = for_each_index (base, idx_analyze_ref, &ar_data); ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "idx_flag = %d \n\n", idx_flag); ++ } ++ return idx_flag; + } + + /* Record a memory reference REF to the list REFS. The reference occurs in +@@ -601,6 +784,55 @@ gather_memory_references_ref (class loop *loop, struct mem_ref_group **refs, + return true; + } + ++/* Determine whether to collect the memory references based on the ++ ranking of ref cache miss counts. */ ++ ++static bool ++should_gather_memory_references (gimple *stmt) ++{ ++ if (!(profile_exist (CACHE_MISSES))) ++ { ++ return true; ++ } ++ ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "stmt:"); ++ print_gimple_stmt (dump_file, stmt, 0); ++ fprintf (dump_file, "\n"); ++ } ++ if (ref_rank.count (gimple_location (stmt)) == 0) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "stmt location no found, skip prefetch " ++ "analysis\n"); ++ } ++ return false; ++ } ++ gcov_type loc_count = event_get_loc_count (gimple_location (stmt), CACHE_MISSES); ++ if (ref_rank[gimple_location (stmt)] > PREFETCH_REF_TOPN) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "stmt loc %u counts is %lu:" ++ "rank %d exceed topn %d, skip prefetch " ++ "analysis\n", ++ gimple_location (stmt), loc_count, ++ ref_rank[gimple_location (stmt)], PREFETCH_REF_TOPN); ++ } ++ return false; ++ } ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "stmt loc %u counts is %lu: rank %d in top %d," ++ "continue prefetch analysis\n", ++ gimple_location (stmt), loc_count, ++ ref_rank[gimple_location (stmt)], PREFETCH_REF_TOPN); ++ } ++ return true; ++} ++ + /* Record the suitable memory references in LOOP. NO_OTHER_REFS is set to + true if there are no other memory references inside the loop. */ + +@@ -626,6 +858,13 @@ gather_memory_references (class loop *loop, bool *no_other_refs, unsigned *ref_c + if (bb->loop_father != loop) + continue; + ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "==== the %dth loop bb body ====\n", i); ++ gimple_dump_bb (dump_file, bb, 0, dump_flags); ++ fprintf (dump_file, "\n"); ++ } ++ + for (bsi = gsi_start_bb (bb); !gsi_end_p (bsi); gsi_next (&bsi)) + { + stmt = gsi_stmt (bsi); +@@ -642,20 +881,31 @@ gather_memory_references (class loop *loop, bool *no_other_refs, unsigned *ref_c + if (! gimple_vuse (stmt)) + continue; + ++ if (!should_gather_memory_references (stmt)) ++ continue; ++ + lhs = gimple_assign_lhs (stmt); + rhs = gimple_assign_rhs1 (stmt); + + if (REFERENCE_CLASS_P (rhs)) + { +- *no_other_refs &= gather_memory_references_ref (loop, &refs, +- rhs, false, stmt); +- *ref_count += 1; ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "====> the %dth ref \n", *ref_count); ++ } ++ *no_other_refs &= gather_memory_references_ref (loop, &refs, rhs, ++ false, stmt); ++ *ref_count += 1; + } + if (REFERENCE_CLASS_P (lhs)) + { +- *no_other_refs &= gather_memory_references_ref (loop, &refs, +- lhs, true, stmt); +- *ref_count += 1; ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "====> the %dth ref \n", *ref_count); ++ } ++ *no_other_refs &= gather_memory_references_ref (loop, &refs, lhs, ++ true, stmt); ++ *ref_count += 1; + } + } + } +@@ -1168,9 +1418,9 @@ issue_prefetch_ref (struct mem_ref *ref, unsigned unroll_factor, unsigned ahead) + bool nontemporal = ref->reuse_distance >= L2_CACHE_SIZE_BYTES; + + if (dump_file && (dump_flags & TDF_DETAILS)) +- fprintf (dump_file, "Issued%s prefetch for reference %u:%u.\n", +- nontemporal ? " nontemporal" : "", +- ref->group->uid, ref->uid); ++ fprintf (dump_file, "Issued%s prefetch for reference %u:%u.\n", ++ nontemporal ? " nontemporal" : "", ++ ref->group->uid, ref->uid); + + bsi = gsi_for_stmt (ref->stmt); + +@@ -1875,6 +2125,306 @@ insn_to_prefetch_ratio_too_small_p (unsigned ninsns, unsigned prefetch_count, + return false; + } + ++/* Obtain the edge probability information of each basic block in the loop. */ ++ ++static float ++get_edge_prob (edge e) ++{ ++ /* Limit the minimum probability value. */ ++ const float MINNUM_PROB = 0.00001f; ++ float fvalue = 1; ++ ++ profile_probability probability = e->probability; ++ if (probability.initialized_p ()) ++ { ++ fvalue = probability.to_reg_br_prob_base () / float (REG_BR_PROB_BASE); ++ if (fvalue < MINNUM_PROB && probability.to_reg_br_prob_base ()) ++ { ++ fvalue = MINNUM_PROB; ++ } ++ } ++ return fvalue; ++} ++ ++ ++/* Dump the bb information in a loop. */ ++ ++static void ++dump_loop_bb (struct loop *loop) ++{ ++ basic_block *body = get_loop_body_in_dom_order (loop); ++ basic_block bb = NULL; ++ ++ for (unsigned i = 0; i < loop->num_nodes; i++) ++ { ++ bb = body[i]; ++ if (bb->loop_father != loop) ++ { ++ continue; ++ } ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "===== the %dth loop bb body ======= \n", i); ++ gimple_dump_bb (dump_file, bb, 0, dump_flags); ++ fprintf (dump_file, "\n"); ++ } ++ } ++ free (body); ++} ++ ++ ++/* Obtain the branch probability information of each basic block ++ in the loop. */ ++ ++static void ++get_bb_branch_prob (hash_map <basic_block, bb_bp> &bb_branch_prob, ++ struct loop *loop) ++{ ++ basic_block *body = get_loop_body (loop); ++ basic_block bb = NULL; ++ for (unsigned i = 0; i < loop->num_nodes; i++) ++ { ++ bb = body[i]; ++ if (bb->loop_father != loop) ++ { ++ continue; ++ } ++ bb_bp &branch_prob = bb_branch_prob.get_or_insert (bb); ++ branch_prob.bb = bb; ++ branch_prob.true_edge_bb = NULL; ++ branch_prob.false_edge_bb = NULL; ++ branch_prob.true_edge_prob = 0; ++ branch_prob.false_edge_prob = 0; ++ branch_prob.bb_prob = 0; ++ ++ gimple *stmt = last_stmt (bb); ++ if (stmt && gimple_code (stmt) == GIMPLE_COND) ++ { ++ if (EDGE_COUNT (bb->succs) != 2) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "The number of successful edges of bb" ++ "is abnormal\n"); ++ continue; ++ } ++ edge true_edge = NULL; ++ edge false_edge = NULL; ++ extract_true_false_edges_from_block (bb, &true_edge, &false_edge); ++ ++ /* If it is exiting bb, and the destination bb of the edge does not ++ belong to the current loop, the information of the edge is not ++ recorded. */ ++ if (true_edge->dest->loop_father == loop) ++ { ++ branch_prob.true_edge_bb = true_edge->dest; ++ branch_prob.true_edge_prob = get_edge_prob (true_edge); ++ } ++ if (false_edge->dest->loop_father == loop) ++ { ++ branch_prob.false_edge_bb = false_edge->dest; ++ branch_prob.false_edge_prob = get_edge_prob (false_edge); ++ } ++ } ++ ++ edge e = find_fallthru_edge (bb->succs); ++ if (e) ++ { ++ branch_prob.true_edge_bb = e->dest; ++ branch_prob.true_edge_prob = get_edge_prob (e); ++ } ++ } ++} ++ ++/* Traverse each bb in the loop and prune fake loops. */ ++ ++static bool ++traverse_prune_bb_branch (hash_map <basic_block, bb_bp> &bb_branch_prob, ++ int& max_path, hash_set <basic_block> &path_node, ++ basic_block current_bb, basic_block latch_bb) ++{ ++ /* Limit the maximum number of analysis paths. */ ++ if (max_path <= 0 || current_bb == NULL) ++ return false; ++ ++ /* Do not join edges that do not form a complete loop. */ ++ bb_bp *bb_bp_node = bb_branch_prob.get (current_bb); ++ if (bb_bp_node == NULL || (bb_bp_node->true_edge_bb == NULL ++ && bb_bp_node->false_edge_bb == NULL)) ++ return false; ++ ++ if (current_bb == latch_bb) ++ { ++ max_path--; ++ return true; ++ } ++ ++ /* Do not join edges that return to non-dominate nodes. */ ++ if (path_node.contains (bb_bp_node->true_edge_bb) ++ || path_node.contains (bb_bp_node->false_edge_bb)) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "fake loop: in bb%d\n", current_bb->index); ++ return false; ++ } ++ ++ path_node.add (current_bb); ++ if (bb_bp_node->true_edge_bb) ++ { ++ if (traverse_prune_bb_branch (bb_branch_prob, max_path, ++ path_node, bb_bp_node->true_edge_bb, latch_bb) == false) ++ return false; ++ } ++ if (bb_bp_node->false_edge_bb) ++ { ++ if (traverse_prune_bb_branch (bb_branch_prob, max_path, ++ path_node, bb_bp_node->false_edge_bb, latch_bb) == false) ++ return false; ++ } ++ path_node.remove (current_bb); ++ ++ max_path--; ++ return true; ++} ++ ++/* Traverse and calculate the probability of basic block. */ ++ ++static void ++traverse_calculate_bb_prob (hash_map <basic_block, bb_bp> &bb_branch_prob, ++ basic_block current_bb, basic_block latch_bb, ++ float prob) ++{ ++ /* Limit bb block access probability, the probability is ++ less than 100% and include delta. */ ++ const float MAX_BB_PROBABILITY = 1.001f; ++ ++ if (current_bb == NULL) ++ { ++ return; ++ } ++ bb_bp *bb_bp_node = bb_branch_prob.get (current_bb); ++ bb_bp_node->bb_prob += prob; ++ ++ gcc_assert (bb_bp_node->bb_prob <= MAX_BB_PROBABILITY); ++ ++ if (bb_bp_node == NULL || (bb_bp_node->true_edge_bb == NULL ++ && bb_bp_node->false_edge_bb == NULL)) ++ { ++ return; ++ } ++ if (current_bb == latch_bb) ++ { ++ return; ++ } ++ ++ bool assign = (bb_bp_node->true_edge_bb && bb_bp_node->false_edge_bb); ++ if (bb_bp_node->true_edge_bb) ++ { ++ float assign_prob = assign ? bb_bp_node->true_edge_prob * prob : prob; ++ traverse_calculate_bb_prob (bb_branch_prob, ++ bb_bp_node->true_edge_bb, latch_bb, assign_prob); ++ } ++ if (bb_bp_node->false_edge_bb) ++ { ++ float assign_prob = assign ? bb_bp_node->false_edge_prob * prob : prob; ++ traverse_calculate_bb_prob (bb_branch_prob, ++ bb_bp_node->false_edge_bb, latch_bb, assign_prob); ++ } ++ return; ++} ++ ++/* Obtain the probability of basic block. */ ++ ++static bool ++get_bb_prob (hash_map <basic_block, bb_bp> &bb_branch_prob, struct loop *loop) ++{ ++ /* The upper limit of the branch path in the loop is 10000. */ ++ const int MAX_BB_BRANCH_PATH = 10000; ++ ++ if (loop->header == NULL || loop->latch == NULL ++ || loop->header == loop->latch) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "get_bb_prob failed: without the header bb or " ++ "latch bb\n"); ++ return false; ++ } ++ ++ bb_bp *latch_branch_prob = bb_branch_prob.get (loop->latch); ++ bb_bp *header_branch_prob = bb_branch_prob.get (loop->header); ++ if (header_branch_prob == NULL || latch_branch_prob == NULL ++ || (latch_branch_prob->true_edge_bb != header_branch_prob->bb ++ && latch_branch_prob->false_edge_bb != header_branch_prob->bb)) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "get_bb_prob failed: loop data exception\n"); ++ return false; ++ } ++ ++ hash_set <basic_block> path_node; ++ int max_path = MAX_BB_BRANCH_PATH; ++ if (traverse_prune_bb_branch (bb_branch_prob, max_path, path_node, ++ header_branch_prob->bb, loop->latch) == false) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ fprintf (dump_file, "traverse_prune_bb_branch false.\n"); ++ return false; ++ } ++ traverse_calculate_bb_prob (bb_branch_prob, ++ header_branch_prob->bb, loop->latch, 1); ++ ++ return true; ++} ++ ++/* Computes an estimated number of insns in LOOP, weighted by WEIGHTS. */ ++ ++static unsigned ++estimate_num_loop_insns (struct loop *loop, eni_weights *weights) ++{ ++ basic_block *body = get_loop_body_in_dom_order (loop); ++ gimple_stmt_iterator gsi; ++ float size = 0; ++ basic_block bb = NULL; ++ hash_map <basic_block, bb_bp> bb_branch_prob; ++ ++ if (prefetch_level >= BRANCH_WEIGHTED_AHEAD) ++ { ++ get_bb_branch_prob (bb_branch_prob, loop); ++ if (get_bb_prob (bb_branch_prob, loop) == false) ++ { ++ dump_loop_bb (loop); ++ return 0; ++ } ++ } ++ ++ for (unsigned i = 0; i < loop->num_nodes; i++) ++ { ++ bb = body[i]; ++ /* For nested loops, the bb of the inner loop is not calculated. */ ++ if (bb->loop_father != loop) ++ { ++ continue; ++ } ++ ++ float size_tmp = 0; ++ for (gsi = gsi_start_bb (body[i]); !gsi_end_p (gsi); gsi_next (&gsi)) ++ { ++ size_tmp += estimate_num_insns (gsi_stmt (gsi), weights); ++ } ++ ++ if (prefetch_level >= BRANCH_WEIGHTED_AHEAD) ++ { ++ float bb_prob = bb_branch_prob.get (bb)->bb_prob; ++ size += size_tmp * bb_prob; ++ } ++ else ++ { ++ size += size_tmp; ++ } ++ } ++ free (body); ++ ++ return unsigned (size); ++} + + /* Issue prefetch instructions for array references in LOOP. Returns + true if the LOOP was unrolled. */ +@@ -1899,7 +2449,15 @@ loop_prefetch_arrays (class loop *loop) + + /* FIXME: the time should be weighted by the probabilities of the blocks in + the loop body. */ +- time = tree_num_loop_insns (loop, &eni_time_weights); ++ ++ if (prefetch_level >= REFINE_BB_AHEAD) ++ { ++ time = estimate_num_loop_insns (loop, &eni_time_weights); ++ } ++ else ++ { ++ time = tree_num_loop_insns (loop, &eni_time_weights); ++ } + if (time == 0) + return false; + +@@ -1913,7 +2471,14 @@ loop_prefetch_arrays (class loop *loop) + if (trip_count_to_ahead_ratio_too_small_p (ahead, est_niter)) + return false; + +- ninsns = tree_num_loop_insns (loop, &eni_size_weights); ++ if (prefetch_level >= REFINE_BB_AHEAD) ++ { ++ ninsns = estimate_num_loop_insns (loop, &eni_size_weights); ++ } ++ else ++ { ++ ninsns = tree_num_loop_insns (loop, &eni_size_weights); ++ } + + /* Step 1: gather the memory references. */ + refs = gather_memory_references (loop, &no_other_refs, &mem_ref_count); +@@ -1978,10 +2543,49 @@ fail: + return unrolled; + } + ++/* Determine if it is a high execution rate loop. */ ++ ++static bool ++is_high_exec_rate_loop (struct loop *loop) ++{ ++ vec<edge> exit_edges = get_loop_exit_edges (loop); ++ if (exit_edges == vNULL) ++ { ++ return false; ++ } ++ ++ unsigned i = 0; ++ gcov_type exit_count = 0; ++ edge e = NULL; ++ float loop_exec_rate = 0; ++ gcov_type header_bb_count = loop->header->count.to_gcov_type (); ++ FOR_EACH_VEC_ELT (exit_edges, i, e) ++ { ++ gcov_type exiting_bb_count = e->src->count.to_gcov_type (); ++ float exit_edge_prob = get_edge_prob (e); ++ exit_count += exit_edge_prob * exiting_bb_count; ++ ++ loop_exec_rate = 1.0 - ((double) exit_count / header_bb_count); ++ ++ if (loop_exec_rate < (float) LOOP_EXECUTION_RATE / 100.0) ++ { ++ return false; ++ } ++ } ++ ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "loop with high execution rate: %f >= %f\n\n", ++ loop_exec_rate, (float) LOOP_EXECUTION_RATE / 100.0); ++ dump_loop_bb (loop); ++ } ++ return true; ++} ++ + /* Issue prefetch instructions for array references in loops. */ + + unsigned int +-tree_ssa_prefetch_arrays (void) ++tree_ssa_prefetch_arrays (function *fun) + { + class loop *loop; + bool unrolled = false; +@@ -2012,6 +2616,12 @@ tree_ssa_prefetch_arrays (void) + param_min_insn_to_prefetch_ratio); + fprintf (dump_file, " min insn-to-mem ratio: %d \n", + param_prefetch_min_insn_to_mem_ratio); ++ fprintf (dump_file, " prefetch_func_topn: %d \n", ++ param_prefetch_func_topn); ++ fprintf (dump_file, " prefetch_ref_topn: %d \n", ++ param_prefetch_ref_topn); ++ fprintf (dump_file, " high_loop_execution_rate: %d \n", ++ LOOP_EXECUTION_RATE); + fprintf (dump_file, "\n"); + } + +@@ -2028,13 +2638,42 @@ tree_ssa_prefetch_arrays (void) + set_builtin_decl (BUILT_IN_PREFETCH, decl, false); + } + +- FOR_EACH_LOOP (loop, LI_FROM_INNERMOST) ++ enum li_flags LI = LI_FROM_INNERMOST; ++ ++ if (profile_exist (CACHE_MISSES)) ++ { ++ LI = LI_ONLY_INNERMOST; ++ } ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "Processing model %d:\n", LI); ++ } ++ ++ if (profile_exist (CACHE_MISSES)) ++ { ++ sort_ref_by_event_count (fun, CACHE_MISSES); ++ } ++ ++ FOR_EACH_LOOP (loop, LI) + { + if (dump_file && (dump_flags & TDF_DETAILS)) +- fprintf (dump_file, "Processing loop %d:\n", loop->num); ++ { ++ fprintf (dump_file, "======================================\n"); ++ fprintf (dump_file, "Processing loop %d:\n", loop->num); ++ fprintf (dump_file, "======================================\n"); ++ flow_loop_dump (loop, dump_file, NULL, 1); ++ fprintf (dump_file, "\n\n"); ++ } + +- unrolled |= loop_prefetch_arrays (loop); ++ if (profile_exist (CACHE_MISSES)) ++ { ++ if (!is_high_exec_rate_loop (loop)) ++ { ++ continue; ++ } ++ } + ++ unrolled |= loop_prefetch_arrays (loop); + if (dump_file && (dump_flags & TDF_DETAILS)) + fprintf (dump_file, "\n\n"); + } +@@ -2049,6 +2688,56 @@ tree_ssa_prefetch_arrays (void) + return todo_flags; + } + ++/* Determine whether to analyze the function according to ++ the sorting of the function containing cache-miss counts. */ ++ ++static bool ++should_analyze_func_p (void) ++{ ++ gcov_type decl_uid = DECL_UID (current_function_decl); ++ struct rank_info func_rank_info = ++ event_get_func_rank (decl_uid, CACHE_MISSES); ++ if (func_rank_info.total == 0) ++ { ++ return false; ++ } ++ gcov_type func_count = event_get_func_count (decl_uid, CACHE_MISSES); ++ if (func_count == 0) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "function uid %d cannot find profile data " ++ "and skip prefetch analysis\n", ++ decl_uid); ++ } ++ return false; ++ } ++ if (func_rank_info.rank > PREFETCH_FUNC_TOPN ++ || func_count < PREFETCH_FUNC_COUNTS_THRESHOLD) ++ { ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "function uid %d total counts is %lu: " ++ "rank %d > topn %d, counts %lu < threshold %lu " ++ "skip prefetch analysis\n", ++ decl_uid, func_count, ++ func_rank_info.rank, PREFETCH_FUNC_TOPN, ++ func_count, PREFETCH_FUNC_COUNTS_THRESHOLD); ++ } ++ return false; ++ } ++ if (dump_file && (dump_flags & TDF_DETAILS)) ++ { ++ fprintf (dump_file, "function uid %d total counts is %lu: " ++ "rank %d in topn %d, counts %lu > threshold %lu " ++ "continue prefetch analysis\n", ++ decl_uid, func_count, ++ func_rank_info.rank, PREFETCH_FUNC_TOPN, ++ func_count, PREFETCH_FUNC_COUNTS_THRESHOLD); ++ } ++ return true; ++} ++ + /* Prefetching. */ + + namespace { +@@ -2085,6 +2774,18 @@ pass_loop_prefetch::execute (function *fun) + if (number_of_loops (fun) <= 1) + return 0; + ++ /* Filter only when combined with cache-miss. When the should_analyze_func_p ++ analysis fails (for example, the function without cache-miss count), ++ in order to ensure the accuracy of the prefetch analysis, the function ++ does not perform native prefetch processing. */ ++ if (profile_exist (CACHE_MISSES)) ++ { ++ if (!should_analyze_func_p ()) ++ { ++ return 0; ++ } ++ } ++ + if ((PREFETCH_BLOCK & (PREFETCH_BLOCK - 1)) != 0) + { + static bool warned = false; +@@ -2099,7 +2800,7 @@ pass_loop_prefetch::execute (function *fun) + return 0; + } + +- return tree_ssa_prefetch_arrays (); ++ return tree_ssa_prefetch_arrays (fun); + } + + } // anon namespace +-- +2.27.0.windows.1 + |