diff options
Diffstat (limited to '0026-AutoFDO-Enable-discriminator-and-MCF-algorithm-on-Au.patch')
-rw-r--r-- | 0026-AutoFDO-Enable-discriminator-and-MCF-algorithm-on-Au.patch | 353 |
1 files changed, 353 insertions, 0 deletions
diff --git a/0026-AutoFDO-Enable-discriminator-and-MCF-algorithm-on-Au.patch b/0026-AutoFDO-Enable-discriminator-and-MCF-algorithm-on-Au.patch new file mode 100644 index 0000000..bbc98c6 --- /dev/null +++ b/0026-AutoFDO-Enable-discriminator-and-MCF-algorithm-on-Au.patch @@ -0,0 +1,353 @@ +From eb58d920a95696d8d5a7db9a6d640d4494fb023f Mon Sep 17 00:00:00 2001 +From: liyancheng <412998149@qq.com> +Date: Tue, 25 Jan 2022 16:57:28 +0800 +Subject: [PATCH 26/28] [AutoFDO] Enable discriminator and MCF algorithm on + AutoFDO + +1. Support discriminator for distinguishes among several + basic blocks that share a common locus, allowing for + more accurate autofdo. + +2. Using option -fprofile-correction for calling MCF algorithm + to smooth non conservative BB counts. +--- + gcc/auto-profile.c | 172 ++++++++++++++++++++++++++++++++++++++++++++- + gcc/cfghooks.c | 7 ++ + gcc/ipa-cp.c | 21 ++++++ + gcc/opts.c | 5 +- + gcc/tree-inline.c | 14 ++++ + 5 files changed, 215 insertions(+), 4 deletions(-) + +diff --git a/gcc/auto-profile.c b/gcc/auto-profile.c +index aced8fca5..e6164b91b 100644 +--- a/gcc/auto-profile.c ++++ b/gcc/auto-profile.c +@@ -678,6 +678,17 @@ string_table::get_index (const char *name) const + if (name == NULL) + return -1; + string_index_map::const_iterator iter = map_.find (name); ++ /* Function name may be duplicate. Try to distinguish by the ++ #file_name#function_name defined by the autofdo tool chain. */ ++ if (iter == map_.end ()) ++ { ++ char* file_name = get_original_name (lbasename (dump_base_name)); ++ char* file_func_name ++ = concat ("#", file_name, "#", name, NULL); ++ iter = map_.find (file_func_name); ++ free (file_name); ++ free (file_func_name); ++ } + if (iter == map_.end ()) + return -1; + +@@ -866,7 +877,7 @@ function_instance::read_function_instance (function_instance_stack *stack, + + for (unsigned i = 0; i < num_pos_counts; i++) + { +- unsigned offset = gcov_read_unsigned () & 0xffff0000; ++ unsigned offset = gcov_read_unsigned (); + unsigned num_targets = gcov_read_unsigned (); + gcov_type count = gcov_read_counter (); + s->pos_counts[offset].count = count; +@@ -945,6 +956,10 @@ autofdo_source_profile::get_count_info (gimple *stmt, count_info *info) const + function_instance *s = get_function_instance_by_inline_stack (stack); + if (s == NULL) + return false; ++ if (s->get_count_info (stack[0].second + stmt->bb->discriminator, info)) ++ { ++ return true; ++ } + return s->get_count_info (stack[0].second, info); + } + +@@ -1583,6 +1598,68 @@ afdo_propagate (bb_set *annotated_bb) + } + } + ++/* Process the following scene when the branch probability ++ inversion when do function afdo_propagate (). E.g. ++ BB_NUM (sample count) ++ BB1 (1000) ++ / \ ++ BB2 (10) BB3 (0) ++ \ / ++ BB4 ++ In afdo_propagate(), count of BB3 is calculated by ++ COUNT (BB3) = 990 (990 = COUNT (BB1) - COUNT (BB2) = 1000 - 10) ++ ++ In fact, BB3 may be colder than BB2 by sample count. ++ ++ This function allocate source BB count to each succ BB by sample ++ rate, E.g. ++ BB2_COUNT = BB1_COUNT * (BB2_COUNT / (BB2_COUNT + BB3_COUNT)) */ ++ ++static void ++afdo_preprocess_bb_count () ++{ ++ basic_block bb; ++ FOR_ALL_BB_FN (bb, cfun) ++ { ++ if (bb->count.ipa_p () && EDGE_COUNT (bb->succs) > 1 ++ && bb->count > profile_count::zero ().afdo ()) ++ { ++ basic_block bb1 = EDGE_SUCC (bb, 0)->dest; ++ basic_block bb2 = EDGE_SUCC (bb, 1)->dest; ++ if (single_succ_p (bb1) && single_succ_p (bb2) ++ && EDGE_SUCC (bb1, 0)->dest == EDGE_SUCC (bb2, 0)->dest) ++ { ++ gcov_type max_count = 0; ++ gcov_type total_count = 0; ++ edge e; ++ edge_iterator ei; ++ FOR_EACH_EDGE (e, ei, bb->succs) ++ { ++ if (!e->dest->count.ipa_p ()) ++ { ++ continue; ++ } ++ max_count = MAX(max_count, e->dest->count.to_gcov_type ()); ++ total_count += e->dest->count.to_gcov_type (); ++ } ++ /* Only bb_count > max_count * 2, branch probability will ++ inversion. */ ++ if (max_count > 0 ++ && bb->count.to_gcov_type () > max_count * 2) ++ { ++ FOR_EACH_EDGE (e, ei, bb->succs) ++ { ++ gcov_type target_count = bb->count.to_gcov_type () ++ * e->dest->count.to_gcov_type () / total_count; ++ e->dest->count ++ = profile_count::from_gcov_type (target_count).afdo (); ++ } ++ } ++ } ++ } ++ } ++} ++ + /* Propagate counts on control flow graph and calculate branch + probabilities. */ + +@@ -1608,6 +1685,7 @@ afdo_calculate_branch_prob (bb_set *annotated_bb) + } + + afdo_find_equiv_class (annotated_bb); ++ afdo_preprocess_bb_count (); + afdo_propagate (annotated_bb); + + FOR_EACH_BB_FN (bb, cfun) +@@ -1711,6 +1789,82 @@ afdo_vpt_for_early_inline (stmt_set *promoted_stmts) + return false; + } + ++/* Preparation before executing MCF algorithm. */ ++ ++static void ++afdo_init_mcf () ++{ ++ basic_block bb; ++ edge e; ++ edge_iterator ei; ++ ++ if (dump_file) ++ { ++ fprintf (dump_file, "\n init calling mcf_smooth_cfg (). \n"); ++ } ++ ++ /* Step1: when use mcf, BB id must be continous, ++ so we need compact_blocks (). */ ++ compact_blocks (); ++ ++ /* Step2: allocate memory for MCF input data. */ ++ bb_gcov_counts.safe_grow_cleared (cfun->cfg->x_last_basic_block); ++ edge_gcov_counts = new hash_map<edge, gcov_type>; ++ ++ /* Step3: init MCF input data from cfg. */ ++ FOR_ALL_BB_FN (bb, cfun) ++ { ++ /* Init BB count for MCF. */ ++ bb_gcov_count (bb) = bb->count.to_gcov_type (); ++ ++ gcov_type total_count = 0; ++ FOR_EACH_EDGE (e, ei, bb->succs) ++ { ++ total_count += e->dest->count.to_gcov_type (); ++ } ++ ++ /* If there is no sample in each successor blocks, source ++ BB samples are allocated to each edge by branch static prob. */ ++ ++ FOR_EACH_EDGE (e, ei, bb->succs) ++ { ++ if (total_count == 0) ++ { ++ edge_gcov_count (e) = e->src->count.to_gcov_type () ++ * e->probability.to_reg_br_prob_base () / REG_BR_PROB_BASE; ++ } ++ else ++ { ++ edge_gcov_count (e) = e->src->count.to_gcov_type () ++ * e->dest->count.to_gcov_type () / total_count; ++ } ++ } ++ } ++} ++ ++/* Free the resources used by MCF and reset BB count from MCF result, ++ branch probability has been updated in mcf_smooth_cfg (). */ ++ ++static void ++afdo_process_after_mcf () ++{ ++ basic_block bb; ++ /* Reset BB count from MCF result. */ ++ FOR_EACH_BB_FN (bb, cfun) ++ { ++ if (bb_gcov_count (bb)) ++ { ++ bb->count ++ = profile_count::from_gcov_type (bb_gcov_count (bb)).afdo (); ++ } ++ } ++ ++ /* Clean up MCF resource. */ ++ bb_gcov_counts.release (); ++ delete edge_gcov_counts; ++ edge_gcov_counts = NULL; ++} ++ + /* Annotate auto profile to the control flow graph. Do not annotate value + profile for stmts in PROMOTED_STMTS. */ + +@@ -1762,8 +1916,20 @@ afdo_annotate_cfg (const stmt_set &promoted_stmts) + afdo_source_profile->mark_annotated (cfun->function_end_locus); + if (max_count > profile_count::zero ()) + { +- /* Calculate, propagate count and probability information on CFG. */ +- afdo_calculate_branch_prob (&annotated_bb); ++ /* 1 means -fprofile-correction is enabled manually, and MCF ++ algorithm will be used to calculate count and probability. ++ Otherwise, use the default calculate algorithm. */ ++ if (flag_profile_correction == 1) ++ { ++ afdo_init_mcf (); ++ mcf_smooth_cfg (); ++ afdo_process_after_mcf (); ++ } ++ else ++ { ++ /* Calculate, propagate count and probability information on CFG. */ ++ afdo_calculate_branch_prob (&annotated_bb); ++ } + } + update_max_bb_count (); + profile_status_for_fn (cfun) = PROFILE_READ; +diff --git a/gcc/cfghooks.c b/gcc/cfghooks.c +index ea558b469..4ea490a8a 100644 +--- a/gcc/cfghooks.c ++++ b/gcc/cfghooks.c +@@ -526,6 +526,9 @@ split_block_1 (basic_block bb, void *i) + return NULL; + + new_bb->count = bb->count; ++ /* Copy discriminator from original bb for distinguishes among ++ several basic blocks that share a common locus, allowing for ++ more accurate autofdo. */ + new_bb->discriminator = bb->discriminator; + + if (dom_info_available_p (CDI_DOMINATORS)) +@@ -1091,6 +1094,10 @@ duplicate_block (basic_block bb, edge e, basic_block after, copy_bb_data *id) + move_block_after (new_bb, after); + + new_bb->flags = (bb->flags & ~BB_DUPLICATED); ++ /* Copy discriminator from original bb for distinguishes among ++ several basic blocks that share a common locus, allowing for ++ more accurate autofdo. */ ++ new_bb->discriminator = bb->discriminator; + FOR_EACH_EDGE (s, ei, bb->succs) + { + /* Since we are creating edges from a new block to successors +diff --git a/gcc/ipa-cp.c b/gcc/ipa-cp.c +index b1f0881bd..c208070c9 100644 +--- a/gcc/ipa-cp.c ++++ b/gcc/ipa-cp.c +@@ -4365,6 +4365,27 @@ update_profiling_info (struct cgraph_node *orig_node, + orig_node_count.dump (dump_file); + fprintf (dump_file, "\n"); + } ++ ++ /* When autofdo uses PMU as the sampling unit, the count of ++ cgraph_node->count cannot be obtained directly and will ++ be zero. It using for apply_scale will cause the node ++ count incorrectly overestimated. So set orig_new_node_count ++ equal to orig_node_count, which is same as known error ++ handling. */ ++ if (orig_node->count == profile_count::zero ().afdo () ++ && new_node->count == profile_count::zero ().global0adjusted ()) ++ { ++ orig_new_node_count = (orig_sum + new_sum).apply_scale (12, 10); ++ ++ if (dump_file) ++ { ++ fprintf (dump_file, " node %s with zero count from afdo ", ++ new_node->dump_name ()); ++ fprintf (dump_file, " proceeding by pretending it was "); ++ orig_new_node_count.dump (dump_file); ++ fprintf (dump_file, "\n"); ++ } ++ } + } + + remainder = orig_node_count.combine_with_ipa_count (orig_node_count.ipa () +diff --git a/gcc/opts.c b/gcc/opts.c +index 642327296..7a39f618b 100644 +--- a/gcc/opts.c ++++ b/gcc/opts.c +@@ -2606,7 +2606,10 @@ common_handle_option (struct gcc_options *opts, + /* FALLTHRU */ + case OPT_fauto_profile: + enable_fdo_optimizations (opts, opts_set, value); +- SET_OPTION_IF_UNSET (opts, opts_set, flag_profile_correction, value); ++ /* 2 is special and means flag_profile_correction trun on by ++ -fauto-profile. */ ++ SET_OPTION_IF_UNSET (opts, opts_set, flag_profile_correction, ++ (value ? 2 : 0)); + SET_OPTION_IF_UNSET (opts, opts_set, + param_early_inliner_max_iterations, 10); + break; +diff --git a/gcc/tree-inline.c b/gcc/tree-inline.c +index efde5d158..8405a959c 100644 +--- a/gcc/tree-inline.c ++++ b/gcc/tree-inline.c +@@ -2015,6 +2015,10 @@ copy_bb (copy_body_data *id, basic_block bb, + basic_block_info automatically. */ + copy_basic_block = create_basic_block (NULL, (basic_block) prev->aux); + copy_basic_block->count = bb->count.apply_scale (num, den); ++ /* Copy discriminator from original bb for distinguishes among ++ several basic blocks that share a common locus, allowing for ++ more accurate autofdo. */ ++ copy_basic_block->discriminator = bb->discriminator; + + copy_gsi = gsi_start_bb (copy_basic_block); + +@@ -3028,6 +3032,16 @@ copy_cfg_body (copy_body_data * id, + den += e->count (); + ENTRY_BLOCK_PTR_FOR_FN (cfun)->count = den; + } ++ /* When autofdo uses PMU as the sampling unit, the number of ++ ENTRY_BLOCK_PTR_FOR_FN cannot be obtained directly and will ++ be zero. It using for adjust_for_ipa_scaling will cause the ++ inlined BB count incorrectly overestimated. So set den equal ++ to num, which is the source inline BB count to avoid ++ overestimated. */ ++ if (den == profile_count::zero ().afdo ()) ++ { ++ den = num; ++ } + + profile_count::adjust_for_ipa_scaling (&num, &den); + +-- +2.27.0.windows.1 + |