1 //===- CallGraphSort.cpp --------------------------------------------------===//
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
8 //===----------------------------------------------------------------------===//
10 /// Implementation of Call-Chain Clustering from: Optimizing Function Placement
11 /// for Large-Scale Data-Center Applications
12 /// https://research.fb.com/wp-content/uploads/2017/01/cgo2017-hfsort-final1.pdf
14 /// The goal of this algorithm is to improve runtime performance of the final
15 /// executable by arranging code sections such that page table and i-cache
16 /// misses are minimized.
20 /// * An ordered list of input sections which are layed out as a unit. At the
21 /// beginning of the algorithm each input section has its own cluster and
22 /// the weight of the cluster is the sum of the weight of all incomming
24 /// * Call-Chain Clustering (C³) Heuristic
25 /// * Defines when and how clusters are combined. Pick the highest weighted
26 /// input section then add it to its most likely predecessor if it wouldn't
27 /// penalize it too much.
29 /// * The weight of the cluster divided by the size of the cluster. This is a
30 /// proxy for the ammount of execution time spent per byte of the cluster.
32 /// It does so given a call graph profile by the following:
33 /// * Build a weighted call graph from the call graph profile
34 /// * Sort input sections by weight
35 /// * For each input section starting with the highest weight
36 /// * Find its most likely predecessor cluster
37 /// * Check if the combined cluster would be too large, or would have too low
39 /// * If not, then combine the clusters.
40 /// * Sort non-empty clusters by density
42 //===----------------------------------------------------------------------===//
44 #include "CallGraphSort.h"
45 #include "OutputSections.h"
46 #include "SymbolTable.h"
51 using namespace lld::elf;
60 Cluster(int Sec, size_t S) {
61 Sections.push_back(Sec);
65 double getDensity() const {
68 return double(Weight) / double(Size);
71 std::vector<int> Sections;
74 uint64_t InitialWeight = 0;
75 std::vector<Edge> Preds;
82 DenseMap<const InputSectionBase *, int> run();
85 std::vector<Cluster> Clusters;
86 std::vector<const InputSectionBase *> Sections;
91 // Maximum ammount the combined cluster density can be worse than the original
92 // cluster to consider merging.
93 constexpr int MAX_DENSITY_DEGRADATION = 8;
95 // Maximum cluster size in bytes.
96 constexpr uint64_t MAX_CLUSTER_SIZE = 1024 * 1024;
97 } // end anonymous namespace
99 // Take the edge list in Config->CallGraphProfile, resolve symbol names to
100 // Symbols, and generate a graph between InputSections with the provided
102 CallGraphSort::CallGraphSort() {
103 llvm::MapVector<std::pair<const InputSectionBase *, const InputSectionBase *>,
104 uint64_t> &Profile = Config->CallGraphProfile;
105 DenseMap<const InputSectionBase *, int> SecToCluster;
107 auto GetOrCreateNode = [&](const InputSectionBase *IS) -> int {
108 auto Res = SecToCluster.insert(std::make_pair(IS, Clusters.size()));
110 Sections.push_back(IS);
111 Clusters.emplace_back(Clusters.size(), IS->getSize());
113 return Res.first->second;
117 for (const auto &C : Profile) {
118 const auto *FromSB = cast<InputSectionBase>(C.first.first->Repl);
119 const auto *ToSB = cast<InputSectionBase>(C.first.second->Repl);
120 uint64_t Weight = C.second;
122 // Ignore edges between input sections belonging to different output
123 // sections. This is done because otherwise we would end up with clusters
124 // containing input sections that can't actually be placed adjacently in the
125 // output. This messes with the cluster size and density calculations. We
126 // would also end up moving input sections in other output sections without
127 // moving them closer to what calls them.
128 if (FromSB->getOutputSection() != ToSB->getOutputSection())
131 int From = GetOrCreateNode(FromSB);
132 int To = GetOrCreateNode(ToSB);
134 Clusters[To].Weight += Weight;
140 Clusters[To].Preds.push_back({From, Weight});
142 for (Cluster &C : Clusters)
143 C.InitialWeight = C.Weight;
146 // It's bad to merge clusters which would degrade the density too much.
147 static bool isNewDensityBad(Cluster &A, Cluster &B) {
148 double NewDensity = double(A.Weight + B.Weight) / double(A.Size + B.Size);
149 if (NewDensity < A.getDensity() / MAX_DENSITY_DEGRADATION)
154 static void mergeClusters(Cluster &Into, Cluster &From) {
155 Into.Sections.insert(Into.Sections.end(), From.Sections.begin(),
156 From.Sections.end());
157 Into.Size += From.Size;
158 Into.Weight += From.Weight;
159 From.Sections.clear();
164 // Group InputSections into clusters using the Call-Chain Clustering heuristic
165 // then sort the clusters by density.
166 void CallGraphSort::groupClusters() {
167 std::vector<int> SortedSecs(Clusters.size());
168 std::vector<Cluster *> SecToCluster(Clusters.size());
170 for (int SI = 0, SE = Clusters.size(); SI != SE; ++SI) {
172 SecToCluster[SI] = &Clusters[SI];
175 std::stable_sort(SortedSecs.begin(), SortedSecs.end(), [&](int A, int B) {
176 return Clusters[B].getDensity() < Clusters[A].getDensity();
179 for (int SI : SortedSecs) {
180 // Clusters[SI] is the same as SecToClusters[SI] here because it has not
181 // been merged into another cluster yet.
182 Cluster &C = Clusters[SI];
185 uint64_t BestWeight = 0;
187 for (Edge &E : C.Preds) {
188 if (BestPred == -1 || E.Weight > BestWeight) {
190 BestWeight = E.Weight;
194 // don't consider merging if the edge is unlikely.
195 if (BestWeight * 10 <= C.InitialWeight)
198 Cluster *PredC = SecToCluster[BestPred];
202 if (C.Size + PredC->Size > MAX_CLUSTER_SIZE)
205 if (isNewDensityBad(*PredC, C))
208 // NOTE: Consider using a disjoint-set to track section -> cluster mapping
209 // if this is ever slow.
210 for (int SI : C.Sections)
211 SecToCluster[SI] = PredC;
213 mergeClusters(*PredC, C);
216 // Remove empty or dead nodes. Invalidates all cluster indices.
217 llvm::erase_if(Clusters, [](const Cluster &C) {
218 return C.Size == 0 || C.Sections.empty();
222 std::stable_sort(Clusters.begin(), Clusters.end(),
223 [](const Cluster &A, const Cluster &B) {
224 return A.getDensity() > B.getDensity();
228 DenseMap<const InputSectionBase *, int> CallGraphSort::run() {
232 llvm::DenseMap<const InputSectionBase *, int> OrderMap;
233 ssize_t CurOrder = 1;
235 for (const Cluster &C : Clusters)
236 for (int SecIndex : C.Sections)
237 OrderMap[Sections[SecIndex]] = CurOrder++;
242 // Sort sections by the profile data provided by -callgraph-profile-file
244 // This first builds a call graph based on the profile data then merges sections
245 // according to the C³ huristic. All clusters are then sorted by a density
246 // metric to further improve locality.
247 DenseMap<const InputSectionBase *, int> elf::computeCallGraphProfileOrder() {
248 return CallGraphSort().run();