contrib/llvm/tools/lld/ELF/CallGraphSort.cpp

   1 //===- CallGraphSort.cpp --------------------------------------------------===//
   2 //
   3 //                             The LLVM Linker
   4 //
   5 // This file is distributed under the University of Illinois Open Source
   6 // License. See LICENSE.TXT for details.
   7 //
   8 //===----------------------------------------------------------------------===//
   9 ///
  10 /// Implementation of Call-Chain Clustering from: Optimizing Function Placement
  11 /// for Large-Scale Data-Center Applications
  12 /// https://research.fb.com/wp-content/uploads/2017/01/cgo2017-hfsort-final1.pdf
  13 ///
  14 /// The goal of this algorithm is to improve runtime performance of the final
  15 /// executable by arranging code sections such that page table and i-cache
  16 /// misses are minimized.
  17 ///
  18 /// Definitions:
  19 /// * Cluster
  20 ///   * An ordered list of input sections which are layed out as a unit. At the
  21 ///     beginning of the algorithm each input section has its own cluster and
  22 ///     the weight of the cluster is the sum of the weight of all incomming
  23 ///     edges.
  24 /// * Call-Chain Clustering (C³) Heuristic
  25 ///   * Defines when and how clusters are combined. Pick the highest weighted
  26 ///     input section then add it to its most likely predecessor if it wouldn't
  27 ///     penalize it too much.
  28 /// * Density
  29 ///   * The weight of the cluster divided by the size of the cluster. This is a
  30 ///     proxy for the ammount of execution time spent per byte of the cluster.
  31 ///
  32 /// It does so given a call graph profile by the following:
  33 /// * Build a weighted call graph from the call graph profile
  34 /// * Sort input sections by weight
  35 /// * For each input section starting with the highest weight
  36 ///   * Find its most likely predecessor cluster
  37 ///   * Check if the combined cluster would be too large, or would have too low
  38 ///     a density.
  39 ///   * If not, then combine the clusters.
  40 /// * Sort non-empty clusters by density
  41 ///
  42 //===----------------------------------------------------------------------===//
  43
  44 #include "CallGraphSort.h"
  45 #include "OutputSections.h"
  46 #include "SymbolTable.h"
  47 #include "Symbols.h"
  48
  49 using namespace llvm;
  50 using namespace lld;
  51 using namespace lld::elf;
  52
  53 namespace {
  54 struct Edge {
  55   int From;
  56   uint64_t Weight;
  57 };
  58
  59 struct Cluster {
  60   Cluster(int Sec, size_t S) : Sections{Sec}, Size(S) {}
  61
  62   double getDensity() const {
  63     if (Size == 0)
  64       return 0;
  65     return double(Weight) / double(Size);
  66   }
  67
  68   std::vector<int> Sections;
  69   size_t Size = 0;
  70   uint64_t Weight = 0;
  71   uint64_t InitialWeight = 0;
  72   Edge BestPred = {-1, 0};
  73 };
  74
  75 class CallGraphSort {
  76 public:
  77   CallGraphSort();
  78
  79   DenseMap<const InputSectionBase *, int> run();
  80
  81 private:
  82   std::vector<Cluster> Clusters;
  83   std::vector<const InputSectionBase *> Sections;
  84
  85   void groupClusters();
  86 };
  87
  88 // Maximum ammount the combined cluster density can be worse than the original
  89 // cluster to consider merging.
  90 constexpr int MAX_DENSITY_DEGRADATION = 8;
  91
  92 // Maximum cluster size in bytes.
  93 constexpr uint64_t MAX_CLUSTER_SIZE = 1024 * 1024;
  94 } // end anonymous namespace
  95
  96 typedef std::pair<const InputSectionBase *, const InputSectionBase *>
  97     SectionPair;
  98
  99 // Take the edge list in Config->CallGraphProfile, resolve symbol names to
 100 // Symbols, and generate a graph between InputSections with the provided
 101 // weights.
 102 CallGraphSort::CallGraphSort() {
 103   MapVector<SectionPair, uint64_t> &Profile = Config->CallGraphProfile;
 104   DenseMap<const InputSectionBase *, int> SecToCluster;
 105
 106   auto GetOrCreateNode = [&](const InputSectionBase *IS) -> int {
 107     auto Res = SecToCluster.insert(std::make_pair(IS, Clusters.size()));
 108     if (Res.second) {
 109       Sections.push_back(IS);
 110       Clusters.emplace_back(Clusters.size(), IS->getSize());
 111     }
 112     return Res.first->second;
 113   };
 114
 115   // Create the graph.
 116   for (std::pair<SectionPair, uint64_t> &C : Profile) {
 117     const auto *FromSB = cast<InputSectionBase>(C.first.first->Repl);
 118     const auto *ToSB = cast<InputSectionBase>(C.first.second->Repl);
 119     uint64_t Weight = C.second;
 120
 121     // Ignore edges between input sections belonging to different output
 122     // sections.  This is done because otherwise we would end up with clusters
 123     // containing input sections that can't actually be placed adjacently in the
 124     // output.  This messes with the cluster size and density calculations.  We
 125     // would also end up moving input sections in other output sections without
 126     // moving them closer to what calls them.
 127     if (FromSB->getOutputSection() != ToSB->getOutputSection())
 128       continue;
 129
 130     int From = GetOrCreateNode(FromSB);
 131     int To = GetOrCreateNode(ToSB);
 132
 133     Clusters[To].Weight += Weight;
 134
 135     if (From == To)
 136       continue;
 137
 138     // Remember the best edge.
 139     Cluster &ToC = Clusters[To];
 140     if (ToC.BestPred.From == -1 || ToC.BestPred.Weight < Weight) {
 141       ToC.BestPred.From = From;
 142       ToC.BestPred.Weight = Weight;
 143     }
 144   }
 145   for (Cluster &C : Clusters)
 146     C.InitialWeight = C.Weight;
 147 }
 148
 149 // It's bad to merge clusters which would degrade the density too much.
 150 static bool isNewDensityBad(Cluster &A, Cluster &B) {
 151   double NewDensity = double(A.Weight + B.Weight) / double(A.Size + B.Size);
 152   return NewDensity < A.getDensity() / MAX_DENSITY_DEGRADATION;
 153 }
 154
 155 static void mergeClusters(Cluster &Into, Cluster &From) {
 156   Into.Sections.insert(Into.Sections.end(), From.Sections.begin(),
 157                        From.Sections.end());
 158   Into.Size += From.Size;
 159   Into.Weight += From.Weight;
 160   From.Sections.clear();
 161   From.Size = 0;
 162   From.Weight = 0;
 163 }
 164
 165 // Group InputSections into clusters using the Call-Chain Clustering heuristic
 166 // then sort the clusters by density.
 167 void CallGraphSort::groupClusters() {
 168   std::vector<int> SortedSecs(Clusters.size());
 169   std::vector<Cluster *> SecToCluster(Clusters.size());
 170
 171   for (size_t I = 0; I < Clusters.size(); ++I) {
 172     SortedSecs[I] = I;
 173     SecToCluster[I] = &Clusters[I];
 174   }
 175
 176   std::stable_sort(SortedSecs.begin(), SortedSecs.end(), [&](int A, int B) {
 177     return Clusters[B].getDensity() < Clusters[A].getDensity();
 178   });
 179
 180   for (int SI : SortedSecs) {
 181     // Clusters[SI] is the same as SecToClusters[SI] here because it has not
 182     // been merged into another cluster yet.
 183     Cluster &C = Clusters[SI];
 184
 185     // Don't consider merging if the edge is unlikely.
 186     if (C.BestPred.From == -1 || C.BestPred.Weight * 10 <= C.InitialWeight)
 187       continue;
 188
 189     Cluster *PredC = SecToCluster[C.BestPred.From];
 190     if (PredC == &C)
 191       continue;
 192
 193     if (C.Size + PredC->Size > MAX_CLUSTER_SIZE)
 194       continue;
 195
 196     if (isNewDensityBad(*PredC, C))
 197       continue;
 198
 199     // NOTE: Consider using a disjoint-set to track section -> cluster mapping
 200     // if this is ever slow.
 201     for (int SI : C.Sections)
 202       SecToCluster[SI] = PredC;
 203
 204     mergeClusters(*PredC, C);
 205   }
 206
 207   // Remove empty or dead nodes. Invalidates all cluster indices.
 208   llvm::erase_if(Clusters, [](const Cluster &C) {
 209     return C.Size == 0 || C.Sections.empty();
 210   });
 211
 212   // Sort by density.
 213   std::stable_sort(Clusters.begin(), Clusters.end(),
 214                    [](const Cluster &A, const Cluster &B) {
 215                      return A.getDensity() > B.getDensity();
 216                    });
 217 }
 218
 219 DenseMap<const InputSectionBase *, int> CallGraphSort::run() {
 220   groupClusters();
 221
 222   // Generate order.
 223   DenseMap<const InputSectionBase *, int> OrderMap;
 224   ssize_t CurOrder = 1;
 225
 226   for (const Cluster &C : Clusters)
 227     for (int SecIndex : C.Sections)
 228       OrderMap[Sections[SecIndex]] = CurOrder++;
 229
 230   return OrderMap;
 231 }
 232
 233 // Sort sections by the profile data provided by -callgraph-profile-file
 234 //
 235 // This first builds a call graph based on the profile data then merges sections
 236 // according to the C³ huristic. All clusters are then sorted by a density
 237 // metric to further improve locality.
 238 DenseMap<const InputSectionBase *, int> elf::computeCallGraphProfileOrder() {
 239   return CallGraphSort().run();
 240 }