-
Notifications
You must be signed in to change notification settings - Fork 12.8k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[ELF] Add profile guided section layout
This adds profile guided layout using the Call-Chain Clustering (C³) heuristic from https://research.fb.com/wp-content/uploads/2017/01/cgo2017-hfsort-final1.pdf . RFC: [llvm-dev] [RFC] Profile guided section layout http://lists.llvm.org/pipermail/llvm-dev/2017-June/114178.html Pass `--call-graph-ordering-file <file>` to read a call graph profile where each line has the format: <from symbol> <to symbol> <call count> Differential Revision: https://reviews.llvm.org/D36351 llvm-svn: 330234
- Loading branch information
Showing
13 changed files
with
693 additions
and
19 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,251 @@ | ||
//===- CallGraphSort.cpp --------------------------------------------------===// | ||
// | ||
// The LLVM Linker | ||
// | ||
// This file is distributed under the University of Illinois Open Source | ||
// License. See LICENSE.TXT for details. | ||
// | ||
//===----------------------------------------------------------------------===// | ||
/// | ||
/// Implementation of Call-Chain Clustering from: Optimizing Function Placement | ||
/// for Large-Scale Data-Center Applications | ||
/// https://research.fb.com/wp-content/uploads/2017/01/cgo2017-hfsort-final1.pdf | ||
/// | ||
/// The goal of this algorithm is to improve runtime performance of the final | ||
/// executable by arranging code sections such that page table and i-cache | ||
/// misses are minimized. | ||
/// | ||
/// Definitions: | ||
/// * Cluster | ||
/// * An ordered list of input sections which are layed out as a unit. At the | ||
/// beginning of the algorithm each input section has its own cluster and | ||
/// the weight of the cluster is the sum of the weight of all incomming | ||
/// edges. | ||
/// * Call-Chain Clustering (C³) Heuristic | ||
/// * Defines when and how clusters are combined. Pick the highest weighted | ||
/// input section then add it to its most likely predecessor if it wouldn't | ||
/// penalize it too much. | ||
/// * Density | ||
/// * The weight of the cluster divided by the size of the cluster. This is a | ||
/// proxy for the ammount of execution time spent per byte of the cluster. | ||
/// | ||
/// It does so given a call graph profile by the following: | ||
/// * Build a weighted call graph from the call graph profile | ||
/// * Sort input sections by weight | ||
/// * For each input section starting with the highest weight | ||
/// * Find its most likely predecessor cluster | ||
/// * Check if the combined cluster would be too large, or would have too low | ||
/// a density. | ||
/// * If not, then combine the clusters. | ||
/// * Sort non-empty clusters by density | ||
/// | ||
//===----------------------------------------------------------------------===// | ||
|
||
#include "CallGraphSort.h" | ||
#include "OutputSections.h" | ||
#include "SymbolTable.h" | ||
#include "Symbols.h" | ||
|
||
using namespace llvm; | ||
using namespace lld; | ||
using namespace lld::elf; | ||
|
||
namespace { | ||
struct Edge { | ||
int From; | ||
uint64_t Weight; | ||
}; | ||
|
||
struct Cluster { | ||
Cluster(int Sec, size_t S) { | ||
Sections.push_back(Sec); | ||
Size = S; | ||
} | ||
|
||
double getDensity() const { | ||
if (Size == 0) | ||
return 0; | ||
return double(Weight) / double(Size); | ||
} | ||
|
||
std::vector<int> Sections; | ||
size_t Size = 0; | ||
uint64_t Weight = 0; | ||
uint64_t InitialWeight = 0; | ||
std::vector<Edge> Preds; | ||
}; | ||
|
||
class CallGraphSort { | ||
public: | ||
CallGraphSort(); | ||
|
||
DenseMap<const InputSectionBase *, int> run(); | ||
|
||
private: | ||
std::vector<Cluster> Clusters; | ||
std::vector<const InputSectionBase *> Sections; | ||
|
||
void groupClusters(); | ||
}; | ||
|
||
// Maximum ammount the combined cluster density can be worse than the original | ||
// cluster to consider merging. | ||
constexpr int MAX_DENSITY_DEGRADATION = 8; | ||
|
||
// Maximum cluster size in bytes. | ||
constexpr uint64_t MAX_CLUSTER_SIZE = 1024 * 1024; | ||
} // end anonymous namespace | ||
|
||
// Take the edge list in Config->CallGraphProfile, resolve symbol names to | ||
// Symbols, and generate a graph between InputSections with the provided | ||
// weights. | ||
CallGraphSort::CallGraphSort() { | ||
llvm::MapVector<std::pair<const InputSectionBase *, const InputSectionBase *>, | ||
uint64_t> &Profile = Config->CallGraphProfile; | ||
DenseMap<const InputSectionBase *, int> SecToCluster; | ||
|
||
auto GetOrCreateNode = [&](const InputSectionBase *IS) -> int { | ||
auto Res = SecToCluster.insert(std::make_pair(IS, Clusters.size())); | ||
if (Res.second) { | ||
Sections.push_back(IS); | ||
Clusters.emplace_back(Clusters.size(), IS->getSize()); | ||
} | ||
return Res.first->second; | ||
}; | ||
|
||
// Create the graph. | ||
for (const auto &C : Profile) { | ||
const auto *FromSB = cast<InputSectionBase>(C.first.first->Repl); | ||
const auto *ToSB = cast<InputSectionBase>(C.first.second->Repl); | ||
uint64_t Weight = C.second; | ||
|
||
// Ignore edges between input sections belonging to different output | ||
// sections. This is done because otherwise we would end up with clusters | ||
// containing input sections that can't actually be placed adjacently in the | ||
// output. This messes with the cluster size and density calculations. We | ||
// would also end up moving input sections in other output sections without | ||
// moving them closer to what calls them. | ||
if (FromSB->getOutputSection() != ToSB->getOutputSection()) | ||
continue; | ||
|
||
int From = GetOrCreateNode(FromSB); | ||
int To = GetOrCreateNode(ToSB); | ||
|
||
Clusters[To].Weight += Weight; | ||
|
||
if (From == To) | ||
continue; | ||
|
||
// Add an edge | ||
Clusters[To].Preds.push_back({From, Weight}); | ||
} | ||
for (Cluster &C : Clusters) | ||
C.InitialWeight = C.Weight; | ||
} | ||
|
||
// It's bad to merge clusters which would degrade the density too much. | ||
static bool isNewDensityBad(Cluster &A, Cluster &B) { | ||
double NewDensity = double(A.Weight + B.Weight) / double(A.Size + B.Size); | ||
if (NewDensity < A.getDensity() / MAX_DENSITY_DEGRADATION) | ||
return true; | ||
return false; | ||
} | ||
|
||
static void mergeClusters(Cluster &Into, Cluster &From) { | ||
Into.Sections.insert(Into.Sections.end(), From.Sections.begin(), | ||
From.Sections.end()); | ||
Into.Size += From.Size; | ||
Into.Weight += From.Weight; | ||
From.Sections.clear(); | ||
From.Size = 0; | ||
From.Weight = 0; | ||
} | ||
|
||
// Group InputSections into clusters using the Call-Chain Clustering heuristic | ||
// then sort the clusters by density. | ||
void CallGraphSort::groupClusters() { | ||
std::vector<int> SortedSecs(Clusters.size()); | ||
std::vector<Cluster *> SecToCluster(Clusters.size()); | ||
|
||
for (int SI = 0, SE = Clusters.size(); SI != SE; ++SI) { | ||
SortedSecs[SI] = SI; | ||
SecToCluster[SI] = &Clusters[SI]; | ||
} | ||
|
||
std::stable_sort(SortedSecs.begin(), SortedSecs.end(), [&](int A, int B) { | ||
return Clusters[B].getDensity() < Clusters[A].getDensity(); | ||
}); | ||
|
||
for (int SI : SortedSecs) { | ||
// Clusters[SI] is the same as SecToClusters[SI] here because it has not | ||
// been merged into another cluster yet. | ||
Cluster &C = Clusters[SI]; | ||
|
||
int BestPred = -1; | ||
uint64_t BestWeight = 0; | ||
|
||
for (Edge &E : C.Preds) { | ||
if (BestPred == -1 || E.Weight > BestWeight) { | ||
BestPred = E.From; | ||
BestWeight = E.Weight; | ||
} | ||
} | ||
|
||
// don't consider merging if the edge is unlikely. | ||
if (BestWeight * 10 <= C.InitialWeight) | ||
continue; | ||
|
||
Cluster *PredC = SecToCluster[BestPred]; | ||
if (PredC == &C) | ||
continue; | ||
|
||
if (C.Size + PredC->Size > MAX_CLUSTER_SIZE) | ||
continue; | ||
|
||
if (isNewDensityBad(*PredC, C)) | ||
continue; | ||
|
||
// NOTE: Consider using a disjoint-set to track section -> cluster mapping | ||
// if this is ever slow. | ||
for (int SI : C.Sections) | ||
SecToCluster[SI] = PredC; | ||
|
||
mergeClusters(*PredC, C); | ||
} | ||
|
||
// Remove empty or dead nodes. Invalidates all cluster indices. | ||
Clusters.erase(std::remove_if(Clusters.begin(), Clusters.end(), | ||
[](const Cluster &C) { | ||
return C.Size == 0 || C.Sections.empty(); | ||
}), | ||
Clusters.end()); | ||
|
||
// Sort by density. | ||
std::sort(Clusters.begin(), Clusters.end(), | ||
[](const Cluster &A, const Cluster &B) { | ||
return A.getDensity() > B.getDensity(); | ||
}); | ||
} | ||
|
||
DenseMap<const InputSectionBase *, int> CallGraphSort::run() { | ||
groupClusters(); | ||
|
||
// Generate order. | ||
llvm::DenseMap<const InputSectionBase *, int> OrderMap; | ||
ssize_t CurOrder = 1; | ||
|
||
for (const Cluster &C : Clusters) | ||
for (int SecIndex : C.Sections) | ||
OrderMap[Sections[SecIndex]] = CurOrder++; | ||
|
||
return OrderMap; | ||
} | ||
|
||
// Sort sections by the profile data provided by -callgraph-profile-file | ||
// | ||
// This first builds a call graph based on the profile data then merges sections | ||
// according to the C³ huristic. All clusters are then sorted by a density | ||
// metric to further improve locality. | ||
DenseMap<const InputSectionBase *, int> elf::computeCallGraphProfileOrder() { | ||
return CallGraphSort().run(); | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,23 @@ | ||
//===- CallGraphSort.h ------------------------------------------*- C++ -*-===// | ||
// | ||
// The LLVM Linker | ||
// | ||
// This file is distributed under the University of Illinois Open Source | ||
// License. See LICENSE.TXT for details. | ||
// | ||
//===----------------------------------------------------------------------===// | ||
|
||
#ifndef LLD_ELF_CALL_GRAPH_SORT_H | ||
#define LLD_ELF_CALL_GRAPH_SORT_H | ||
|
||
#include "llvm/ADT/DenseMap.h" | ||
|
||
namespace lld { | ||
namespace elf { | ||
class InputSectionBase; | ||
|
||
llvm::DenseMap<const InputSectionBase *, int> computeCallGraphProfileOrder(); | ||
} // namespace elf | ||
} // namespace lld | ||
|
||
#endif |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.