1 //===- PartialInlining.cpp - Inline parts of functions --------------------===//
3 // The LLVM Compiler Infrastructure
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
8 //===----------------------------------------------------------------------===//
10 // This pass performs partial inlining, typically by inlining an if statement
11 // that surrounds the body of the function.
13 //===----------------------------------------------------------------------===//
15 #include "llvm/Transforms/IPO/PartialInlining.h"
16 #include "llvm/ADT/Statistic.h"
17 #include "llvm/Analysis/BlockFrequencyInfo.h"
18 #include "llvm/Analysis/BranchProbabilityInfo.h"
19 #include "llvm/Analysis/CodeMetrics.h"
20 #include "llvm/Analysis/InlineCost.h"
21 #include "llvm/Analysis/LoopInfo.h"
22 #include "llvm/Analysis/OptimizationDiagnosticInfo.h"
23 #include "llvm/Analysis/ProfileSummaryInfo.h"
24 #include "llvm/Analysis/TargetLibraryInfo.h"
25 #include "llvm/Analysis/TargetTransformInfo.h"
26 #include "llvm/IR/CFG.h"
27 #include "llvm/IR/DiagnosticInfo.h"
28 #include "llvm/IR/Dominators.h"
29 #include "llvm/IR/Instructions.h"
30 #include "llvm/IR/Module.h"
31 #include "llvm/Pass.h"
32 #include "llvm/Transforms/IPO.h"
33 #include "llvm/Transforms/Utils/Cloning.h"
34 #include "llvm/Transforms/Utils/CodeExtractor.h"
37 #define DEBUG_TYPE "partial-inlining"
39 STATISTIC(NumPartialInlined,
40 "Number of callsites functions partially inlined into.");
42 // Command line option to disable partial-inlining. The default is false:
44 DisablePartialInlining("disable-partial-inlining", cl::init(false),
45 cl::Hidden, cl::desc("Disable partial ininling"));
46 // This is an option used by testing:
47 static cl::opt<bool> SkipCostAnalysis("skip-partial-inlining-cost-analysis",
48 cl::init(false), cl::ZeroOrMore,
50 cl::desc("Skip Cost Analysis"));
52 static cl::opt<unsigned> MaxNumInlineBlocks(
53 "max-num-inline-blocks", cl::init(5), cl::Hidden,
54 cl::desc("Max Number of Blocks To be Partially Inlined"));
56 // Command line option to set the maximum number of partial inlining allowed
57 // for the module. The default value of -1 means no limit.
58 static cl::opt<int> MaxNumPartialInlining(
59 "max-partial-inlining", cl::init(-1), cl::Hidden, cl::ZeroOrMore,
60 cl::desc("Max number of partial inlining. The default is unlimited"));
62 // Used only when PGO or user annotated branch data is absent. It is
63 // the least value that is used to weigh the outline region. If BFI
64 // produces larger value, the BFI value will be used.
66 OutlineRegionFreqPercent("outline-region-freq-percent", cl::init(75),
67 cl::Hidden, cl::ZeroOrMore,
68 cl::desc("Relative frequency of outline region to "
71 static cl::opt<unsigned> ExtraOutliningPenalty(
72 "partial-inlining-extra-penalty", cl::init(0), cl::Hidden,
73 cl::desc("A debug option to add additional penalty to the computed one."));
77 struct FunctionOutliningInfo {
78 FunctionOutliningInfo()
79 : Entries(), ReturnBlock(nullptr), NonReturnBlock(nullptr),
81 // Returns the number of blocks to be inlined including all blocks
82 // in Entries and one return block.
83 unsigned GetNumInlinedBlocks() const { return Entries.size() + 1; }
85 // A set of blocks including the function entry that guard
86 // the region to be outlined.
87 SmallVector<BasicBlock *, 4> Entries;
88 // The return block that is not included in the outlined region.
89 BasicBlock *ReturnBlock;
90 // The dominating block of the region to be outlined.
91 BasicBlock *NonReturnBlock;
92 // The set of blocks in Entries that that are predecessors to ReturnBlock
93 SmallVector<BasicBlock *, 4> ReturnBlockPreds;
96 struct PartialInlinerImpl {
98 std::function<AssumptionCache &(Function &)> *GetAC,
99 std::function<TargetTransformInfo &(Function &)> *GTTI,
100 Optional<function_ref<BlockFrequencyInfo &(Function &)>> GBFI,
101 ProfileSummaryInfo *ProfSI)
102 : GetAssumptionCache(GetAC), GetTTI(GTTI), GetBFI(GBFI), PSI(ProfSI) {}
104 Function *unswitchFunction(Function *F);
107 int NumPartialInlining = 0;
108 std::function<AssumptionCache &(Function &)> *GetAssumptionCache;
109 std::function<TargetTransformInfo &(Function &)> *GetTTI;
110 Optional<function_ref<BlockFrequencyInfo &(Function &)>> GetBFI;
111 ProfileSummaryInfo *PSI;
113 // Return the frequency of the OutlininingBB relative to F's entry point.
114 // The result is no larger than 1 and is represented using BP.
115 // (Note that the outlined region's 'head' block can only have incoming
116 // edges from the guarding entry blocks).
117 BranchProbability getOutliningCallBBRelativeFreq(Function *F,
118 FunctionOutliningInfo *OI,
119 Function *DuplicateFunction,
120 BlockFrequencyInfo *BFI,
121 BasicBlock *OutliningCallBB);
123 // Return true if the callee of CS should be partially inlined with
125 bool shouldPartialInline(CallSite CS, Function *F, FunctionOutliningInfo *OI,
126 BlockFrequencyInfo *CalleeBFI,
127 BasicBlock *OutliningCallBB,
128 int OutliningCallOverhead,
129 OptimizationRemarkEmitter &ORE);
131 // Try to inline DuplicateFunction (cloned from F with call to
132 // the OutlinedFunction into its callers. Return true
133 // if there is any successful inlining.
134 bool tryPartialInline(Function *DuplicateFunction,
135 Function *F, /*orignal function */
136 FunctionOutliningInfo *OI, Function *OutlinedFunction,
137 BlockFrequencyInfo *CalleeBFI);
139 // Compute the mapping from use site of DuplicationFunction to the enclosing
140 // BB's profile count.
141 void computeCallsiteToProfCountMap(Function *DuplicateFunction,
142 DenseMap<User *, uint64_t> &SiteCountMap);
144 bool IsLimitReached() {
145 return (MaxNumPartialInlining != -1 &&
146 NumPartialInlining >= MaxNumPartialInlining);
149 CallSite getCallSite(User *U) {
151 if (CallInst *CI = dyn_cast<CallInst>(U))
153 else if (InvokeInst *II = dyn_cast<InvokeInst>(U))
156 llvm_unreachable("All uses must be calls");
160 CallSite getOneCallSiteTo(Function *F) {
161 User *User = *F->user_begin();
162 return getCallSite(User);
165 std::tuple<DebugLoc, BasicBlock *> getOneDebugLoc(Function *F) {
166 CallSite CS = getOneCallSiteTo(F);
167 DebugLoc DLoc = CS.getInstruction()->getDebugLoc();
168 BasicBlock *Block = CS.getParent();
169 return std::make_tuple(DLoc, Block);
172 // Returns the costs associated with function outlining:
173 // - The first value is the non-weighted runtime cost for making the call
174 // to the outlined function 'OutlinedFunction', including the addtional
175 // setup cost in the outlined function itself;
176 // - The second value is the estimated size of the new call sequence in
177 // basic block 'OutliningCallBB';
178 // - The third value is the estimated size of the original code from
179 // function 'F' that is extracted into the outlined function.
180 std::tuple<int, int, int>
181 computeOutliningCosts(Function *F, const FunctionOutliningInfo *OutliningInfo,
182 Function *OutlinedFunction,
183 BasicBlock *OutliningCallBB);
184 // Compute the 'InlineCost' of block BB. InlineCost is a proxy used to
185 // approximate both the size and runtime cost (Note that in the current
186 // inline cost analysis, there is no clear distinction there either).
187 int computeBBInlineCost(BasicBlock *BB);
189 std::unique_ptr<FunctionOutliningInfo> computeOutliningInfo(Function *F);
193 struct PartialInlinerLegacyPass : public ModulePass {
194 static char ID; // Pass identification, replacement for typeid
195 PartialInlinerLegacyPass() : ModulePass(ID) {
196 initializePartialInlinerLegacyPassPass(*PassRegistry::getPassRegistry());
199 void getAnalysisUsage(AnalysisUsage &AU) const override {
200 AU.addRequired<AssumptionCacheTracker>();
201 AU.addRequired<ProfileSummaryInfoWrapperPass>();
202 AU.addRequired<TargetTransformInfoWrapperPass>();
204 bool runOnModule(Module &M) override {
208 AssumptionCacheTracker *ACT = &getAnalysis<AssumptionCacheTracker>();
209 TargetTransformInfoWrapperPass *TTIWP =
210 &getAnalysis<TargetTransformInfoWrapperPass>();
211 ProfileSummaryInfo *PSI =
212 getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
214 std::function<AssumptionCache &(Function &)> GetAssumptionCache =
215 [&ACT](Function &F) -> AssumptionCache & {
216 return ACT->getAssumptionCache(F);
219 std::function<TargetTransformInfo &(Function &)> GetTTI =
220 [&TTIWP](Function &F) -> TargetTransformInfo & {
221 return TTIWP->getTTI(F);
224 return PartialInlinerImpl(&GetAssumptionCache, &GetTTI, None, PSI).run(M);
229 std::unique_ptr<FunctionOutliningInfo>
230 PartialInlinerImpl::computeOutliningInfo(Function *F) {
231 BasicBlock *EntryBlock = &F->front();
232 BranchInst *BR = dyn_cast<BranchInst>(EntryBlock->getTerminator());
233 if (!BR || BR->isUnconditional())
234 return std::unique_ptr<FunctionOutliningInfo>();
236 // Returns true if Succ is BB's successor
237 auto IsSuccessor = [](BasicBlock *Succ, BasicBlock *BB) {
238 return is_contained(successors(BB), Succ);
241 auto SuccSize = [](BasicBlock *BB) {
242 return std::distance(succ_begin(BB), succ_end(BB));
245 auto IsReturnBlock = [](BasicBlock *BB) {
246 TerminatorInst *TI = BB->getTerminator();
247 return isa<ReturnInst>(TI);
250 auto GetReturnBlock = [&](BasicBlock *Succ1, BasicBlock *Succ2) {
251 if (IsReturnBlock(Succ1))
252 return std::make_tuple(Succ1, Succ2);
253 if (IsReturnBlock(Succ2))
254 return std::make_tuple(Succ2, Succ1);
256 return std::make_tuple<BasicBlock *, BasicBlock *>(nullptr, nullptr);
259 // Detect a triangular shape:
260 auto GetCommonSucc = [&](BasicBlock *Succ1, BasicBlock *Succ2) {
261 if (IsSuccessor(Succ1, Succ2))
262 return std::make_tuple(Succ1, Succ2);
263 if (IsSuccessor(Succ2, Succ1))
264 return std::make_tuple(Succ2, Succ1);
266 return std::make_tuple<BasicBlock *, BasicBlock *>(nullptr, nullptr);
269 std::unique_ptr<FunctionOutliningInfo> OutliningInfo =
270 llvm::make_unique<FunctionOutliningInfo>();
272 BasicBlock *CurrEntry = EntryBlock;
273 bool CandidateFound = false;
275 // The number of blocks to be inlined has already reached
276 // the limit. When MaxNumInlineBlocks is set to 0 or 1, this
277 // disables partial inlining for the function.
278 if (OutliningInfo->GetNumInlinedBlocks() >= MaxNumInlineBlocks)
281 if (SuccSize(CurrEntry) != 2)
284 BasicBlock *Succ1 = *succ_begin(CurrEntry);
285 BasicBlock *Succ2 = *(succ_begin(CurrEntry) + 1);
287 BasicBlock *ReturnBlock, *NonReturnBlock;
288 std::tie(ReturnBlock, NonReturnBlock) = GetReturnBlock(Succ1, Succ2);
291 OutliningInfo->Entries.push_back(CurrEntry);
292 OutliningInfo->ReturnBlock = ReturnBlock;
293 OutliningInfo->NonReturnBlock = NonReturnBlock;
294 CandidateFound = true;
298 BasicBlock *CommSucc;
299 BasicBlock *OtherSucc;
300 std::tie(CommSucc, OtherSucc) = GetCommonSucc(Succ1, Succ2);
305 OutliningInfo->Entries.push_back(CurrEntry);
306 CurrEntry = OtherSucc;
311 return std::unique_ptr<FunctionOutliningInfo>();
313 // Do sanity check of the entries: threre should not
314 // be any successors (not in the entry set) other than
315 // {ReturnBlock, NonReturnBlock}
316 assert(OutliningInfo->Entries[0] == &F->front() &&
317 "Function Entry must be the first in Entries vector");
318 DenseSet<BasicBlock *> Entries;
319 for (BasicBlock *E : OutliningInfo->Entries)
322 // Returns true of BB has Predecessor which is not
324 auto HasNonEntryPred = [Entries](BasicBlock *BB) {
325 for (auto Pred : predecessors(BB)) {
326 if (!Entries.count(Pred))
331 auto CheckAndNormalizeCandidate =
332 [Entries, HasNonEntryPred](FunctionOutliningInfo *OutliningInfo) {
333 for (BasicBlock *E : OutliningInfo->Entries) {
334 for (auto Succ : successors(E)) {
335 if (Entries.count(Succ))
337 if (Succ == OutliningInfo->ReturnBlock)
338 OutliningInfo->ReturnBlockPreds.push_back(E);
339 else if (Succ != OutliningInfo->NonReturnBlock)
342 // There should not be any outside incoming edges either:
343 if (HasNonEntryPred(E))
349 if (!CheckAndNormalizeCandidate(OutliningInfo.get()))
350 return std::unique_ptr<FunctionOutliningInfo>();
352 // Now further growing the candidate's inlining region by
353 // peeling off dominating blocks from the outlining region:
354 while (OutliningInfo->GetNumInlinedBlocks() < MaxNumInlineBlocks) {
355 BasicBlock *Cand = OutliningInfo->NonReturnBlock;
356 if (SuccSize(Cand) != 2)
359 if (HasNonEntryPred(Cand))
362 BasicBlock *Succ1 = *succ_begin(Cand);
363 BasicBlock *Succ2 = *(succ_begin(Cand) + 1);
365 BasicBlock *ReturnBlock, *NonReturnBlock;
366 std::tie(ReturnBlock, NonReturnBlock) = GetReturnBlock(Succ1, Succ2);
367 if (!ReturnBlock || ReturnBlock != OutliningInfo->ReturnBlock)
370 if (NonReturnBlock->getSinglePredecessor() != Cand)
373 // Now grow and update OutlininigInfo:
374 OutliningInfo->Entries.push_back(Cand);
375 OutliningInfo->NonReturnBlock = NonReturnBlock;
376 OutliningInfo->ReturnBlockPreds.push_back(Cand);
377 Entries.insert(Cand);
380 return OutliningInfo;
383 // Check if there is PGO data or user annoated branch data:
384 static bool hasProfileData(Function *F, FunctionOutliningInfo *OI) {
385 if (F->getEntryCount())
387 // Now check if any of the entry block has MD_prof data:
388 for (auto *E : OI->Entries) {
389 BranchInst *BR = dyn_cast<BranchInst>(E->getTerminator());
390 if (!BR || BR->isUnconditional())
393 if (BR->extractProfMetadata(T, F))
399 BranchProbability PartialInlinerImpl::getOutliningCallBBRelativeFreq(
400 Function *F, FunctionOutliningInfo *OI, Function *DuplicateFunction,
401 BlockFrequencyInfo *BFI, BasicBlock *OutliningCallBB) {
404 BFI->getBlockFreq(&DuplicateFunction->getEntryBlock());
405 auto OutliningCallFreq = BFI->getBlockFreq(OutliningCallBB);
407 auto OutlineRegionRelFreq =
408 BranchProbability::getBranchProbability(OutliningCallFreq.getFrequency(),
409 EntryFreq.getFrequency());
411 if (hasProfileData(F, OI))
412 return OutlineRegionRelFreq;
414 // When profile data is not available, we need to be conservative in
415 // estimating the overall savings. Static branch prediction can usually
416 // guess the branch direction right (taken/non-taken), but the guessed
417 // branch probability is usually not biased enough. In case when the
418 // outlined region is predicted to be likely, its probability needs
419 // to be made higher (more biased) to not under-estimate the cost of
420 // function outlining. On the other hand, if the outlined region
421 // is predicted to be less likely, the predicted probablity is usually
422 // higher than the actual. For instance, the actual probability of the
423 // less likely target is only 5%, but the guessed probablity can be
424 // 40%. In the latter case, there is no need for further adjustement.
425 // FIXME: add an option for this.
426 if (OutlineRegionRelFreq < BranchProbability(45, 100))
427 return OutlineRegionRelFreq;
429 OutlineRegionRelFreq = std::max(
430 OutlineRegionRelFreq, BranchProbability(OutlineRegionFreqPercent, 100));
432 return OutlineRegionRelFreq;
435 bool PartialInlinerImpl::shouldPartialInline(
436 CallSite CS, Function *F /* Original Callee */, FunctionOutliningInfo *OI,
437 BlockFrequencyInfo *CalleeBFI, BasicBlock *OutliningCallBB,
438 int NonWeightedOutliningRcost, OptimizationRemarkEmitter &ORE) {
440 if (SkipCostAnalysis)
443 Instruction *Call = CS.getInstruction();
444 Function *Callee = CS.getCalledFunction();
445 Function *Caller = CS.getCaller();
446 auto &CalleeTTI = (*GetTTI)(*Callee);
447 InlineCost IC = getInlineCost(CS, getInlineParams(), CalleeTTI,
448 *GetAssumptionCache, GetBFI, PSI);
451 ORE.emit(OptimizationRemarkAnalysis(DEBUG_TYPE, "AlwaysInline", Call)
453 << " should always be fully inlined, not partially");
458 ORE.emit(OptimizationRemarkMissed(DEBUG_TYPE, "NeverInline", Call)
459 << NV("Callee", F) << " not partially inlined into "
460 << NV("Caller", Caller)
461 << " because it should never be inlined (cost=never)");
466 ORE.emit(OptimizationRemarkAnalysis(DEBUG_TYPE, "TooCostly", Call)
467 << NV("Callee", F) << " not partially inlined into "
468 << NV("Caller", Caller) << " because too costly to inline (cost="
469 << NV("Cost", IC.getCost()) << ", threshold="
470 << NV("Threshold", IC.getCostDelta() + IC.getCost()) << ")");
473 const DataLayout &DL = Caller->getParent()->getDataLayout();
474 // The savings of eliminating the call:
475 int NonWeightedSavings = getCallsiteCost(CS, DL);
476 BlockFrequency NormWeightedSavings(NonWeightedSavings);
479 getOutliningCallBBRelativeFreq(F, OI, Callee, CalleeBFI, OutliningCallBB);
480 auto NormWeightedRcost =
481 BlockFrequency(NonWeightedOutliningRcost) * RelativeFreq;
483 // Weighted saving is smaller than weighted cost, return false
484 if (NormWeightedSavings < NormWeightedRcost) {
486 OptimizationRemarkAnalysis(DEBUG_TYPE, "OutliningCallcostTooHigh", Call)
487 << NV("Callee", F) << " not partially inlined into "
488 << NV("Caller", Caller) << " runtime overhead (overhead="
489 << NV("Overhead", (unsigned)NormWeightedRcost.getFrequency())
491 << NV("Savings", (unsigned)NormWeightedSavings.getFrequency()) << ")"
492 << " of making the outlined call is too high");
497 ORE.emit(OptimizationRemarkAnalysis(DEBUG_TYPE, "CanBePartiallyInlined", Call)
498 << NV("Callee", F) << " can be partially inlined into "
499 << NV("Caller", Caller) << " with cost=" << NV("Cost", IC.getCost())
501 << NV("Threshold", IC.getCostDelta() + IC.getCost()) << ")");
505 // TODO: Ideally we should share Inliner's InlineCost Analysis code.
506 // For now use a simplified version. The returned 'InlineCost' will be used
507 // to esimate the size cost as well as runtime cost of the BB.
508 int PartialInlinerImpl::computeBBInlineCost(BasicBlock *BB) {
510 const DataLayout &DL = BB->getParent()->getParent()->getDataLayout();
511 for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) {
512 if (isa<DbgInfoIntrinsic>(I))
515 switch (I->getOpcode()) {
516 case Instruction::BitCast:
517 case Instruction::PtrToInt:
518 case Instruction::IntToPtr:
519 case Instruction::Alloca:
521 case Instruction::GetElementPtr:
522 if (cast<GetElementPtrInst>(I)->hasAllZeroIndices())
528 IntrinsicInst *IntrInst = dyn_cast<IntrinsicInst>(I);
530 if (IntrInst->getIntrinsicID() == Intrinsic::lifetime_start ||
531 IntrInst->getIntrinsicID() == Intrinsic::lifetime_end)
535 if (CallInst *CI = dyn_cast<CallInst>(I)) {
536 InlineCost += getCallsiteCost(CallSite(CI), DL);
540 if (InvokeInst *II = dyn_cast<InvokeInst>(I)) {
541 InlineCost += getCallsiteCost(CallSite(II), DL);
545 if (SwitchInst *SI = dyn_cast<SwitchInst>(I)) {
546 InlineCost += (SI->getNumCases() + 1) * InlineConstants::InstrCost;
549 InlineCost += InlineConstants::InstrCost;
554 std::tuple<int, int, int> PartialInlinerImpl::computeOutliningCosts(
555 Function *F, const FunctionOutliningInfo *OI, Function *OutlinedFunction,
556 BasicBlock *OutliningCallBB) {
557 // First compute the cost of the outlined region 'OI' in the original
559 // FIXME: The code extractor (outliner) can now do code sinking/hoisting
560 // to reduce outlining cost. The hoisted/sunk code currently do not
561 // incur any runtime cost so it is still OK to compare the outlined
562 // function cost with the outlined region in the original function.
563 // If this ever changes, we will need to introduce new extractor api
564 // to pass the information.
565 int OutlinedRegionCost = 0;
566 for (BasicBlock &BB : *F) {
567 if (&BB != OI->ReturnBlock &&
568 // Assuming Entry set is small -- do a linear search here:
569 std::find(OI->Entries.begin(), OI->Entries.end(), &BB) ==
571 OutlinedRegionCost += computeBBInlineCost(&BB);
575 // Now compute the cost of the call sequence to the outlined function
576 // 'OutlinedFunction' in BB 'OutliningCallBB':
577 int OutliningFuncCallCost = computeBBInlineCost(OutliningCallBB);
579 // Now compute the cost of the extracted/outlined function itself:
580 int OutlinedFunctionCost = 0;
581 for (BasicBlock &BB : *OutlinedFunction) {
582 OutlinedFunctionCost += computeBBInlineCost(&BB);
585 assert(OutlinedFunctionCost >= OutlinedRegionCost &&
586 "Outlined function cost should be no less than the outlined region");
587 // The code extractor introduces a new root and exit stub blocks with
588 // additional unconditional branches. Those branches will be eliminated
589 // later with bb layout. The cost should be adjusted accordingly:
590 OutlinedFunctionCost -= 2 * InlineConstants::InstrCost;
592 int OutliningRuntimeOverhead = OutliningFuncCallCost +
593 (OutlinedFunctionCost - OutlinedRegionCost) +
594 ExtraOutliningPenalty;
596 return std::make_tuple(OutliningFuncCallCost, OutliningRuntimeOverhead,
600 // Create the callsite to profile count map which is
601 // used to update the original function's entry count,
602 // after the function is partially inlined into the callsite.
603 void PartialInlinerImpl::computeCallsiteToProfCountMap(
604 Function *DuplicateFunction,
605 DenseMap<User *, uint64_t> &CallSiteToProfCountMap) {
606 std::vector<User *> Users(DuplicateFunction->user_begin(),
607 DuplicateFunction->user_end());
608 Function *CurrentCaller = nullptr;
609 std::unique_ptr<BlockFrequencyInfo> TempBFI;
610 BlockFrequencyInfo *CurrentCallerBFI = nullptr;
612 auto ComputeCurrBFI = [&,this](Function *Caller) {
613 // For the old pass manager:
615 DominatorTree DT(*Caller);
617 BranchProbabilityInfo BPI(*Caller, LI);
618 TempBFI.reset(new BlockFrequencyInfo(*Caller, BPI, LI));
619 CurrentCallerBFI = TempBFI.get();
622 CurrentCallerBFI = &(*GetBFI)(*Caller);
626 for (User *User : Users) {
627 CallSite CS = getCallSite(User);
628 Function *Caller = CS.getCaller();
629 if (CurrentCaller != Caller) {
630 CurrentCaller = Caller;
631 ComputeCurrBFI(Caller);
633 assert(CurrentCallerBFI && "CallerBFI is not set");
635 BasicBlock *CallBB = CS.getInstruction()->getParent();
636 auto Count = CurrentCallerBFI->getBlockProfileCount(CallBB);
638 CallSiteToProfCountMap[User] = *Count;
640 CallSiteToProfCountMap[User] = 0;
644 Function *PartialInlinerImpl::unswitchFunction(Function *F) {
646 if (F->hasAddressTaken())
649 // Let inliner handle it
650 if (F->hasFnAttribute(Attribute::AlwaysInline))
653 if (F->hasFnAttribute(Attribute::NoInline))
656 if (PSI->isFunctionEntryCold(F))
659 if (F->user_begin() == F->user_end())
662 std::unique_ptr<FunctionOutliningInfo> OI = computeOutliningInfo(F);
667 // Clone the function, so that we can hack away on it.
668 ValueToValueMapTy VMap;
669 Function *DuplicateFunction = CloneFunction(F, VMap);
670 BasicBlock *NewReturnBlock = cast<BasicBlock>(VMap[OI->ReturnBlock]);
671 BasicBlock *NewNonReturnBlock = cast<BasicBlock>(VMap[OI->NonReturnBlock]);
672 DenseSet<BasicBlock *> NewEntries;
673 for (BasicBlock *BB : OI->Entries) {
674 NewEntries.insert(cast<BasicBlock>(VMap[BB]));
677 // Go ahead and update all uses to the duplicate, so that we can just
678 // use the inliner functionality when we're done hacking.
679 F->replaceAllUsesWith(DuplicateFunction);
681 auto getFirstPHI = [](BasicBlock *BB) {
682 BasicBlock::iterator I = BB->begin();
683 PHINode *FirstPhi = nullptr;
684 while (I != BB->end()) {
685 PHINode *Phi = dyn_cast<PHINode>(I);
695 // Special hackery is needed with PHI nodes that have inputs from more than
696 // one extracted block. For simplicity, just split the PHIs into a two-level
697 // sequence of PHIs, some of which will go in the extracted region, and some
698 // of which will go outside.
699 BasicBlock *PreReturn = NewReturnBlock;
700 // only split block when necessary:
701 PHINode *FirstPhi = getFirstPHI(PreReturn);
702 unsigned NumPredsFromEntries = OI->ReturnBlockPreds.size();
703 auto IsTrivialPhi = [](PHINode *PN) -> Value * {
704 Value *CommonValue = PN->getIncomingValue(0);
705 if (all_of(PN->incoming_values(),
706 [&](Value *V) { return V == CommonValue; }))
711 if (FirstPhi && FirstPhi->getNumIncomingValues() > NumPredsFromEntries + 1) {
713 NewReturnBlock = NewReturnBlock->splitBasicBlock(
714 NewReturnBlock->getFirstNonPHI()->getIterator());
715 BasicBlock::iterator I = PreReturn->begin();
716 Instruction *Ins = &NewReturnBlock->front();
717 SmallVector<Instruction *, 4> DeadPhis;
718 while (I != PreReturn->end()) {
719 PHINode *OldPhi = dyn_cast<PHINode>(I);
724 PHINode::Create(OldPhi->getType(), NumPredsFromEntries + 1, "", Ins);
725 OldPhi->replaceAllUsesWith(RetPhi);
726 Ins = NewReturnBlock->getFirstNonPHI();
728 RetPhi->addIncoming(&*I, PreReturn);
729 for (BasicBlock *E : OI->ReturnBlockPreds) {
730 BasicBlock *NewE = cast<BasicBlock>(VMap[E]);
731 RetPhi->addIncoming(OldPhi->getIncomingValueForBlock(NewE), NewE);
732 OldPhi->removeIncomingValue(NewE);
735 // After incoming values splitting, the old phi may become trivial.
736 // Keeping the trivial phi can introduce definition inside the outline
737 // region which is live-out, causing necessary overhead (load, store
739 if (auto *OldPhiVal = IsTrivialPhi(OldPhi)) {
740 OldPhi->replaceAllUsesWith(OldPhiVal);
741 DeadPhis.push_back(OldPhi);
747 for (auto *DP : DeadPhis)
748 DP->eraseFromParent();
750 for (auto E : OI->ReturnBlockPreds) {
751 BasicBlock *NewE = cast<BasicBlock>(VMap[E]);
752 NewE->getTerminator()->replaceUsesOfWith(PreReturn, NewReturnBlock);
756 // Returns true if the block is to be partial inlined into the caller
757 // (i.e. not to be extracted to the out of line function)
758 auto ToBeInlined = [&](BasicBlock *BB) {
759 return BB == NewReturnBlock || NewEntries.count(BB);
761 // Gather up the blocks that we're going to extract.
762 std::vector<BasicBlock *> ToExtract;
763 ToExtract.push_back(NewNonReturnBlock);
764 for (BasicBlock &BB : *DuplicateFunction)
765 if (!ToBeInlined(&BB) && &BB != NewNonReturnBlock)
766 ToExtract.push_back(&BB);
768 // The CodeExtractor needs a dominator tree.
770 DT.recalculate(*DuplicateFunction);
772 // Manually calculate a BlockFrequencyInfo and BranchProbabilityInfo.
774 BranchProbabilityInfo BPI(*DuplicateFunction, LI);
775 BlockFrequencyInfo BFI(*DuplicateFunction, BPI, LI);
777 // Extract the body of the if.
778 Function *OutlinedFunction =
779 CodeExtractor(ToExtract, &DT, /*AggregateArgs*/ false, &BFI, &BPI)
780 .extractCodeRegion();
783 tryPartialInline(DuplicateFunction, F, OI.get(), OutlinedFunction, &BFI);
785 // Ditch the duplicate, since we're done with it, and rewrite all remaining
786 // users (function pointers, etc.) back to the original function.
787 DuplicateFunction->replaceAllUsesWith(F);
788 DuplicateFunction->eraseFromParent();
791 return OutlinedFunction;
793 // Remove the function that is speculatively created:
794 if (OutlinedFunction)
795 OutlinedFunction->eraseFromParent();
800 bool PartialInlinerImpl::tryPartialInline(Function *DuplicateFunction,
802 FunctionOutliningInfo *OI,
803 Function *OutlinedFunction,
804 BlockFrequencyInfo *CalleeBFI) {
805 if (OutlinedFunction == nullptr)
808 int NonWeightedRcost;
810 int OutlinedRegionSizeCost;
812 auto OutliningCallBB =
813 getOneCallSiteTo(OutlinedFunction).getInstruction()->getParent();
815 std::tie(SizeCost, NonWeightedRcost, OutlinedRegionSizeCost) =
816 computeOutliningCosts(F, OI, OutlinedFunction, OutliningCallBB);
818 // The call sequence to the outlined function is larger than the original
819 // outlined region size, it does not increase the chances of inlining
820 // 'F' with outlining (The inliner usies the size increase to model the
821 // the cost of inlining a callee).
822 if (!SkipCostAnalysis && OutlinedRegionSizeCost < SizeCost) {
823 OptimizationRemarkEmitter ORE(F);
826 std::tie(DLoc, Block) = getOneDebugLoc(DuplicateFunction);
827 ORE.emit(OptimizationRemarkAnalysis(DEBUG_TYPE, "OutlineRegionTooSmall",
829 << ore::NV("Function", F)
830 << " not partially inlined into callers (Original Size = "
831 << ore::NV("OutlinedRegionOriginalSize", OutlinedRegionSizeCost)
832 << ", Size of call sequence to outlined function = "
833 << ore::NV("NewSize", SizeCost) << ")");
837 assert(F->user_begin() == F->user_end() &&
838 "F's users should all be replaced!");
839 std::vector<User *> Users(DuplicateFunction->user_begin(),
840 DuplicateFunction->user_end());
842 DenseMap<User *, uint64_t> CallSiteToProfCountMap;
843 if (F->getEntryCount())
844 computeCallsiteToProfCountMap(DuplicateFunction, CallSiteToProfCountMap);
846 auto CalleeEntryCount = F->getEntryCount();
847 uint64_t CalleeEntryCountV = (CalleeEntryCount ? *CalleeEntryCount : 0);
848 bool AnyInline = false;
849 for (User *User : Users) {
850 CallSite CS = getCallSite(User);
852 if (IsLimitReached())
855 OptimizationRemarkEmitter ORE(CS.getCaller());
857 if (!shouldPartialInline(CS, F, OI, CalleeBFI, OutliningCallBB,
858 NonWeightedRcost, ORE))
862 OptimizationRemark(DEBUG_TYPE, "PartiallyInlined", CS.getInstruction())
863 << ore::NV("Callee", F) << " partially inlined into "
864 << ore::NV("Caller", CS.getCaller()));
866 InlineFunctionInfo IFI(nullptr, GetAssumptionCache, PSI);
867 InlineFunction(CS, IFI);
869 // Now update the entry count:
870 if (CalleeEntryCountV && CallSiteToProfCountMap.count(User)) {
871 uint64_t CallSiteCount = CallSiteToProfCountMap[User];
872 CalleeEntryCountV -= std::min(CalleeEntryCountV, CallSiteCount);
876 NumPartialInlining++;
881 if (AnyInline && CalleeEntryCount)
882 F->setEntryCount(CalleeEntryCountV);
887 bool PartialInlinerImpl::run(Module &M) {
888 if (DisablePartialInlining)
891 std::vector<Function *> Worklist;
892 Worklist.reserve(M.size());
893 for (Function &F : M)
894 if (!F.use_empty() && !F.isDeclaration())
895 Worklist.push_back(&F);
897 bool Changed = false;
898 while (!Worklist.empty()) {
899 Function *CurrFunc = Worklist.back();
902 if (CurrFunc->use_empty())
905 bool Recursive = false;
906 for (User *U : CurrFunc->users())
907 if (Instruction *I = dyn_cast<Instruction>(U))
908 if (I->getParent()->getParent() == CurrFunc) {
915 if (Function *NewFunc = unswitchFunction(CurrFunc)) {
916 Worklist.push_back(NewFunc);
924 char PartialInlinerLegacyPass::ID = 0;
925 INITIALIZE_PASS_BEGIN(PartialInlinerLegacyPass, "partial-inliner",
926 "Partial Inliner", false, false)
927 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
928 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
929 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
930 INITIALIZE_PASS_END(PartialInlinerLegacyPass, "partial-inliner",
931 "Partial Inliner", false, false)
933 ModulePass *llvm::createPartialInliningPass() {
934 return new PartialInlinerLegacyPass();
937 PreservedAnalyses PartialInlinerPass::run(Module &M,
938 ModuleAnalysisManager &AM) {
939 auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
941 std::function<AssumptionCache &(Function &)> GetAssumptionCache =
942 [&FAM](Function &F) -> AssumptionCache & {
943 return FAM.getResult<AssumptionAnalysis>(F);
946 std::function<BlockFrequencyInfo &(Function &)> GetBFI =
947 [&FAM](Function &F) -> BlockFrequencyInfo & {
948 return FAM.getResult<BlockFrequencyAnalysis>(F);
951 std::function<TargetTransformInfo &(Function &)> GetTTI =
952 [&FAM](Function &F) -> TargetTransformInfo & {
953 return FAM.getResult<TargetIRAnalysis>(F);
956 ProfileSummaryInfo *PSI = &AM.getResult<ProfileSummaryAnalysis>(M);
958 if (PartialInlinerImpl(&GetAssumptionCache, &GetTTI, {GetBFI}, PSI).run(M))
959 return PreservedAnalyses::none();
960 return PreservedAnalyses::all();