//===- PartialInlining.cpp - Inline parts of functions --------------------===// // // The LLVM Compiler Infrastructure // // This file is distributed under the University of Illinois Open Source // License. See LICENSE.TXT for details. // //===----------------------------------------------------------------------===// // // This pass performs partial inlining, typically by inlining an if statement // that surrounds the body of the function. // //===----------------------------------------------------------------------===// #include "llvm/Transforms/IPO/PartialInlining.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/BlockFrequencyInfo.h" #include "llvm/Analysis/BranchProbabilityInfo.h" #include "llvm/Analysis/CodeMetrics.h" #include "llvm/Analysis/InlineCost.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/OptimizationDiagnosticInfo.h" #include "llvm/Analysis/ProfileSummaryInfo.h" #include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/IR/CFG.h" #include "llvm/IR/DiagnosticInfo.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/Module.h" #include "llvm/Pass.h" #include "llvm/Transforms/IPO.h" #include "llvm/Transforms/Utils/Cloning.h" #include "llvm/Transforms/Utils/CodeExtractor.h" using namespace llvm; #define DEBUG_TYPE "partial-inlining" STATISTIC(NumPartialInlined, "Number of callsites functions partially inlined into."); // Command line option to disable partial-inlining. The default is false: static cl::opt DisablePartialInlining("disable-partial-inlining", cl::init(false), cl::Hidden, cl::desc("Disable partial ininling")); // This is an option used by testing: static cl::opt SkipCostAnalysis("skip-partial-inlining-cost-analysis", cl::init(false), cl::ZeroOrMore, cl::ReallyHidden, cl::desc("Skip Cost Analysis")); static cl::opt MaxNumInlineBlocks( "max-num-inline-blocks", cl::init(5), cl::Hidden, cl::desc("Max Number of Blocks To be Partially Inlined")); // Command line option to set the maximum number of partial inlining allowed // for the module. The default value of -1 means no limit. static cl::opt MaxNumPartialInlining( "max-partial-inlining", cl::init(-1), cl::Hidden, cl::ZeroOrMore, cl::desc("Max number of partial inlining. The default is unlimited")); // Used only when PGO or user annotated branch data is absent. It is // the least value that is used to weigh the outline region. If BFI // produces larger value, the BFI value will be used. static cl::opt OutlineRegionFreqPercent("outline-region-freq-percent", cl::init(75), cl::Hidden, cl::ZeroOrMore, cl::desc("Relative frequency of outline region to " "the entry block")); namespace { struct FunctionOutliningInfo { FunctionOutliningInfo() : Entries(), ReturnBlock(nullptr), NonReturnBlock(nullptr), ReturnBlockPreds() {} // Returns the number of blocks to be inlined including all blocks // in Entries and one return block. unsigned GetNumInlinedBlocks() const { return Entries.size() + 1; } // A set of blocks including the function entry that guard // the region to be outlined. SmallVector Entries; // The return block that is not included in the outlined region. BasicBlock *ReturnBlock; // The dominating block of the region ot be outlined. BasicBlock *NonReturnBlock; // The set of blocks in Entries that that are predecessors to ReturnBlock SmallVector ReturnBlockPreds; }; struct PartialInlinerImpl { PartialInlinerImpl( std::function *GetAC, std::function *GTTI, Optional> GBFI, ProfileSummaryInfo *ProfSI) : GetAssumptionCache(GetAC), GetTTI(GTTI), GetBFI(GBFI), PSI(ProfSI) {} bool run(Module &M); Function *unswitchFunction(Function *F); private: int NumPartialInlining = 0; std::function *GetAssumptionCache; std::function *GetTTI; Optional> GetBFI; ProfileSummaryInfo *PSI; // Return the frequency of the OutlininingBB relative to F's entry point. // The result is no larger than 1 and is represented using BP. // (Note that the outlined region's 'head' block can only have incoming // edges from the guarding entry blocks). BranchProbability getOutliningCallBBRelativeFreq(Function *F, FunctionOutliningInfo *OI, Function *DuplicateFunction, BlockFrequencyInfo *BFI, BasicBlock *OutliningCallBB); // Return true if the callee of CS should be partially inlined with // profit. bool shouldPartialInline(CallSite CS, Function *F, FunctionOutliningInfo *OI, BlockFrequencyInfo *CalleeBFI, BasicBlock *OutliningCallBB, int OutliningCallOverhead, OptimizationRemarkEmitter &ORE); // Try to inline DuplicateFunction (cloned from F with call to // the OutlinedFunction into its callers. Return true // if there is any successful inlining. bool tryPartialInline(Function *DuplicateFunction, Function *F, /*orignal function */ FunctionOutliningInfo *OI, Function *OutlinedFunction, BlockFrequencyInfo *CalleeBFI); // Compute the mapping from use site of DuplicationFunction to the enclosing // BB's profile count. void computeCallsiteToProfCountMap(Function *DuplicateFunction, DenseMap &SiteCountMap); bool IsLimitReached() { return (MaxNumPartialInlining != -1 && NumPartialInlining >= MaxNumPartialInlining); } CallSite getCallSite(User *U) { CallSite CS; if (CallInst *CI = dyn_cast(U)) CS = CallSite(CI); else if (InvokeInst *II = dyn_cast(U)) CS = CallSite(II); else llvm_unreachable("All uses must be calls"); return CS; } CallSite getOneCallSiteTo(Function *F) { User *User = *F->user_begin(); return getCallSite(User); } std::tuple getOneDebugLoc(Function *F) { CallSite CS = getOneCallSiteTo(F); DebugLoc DLoc = CS.getInstruction()->getDebugLoc(); BasicBlock *Block = CS.getParent(); return std::make_tuple(DLoc, Block); } // Returns the costs associated with function outlining: // - The first value is the non-weighted runtime cost for making the call // to the outlined function 'OutlinedFunction', including the addtional // setup cost in the outlined function itself; // - The second value is the estimated size of the new call sequence in // basic block 'OutliningCallBB'; // - The third value is the estimated size of the original code from // function 'F' that is extracted into the outlined function. std::tuple computeOutliningCosts(Function *F, const FunctionOutliningInfo *OutliningInfo, Function *OutlinedFunction, BasicBlock *OutliningCallBB); // Compute the 'InlineCost' of block BB. InlineCost is a proxy used to // approximate both the size and runtime cost (Note that in the current // inline cost analysis, there is no clear distinction there either). int computeBBInlineCost(BasicBlock *BB); std::unique_ptr computeOutliningInfo(Function *F); }; struct PartialInlinerLegacyPass : public ModulePass { static char ID; // Pass identification, replacement for typeid PartialInlinerLegacyPass() : ModulePass(ID) { initializePartialInlinerLegacyPassPass(*PassRegistry::getPassRegistry()); } void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired(); AU.addRequired(); AU.addRequired(); } bool runOnModule(Module &M) override { if (skipModule(M)) return false; AssumptionCacheTracker *ACT = &getAnalysis(); TargetTransformInfoWrapperPass *TTIWP = &getAnalysis(); ProfileSummaryInfo *PSI = getAnalysis().getPSI(); std::function GetAssumptionCache = [&ACT](Function &F) -> AssumptionCache & { return ACT->getAssumptionCache(F); }; std::function GetTTI = [&TTIWP](Function &F) -> TargetTransformInfo & { return TTIWP->getTTI(F); }; return PartialInlinerImpl(&GetAssumptionCache, &GetTTI, None, PSI).run(M); } }; } std::unique_ptr PartialInlinerImpl::computeOutliningInfo(Function *F) { BasicBlock *EntryBlock = &F->front(); BranchInst *BR = dyn_cast(EntryBlock->getTerminator()); if (!BR || BR->isUnconditional()) return std::unique_ptr(); // Returns true if Succ is BB's successor auto IsSuccessor = [](BasicBlock *Succ, BasicBlock *BB) { return is_contained(successors(BB), Succ); }; auto SuccSize = [](BasicBlock *BB) { return std::distance(succ_begin(BB), succ_end(BB)); }; auto IsReturnBlock = [](BasicBlock *BB) { TerminatorInst *TI = BB->getTerminator(); return isa(TI); }; auto GetReturnBlock = [&](BasicBlock *Succ1, BasicBlock *Succ2) { if (IsReturnBlock(Succ1)) return std::make_tuple(Succ1, Succ2); if (IsReturnBlock(Succ2)) return std::make_tuple(Succ2, Succ1); return std::make_tuple(nullptr, nullptr); }; // Detect a triangular shape: auto GetCommonSucc = [&](BasicBlock *Succ1, BasicBlock *Succ2) { if (IsSuccessor(Succ1, Succ2)) return std::make_tuple(Succ1, Succ2); if (IsSuccessor(Succ2, Succ1)) return std::make_tuple(Succ2, Succ1); return std::make_tuple(nullptr, nullptr); }; std::unique_ptr OutliningInfo = llvm::make_unique(); BasicBlock *CurrEntry = EntryBlock; bool CandidateFound = false; do { // The number of blocks to be inlined has already reached // the limit. When MaxNumInlineBlocks is set to 0 or 1, this // disables partial inlining for the function. if (OutliningInfo->GetNumInlinedBlocks() >= MaxNumInlineBlocks) break; if (SuccSize(CurrEntry) != 2) break; BasicBlock *Succ1 = *succ_begin(CurrEntry); BasicBlock *Succ2 = *(succ_begin(CurrEntry) + 1); BasicBlock *ReturnBlock, *NonReturnBlock; std::tie(ReturnBlock, NonReturnBlock) = GetReturnBlock(Succ1, Succ2); if (ReturnBlock) { OutliningInfo->Entries.push_back(CurrEntry); OutliningInfo->ReturnBlock = ReturnBlock; OutliningInfo->NonReturnBlock = NonReturnBlock; CandidateFound = true; break; } BasicBlock *CommSucc; BasicBlock *OtherSucc; std::tie(CommSucc, OtherSucc) = GetCommonSucc(Succ1, Succ2); if (!CommSucc) break; OutliningInfo->Entries.push_back(CurrEntry); CurrEntry = OtherSucc; } while (true); if (!CandidateFound) return std::unique_ptr(); // Do sanity check of the entries: threre should not // be any successors (not in the entry set) other than // {ReturnBlock, NonReturnBlock} assert(OutliningInfo->Entries[0] == &F->front() && "Function Entry must be the first in Entries vector"); DenseSet Entries; for (BasicBlock *E : OutliningInfo->Entries) Entries.insert(E); // Returns true of BB has Predecessor which is not // in Entries set. auto HasNonEntryPred = [Entries](BasicBlock *BB) { for (auto Pred : predecessors(BB)) { if (!Entries.count(Pred)) return true; } return false; }; auto CheckAndNormalizeCandidate = [Entries, HasNonEntryPred](FunctionOutliningInfo *OutliningInfo) { for (BasicBlock *E : OutliningInfo->Entries) { for (auto Succ : successors(E)) { if (Entries.count(Succ)) continue; if (Succ == OutliningInfo->ReturnBlock) OutliningInfo->ReturnBlockPreds.push_back(E); else if (Succ != OutliningInfo->NonReturnBlock) return false; } // There should not be any outside incoming edges either: if (HasNonEntryPred(E)) return false; } return true; }; if (!CheckAndNormalizeCandidate(OutliningInfo.get())) return std::unique_ptr(); // Now further growing the candidate's inlining region by // peeling off dominating blocks from the outlining region: while (OutliningInfo->GetNumInlinedBlocks() < MaxNumInlineBlocks) { BasicBlock *Cand = OutliningInfo->NonReturnBlock; if (SuccSize(Cand) != 2) break; if (HasNonEntryPred(Cand)) break; BasicBlock *Succ1 = *succ_begin(Cand); BasicBlock *Succ2 = *(succ_begin(Cand) + 1); BasicBlock *ReturnBlock, *NonReturnBlock; std::tie(ReturnBlock, NonReturnBlock) = GetReturnBlock(Succ1, Succ2); if (!ReturnBlock || ReturnBlock != OutliningInfo->ReturnBlock) break; if (NonReturnBlock->getSinglePredecessor() != Cand) break; // Now grow and update OutlininigInfo: OutliningInfo->Entries.push_back(Cand); OutliningInfo->NonReturnBlock = NonReturnBlock; OutliningInfo->ReturnBlockPreds.push_back(Cand); Entries.insert(Cand); } return OutliningInfo; } // Check if there is PGO data or user annoated branch data: static bool hasProfileData(Function *F, FunctionOutliningInfo *OI) { if (F->getEntryCount()) return true; // Now check if any of the entry block has MD_prof data: for (auto *E : OI->Entries) { BranchInst *BR = dyn_cast(E->getTerminator()); if (!BR || BR->isUnconditional()) continue; uint64_t T, F; if (BR->extractProfMetadata(T, F)) return true; } return false; } BranchProbability PartialInlinerImpl::getOutliningCallBBRelativeFreq( Function *F, FunctionOutliningInfo *OI, Function *DuplicateFunction, BlockFrequencyInfo *BFI, BasicBlock *OutliningCallBB) { auto EntryFreq = BFI->getBlockFreq(&DuplicateFunction->getEntryBlock()); auto OutliningCallFreq = BFI->getBlockFreq(OutliningCallBB); auto OutlineRegionRelFreq = BranchProbability::getBranchProbability(OutliningCallFreq.getFrequency(), EntryFreq.getFrequency()); if (hasProfileData(F, OI)) return OutlineRegionRelFreq; // When profile data is not available, we need to be very // conservative in estimating the overall savings. We need to make sure // the outline region relative frequency is not below the threshold // specified by the option. OutlineRegionRelFreq = std::max(OutlineRegionRelFreq, BranchProbability(OutlineRegionFreqPercent, 100)); return OutlineRegionRelFreq; } bool PartialInlinerImpl::shouldPartialInline( CallSite CS, Function *F /* Original Callee */, FunctionOutliningInfo *OI, BlockFrequencyInfo *CalleeBFI, BasicBlock *OutliningCallBB, int NonWeightedOutliningRcost, OptimizationRemarkEmitter &ORE) { using namespace ore; if (SkipCostAnalysis) return true; Instruction *Call = CS.getInstruction(); Function *Callee = CS.getCalledFunction(); Function *Caller = CS.getCaller(); auto &CalleeTTI = (*GetTTI)(*Callee); InlineCost IC = getInlineCost(CS, getInlineParams(), CalleeTTI, *GetAssumptionCache, GetBFI, PSI); if (IC.isAlways()) { ORE.emit(OptimizationRemarkAnalysis(DEBUG_TYPE, "AlwaysInline", Call) << NV("Callee", F) << " should always be fully inlined, not partially"); return false; } if (IC.isNever()) { ORE.emit(OptimizationRemarkMissed(DEBUG_TYPE, "NeverInline", Call) << NV("Callee", F) << " not partially inlined into " << NV("Caller", Caller) << " because it should never be inlined (cost=never)"); return false; } if (!IC) { ORE.emit(OptimizationRemarkAnalysis(DEBUG_TYPE, "TooCostly", Call) << NV("Callee", F) << " not partially inlined into " << NV("Caller", Caller) << " because too costly to inline (cost=" << NV("Cost", IC.getCost()) << ", threshold=" << NV("Threshold", IC.getCostDelta() + IC.getCost()) << ")"); return false; } const DataLayout &DL = Caller->getParent()->getDataLayout(); // The savings of eliminating the call: int NonWeightedSavings = getCallsiteCost(CS, DL); BlockFrequency NormWeightedSavings(NonWeightedSavings); auto RelativeFreq = getOutliningCallBBRelativeFreq(F, OI, Callee, CalleeBFI, OutliningCallBB); auto NormWeightedRcost = BlockFrequency(NonWeightedOutliningRcost) * RelativeFreq; // Weighted saving is smaller than weighted cost, return false if (NormWeightedSavings < NormWeightedRcost) { ORE.emit( OptimizationRemarkAnalysis(DEBUG_TYPE, "OutliningCallcostTooHigh", Call) << NV("Callee", F) << " not partially inlined into " << NV("Caller", Caller) << " runtime overhead (overhead=" << NV("Overhead", (unsigned)NormWeightedRcost.getFrequency()) << ", savings=" << NV("Savings", (unsigned)NormWeightedSavings.getFrequency()) << ")" << " of making the outlined call is too high"); return false; } ORE.emit(OptimizationRemarkAnalysis(DEBUG_TYPE, "CanBePartiallyInlined", Call) << NV("Callee", F) << " can be partially inlined into " << NV("Caller", Caller) << " with cost=" << NV("Cost", IC.getCost()) << " (threshold=" << NV("Threshold", IC.getCostDelta() + IC.getCost()) << ")"); return true; } // TODO: Ideally we should share Inliner's InlineCost Analysis code. // For now use a simplified version. The returned 'InlineCost' will be used // to esimate the size cost as well as runtime cost of the BB. int PartialInlinerImpl::computeBBInlineCost(BasicBlock *BB) { int InlineCost = 0; const DataLayout &DL = BB->getParent()->getParent()->getDataLayout(); for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) { if (isa(I)) continue; if (CallInst *CI = dyn_cast(I)) { InlineCost += getCallsiteCost(CallSite(CI), DL); continue; } if (InvokeInst *II = dyn_cast(I)) { InlineCost += getCallsiteCost(CallSite(II), DL); continue; } if (SwitchInst *SI = dyn_cast(I)) { InlineCost += (SI->getNumCases() + 1) * InlineConstants::InstrCost; continue; } InlineCost += InlineConstants::InstrCost; } return InlineCost; } std::tuple PartialInlinerImpl::computeOutliningCosts( Function *F, const FunctionOutliningInfo *OI, Function *OutlinedFunction, BasicBlock *OutliningCallBB) { // First compute the cost of the outlined region 'OI' in the original // function 'F': int OutlinedRegionCost = 0; for (BasicBlock &BB : *F) { if (&BB != OI->ReturnBlock && // Assuming Entry set is small -- do a linear search here: std::find(OI->Entries.begin(), OI->Entries.end(), &BB) == OI->Entries.end()) { OutlinedRegionCost += computeBBInlineCost(&BB); } } // Now compute the cost of the call sequence to the outlined function // 'OutlinedFunction' in BB 'OutliningCallBB': int OutliningFuncCallCost = computeBBInlineCost(OutliningCallBB); // Now compute the cost of the extracted/outlined function itself: int OutlinedFunctionCost = 0; for (BasicBlock &BB : *OutlinedFunction) { OutlinedFunctionCost += computeBBInlineCost(&BB); } assert(OutlinedFunctionCost >= OutlinedRegionCost && "Outlined function cost should be no less than the outlined region"); int OutliningRuntimeOverhead = OutliningFuncCallCost + (OutlinedFunctionCost - OutlinedRegionCost); return std::make_tuple(OutliningFuncCallCost, OutliningRuntimeOverhead, OutlinedRegionCost); } // Create the callsite to profile count map which is // used to update the original function's entry count, // after the function is partially inlined into the callsite. void PartialInlinerImpl::computeCallsiteToProfCountMap( Function *DuplicateFunction, DenseMap &CallSiteToProfCountMap) { std::vector Users(DuplicateFunction->user_begin(), DuplicateFunction->user_end()); Function *CurrentCaller = nullptr; std::unique_ptr TempBFI; BlockFrequencyInfo *CurrentCallerBFI = nullptr; auto ComputeCurrBFI = [&,this](Function *Caller) { // For the old pass manager: if (!GetBFI) { DominatorTree DT(*Caller); LoopInfo LI(DT); BranchProbabilityInfo BPI(*Caller, LI); TempBFI.reset(new BlockFrequencyInfo(*Caller, BPI, LI)); CurrentCallerBFI = TempBFI.get(); } else { // New pass manager: CurrentCallerBFI = &(*GetBFI)(*Caller); } }; for (User *User : Users) { CallSite CS = getCallSite(User); Function *Caller = CS.getCaller(); if (CurrentCaller != Caller) { CurrentCaller = Caller; ComputeCurrBFI(Caller); } else { assert(CurrentCallerBFI && "CallerBFI is not set"); } BasicBlock *CallBB = CS.getInstruction()->getParent(); auto Count = CurrentCallerBFI->getBlockProfileCount(CallBB); if (Count) CallSiteToProfCountMap[User] = *Count; else CallSiteToProfCountMap[User] = 0; } } Function *PartialInlinerImpl::unswitchFunction(Function *F) { if (F->hasAddressTaken()) return nullptr; // Let inliner handle it if (F->hasFnAttribute(Attribute::AlwaysInline)) return nullptr; if (F->hasFnAttribute(Attribute::NoInline)) return nullptr; if (PSI->isFunctionEntryCold(F)) return nullptr; if (F->user_begin() == F->user_end()) return nullptr; std::unique_ptr OI = computeOutliningInfo(F); if (!OI) return nullptr; // Clone the function, so that we can hack away on it. ValueToValueMapTy VMap; Function *DuplicateFunction = CloneFunction(F, VMap); BasicBlock *NewReturnBlock = cast(VMap[OI->ReturnBlock]); BasicBlock *NewNonReturnBlock = cast(VMap[OI->NonReturnBlock]); DenseSet NewEntries; for (BasicBlock *BB : OI->Entries) { NewEntries.insert(cast(VMap[BB])); } // Go ahead and update all uses to the duplicate, so that we can just // use the inliner functionality when we're done hacking. F->replaceAllUsesWith(DuplicateFunction); auto getFirstPHI = [](BasicBlock *BB) { BasicBlock::iterator I = BB->begin(); PHINode *FirstPhi = nullptr; while (I != BB->end()) { PHINode *Phi = dyn_cast(I); if (!Phi) break; if (!FirstPhi) { FirstPhi = Phi; break; } } return FirstPhi; }; // Special hackery is needed with PHI nodes that have inputs from more than // one extracted block. For simplicity, just split the PHIs into a two-level // sequence of PHIs, some of which will go in the extracted region, and some // of which will go outside. BasicBlock *PreReturn = NewReturnBlock; // only split block when necessary: PHINode *FirstPhi = getFirstPHI(PreReturn); unsigned NumPredsFromEntries = OI->ReturnBlockPreds.size(); auto IsTrivialPhi = [](PHINode *PN) -> Value * { Value *CommonValue = PN->getIncomingValue(0); if (all_of(PN->incoming_values(), [&](Value *V) { return V == CommonValue; })) return CommonValue; return nullptr; }; if (FirstPhi && FirstPhi->getNumIncomingValues() > NumPredsFromEntries + 1) { NewReturnBlock = NewReturnBlock->splitBasicBlock( NewReturnBlock->getFirstNonPHI()->getIterator()); BasicBlock::iterator I = PreReturn->begin(); Instruction *Ins = &NewReturnBlock->front(); SmallVector DeadPhis; while (I != PreReturn->end()) { PHINode *OldPhi = dyn_cast(I); if (!OldPhi) break; PHINode *RetPhi = PHINode::Create(OldPhi->getType(), NumPredsFromEntries + 1, "", Ins); OldPhi->replaceAllUsesWith(RetPhi); Ins = NewReturnBlock->getFirstNonPHI(); RetPhi->addIncoming(&*I, PreReturn); for (BasicBlock *E : OI->ReturnBlockPreds) { BasicBlock *NewE = cast(VMap[E]); RetPhi->addIncoming(OldPhi->getIncomingValueForBlock(NewE), NewE); OldPhi->removeIncomingValue(NewE); } // After incoming values splitting, the old phi may become trivial. // Keeping the trivial phi can introduce definition inside the outline // region which is live-out, causing necessary overhead (load, store // arg passing etc). if (auto *OldPhiVal = IsTrivialPhi(OldPhi)) { OldPhi->replaceAllUsesWith(OldPhiVal); DeadPhis.push_back(OldPhi); } ++I; } for (auto *DP : DeadPhis) DP->eraseFromParent(); for (auto E : OI->ReturnBlockPreds) { BasicBlock *NewE = cast(VMap[E]); NewE->getTerminator()->replaceUsesOfWith(PreReturn, NewReturnBlock); } } // Returns true if the block is to be partial inlined into the caller // (i.e. not to be extracted to the out of line function) auto ToBeInlined = [&](BasicBlock *BB) { return BB == NewReturnBlock || NewEntries.count(BB); }; // Gather up the blocks that we're going to extract. std::vector ToExtract; ToExtract.push_back(NewNonReturnBlock); for (BasicBlock &BB : *DuplicateFunction) if (!ToBeInlined(&BB) && &BB != NewNonReturnBlock) ToExtract.push_back(&BB); // The CodeExtractor needs a dominator tree. DominatorTree DT; DT.recalculate(*DuplicateFunction); // Manually calculate a BlockFrequencyInfo and BranchProbabilityInfo. LoopInfo LI(DT); BranchProbabilityInfo BPI(*DuplicateFunction, LI); BlockFrequencyInfo BFI(*DuplicateFunction, BPI, LI); // Extract the body of the if. Function *OutlinedFunction = CodeExtractor(ToExtract, &DT, /*AggregateArgs*/ false, &BFI, &BPI) .extractCodeRegion(); bool AnyInline = tryPartialInline(DuplicateFunction, F, OI.get(), OutlinedFunction, &BFI); // Ditch the duplicate, since we're done with it, and rewrite all remaining // users (function pointers, etc.) back to the original function. DuplicateFunction->replaceAllUsesWith(F); DuplicateFunction->eraseFromParent(); if (AnyInline) return OutlinedFunction; // Remove the function that is speculatively created: if (OutlinedFunction) OutlinedFunction->eraseFromParent(); return nullptr; } bool PartialInlinerImpl::tryPartialInline(Function *DuplicateFunction, Function *F, FunctionOutliningInfo *OI, Function *OutlinedFunction, BlockFrequencyInfo *CalleeBFI) { if (OutlinedFunction == nullptr) return false; int NonWeightedRcost; int SizeCost; int OutlinedRegionSizeCost; auto OutliningCallBB = getOneCallSiteTo(OutlinedFunction).getInstruction()->getParent(); std::tie(SizeCost, NonWeightedRcost, OutlinedRegionSizeCost) = computeOutliningCosts(F, OI, OutlinedFunction, OutliningCallBB); // The call sequence to the outlined function is larger than the original // outlined region size, it does not increase the chances of inlining // 'F' with outlining (The inliner usies the size increase to model the // the cost of inlining a callee). if (!SkipCostAnalysis && OutlinedRegionSizeCost < SizeCost) { OptimizationRemarkEmitter ORE(F); DebugLoc DLoc; BasicBlock *Block; std::tie(DLoc, Block) = getOneDebugLoc(DuplicateFunction); ORE.emit(OptimizationRemarkAnalysis(DEBUG_TYPE, "OutlineRegionTooSmall", DLoc, Block) << ore::NV("Function", F) << " not partially inlined into callers (Original Size = " << ore::NV("OutlinedRegionOriginalSize", OutlinedRegionSizeCost) << ", Size of call sequence to outlined function = " << ore::NV("NewSize", SizeCost) << ")"); return false; } assert(F->user_begin() == F->user_end() && "F's users should all be replaced!"); std::vector Users(DuplicateFunction->user_begin(), DuplicateFunction->user_end()); DenseMap CallSiteToProfCountMap; if (F->getEntryCount()) computeCallsiteToProfCountMap(DuplicateFunction, CallSiteToProfCountMap); auto CalleeEntryCount = F->getEntryCount(); uint64_t CalleeEntryCountV = (CalleeEntryCount ? *CalleeEntryCount : 0); bool AnyInline = false; for (User *User : Users) { CallSite CS = getCallSite(User); if (IsLimitReached()) continue; OptimizationRemarkEmitter ORE(CS.getCaller()); if (!shouldPartialInline(CS, F, OI, CalleeBFI, OutliningCallBB, NonWeightedRcost, ORE)) continue; ORE.emit( OptimizationRemark(DEBUG_TYPE, "PartiallyInlined", CS.getInstruction()) << ore::NV("Callee", F) << " partially inlined into " << ore::NV("Caller", CS.getCaller())); InlineFunctionInfo IFI(nullptr, GetAssumptionCache, PSI); InlineFunction(CS, IFI); // Now update the entry count: if (CalleeEntryCountV && CallSiteToProfCountMap.count(User)) { uint64_t CallSiteCount = CallSiteToProfCountMap[User]; CalleeEntryCountV -= std::min(CalleeEntryCountV, CallSiteCount); } AnyInline = true; NumPartialInlining++; // Update the stats NumPartialInlined++; } if (AnyInline && CalleeEntryCount) F->setEntryCount(CalleeEntryCountV); return AnyInline; } bool PartialInlinerImpl::run(Module &M) { if (DisablePartialInlining) return false; std::vector Worklist; Worklist.reserve(M.size()); for (Function &F : M) if (!F.use_empty() && !F.isDeclaration()) Worklist.push_back(&F); bool Changed = false; while (!Worklist.empty()) { Function *CurrFunc = Worklist.back(); Worklist.pop_back(); if (CurrFunc->use_empty()) continue; bool Recursive = false; for (User *U : CurrFunc->users()) if (Instruction *I = dyn_cast(U)) if (I->getParent()->getParent() == CurrFunc) { Recursive = true; break; } if (Recursive) continue; if (Function *NewFunc = unswitchFunction(CurrFunc)) { Worklist.push_back(NewFunc); Changed = true; } } return Changed; } char PartialInlinerLegacyPass::ID = 0; INITIALIZE_PASS_BEGIN(PartialInlinerLegacyPass, "partial-inliner", "Partial Inliner", false, false) INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) INITIALIZE_PASS_END(PartialInlinerLegacyPass, "partial-inliner", "Partial Inliner", false, false) ModulePass *llvm::createPartialInliningPass() { return new PartialInlinerLegacyPass(); } PreservedAnalyses PartialInlinerPass::run(Module &M, ModuleAnalysisManager &AM) { auto &FAM = AM.getResult(M).getManager(); std::function GetAssumptionCache = [&FAM](Function &F) -> AssumptionCache & { return FAM.getResult(F); }; std::function GetBFI = [&FAM](Function &F) -> BlockFrequencyInfo & { return FAM.getResult(F); }; std::function GetTTI = [&FAM](Function &F) -> TargetTransformInfo & { return FAM.getResult(F); }; ProfileSummaryInfo *PSI = &AM.getResult(M); if (PartialInlinerImpl(&GetAssumptionCache, &GetTTI, {GetBFI}, PSI).run(M)) return PreservedAnalyses::none(); return PreservedAnalyses::all(); }