1 //===- InlineCost.cpp - Cost analysis for inliner -------------------------===//
3 // The LLVM Compiler Infrastructure
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
8 //===----------------------------------------------------------------------===//
10 // This file implements inline cost analysis.
12 //===----------------------------------------------------------------------===//
14 #include "llvm/Analysis/InlineCost.h"
15 #include "llvm/ADT/STLExtras.h"
16 #include "llvm/ADT/SetVector.h"
17 #include "llvm/ADT/SmallPtrSet.h"
18 #include "llvm/ADT/SmallVector.h"
19 #include "llvm/ADT/Statistic.h"
20 #include "llvm/Analysis/AssumptionCache.h"
21 #include "llvm/Analysis/BlockFrequencyInfo.h"
22 #include "llvm/Analysis/CodeMetrics.h"
23 #include "llvm/Analysis/ConstantFolding.h"
24 #include "llvm/Analysis/CFG.h"
25 #include "llvm/Analysis/InstructionSimplify.h"
26 #include "llvm/Analysis/ProfileSummaryInfo.h"
27 #include "llvm/Analysis/TargetTransformInfo.h"
28 #include "llvm/Analysis/ValueTracking.h"
29 #include "llvm/IR/CallSite.h"
30 #include "llvm/IR/CallingConv.h"
31 #include "llvm/IR/DataLayout.h"
32 #include "llvm/IR/GetElementPtrTypeIterator.h"
33 #include "llvm/IR/GlobalAlias.h"
34 #include "llvm/IR/InstVisitor.h"
35 #include "llvm/IR/IntrinsicInst.h"
36 #include "llvm/IR/Operator.h"
37 #include "llvm/Support/Debug.h"
38 #include "llvm/Support/raw_ostream.h"
42 #define DEBUG_TYPE "inline-cost"
44 STATISTIC(NumCallsAnalyzed, "Number of call sites analyzed");
46 static cl::opt<int> InlineThreshold(
47 "inline-threshold", cl::Hidden, cl::init(225), cl::ZeroOrMore,
48 cl::desc("Control the amount of inlining to perform (default = 225)"));
50 static cl::opt<int> HintThreshold(
51 "inlinehint-threshold", cl::Hidden, cl::init(325),
52 cl::desc("Threshold for inlining functions with inline hint"));
55 ColdCallSiteThreshold("inline-cold-callsite-threshold", cl::Hidden,
57 cl::desc("Threshold for inlining cold callsites"));
59 // We introduce this threshold to help performance of instrumentation based
60 // PGO before we actually hook up inliner with analysis passes such as BPI and
62 static cl::opt<int> ColdThreshold(
63 "inlinecold-threshold", cl::Hidden, cl::init(45),
64 cl::desc("Threshold for inlining functions with cold attribute"));
67 HotCallSiteThreshold("hot-callsite-threshold", cl::Hidden, cl::init(3000),
69 cl::desc("Threshold for hot callsites "));
71 static cl::opt<int> LocallyHotCallSiteThreshold(
72 "locally-hot-callsite-threshold", cl::Hidden, cl::init(525), cl::ZeroOrMore,
73 cl::desc("Threshold for locally hot callsites "));
75 static cl::opt<int> ColdCallSiteRelFreq(
76 "cold-callsite-rel-freq", cl::Hidden, cl::init(2), cl::ZeroOrMore,
77 cl::desc("Maxmimum block frequency, expressed as a percentage of caller's "
78 "entry frequency, for a callsite to be cold in the absence of "
79 "profile information."));
81 static cl::opt<int> HotCallSiteRelFreq(
82 "hot-callsite-rel-freq", cl::Hidden, cl::init(60), cl::ZeroOrMore,
83 cl::desc("Minimum block frequency, expressed as a multiple of caller's "
84 "entry frequency, for a callsite to be hot in the absence of "
85 "profile information."));
87 static cl::opt<bool> OptComputeFullInlineCost(
88 "inline-cost-full", cl::Hidden, cl::init(false),
89 cl::desc("Compute the full inline cost of a call site even when the cost "
90 "exceeds the threshold."));
94 class CallAnalyzer : public InstVisitor<CallAnalyzer, bool> {
95 typedef InstVisitor<CallAnalyzer, bool> Base;
96 friend class InstVisitor<CallAnalyzer, bool>;
98 /// The TargetTransformInfo available for this compilation.
99 const TargetTransformInfo &TTI;
101 /// Getter for the cache of @llvm.assume intrinsics.
102 std::function<AssumptionCache &(Function &)> &GetAssumptionCache;
104 /// Getter for BlockFrequencyInfo
105 Optional<function_ref<BlockFrequencyInfo &(Function &)>> &GetBFI;
107 /// Profile summary information.
108 ProfileSummaryInfo *PSI;
110 /// The called function.
113 // Cache the DataLayout since we use it a lot.
114 const DataLayout &DL;
116 /// The OptimizationRemarkEmitter available for this compilation.
117 OptimizationRemarkEmitter *ORE;
119 /// The candidate callsite being analyzed. Please do not use this to do
120 /// analysis in the caller function; we want the inline cost query to be
121 /// easily cacheable. Instead, use the cover function paramHasAttr.
122 CallSite CandidateCS;
124 /// Tunable parameters that control the analysis.
125 const InlineParams &Params;
129 bool ComputeFullInlineCost;
131 bool IsCallerRecursive;
132 bool IsRecursiveCall;
133 bool ExposesReturnsTwice;
134 bool HasDynamicAlloca;
135 bool ContainsNoDuplicateCall;
140 /// Number of bytes allocated statically by the callee.
141 uint64_t AllocatedSize;
142 unsigned NumInstructions, NumVectorInstructions;
143 int VectorBonus, TenPercentVectorBonus;
144 // Bonus to be applied when the callee has only one reachable basic block.
147 /// While we walk the potentially-inlined instructions, we build up and
148 /// maintain a mapping of simplified values specific to this callsite. The
149 /// idea is to propagate any special information we have about arguments to
150 /// this call through the inlinable section of the function, and account for
151 /// likely simplifications post-inlining. The most important aspect we track
152 /// is CFG altering simplifications -- when we prove a basic block dead, that
153 /// can cause dramatic shifts in the cost of inlining a function.
154 DenseMap<Value *, Constant *> SimplifiedValues;
156 /// Keep track of the values which map back (through function arguments) to
157 /// allocas on the caller stack which could be simplified through SROA.
158 DenseMap<Value *, Value *> SROAArgValues;
160 /// The mapping of caller Alloca values to their accumulated cost savings. If
161 /// we have to disable SROA for one of the allocas, this tells us how much
162 /// cost must be added.
163 DenseMap<Value *, int> SROAArgCosts;
165 /// Keep track of values which map to a pointer base and constant offset.
166 DenseMap<Value *, std::pair<Value *, APInt>> ConstantOffsetPtrs;
168 /// Keep track of dead blocks due to the constant arguments.
169 SetVector<BasicBlock *> DeadBlocks;
171 /// The mapping of the blocks to their known unique successors due to the
172 /// constant arguments.
173 DenseMap<BasicBlock *, BasicBlock *> KnownSuccessors;
175 /// Model the elimination of repeated loads that is expected to happen
176 /// whenever we simplify away the stores that would otherwise cause them to be
178 bool EnableLoadElimination;
179 SmallPtrSet<Value *, 16> LoadAddrSet;
180 int LoadEliminationCost;
182 // Custom simplification helper routines.
183 bool isAllocaDerivedArg(Value *V);
184 bool lookupSROAArgAndCost(Value *V, Value *&Arg,
185 DenseMap<Value *, int>::iterator &CostIt);
186 void disableSROA(DenseMap<Value *, int>::iterator CostIt);
187 void disableSROA(Value *V);
188 void findDeadBlocks(BasicBlock *CurrBB, BasicBlock *NextBB);
189 void accumulateSROACost(DenseMap<Value *, int>::iterator CostIt,
190 int InstructionCost);
191 void disableLoadElimination();
192 bool isGEPFree(GetElementPtrInst &GEP);
193 bool canFoldInboundsGEP(GetElementPtrInst &I);
194 bool accumulateGEPOffset(GEPOperator &GEP, APInt &Offset);
195 bool simplifyCallSite(Function *F, CallSite CS);
196 template <typename Callable>
197 bool simplifyInstruction(Instruction &I, Callable Evaluate);
198 ConstantInt *stripAndComputeInBoundsConstantOffsets(Value *&V);
200 /// Return true if the given argument to the function being considered for
201 /// inlining has the given attribute set either at the call site or the
202 /// function declaration. Primarily used to inspect call site specific
203 /// attributes since these can be more precise than the ones on the callee
205 bool paramHasAttr(Argument *A, Attribute::AttrKind Attr);
207 /// Return true if the given value is known non null within the callee if
208 /// inlined through this particular callsite.
209 bool isKnownNonNullInCallee(Value *V);
211 /// Update Threshold based on callsite properties such as callee
212 /// attributes and callee hotness for PGO builds. The Callee is explicitly
213 /// passed to support analyzing indirect calls whose target is inferred by
215 void updateThreshold(CallSite CS, Function &Callee);
217 /// Return true if size growth is allowed when inlining the callee at CS.
218 bool allowSizeGrowth(CallSite CS);
220 /// Return true if \p CS is a cold callsite.
221 bool isColdCallSite(CallSite CS, BlockFrequencyInfo *CallerBFI);
223 /// Return a higher threshold if \p CS is a hot callsite.
224 Optional<int> getHotCallSiteThreshold(CallSite CS,
225 BlockFrequencyInfo *CallerBFI);
227 // Custom analysis routines.
228 bool analyzeBlock(BasicBlock *BB, SmallPtrSetImpl<const Value *> &EphValues);
230 // Disable several entry points to the visitor so we don't accidentally use
231 // them by declaring but not defining them here.
232 void visit(Module *);
233 void visit(Module &);
234 void visit(Function *);
235 void visit(Function &);
236 void visit(BasicBlock *);
237 void visit(BasicBlock &);
239 // Provide base case for our instruction visit.
240 bool visitInstruction(Instruction &I);
242 // Our visit overrides.
243 bool visitAlloca(AllocaInst &I);
244 bool visitPHI(PHINode &I);
245 bool visitGetElementPtr(GetElementPtrInst &I);
246 bool visitBitCast(BitCastInst &I);
247 bool visitPtrToInt(PtrToIntInst &I);
248 bool visitIntToPtr(IntToPtrInst &I);
249 bool visitCastInst(CastInst &I);
250 bool visitUnaryInstruction(UnaryInstruction &I);
251 bool visitCmpInst(CmpInst &I);
252 bool visitSub(BinaryOperator &I);
253 bool visitBinaryOperator(BinaryOperator &I);
254 bool visitLoad(LoadInst &I);
255 bool visitStore(StoreInst &I);
256 bool visitExtractValue(ExtractValueInst &I);
257 bool visitInsertValue(InsertValueInst &I);
258 bool visitCallSite(CallSite CS);
259 bool visitReturnInst(ReturnInst &RI);
260 bool visitBranchInst(BranchInst &BI);
261 bool visitSelectInst(SelectInst &SI);
262 bool visitSwitchInst(SwitchInst &SI);
263 bool visitIndirectBrInst(IndirectBrInst &IBI);
264 bool visitResumeInst(ResumeInst &RI);
265 bool visitCleanupReturnInst(CleanupReturnInst &RI);
266 bool visitCatchReturnInst(CatchReturnInst &RI);
267 bool visitUnreachableInst(UnreachableInst &I);
270 CallAnalyzer(const TargetTransformInfo &TTI,
271 std::function<AssumptionCache &(Function &)> &GetAssumptionCache,
272 Optional<function_ref<BlockFrequencyInfo &(Function &)>> &GetBFI,
273 ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE,
274 Function &Callee, CallSite CSArg, const InlineParams &Params)
275 : TTI(TTI), GetAssumptionCache(GetAssumptionCache), GetBFI(GetBFI),
276 PSI(PSI), F(Callee), DL(F.getParent()->getDataLayout()), ORE(ORE),
277 CandidateCS(CSArg), Params(Params), Threshold(Params.DefaultThreshold),
278 Cost(0), ComputeFullInlineCost(OptComputeFullInlineCost ||
279 Params.ComputeFullInlineCost || ORE),
280 IsCallerRecursive(false), IsRecursiveCall(false),
281 ExposesReturnsTwice(false), HasDynamicAlloca(false),
282 ContainsNoDuplicateCall(false), HasReturn(false), HasIndirectBr(false),
283 HasFrameEscape(false), AllocatedSize(0), NumInstructions(0),
284 NumVectorInstructions(0), VectorBonus(0), SingleBBBonus(0),
285 EnableLoadElimination(true), LoadEliminationCost(0), NumConstantArgs(0),
286 NumConstantOffsetPtrArgs(0), NumAllocaArgs(0), NumConstantPtrCmps(0),
287 NumConstantPtrDiffs(0), NumInstructionsSimplified(0),
288 SROACostSavings(0), SROACostSavingsLost(0) {}
290 bool analyzeCall(CallSite CS);
292 int getThreshold() { return Threshold; }
293 int getCost() { return Cost; }
295 // Keep a bunch of stats about the cost savings found so we can print them
296 // out when debugging.
297 unsigned NumConstantArgs;
298 unsigned NumConstantOffsetPtrArgs;
299 unsigned NumAllocaArgs;
300 unsigned NumConstantPtrCmps;
301 unsigned NumConstantPtrDiffs;
302 unsigned NumInstructionsSimplified;
303 unsigned SROACostSavings;
304 unsigned SROACostSavingsLost;
311 /// \brief Test whether the given value is an Alloca-derived function argument.
312 bool CallAnalyzer::isAllocaDerivedArg(Value *V) {
313 return SROAArgValues.count(V);
316 /// \brief Lookup the SROA-candidate argument and cost iterator which V maps to.
317 /// Returns false if V does not map to a SROA-candidate.
318 bool CallAnalyzer::lookupSROAArgAndCost(
319 Value *V, Value *&Arg, DenseMap<Value *, int>::iterator &CostIt) {
320 if (SROAArgValues.empty() || SROAArgCosts.empty())
323 DenseMap<Value *, Value *>::iterator ArgIt = SROAArgValues.find(V);
324 if (ArgIt == SROAArgValues.end())
328 CostIt = SROAArgCosts.find(Arg);
329 return CostIt != SROAArgCosts.end();
332 /// \brief Disable SROA for the candidate marked by this cost iterator.
334 /// This marks the candidate as no longer viable for SROA, and adds the cost
335 /// savings associated with it back into the inline cost measurement.
336 void CallAnalyzer::disableSROA(DenseMap<Value *, int>::iterator CostIt) {
337 // If we're no longer able to perform SROA we need to undo its cost savings
338 // and prevent subsequent analysis.
339 Cost += CostIt->second;
340 SROACostSavings -= CostIt->second;
341 SROACostSavingsLost += CostIt->second;
342 SROAArgCosts.erase(CostIt);
343 disableLoadElimination();
346 /// \brief If 'V' maps to a SROA candidate, disable SROA for it.
347 void CallAnalyzer::disableSROA(Value *V) {
349 DenseMap<Value *, int>::iterator CostIt;
350 if (lookupSROAArgAndCost(V, SROAArg, CostIt))
354 /// \brief Accumulate the given cost for a particular SROA candidate.
355 void CallAnalyzer::accumulateSROACost(DenseMap<Value *, int>::iterator CostIt,
356 int InstructionCost) {
357 CostIt->second += InstructionCost;
358 SROACostSavings += InstructionCost;
361 void CallAnalyzer::disableLoadElimination() {
362 if (EnableLoadElimination) {
363 Cost += LoadEliminationCost;
364 LoadEliminationCost = 0;
365 EnableLoadElimination = false;
369 /// \brief Accumulate a constant GEP offset into an APInt if possible.
371 /// Returns false if unable to compute the offset for any reason. Respects any
372 /// simplified values known during the analysis of this callsite.
373 bool CallAnalyzer::accumulateGEPOffset(GEPOperator &GEP, APInt &Offset) {
374 unsigned IntPtrWidth = DL.getPointerSizeInBits();
375 assert(IntPtrWidth == Offset.getBitWidth());
377 for (gep_type_iterator GTI = gep_type_begin(GEP), GTE = gep_type_end(GEP);
379 ConstantInt *OpC = dyn_cast<ConstantInt>(GTI.getOperand());
381 if (Constant *SimpleOp = SimplifiedValues.lookup(GTI.getOperand()))
382 OpC = dyn_cast<ConstantInt>(SimpleOp);
388 // Handle a struct index, which adds its field offset to the pointer.
389 if (StructType *STy = GTI.getStructTypeOrNull()) {
390 unsigned ElementIdx = OpC->getZExtValue();
391 const StructLayout *SL = DL.getStructLayout(STy);
392 Offset += APInt(IntPtrWidth, SL->getElementOffset(ElementIdx));
396 APInt TypeSize(IntPtrWidth, DL.getTypeAllocSize(GTI.getIndexedType()));
397 Offset += OpC->getValue().sextOrTrunc(IntPtrWidth) * TypeSize;
402 /// \brief Use TTI to check whether a GEP is free.
404 /// Respects any simplified values known during the analysis of this callsite.
405 bool CallAnalyzer::isGEPFree(GetElementPtrInst &GEP) {
406 SmallVector<Value *, 4> Operands;
407 Operands.push_back(GEP.getOperand(0));
408 for (User::op_iterator I = GEP.idx_begin(), E = GEP.idx_end(); I != E; ++I)
409 if (Constant *SimpleOp = SimplifiedValues.lookup(*I))
410 Operands.push_back(SimpleOp);
412 Operands.push_back(*I);
413 return TargetTransformInfo::TCC_Free == TTI.getUserCost(&GEP, Operands);
416 bool CallAnalyzer::visitAlloca(AllocaInst &I) {
417 // Check whether inlining will turn a dynamic alloca into a static
418 // alloca and handle that case.
419 if (I.isArrayAllocation()) {
420 Constant *Size = SimplifiedValues.lookup(I.getArraySize());
421 if (auto *AllocSize = dyn_cast_or_null<ConstantInt>(Size)) {
422 Type *Ty = I.getAllocatedType();
423 AllocatedSize = SaturatingMultiplyAdd(
424 AllocSize->getLimitedValue(), DL.getTypeAllocSize(Ty), AllocatedSize);
425 return Base::visitAlloca(I);
429 // Accumulate the allocated size.
430 if (I.isStaticAlloca()) {
431 Type *Ty = I.getAllocatedType();
432 AllocatedSize = SaturatingAdd(DL.getTypeAllocSize(Ty), AllocatedSize);
435 // We will happily inline static alloca instructions.
436 if (I.isStaticAlloca())
437 return Base::visitAlloca(I);
439 // FIXME: This is overly conservative. Dynamic allocas are inefficient for
440 // a variety of reasons, and so we would like to not inline them into
441 // functions which don't currently have a dynamic alloca. This simply
442 // disables inlining altogether in the presence of a dynamic alloca.
443 HasDynamicAlloca = true;
447 bool CallAnalyzer::visitPHI(PHINode &I) {
448 // FIXME: We need to propagate SROA *disabling* through phi nodes, even
449 // though we don't want to propagate it's bonuses. The idea is to disable
450 // SROA if it *might* be used in an inappropriate manner.
452 // Phi nodes are always zero-cost.
454 APInt ZeroOffset = APInt::getNullValue(DL.getPointerSizeInBits());
455 bool CheckSROA = I.getType()->isPointerTy();
457 // Track the constant or pointer with constant offset we've seen so far.
458 Constant *FirstC = nullptr;
459 std::pair<Value *, APInt> FirstBaseAndOffset = {nullptr, ZeroOffset};
460 Value *FirstV = nullptr;
462 for (unsigned i = 0, e = I.getNumIncomingValues(); i != e; ++i) {
463 BasicBlock *Pred = I.getIncomingBlock(i);
464 // If the incoming block is dead, skip the incoming block.
465 if (DeadBlocks.count(Pred))
467 // If the parent block of phi is not the known successor of the incoming
468 // block, skip the incoming block.
469 BasicBlock *KnownSuccessor = KnownSuccessors[Pred];
470 if (KnownSuccessor && KnownSuccessor != I.getParent())
473 Value *V = I.getIncomingValue(i);
474 // If the incoming value is this phi itself, skip the incoming value.
478 Constant *C = dyn_cast<Constant>(V);
480 C = SimplifiedValues.lookup(V);
482 std::pair<Value *, APInt> BaseAndOffset = {nullptr, ZeroOffset};
484 BaseAndOffset = ConstantOffsetPtrs.lookup(V);
486 if (!C && !BaseAndOffset.first)
487 // The incoming value is neither a constant nor a pointer with constant
488 // offset, exit early.
493 // If we've seen a constant incoming value before and it is the same
494 // constant we see this time, continue checking the next incoming value.
496 // Otherwise early exit because we either see a different constant or saw
497 // a constant before but we have a pointer with constant offset this time.
502 // The same logic as above, but check pointer with constant offset here.
503 if (FirstBaseAndOffset == BaseAndOffset)
509 // This is the 1st time we've seen a constant, record it.
514 // The remaining case is that this is the 1st time we've seen a pointer with
515 // constant offset, record it.
517 FirstBaseAndOffset = BaseAndOffset;
520 // Check if we can map phi to a constant.
522 SimplifiedValues[&I] = FirstC;
526 // Check if we can map phi to a pointer with constant offset.
527 if (FirstBaseAndOffset.first) {
528 ConstantOffsetPtrs[&I] = FirstBaseAndOffset;
531 DenseMap<Value *, int>::iterator CostIt;
532 if (lookupSROAArgAndCost(FirstV, SROAArg, CostIt))
533 SROAArgValues[&I] = SROAArg;
539 /// \brief Check we can fold GEPs of constant-offset call site argument pointers.
540 /// This requires target data and inbounds GEPs.
542 /// \return true if the specified GEP can be folded.
543 bool CallAnalyzer::canFoldInboundsGEP(GetElementPtrInst &I) {
544 // Check if we have a base + offset for the pointer.
545 std::pair<Value *, APInt> BaseAndOffset =
546 ConstantOffsetPtrs.lookup(I.getPointerOperand());
547 if (!BaseAndOffset.first)
550 // Check if the offset of this GEP is constant, and if so accumulate it
552 if (!accumulateGEPOffset(cast<GEPOperator>(I), BaseAndOffset.second))
555 // Add the result as a new mapping to Base + Offset.
556 ConstantOffsetPtrs[&I] = BaseAndOffset;
561 bool CallAnalyzer::visitGetElementPtr(GetElementPtrInst &I) {
563 DenseMap<Value *, int>::iterator CostIt;
565 lookupSROAArgAndCost(I.getPointerOperand(), SROAArg, CostIt);
567 // Lambda to check whether a GEP's indices are all constant.
568 auto IsGEPOffsetConstant = [&](GetElementPtrInst &GEP) {
569 for (User::op_iterator I = GEP.idx_begin(), E = GEP.idx_end(); I != E; ++I)
570 if (!isa<Constant>(*I) && !SimplifiedValues.lookup(*I))
575 if ((I.isInBounds() && canFoldInboundsGEP(I)) || IsGEPOffsetConstant(I)) {
577 SROAArgValues[&I] = SROAArg;
579 // Constant GEPs are modeled as free.
583 // Variable GEPs will require math and will disable SROA.
589 /// Simplify \p I if its operands are constants and update SimplifiedValues.
590 /// \p Evaluate is a callable specific to instruction type that evaluates the
591 /// instruction when all the operands are constants.
592 template <typename Callable>
593 bool CallAnalyzer::simplifyInstruction(Instruction &I, Callable Evaluate) {
594 SmallVector<Constant *, 2> COps;
595 for (Value *Op : I.operands()) {
596 Constant *COp = dyn_cast<Constant>(Op);
598 COp = SimplifiedValues.lookup(Op);
603 auto *C = Evaluate(COps);
606 SimplifiedValues[&I] = C;
610 bool CallAnalyzer::visitBitCast(BitCastInst &I) {
611 // Propagate constants through bitcasts.
612 if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) {
613 return ConstantExpr::getBitCast(COps[0], I.getType());
617 // Track base/offsets through casts
618 std::pair<Value *, APInt> BaseAndOffset =
619 ConstantOffsetPtrs.lookup(I.getOperand(0));
620 // Casts don't change the offset, just wrap it up.
621 if (BaseAndOffset.first)
622 ConstantOffsetPtrs[&I] = BaseAndOffset;
624 // Also look for SROA candidates here.
626 DenseMap<Value *, int>::iterator CostIt;
627 if (lookupSROAArgAndCost(I.getOperand(0), SROAArg, CostIt))
628 SROAArgValues[&I] = SROAArg;
630 // Bitcasts are always zero cost.
634 bool CallAnalyzer::visitPtrToInt(PtrToIntInst &I) {
635 // Propagate constants through ptrtoint.
636 if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) {
637 return ConstantExpr::getPtrToInt(COps[0], I.getType());
641 // Track base/offset pairs when converted to a plain integer provided the
642 // integer is large enough to represent the pointer.
643 unsigned IntegerSize = I.getType()->getScalarSizeInBits();
644 if (IntegerSize >= DL.getPointerSizeInBits()) {
645 std::pair<Value *, APInt> BaseAndOffset =
646 ConstantOffsetPtrs.lookup(I.getOperand(0));
647 if (BaseAndOffset.first)
648 ConstantOffsetPtrs[&I] = BaseAndOffset;
651 // This is really weird. Technically, ptrtoint will disable SROA. However,
652 // unless that ptrtoint is *used* somewhere in the live basic blocks after
653 // inlining, it will be nuked, and SROA should proceed. All of the uses which
654 // would block SROA would also block SROA if applied directly to a pointer,
655 // and so we can just add the integer in here. The only places where SROA is
656 // preserved either cannot fire on an integer, or won't in-and-of themselves
657 // disable SROA (ext) w/o some later use that we would see and disable.
659 DenseMap<Value *, int>::iterator CostIt;
660 if (lookupSROAArgAndCost(I.getOperand(0), SROAArg, CostIt))
661 SROAArgValues[&I] = SROAArg;
663 return TargetTransformInfo::TCC_Free == TTI.getUserCost(&I);
666 bool CallAnalyzer::visitIntToPtr(IntToPtrInst &I) {
667 // Propagate constants through ptrtoint.
668 if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) {
669 return ConstantExpr::getIntToPtr(COps[0], I.getType());
673 // Track base/offset pairs when round-tripped through a pointer without
674 // modifications provided the integer is not too large.
675 Value *Op = I.getOperand(0);
676 unsigned IntegerSize = Op->getType()->getScalarSizeInBits();
677 if (IntegerSize <= DL.getPointerSizeInBits()) {
678 std::pair<Value *, APInt> BaseAndOffset = ConstantOffsetPtrs.lookup(Op);
679 if (BaseAndOffset.first)
680 ConstantOffsetPtrs[&I] = BaseAndOffset;
683 // "Propagate" SROA here in the same manner as we do for ptrtoint above.
685 DenseMap<Value *, int>::iterator CostIt;
686 if (lookupSROAArgAndCost(Op, SROAArg, CostIt))
687 SROAArgValues[&I] = SROAArg;
689 return TargetTransformInfo::TCC_Free == TTI.getUserCost(&I);
692 bool CallAnalyzer::visitCastInst(CastInst &I) {
693 // Propagate constants through ptrtoint.
694 if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) {
695 return ConstantExpr::getCast(I.getOpcode(), COps[0], I.getType());
699 // Disable SROA in the face of arbitrary casts we don't whitelist elsewhere.
700 disableSROA(I.getOperand(0));
702 // If this is a floating-point cast, and the target says this operation
703 // is expensive, this may eventually become a library call. Treat the cost
705 switch (I.getOpcode()) {
706 case Instruction::FPTrunc:
707 case Instruction::FPExt:
708 case Instruction::UIToFP:
709 case Instruction::SIToFP:
710 case Instruction::FPToUI:
711 case Instruction::FPToSI:
712 if (TTI.getFPOpCost(I.getType()) == TargetTransformInfo::TCC_Expensive)
713 Cost += InlineConstants::CallPenalty;
718 return TargetTransformInfo::TCC_Free == TTI.getUserCost(&I);
721 bool CallAnalyzer::visitUnaryInstruction(UnaryInstruction &I) {
722 Value *Operand = I.getOperand(0);
723 if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) {
724 return ConstantFoldInstOperands(&I, COps[0], DL);
728 // Disable any SROA on the argument to arbitrary unary operators.
729 disableSROA(Operand);
734 bool CallAnalyzer::paramHasAttr(Argument *A, Attribute::AttrKind Attr) {
735 return CandidateCS.paramHasAttr(A->getArgNo(), Attr);
738 bool CallAnalyzer::isKnownNonNullInCallee(Value *V) {
739 // Does the *call site* have the NonNull attribute set on an argument? We
740 // use the attribute on the call site to memoize any analysis done in the
741 // caller. This will also trip if the callee function has a non-null
742 // parameter attribute, but that's a less interesting case because hopefully
743 // the callee would already have been simplified based on that.
744 if (Argument *A = dyn_cast<Argument>(V))
745 if (paramHasAttr(A, Attribute::NonNull))
748 // Is this an alloca in the caller? This is distinct from the attribute case
749 // above because attributes aren't updated within the inliner itself and we
750 // always want to catch the alloca derived case.
751 if (isAllocaDerivedArg(V))
752 // We can actually predict the result of comparisons between an
753 // alloca-derived value and null. Note that this fires regardless of
760 bool CallAnalyzer::allowSizeGrowth(CallSite CS) {
761 // If the normal destination of the invoke or the parent block of the call
762 // site is unreachable-terminated, there is little point in inlining this
763 // unless there is literally zero cost.
764 // FIXME: Note that it is possible that an unreachable-terminated block has a
765 // hot entry. For example, in below scenario inlining hot_call_X() may be
773 // For now, we are not handling this corner case here as it is rare in real
774 // code. In future, we should elaborate this based on BPI and BFI in more
775 // general threshold adjusting heuristics in updateThreshold().
776 Instruction *Instr = CS.getInstruction();
777 if (InvokeInst *II = dyn_cast<InvokeInst>(Instr)) {
778 if (isa<UnreachableInst>(II->getNormalDest()->getTerminator()))
780 } else if (isa<UnreachableInst>(Instr->getParent()->getTerminator()))
786 bool CallAnalyzer::isColdCallSite(CallSite CS, BlockFrequencyInfo *CallerBFI) {
787 // If global profile summary is available, then callsite's coldness is
788 // determined based on that.
789 if (PSI && PSI->hasProfileSummary())
790 return PSI->isColdCallSite(CS, CallerBFI);
792 // Otherwise we need BFI to be available.
796 // Determine if the callsite is cold relative to caller's entry. We could
797 // potentially cache the computation of scaled entry frequency, but the added
798 // complexity is not worth it unless this scaling shows up high in the
800 const BranchProbability ColdProb(ColdCallSiteRelFreq, 100);
801 auto CallSiteBB = CS.getInstruction()->getParent();
802 auto CallSiteFreq = CallerBFI->getBlockFreq(CallSiteBB);
803 auto CallerEntryFreq =
804 CallerBFI->getBlockFreq(&(CS.getCaller()->getEntryBlock()));
805 return CallSiteFreq < CallerEntryFreq * ColdProb;
809 CallAnalyzer::getHotCallSiteThreshold(CallSite CS,
810 BlockFrequencyInfo *CallerBFI) {
812 // If global profile summary is available, then callsite's hotness is
813 // determined based on that.
814 if (PSI && PSI->hasProfileSummary() && PSI->isHotCallSite(CS, CallerBFI))
815 return Params.HotCallSiteThreshold;
817 // Otherwise we need BFI to be available and to have a locally hot callsite
819 if (!CallerBFI || !Params.LocallyHotCallSiteThreshold)
822 // Determine if the callsite is hot relative to caller's entry. We could
823 // potentially cache the computation of scaled entry frequency, but the added
824 // complexity is not worth it unless this scaling shows up high in the
826 auto CallSiteBB = CS.getInstruction()->getParent();
827 auto CallSiteFreq = CallerBFI->getBlockFreq(CallSiteBB).getFrequency();
828 auto CallerEntryFreq = CallerBFI->getEntryFreq();
829 if (CallSiteFreq >= CallerEntryFreq * HotCallSiteRelFreq)
830 return Params.LocallyHotCallSiteThreshold;
832 // Otherwise treat it normally.
836 void CallAnalyzer::updateThreshold(CallSite CS, Function &Callee) {
837 // If no size growth is allowed for this inlining, set Threshold to 0.
838 if (!allowSizeGrowth(CS)) {
843 Function *Caller = CS.getCaller();
845 // return min(A, B) if B is valid.
846 auto MinIfValid = [](int A, Optional<int> B) {
847 return B ? std::min(A, B.getValue()) : A;
850 // return max(A, B) if B is valid.
851 auto MaxIfValid = [](int A, Optional<int> B) {
852 return B ? std::max(A, B.getValue()) : A;
855 // Various bonus percentages. These are multiplied by Threshold to get the
857 // SingleBBBonus: This bonus is applied if the callee has a single reachable
858 // basic block at the given callsite context. This is speculatively applied
859 // and withdrawn if more than one basic block is seen.
861 // Vector bonuses: We want to more aggressively inline vector-dense kernels
862 // and apply this bonus based on the percentage of vector instructions. A
863 // bonus is applied if the vector instructions exceed 50% and half that amount
864 // is applied if it exceeds 10%. Note that these bonuses are some what
865 // arbitrary and evolved over time by accident as much as because they are
866 // principled bonuses.
867 // FIXME: It would be nice to base the bonus values on something more
870 // LstCallToStaticBonus: This large bonus is applied to ensure the inlining
871 // of the last call to a static function as inlining such functions is
872 // guaranteed to reduce code size.
874 // These bonus percentages may be set to 0 based on properties of the caller
876 int SingleBBBonusPercent = 50;
877 int VectorBonusPercent = 150;
878 int LastCallToStaticBonus = InlineConstants::LastCallToStaticBonus;
880 // Lambda to set all the above bonus and bonus percentages to 0.
881 auto DisallowAllBonuses = [&]() {
882 SingleBBBonusPercent = 0;
883 VectorBonusPercent = 0;
884 LastCallToStaticBonus = 0;
887 // Use the OptMinSizeThreshold or OptSizeThreshold knob if they are available
888 // and reduce the threshold if the caller has the necessary attribute.
889 if (Caller->optForMinSize()) {
890 Threshold = MinIfValid(Threshold, Params.OptMinSizeThreshold);
891 // For minsize, we want to disable the single BB bonus and the vector
892 // bonuses, but not the last-call-to-static bonus. Inlining the last call to
893 // a static function will, at the minimum, eliminate the parameter setup and
894 // call/return instructions.
895 SingleBBBonusPercent = 0;
896 VectorBonusPercent = 0;
897 } else if (Caller->optForSize())
898 Threshold = MinIfValid(Threshold, Params.OptSizeThreshold);
900 // Adjust the threshold based on inlinehint attribute and profile based
901 // hotness information if the caller does not have MinSize attribute.
902 if (!Caller->optForMinSize()) {
903 if (Callee.hasFnAttribute(Attribute::InlineHint))
904 Threshold = MaxIfValid(Threshold, Params.HintThreshold);
906 // FIXME: After switching to the new passmanager, simplify the logic below
907 // by checking only the callsite hotness/coldness as we will reliably
908 // have local profile information.
910 // Callsite hotness and coldness can be determined if sample profile is
911 // used (which adds hotness metadata to calls) or if caller's
912 // BlockFrequencyInfo is available.
913 BlockFrequencyInfo *CallerBFI = GetBFI ? &((*GetBFI)(*Caller)) : nullptr;
914 auto HotCallSiteThreshold = getHotCallSiteThreshold(CS, CallerBFI);
915 if (!Caller->optForSize() && HotCallSiteThreshold) {
916 DEBUG(dbgs() << "Hot callsite.\n");
917 // FIXME: This should update the threshold only if it exceeds the
918 // current threshold, but AutoFDO + ThinLTO currently relies on this
919 // behavior to prevent inlining of hot callsites during ThinLTO
921 Threshold = HotCallSiteThreshold.getValue();
922 } else if (isColdCallSite(CS, CallerBFI)) {
923 DEBUG(dbgs() << "Cold callsite.\n");
924 // Do not apply bonuses for a cold callsite including the
925 // LastCallToStatic bonus. While this bonus might result in code size
926 // reduction, it can cause the size of a non-cold caller to increase
927 // preventing it from being inlined.
928 DisallowAllBonuses();
929 Threshold = MinIfValid(Threshold, Params.ColdCallSiteThreshold);
931 // Use callee's global profile information only if we have no way of
932 // determining this via callsite information.
933 if (PSI->isFunctionEntryHot(&Callee)) {
934 DEBUG(dbgs() << "Hot callee.\n");
935 // If callsite hotness can not be determined, we may still know
936 // that the callee is hot and treat it as a weaker hint for threshold
938 Threshold = MaxIfValid(Threshold, Params.HintThreshold);
939 } else if (PSI->isFunctionEntryCold(&Callee)) {
940 DEBUG(dbgs() << "Cold callee.\n");
941 // Do not apply bonuses for a cold callee including the
942 // LastCallToStatic bonus. While this bonus might result in code size
943 // reduction, it can cause the size of a non-cold caller to increase
944 // preventing it from being inlined.
945 DisallowAllBonuses();
946 Threshold = MinIfValid(Threshold, Params.ColdThreshold);
951 // Finally, take the target-specific inlining threshold multiplier into
953 Threshold *= TTI.getInliningThresholdMultiplier();
955 SingleBBBonus = Threshold * SingleBBBonusPercent / 100;
956 VectorBonus = Threshold * VectorBonusPercent / 100;
958 bool OnlyOneCallAndLocalLinkage =
959 F.hasLocalLinkage() && F.hasOneUse() && &F == CS.getCalledFunction();
960 // If there is only one call of the function, and it has internal linkage,
961 // the cost of inlining it drops dramatically. It may seem odd to update
962 // Cost in updateThreshold, but the bonus depends on the logic in this method.
963 if (OnlyOneCallAndLocalLinkage)
964 Cost -= LastCallToStaticBonus;
967 bool CallAnalyzer::visitCmpInst(CmpInst &I) {
968 Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
969 // First try to handle simplified comparisons.
970 if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) {
971 return ConstantExpr::getCompare(I.getPredicate(), COps[0], COps[1]);
975 if (I.getOpcode() == Instruction::FCmp)
978 // Otherwise look for a comparison between constant offset pointers with
980 Value *LHSBase, *RHSBase;
981 APInt LHSOffset, RHSOffset;
982 std::tie(LHSBase, LHSOffset) = ConstantOffsetPtrs.lookup(LHS);
984 std::tie(RHSBase, RHSOffset) = ConstantOffsetPtrs.lookup(RHS);
985 if (RHSBase && LHSBase == RHSBase) {
986 // We have common bases, fold the icmp to a constant based on the
988 Constant *CLHS = ConstantInt::get(LHS->getContext(), LHSOffset);
989 Constant *CRHS = ConstantInt::get(RHS->getContext(), RHSOffset);
990 if (Constant *C = ConstantExpr::getICmp(I.getPredicate(), CLHS, CRHS)) {
991 SimplifiedValues[&I] = C;
992 ++NumConstantPtrCmps;
998 // If the comparison is an equality comparison with null, we can simplify it
999 // if we know the value (argument) can't be null
1000 if (I.isEquality() && isa<ConstantPointerNull>(I.getOperand(1)) &&
1001 isKnownNonNullInCallee(I.getOperand(0))) {
1002 bool IsNotEqual = I.getPredicate() == CmpInst::ICMP_NE;
1003 SimplifiedValues[&I] = IsNotEqual ? ConstantInt::getTrue(I.getType())
1004 : ConstantInt::getFalse(I.getType());
1007 // Finally check for SROA candidates in comparisons.
1009 DenseMap<Value *, int>::iterator CostIt;
1010 if (lookupSROAArgAndCost(I.getOperand(0), SROAArg, CostIt)) {
1011 if (isa<ConstantPointerNull>(I.getOperand(1))) {
1012 accumulateSROACost(CostIt, InlineConstants::InstrCost);
1016 disableSROA(CostIt);
1022 bool CallAnalyzer::visitSub(BinaryOperator &I) {
1023 // Try to handle a special case: we can fold computing the difference of two
1024 // constant-related pointers.
1025 Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
1026 Value *LHSBase, *RHSBase;
1027 APInt LHSOffset, RHSOffset;
1028 std::tie(LHSBase, LHSOffset) = ConstantOffsetPtrs.lookup(LHS);
1030 std::tie(RHSBase, RHSOffset) = ConstantOffsetPtrs.lookup(RHS);
1031 if (RHSBase && LHSBase == RHSBase) {
1032 // We have common bases, fold the subtract to a constant based on the
1034 Constant *CLHS = ConstantInt::get(LHS->getContext(), LHSOffset);
1035 Constant *CRHS = ConstantInt::get(RHS->getContext(), RHSOffset);
1036 if (Constant *C = ConstantExpr::getSub(CLHS, CRHS)) {
1037 SimplifiedValues[&I] = C;
1038 ++NumConstantPtrDiffs;
1044 // Otherwise, fall back to the generic logic for simplifying and handling
1046 return Base::visitSub(I);
1049 bool CallAnalyzer::visitBinaryOperator(BinaryOperator &I) {
1050 Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
1051 Constant *CLHS = dyn_cast<Constant>(LHS);
1053 CLHS = SimplifiedValues.lookup(LHS);
1054 Constant *CRHS = dyn_cast<Constant>(RHS);
1056 CRHS = SimplifiedValues.lookup(RHS);
1058 Value *SimpleV = nullptr;
1059 if (auto FI = dyn_cast<FPMathOperator>(&I))
1060 SimpleV = SimplifyFPBinOp(I.getOpcode(), CLHS ? CLHS : LHS,
1061 CRHS ? CRHS : RHS, FI->getFastMathFlags(), DL);
1064 SimplifyBinOp(I.getOpcode(), CLHS ? CLHS : LHS, CRHS ? CRHS : RHS, DL);
1066 if (Constant *C = dyn_cast_or_null<Constant>(SimpleV))
1067 SimplifiedValues[&I] = C;
1072 // Disable any SROA on arguments to arbitrary, unsimplified binary operators.
1076 // If the instruction is floating point, and the target says this operation
1077 // is expensive, this may eventually become a library call. Treat the cost
1079 if (I.getType()->isFloatingPointTy() &&
1080 TTI.getFPOpCost(I.getType()) == TargetTransformInfo::TCC_Expensive)
1081 Cost += InlineConstants::CallPenalty;
1086 bool CallAnalyzer::visitLoad(LoadInst &I) {
1088 DenseMap<Value *, int>::iterator CostIt;
1089 if (lookupSROAArgAndCost(I.getPointerOperand(), SROAArg, CostIt)) {
1091 accumulateSROACost(CostIt, InlineConstants::InstrCost);
1095 disableSROA(CostIt);
1098 // If the data is already loaded from this address and hasn't been clobbered
1099 // by any stores or calls, this load is likely to be redundant and can be
1101 if (EnableLoadElimination &&
1102 !LoadAddrSet.insert(I.getPointerOperand()).second && I.isUnordered()) {
1103 LoadEliminationCost += InlineConstants::InstrCost;
1110 bool CallAnalyzer::visitStore(StoreInst &I) {
1112 DenseMap<Value *, int>::iterator CostIt;
1113 if (lookupSROAArgAndCost(I.getPointerOperand(), SROAArg, CostIt)) {
1115 accumulateSROACost(CostIt, InlineConstants::InstrCost);
1119 disableSROA(CostIt);
1122 // The store can potentially clobber loads and prevent repeated loads from
1123 // being eliminated.
1125 // 1. We can probably keep an initial set of eliminatable loads substracted
1126 // from the cost even when we finally see a store. We just need to disable
1127 // *further* accumulation of elimination savings.
1128 // 2. We should probably at some point thread MemorySSA for the callee into
1129 // this and then use that to actually compute *really* precise savings.
1130 disableLoadElimination();
1134 bool CallAnalyzer::visitExtractValue(ExtractValueInst &I) {
1135 // Constant folding for extract value is trivial.
1136 if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) {
1137 return ConstantExpr::getExtractValue(COps[0], I.getIndices());
1141 // SROA can look through these but give them a cost.
1145 bool CallAnalyzer::visitInsertValue(InsertValueInst &I) {
1146 // Constant folding for insert value is trivial.
1147 if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) {
1148 return ConstantExpr::getInsertValue(/*AggregateOperand*/ COps[0],
1149 /*InsertedValueOperand*/ COps[1],
1154 // SROA can look through these but give them a cost.
1158 /// \brief Try to simplify a call site.
1160 /// Takes a concrete function and callsite and tries to actually simplify it by
1161 /// analyzing the arguments and call itself with instsimplify. Returns true if
1162 /// it has simplified the callsite to some other entity (a constant), making it
1164 bool CallAnalyzer::simplifyCallSite(Function *F, CallSite CS) {
1165 // FIXME: Using the instsimplify logic directly for this is inefficient
1166 // because we have to continually rebuild the argument list even when no
1167 // simplifications can be performed. Until that is fixed with remapping
1168 // inside of instsimplify, directly constant fold calls here.
1169 if (!canConstantFoldCallTo(CS, F))
1172 // Try to re-map the arguments to constants.
1173 SmallVector<Constant *, 4> ConstantArgs;
1174 ConstantArgs.reserve(CS.arg_size());
1175 for (CallSite::arg_iterator I = CS.arg_begin(), E = CS.arg_end(); I != E;
1177 Constant *C = dyn_cast<Constant>(*I);
1179 C = dyn_cast_or_null<Constant>(SimplifiedValues.lookup(*I));
1181 return false; // This argument doesn't map to a constant.
1183 ConstantArgs.push_back(C);
1185 if (Constant *C = ConstantFoldCall(CS, F, ConstantArgs)) {
1186 SimplifiedValues[CS.getInstruction()] = C;
1193 bool CallAnalyzer::visitCallSite(CallSite CS) {
1194 if (CS.hasFnAttr(Attribute::ReturnsTwice) &&
1195 !F.hasFnAttribute(Attribute::ReturnsTwice)) {
1196 // This aborts the entire analysis.
1197 ExposesReturnsTwice = true;
1200 if (CS.isCall() && cast<CallInst>(CS.getInstruction())->cannotDuplicate())
1201 ContainsNoDuplicateCall = true;
1203 if (Function *F = CS.getCalledFunction()) {
1204 // When we have a concrete function, first try to simplify it directly.
1205 if (simplifyCallSite(F, CS))
1208 // Next check if it is an intrinsic we know about.
1209 // FIXME: Lift this into part of the InstVisitor.
1210 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(CS.getInstruction())) {
1211 switch (II->getIntrinsicID()) {
1213 if (!CS.onlyReadsMemory() && !isAssumeLikeIntrinsic(II))
1214 disableLoadElimination();
1215 return Base::visitCallSite(CS);
1217 case Intrinsic::load_relative:
1218 // This is normally lowered to 4 LLVM instructions.
1219 Cost += 3 * InlineConstants::InstrCost;
1222 case Intrinsic::memset:
1223 case Intrinsic::memcpy:
1224 case Intrinsic::memmove:
1225 disableLoadElimination();
1226 // SROA can usually chew through these intrinsics, but they aren't free.
1228 case Intrinsic::localescape:
1229 HasFrameEscape = true;
1234 if (F == CS.getInstruction()->getFunction()) {
1235 // This flag will fully abort the analysis, so don't bother with anything
1237 IsRecursiveCall = true;
1241 if (TTI.isLoweredToCall(F)) {
1242 // We account for the average 1 instruction per call argument setup
1244 Cost += CS.arg_size() * InlineConstants::InstrCost;
1246 // Everything other than inline ASM will also have a significant cost
1247 // merely from making the call.
1248 if (!isa<InlineAsm>(CS.getCalledValue()))
1249 Cost += InlineConstants::CallPenalty;
1252 if (!CS.onlyReadsMemory())
1253 disableLoadElimination();
1254 return Base::visitCallSite(CS);
1257 // Otherwise we're in a very special case -- an indirect function call. See
1258 // if we can be particularly clever about this.
1259 Value *Callee = CS.getCalledValue();
1261 // First, pay the price of the argument setup. We account for the average
1262 // 1 instruction per call argument setup here.
1263 Cost += CS.arg_size() * InlineConstants::InstrCost;
1265 // Next, check if this happens to be an indirect function call to a known
1266 // function in this inline context. If not, we've done all we can.
1267 Function *F = dyn_cast_or_null<Function>(SimplifiedValues.lookup(Callee));
1269 if (!CS.onlyReadsMemory())
1270 disableLoadElimination();
1271 return Base::visitCallSite(CS);
1274 // If we have a constant that we are calling as a function, we can peer
1275 // through it and see the function target. This happens not infrequently
1276 // during devirtualization and so we want to give it a hefty bonus for
1277 // inlining, but cap that bonus in the event that inlining wouldn't pan
1278 // out. Pretend to inline the function, with a custom threshold.
1279 auto IndirectCallParams = Params;
1280 IndirectCallParams.DefaultThreshold = InlineConstants::IndirectCallThreshold;
1281 CallAnalyzer CA(TTI, GetAssumptionCache, GetBFI, PSI, ORE, *F, CS,
1282 IndirectCallParams);
1283 if (CA.analyzeCall(CS)) {
1284 // We were able to inline the indirect call! Subtract the cost from the
1285 // threshold to get the bonus we want to apply, but don't go below zero.
1286 Cost -= std::max(0, CA.getThreshold() - CA.getCost());
1289 if (!F->onlyReadsMemory())
1290 disableLoadElimination();
1291 return Base::visitCallSite(CS);
1294 bool CallAnalyzer::visitReturnInst(ReturnInst &RI) {
1295 // At least one return instruction will be free after inlining.
1296 bool Free = !HasReturn;
1301 bool CallAnalyzer::visitBranchInst(BranchInst &BI) {
1302 // We model unconditional branches as essentially free -- they really
1303 // shouldn't exist at all, but handling them makes the behavior of the
1304 // inliner more regular and predictable. Interestingly, conditional branches
1305 // which will fold away are also free.
1306 return BI.isUnconditional() || isa<ConstantInt>(BI.getCondition()) ||
1307 dyn_cast_or_null<ConstantInt>(
1308 SimplifiedValues.lookup(BI.getCondition()));
1311 bool CallAnalyzer::visitSelectInst(SelectInst &SI) {
1312 bool CheckSROA = SI.getType()->isPointerTy();
1313 Value *TrueVal = SI.getTrueValue();
1314 Value *FalseVal = SI.getFalseValue();
1316 Constant *TrueC = dyn_cast<Constant>(TrueVal);
1318 TrueC = SimplifiedValues.lookup(TrueVal);
1319 Constant *FalseC = dyn_cast<Constant>(FalseVal);
1321 FalseC = SimplifiedValues.lookup(FalseVal);
1323 dyn_cast_or_null<Constant>(SimplifiedValues.lookup(SI.getCondition()));
1326 // Select C, X, X => X
1327 if (TrueC == FalseC && TrueC) {
1328 SimplifiedValues[&SI] = TrueC;
1333 return Base::visitSelectInst(SI);
1335 std::pair<Value *, APInt> TrueBaseAndOffset =
1336 ConstantOffsetPtrs.lookup(TrueVal);
1337 std::pair<Value *, APInt> FalseBaseAndOffset =
1338 ConstantOffsetPtrs.lookup(FalseVal);
1339 if (TrueBaseAndOffset == FalseBaseAndOffset && TrueBaseAndOffset.first) {
1340 ConstantOffsetPtrs[&SI] = TrueBaseAndOffset;
1343 DenseMap<Value *, int>::iterator CostIt;
1344 if (lookupSROAArgAndCost(TrueVal, SROAArg, CostIt))
1345 SROAArgValues[&SI] = SROAArg;
1349 return Base::visitSelectInst(SI);
1352 // Select condition is a constant.
1353 Value *SelectedV = CondC->isAllOnesValue()
1355 : (CondC->isNullValue()) ? FalseVal : nullptr;
1357 // Condition is a vector constant that is not all 1s or all 0s. If all
1358 // operands are constants, ConstantExpr::getSelect() can handle the cases
1359 // such as select vectors.
1360 if (TrueC && FalseC) {
1361 if (auto *C = ConstantExpr::getSelect(CondC, TrueC, FalseC)) {
1362 SimplifiedValues[&SI] = C;
1366 return Base::visitSelectInst(SI);
1369 // Condition is either all 1s or all 0s. SI can be simplified.
1370 if (Constant *SelectedC = dyn_cast<Constant>(SelectedV)) {
1371 SimplifiedValues[&SI] = SelectedC;
1378 std::pair<Value *, APInt> BaseAndOffset =
1379 ConstantOffsetPtrs.lookup(SelectedV);
1380 if (BaseAndOffset.first) {
1381 ConstantOffsetPtrs[&SI] = BaseAndOffset;
1384 DenseMap<Value *, int>::iterator CostIt;
1385 if (lookupSROAArgAndCost(SelectedV, SROAArg, CostIt))
1386 SROAArgValues[&SI] = SROAArg;
1392 bool CallAnalyzer::visitSwitchInst(SwitchInst &SI) {
1393 // We model unconditional switches as free, see the comments on handling
1395 if (isa<ConstantInt>(SI.getCondition()))
1397 if (Value *V = SimplifiedValues.lookup(SI.getCondition()))
1398 if (isa<ConstantInt>(V))
1401 // Assume the most general case where the switch is lowered into
1402 // either a jump table, bit test, or a balanced binary tree consisting of
1403 // case clusters without merging adjacent clusters with the same
1404 // destination. We do not consider the switches that are lowered with a mix
1405 // of jump table/bit test/binary search tree. The cost of the switch is
1406 // proportional to the size of the tree or the size of jump table range.
1408 // NB: We convert large switches which are just used to initialize large phi
1409 // nodes to lookup tables instead in simplify-cfg, so this shouldn't prevent
1410 // inlining those. It will prevent inlining in cases where the optimization
1411 // does not (yet) fire.
1413 // Maximum valid cost increased in this function.
1414 int CostUpperBound = INT_MAX - InlineConstants::InstrCost - 1;
1416 // Exit early for a large switch, assuming one case needs at least one
1418 // FIXME: This is not true for a bit test, but ignore such case for now to
1419 // save compile-time.
1420 int64_t CostLowerBound =
1421 std::min((int64_t)CostUpperBound,
1422 (int64_t)SI.getNumCases() * InlineConstants::InstrCost + Cost);
1424 if (CostLowerBound > Threshold && !ComputeFullInlineCost) {
1425 Cost = CostLowerBound;
1429 unsigned JumpTableSize = 0;
1430 unsigned NumCaseCluster =
1431 TTI.getEstimatedNumberOfCaseClusters(SI, JumpTableSize);
1433 // If suitable for a jump table, consider the cost for the table size and
1434 // branch to destination.
1435 if (JumpTableSize) {
1436 int64_t JTCost = (int64_t)JumpTableSize * InlineConstants::InstrCost +
1437 4 * InlineConstants::InstrCost;
1439 Cost = std::min((int64_t)CostUpperBound, JTCost + Cost);
1443 // Considering forming a binary search, we should find the number of nodes
1444 // which is same as the number of comparisons when lowered. For a given
1445 // number of clusters, n, we can define a recursive function, f(n), to find
1446 // the number of nodes in the tree. The recursion is :
1447 // f(n) = 1 + f(n/2) + f (n - n/2), when n > 3,
1448 // and f(n) = n, when n <= 3.
1449 // This will lead a binary tree where the leaf should be either f(2) or f(3)
1450 // when n > 3. So, the number of comparisons from leaves should be n, while
1451 // the number of non-leaf should be :
1452 // 2^(log2(n) - 1) - 1
1453 // = 2^log2(n) * 2^-1 - 1
1455 // Considering comparisons from leaf and non-leaf nodes, we can estimate the
1456 // number of comparisons in a simple closed form :
1457 // n + n / 2 - 1 = n * 3 / 2 - 1
1458 if (NumCaseCluster <= 3) {
1459 // Suppose a comparison includes one compare and one conditional branch.
1460 Cost += NumCaseCluster * 2 * InlineConstants::InstrCost;
1464 int64_t ExpectedNumberOfCompare = 3 * (int64_t)NumCaseCluster / 2 - 1;
1465 int64_t SwitchCost =
1466 ExpectedNumberOfCompare * 2 * InlineConstants::InstrCost;
1468 Cost = std::min((int64_t)CostUpperBound, SwitchCost + Cost);
1472 bool CallAnalyzer::visitIndirectBrInst(IndirectBrInst &IBI) {
1473 // We never want to inline functions that contain an indirectbr. This is
1474 // incorrect because all the blockaddress's (in static global initializers
1475 // for example) would be referring to the original function, and this
1476 // indirect jump would jump from the inlined copy of the function into the
1477 // original function which is extremely undefined behavior.
1478 // FIXME: This logic isn't really right; we can safely inline functions with
1479 // indirectbr's as long as no other function or global references the
1480 // blockaddress of a block within the current function.
1481 HasIndirectBr = true;
1485 bool CallAnalyzer::visitResumeInst(ResumeInst &RI) {
1486 // FIXME: It's not clear that a single instruction is an accurate model for
1487 // the inline cost of a resume instruction.
1491 bool CallAnalyzer::visitCleanupReturnInst(CleanupReturnInst &CRI) {
1492 // FIXME: It's not clear that a single instruction is an accurate model for
1493 // the inline cost of a cleanupret instruction.
1497 bool CallAnalyzer::visitCatchReturnInst(CatchReturnInst &CRI) {
1498 // FIXME: It's not clear that a single instruction is an accurate model for
1499 // the inline cost of a catchret instruction.
1503 bool CallAnalyzer::visitUnreachableInst(UnreachableInst &I) {
1504 // FIXME: It might be reasonably to discount the cost of instructions leading
1505 // to unreachable as they have the lowest possible impact on both runtime and
1507 return true; // No actual code is needed for unreachable.
1510 bool CallAnalyzer::visitInstruction(Instruction &I) {
1511 // Some instructions are free. All of the free intrinsics can also be
1512 // handled by SROA, etc.
1513 if (TargetTransformInfo::TCC_Free == TTI.getUserCost(&I))
1516 // We found something we don't understand or can't handle. Mark any SROA-able
1517 // values in the operand list as no longer viable.
1518 for (User::op_iterator OI = I.op_begin(), OE = I.op_end(); OI != OE; ++OI)
1524 /// \brief Analyze a basic block for its contribution to the inline cost.
1526 /// This method walks the analyzer over every instruction in the given basic
1527 /// block and accounts for their cost during inlining at this callsite. It
1528 /// aborts early if the threshold has been exceeded or an impossible to inline
1529 /// construct has been detected. It returns false if inlining is no longer
1530 /// viable, and true if inlining remains viable.
1531 bool CallAnalyzer::analyzeBlock(BasicBlock *BB,
1532 SmallPtrSetImpl<const Value *> &EphValues) {
1533 for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) {
1534 // FIXME: Currently, the number of instructions in a function regardless of
1535 // our ability to simplify them during inline to constants or dead code,
1536 // are actually used by the vector bonus heuristic. As long as that's true,
1537 // we have to special case debug intrinsics here to prevent differences in
1538 // inlining due to debug symbols. Eventually, the number of unsimplified
1539 // instructions shouldn't factor into the cost computation, but until then,
1540 // hack around it here.
1541 if (isa<DbgInfoIntrinsic>(I))
1544 // Skip ephemeral values.
1545 if (EphValues.count(&*I))
1549 if (isa<ExtractElementInst>(I) || I->getType()->isVectorTy())
1550 ++NumVectorInstructions;
1552 // If the instruction simplified to a constant, there is no cost to this
1553 // instruction. Visit the instructions using our InstVisitor to account for
1554 // all of the per-instruction logic. The visit tree returns true if we
1555 // consumed the instruction in any way, and false if the instruction's base
1556 // cost should count against inlining.
1557 if (Base::visit(&*I))
1558 ++NumInstructionsSimplified;
1560 Cost += InlineConstants::InstrCost;
1562 using namespace ore;
1563 // If the visit this instruction detected an uninlinable pattern, abort.
1564 if (IsRecursiveCall || ExposesReturnsTwice || HasDynamicAlloca ||
1565 HasIndirectBr || HasFrameEscape) {
1568 return OptimizationRemarkMissed(DEBUG_TYPE, "NeverInline",
1569 CandidateCS.getInstruction())
1571 << " has uninlinable pattern and cost is not fully computed";
1576 // If the caller is a recursive function then we don't want to inline
1577 // functions which allocate a lot of stack space because it would increase
1578 // the caller stack usage dramatically.
1579 if (IsCallerRecursive &&
1580 AllocatedSize > InlineConstants::TotalAllocaSizeRecursiveCaller) {
1583 return OptimizationRemarkMissed(DEBUG_TYPE, "NeverInline",
1584 CandidateCS.getInstruction())
1586 << " is recursive and allocates too much stack space. Cost is "
1587 "not fully computed";
1592 // Check if we've past the maximum possible threshold so we don't spin in
1593 // huge basic blocks that will never inline.
1594 if (Cost >= Threshold && !ComputeFullInlineCost)
1601 /// \brief Compute the base pointer and cumulative constant offsets for V.
1603 /// This strips all constant offsets off of V, leaving it the base pointer, and
1604 /// accumulates the total constant offset applied in the returned constant. It
1605 /// returns 0 if V is not a pointer, and returns the constant '0' if there are
1606 /// no constant offsets applied.
1607 ConstantInt *CallAnalyzer::stripAndComputeInBoundsConstantOffsets(Value *&V) {
1608 if (!V->getType()->isPointerTy())
1611 unsigned IntPtrWidth = DL.getPointerSizeInBits();
1612 APInt Offset = APInt::getNullValue(IntPtrWidth);
1614 // Even though we don't look through PHI nodes, we could be called on an
1615 // instruction in an unreachable block, which may be on a cycle.
1616 SmallPtrSet<Value *, 4> Visited;
1619 if (GEPOperator *GEP = dyn_cast<GEPOperator>(V)) {
1620 if (!GEP->isInBounds() || !accumulateGEPOffset(*GEP, Offset))
1622 V = GEP->getPointerOperand();
1623 } else if (Operator::getOpcode(V) == Instruction::BitCast) {
1624 V = cast<Operator>(V)->getOperand(0);
1625 } else if (GlobalAlias *GA = dyn_cast<GlobalAlias>(V)) {
1626 if (GA->isInterposable())
1628 V = GA->getAliasee();
1632 assert(V->getType()->isPointerTy() && "Unexpected operand type!");
1633 } while (Visited.insert(V).second);
1635 Type *IntPtrTy = DL.getIntPtrType(V->getContext());
1636 return cast<ConstantInt>(ConstantInt::get(IntPtrTy, Offset));
1639 /// \brief Find dead blocks due to deleted CFG edges during inlining.
1641 /// If we know the successor of the current block, \p CurrBB, has to be \p
1642 /// NextBB, the other successors of \p CurrBB are dead if these successors have
1643 /// no live incoming CFG edges. If one block is found to be dead, we can
1644 /// continue growing the dead block list by checking the successors of the dead
1645 /// blocks to see if all their incoming edges are dead or not.
1646 void CallAnalyzer::findDeadBlocks(BasicBlock *CurrBB, BasicBlock *NextBB) {
1647 auto IsEdgeDead = [&](BasicBlock *Pred, BasicBlock *Succ) {
1648 // A CFG edge is dead if the predecessor is dead or the predessor has a
1649 // known successor which is not the one under exam.
1650 return (DeadBlocks.count(Pred) ||
1651 (KnownSuccessors[Pred] && KnownSuccessors[Pred] != Succ));
1654 auto IsNewlyDead = [&](BasicBlock *BB) {
1655 // If all the edges to a block are dead, the block is also dead.
1656 return (!DeadBlocks.count(BB) &&
1657 llvm::all_of(predecessors(BB),
1658 [&](BasicBlock *P) { return IsEdgeDead(P, BB); }));
1661 for (BasicBlock *Succ : successors(CurrBB)) {
1662 if (Succ == NextBB || !IsNewlyDead(Succ))
1664 SmallVector<BasicBlock *, 4> NewDead;
1665 NewDead.push_back(Succ);
1666 while (!NewDead.empty()) {
1667 BasicBlock *Dead = NewDead.pop_back_val();
1668 if (DeadBlocks.insert(Dead))
1669 // Continue growing the dead block lists.
1670 for (BasicBlock *S : successors(Dead))
1672 NewDead.push_back(S);
1677 /// \brief Analyze a call site for potential inlining.
1679 /// Returns true if inlining this call is viable, and false if it is not
1680 /// viable. It computes the cost and adjusts the threshold based on numerous
1681 /// factors and heuristics. If this method returns false but the computed cost
1682 /// is below the computed threshold, then inlining was forcibly disabled by
1683 /// some artifact of the routine.
1684 bool CallAnalyzer::analyzeCall(CallSite CS) {
1687 // Perform some tweaks to the cost and threshold based on the direct
1688 // callsite information.
1690 // We want to more aggressively inline vector-dense kernels, so up the
1691 // threshold, and we'll lower it if the % of vector instructions gets too
1692 // low. Note that these bonuses are some what arbitrary and evolved over time
1693 // by accident as much as because they are principled bonuses.
1695 // FIXME: It would be nice to remove all such bonuses. At least it would be
1696 // nice to base the bonus values on something more scientific.
1697 assert(NumInstructions == 0);
1698 assert(NumVectorInstructions == 0);
1700 // Update the threshold based on callsite properties
1701 updateThreshold(CS, F);
1703 // Speculatively apply all possible bonuses to Threshold. If cost exceeds
1704 // this Threshold any time, and cost cannot decrease, we can stop processing
1705 // the rest of the function body.
1706 Threshold += (SingleBBBonus + VectorBonus);
1708 // Give out bonuses for the callsite, as the instructions setting them up
1709 // will be gone after inlining.
1710 Cost -= getCallsiteCost(CS, DL);
1712 // If this function uses the coldcc calling convention, prefer not to inline
1714 if (F.getCallingConv() == CallingConv::Cold)
1715 Cost += InlineConstants::ColdccPenalty;
1717 // Check if we're done. This can happen due to bonuses and penalties.
1718 if (Cost >= Threshold && !ComputeFullInlineCost)
1724 Function *Caller = CS.getInstruction()->getFunction();
1725 // Check if the caller function is recursive itself.
1726 for (User *U : Caller->users()) {
1730 Instruction *I = Site.getInstruction();
1731 if (I->getFunction() == Caller) {
1732 IsCallerRecursive = true;
1737 // Populate our simplified values by mapping from function arguments to call
1738 // arguments with known important simplifications.
1739 CallSite::arg_iterator CAI = CS.arg_begin();
1740 for (Function::arg_iterator FAI = F.arg_begin(), FAE = F.arg_end();
1741 FAI != FAE; ++FAI, ++CAI) {
1742 assert(CAI != CS.arg_end());
1743 if (Constant *C = dyn_cast<Constant>(CAI))
1744 SimplifiedValues[&*FAI] = C;
1746 Value *PtrArg = *CAI;
1747 if (ConstantInt *C = stripAndComputeInBoundsConstantOffsets(PtrArg)) {
1748 ConstantOffsetPtrs[&*FAI] = std::make_pair(PtrArg, C->getValue());
1750 // We can SROA any pointer arguments derived from alloca instructions.
1751 if (isa<AllocaInst>(PtrArg)) {
1752 SROAArgValues[&*FAI] = PtrArg;
1753 SROAArgCosts[PtrArg] = 0;
1757 NumConstantArgs = SimplifiedValues.size();
1758 NumConstantOffsetPtrArgs = ConstantOffsetPtrs.size();
1759 NumAllocaArgs = SROAArgValues.size();
1761 // FIXME: If a caller has multiple calls to a callee, we end up recomputing
1762 // the ephemeral values multiple times (and they're completely determined by
1763 // the callee, so this is purely duplicate work).
1764 SmallPtrSet<const Value *, 32> EphValues;
1765 CodeMetrics::collectEphemeralValues(&F, &GetAssumptionCache(F), EphValues);
1767 // The worklist of live basic blocks in the callee *after* inlining. We avoid
1768 // adding basic blocks of the callee which can be proven to be dead for this
1769 // particular call site in order to get more accurate cost estimates. This
1770 // requires a somewhat heavyweight iteration pattern: we need to walk the
1771 // basic blocks in a breadth-first order as we insert live successors. To
1772 // accomplish this, prioritizing for small iterations because we exit after
1773 // crossing our threshold, we use a small-size optimized SetVector.
1774 typedef SetVector<BasicBlock *, SmallVector<BasicBlock *, 16>,
1775 SmallPtrSet<BasicBlock *, 16>>
1777 BBSetVector BBWorklist;
1778 BBWorklist.insert(&F.getEntryBlock());
1779 bool SingleBB = true;
1780 // Note that we *must not* cache the size, this loop grows the worklist.
1781 for (unsigned Idx = 0; Idx != BBWorklist.size(); ++Idx) {
1782 // Bail out the moment we cross the threshold. This means we'll under-count
1783 // the cost, but only when undercounting doesn't matter.
1784 if (Cost >= Threshold && !ComputeFullInlineCost)
1787 BasicBlock *BB = BBWorklist[Idx];
1791 // Disallow inlining a blockaddress. A blockaddress only has defined
1792 // behavior for an indirect branch in the same function, and we do not
1793 // currently support inlining indirect branches. But, the inliner may not
1794 // see an indirect branch that ends up being dead code at a particular call
1795 // site. If the blockaddress escapes the function, e.g., via a global
1796 // variable, inlining may lead to an invalid cross-function reference.
1797 if (BB->hasAddressTaken())
1800 // Analyze the cost of this block. If we blow through the threshold, this
1801 // returns false, and we can bail on out.
1802 if (!analyzeBlock(BB, EphValues))
1805 TerminatorInst *TI = BB->getTerminator();
1807 // Add in the live successors by first checking whether we have terminator
1808 // that may be simplified based on the values simplified by this call.
1809 if (BranchInst *BI = dyn_cast<BranchInst>(TI)) {
1810 if (BI->isConditional()) {
1811 Value *Cond = BI->getCondition();
1812 if (ConstantInt *SimpleCond =
1813 dyn_cast_or_null<ConstantInt>(SimplifiedValues.lookup(Cond))) {
1814 BasicBlock *NextBB = BI->getSuccessor(SimpleCond->isZero() ? 1 : 0);
1815 BBWorklist.insert(NextBB);
1816 KnownSuccessors[BB] = NextBB;
1817 findDeadBlocks(BB, NextBB);
1821 } else if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) {
1822 Value *Cond = SI->getCondition();
1823 if (ConstantInt *SimpleCond =
1824 dyn_cast_or_null<ConstantInt>(SimplifiedValues.lookup(Cond))) {
1825 BasicBlock *NextBB = SI->findCaseValue(SimpleCond)->getCaseSuccessor();
1826 BBWorklist.insert(NextBB);
1827 KnownSuccessors[BB] = NextBB;
1828 findDeadBlocks(BB, NextBB);
1833 // If we're unable to select a particular successor, just count all of
1835 for (unsigned TIdx = 0, TSize = TI->getNumSuccessors(); TIdx != TSize;
1837 BBWorklist.insert(TI->getSuccessor(TIdx));
1839 // If we had any successors at this point, than post-inlining is likely to
1840 // have them as well. Note that we assume any basic blocks which existed
1841 // due to branches or switches which folded above will also fold after
1843 if (SingleBB && TI->getNumSuccessors() > 1) {
1844 // Take off the bonus we applied to the threshold.
1845 Threshold -= SingleBBBonus;
1850 bool OnlyOneCallAndLocalLinkage =
1851 F.hasLocalLinkage() && F.hasOneUse() && &F == CS.getCalledFunction();
1852 // If this is a noduplicate call, we can still inline as long as
1853 // inlining this would cause the removal of the caller (so the instruction
1854 // is not actually duplicated, just moved).
1855 if (!OnlyOneCallAndLocalLinkage && ContainsNoDuplicateCall)
1858 // We applied the maximum possible vector bonus at the beginning. Now,
1859 // subtract the excess bonus, if any, from the Threshold before
1860 // comparing against Cost.
1861 if (NumVectorInstructions <= NumInstructions / 10)
1862 Threshold -= VectorBonus;
1863 else if (NumVectorInstructions <= NumInstructions / 2)
1864 Threshold -= VectorBonus/2;
1866 return Cost < std::max(1, Threshold);
1869 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1870 /// \brief Dump stats about this call's analysis.
1871 LLVM_DUMP_METHOD void CallAnalyzer::dump() {
1872 #define DEBUG_PRINT_STAT(x) dbgs() << " " #x ": " << x << "\n"
1873 DEBUG_PRINT_STAT(NumConstantArgs);
1874 DEBUG_PRINT_STAT(NumConstantOffsetPtrArgs);
1875 DEBUG_PRINT_STAT(NumAllocaArgs);
1876 DEBUG_PRINT_STAT(NumConstantPtrCmps);
1877 DEBUG_PRINT_STAT(NumConstantPtrDiffs);
1878 DEBUG_PRINT_STAT(NumInstructionsSimplified);
1879 DEBUG_PRINT_STAT(NumInstructions);
1880 DEBUG_PRINT_STAT(SROACostSavings);
1881 DEBUG_PRINT_STAT(SROACostSavingsLost);
1882 DEBUG_PRINT_STAT(LoadEliminationCost);
1883 DEBUG_PRINT_STAT(ContainsNoDuplicateCall);
1884 DEBUG_PRINT_STAT(Cost);
1885 DEBUG_PRINT_STAT(Threshold);
1886 #undef DEBUG_PRINT_STAT
1890 /// \brief Test that there are no attribute conflicts between Caller and Callee
1891 /// that prevent inlining.
1892 static bool functionsHaveCompatibleAttributes(Function *Caller,
1894 TargetTransformInfo &TTI) {
1895 return TTI.areInlineCompatible(Caller, Callee) &&
1896 AttributeFuncs::areInlineCompatible(*Caller, *Callee);
1899 int llvm::getCallsiteCost(CallSite CS, const DataLayout &DL) {
1901 for (unsigned I = 0, E = CS.arg_size(); I != E; ++I) {
1902 if (CS.isByValArgument(I)) {
1903 // We approximate the number of loads and stores needed by dividing the
1904 // size of the byval type by the target's pointer size.
1905 PointerType *PTy = cast<PointerType>(CS.getArgument(I)->getType());
1906 unsigned TypeSize = DL.getTypeSizeInBits(PTy->getElementType());
1907 unsigned PointerSize = DL.getPointerSizeInBits();
1908 // Ceiling division.
1909 unsigned NumStores = (TypeSize + PointerSize - 1) / PointerSize;
1911 // If it generates more than 8 stores it is likely to be expanded as an
1912 // inline memcpy so we take that as an upper bound. Otherwise we assume
1913 // one load and one store per word copied.
1914 // FIXME: The maxStoresPerMemcpy setting from the target should be used
1915 // here instead of a magic number of 8, but it's not available via
1917 NumStores = std::min(NumStores, 8U);
1919 Cost += 2 * NumStores * InlineConstants::InstrCost;
1921 // For non-byval arguments subtract off one instruction per call
1923 Cost += InlineConstants::InstrCost;
1926 // The call instruction also disappears after inlining.
1927 Cost += InlineConstants::InstrCost + InlineConstants::CallPenalty;
1931 InlineCost llvm::getInlineCost(
1932 CallSite CS, const InlineParams &Params, TargetTransformInfo &CalleeTTI,
1933 std::function<AssumptionCache &(Function &)> &GetAssumptionCache,
1934 Optional<function_ref<BlockFrequencyInfo &(Function &)>> GetBFI,
1935 ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE) {
1936 return getInlineCost(CS, CS.getCalledFunction(), Params, CalleeTTI,
1937 GetAssumptionCache, GetBFI, PSI, ORE);
1940 InlineCost llvm::getInlineCost(
1941 CallSite CS, Function *Callee, const InlineParams &Params,
1942 TargetTransformInfo &CalleeTTI,
1943 std::function<AssumptionCache &(Function &)> &GetAssumptionCache,
1944 Optional<function_ref<BlockFrequencyInfo &(Function &)>> GetBFI,
1945 ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE) {
1947 // Cannot inline indirect calls.
1949 return llvm::InlineCost::getNever();
1951 // Calls to functions with always-inline attributes should be inlined
1952 // whenever possible.
1953 if (CS.hasFnAttr(Attribute::AlwaysInline)) {
1954 if (isInlineViable(*Callee))
1955 return llvm::InlineCost::getAlways();
1956 return llvm::InlineCost::getNever();
1959 // Never inline functions with conflicting attributes (unless callee has
1960 // always-inline attribute).
1961 Function *Caller = CS.getCaller();
1962 if (!functionsHaveCompatibleAttributes(Caller, Callee, CalleeTTI))
1963 return llvm::InlineCost::getNever();
1965 // Don't inline this call if the caller has the optnone attribute.
1966 if (Caller->hasFnAttribute(Attribute::OptimizeNone))
1967 return llvm::InlineCost::getNever();
1969 // Don't inline functions which can be interposed at link-time. Don't inline
1970 // functions marked noinline or call sites marked noinline.
1971 // Note: inlining non-exact non-interposable functions is fine, since we know
1972 // we have *a* correct implementation of the source level function.
1973 if (Callee->isInterposable() || Callee->hasFnAttribute(Attribute::NoInline) ||
1975 return llvm::InlineCost::getNever();
1977 DEBUG(llvm::dbgs() << " Analyzing call of " << Callee->getName()
1978 << "... (caller:" << Caller->getName() << ")\n");
1980 CallAnalyzer CA(CalleeTTI, GetAssumptionCache, GetBFI, PSI, ORE, *Callee, CS,
1982 bool ShouldInline = CA.analyzeCall(CS);
1986 // Check if there was a reason to force inlining or no inlining.
1987 if (!ShouldInline && CA.getCost() < CA.getThreshold())
1988 return InlineCost::getNever();
1989 if (ShouldInline && CA.getCost() >= CA.getThreshold())
1990 return InlineCost::getAlways();
1992 return llvm::InlineCost::get(CA.getCost(), CA.getThreshold());
1995 bool llvm::isInlineViable(Function &F) {
1996 bool ReturnsTwice = F.hasFnAttribute(Attribute::ReturnsTwice);
1997 for (Function::iterator BI = F.begin(), BE = F.end(); BI != BE; ++BI) {
1998 // Disallow inlining of functions which contain indirect branches or
2000 if (isa<IndirectBrInst>(BI->getTerminator()) || BI->hasAddressTaken())
2003 for (auto &II : *BI) {
2008 // Disallow recursive calls.
2009 if (&F == CS.getCalledFunction())
2012 // Disallow calls which expose returns-twice to a function not previously
2013 // attributed as such.
2014 if (!ReturnsTwice && CS.isCall() &&
2015 cast<CallInst>(CS.getInstruction())->canReturnTwice())
2018 // Disallow inlining functions that call @llvm.localescape. Doing this
2019 // correctly would require major changes to the inliner.
2020 if (CS.getCalledFunction() &&
2021 CS.getCalledFunction()->getIntrinsicID() ==
2022 llvm::Intrinsic::localescape)
2030 // APIs to create InlineParams based on command line flags and/or other
2033 InlineParams llvm::getInlineParams(int Threshold) {
2034 InlineParams Params;
2036 // This field is the threshold to use for a callee by default. This is
2037 // derived from one or more of:
2038 // * optimization or size-optimization levels,
2039 // * a value passed to createFunctionInliningPass function, or
2040 // * the -inline-threshold flag.
2041 // If the -inline-threshold flag is explicitly specified, that is used
2042 // irrespective of anything else.
2043 if (InlineThreshold.getNumOccurrences() > 0)
2044 Params.DefaultThreshold = InlineThreshold;
2046 Params.DefaultThreshold = Threshold;
2048 // Set the HintThreshold knob from the -inlinehint-threshold.
2049 Params.HintThreshold = HintThreshold;
2051 // Set the HotCallSiteThreshold knob from the -hot-callsite-threshold.
2052 Params.HotCallSiteThreshold = HotCallSiteThreshold;
2054 // If the -locally-hot-callsite-threshold is explicitly specified, use it to
2055 // populate LocallyHotCallSiteThreshold. Later, we populate
2056 // Params.LocallyHotCallSiteThreshold from -locally-hot-callsite-threshold if
2057 // we know that optimization level is O3 (in the getInlineParams variant that
2058 // takes the opt and size levels).
2059 // FIXME: Remove this check (and make the assignment unconditional) after
2060 // addressing size regression issues at O2.
2061 if (LocallyHotCallSiteThreshold.getNumOccurrences() > 0)
2062 Params.LocallyHotCallSiteThreshold = LocallyHotCallSiteThreshold;
2064 // Set the ColdCallSiteThreshold knob from the -inline-cold-callsite-threshold.
2065 Params.ColdCallSiteThreshold = ColdCallSiteThreshold;
2067 // Set the OptMinSizeThreshold and OptSizeThreshold params only if the
2068 // -inlinehint-threshold commandline option is not explicitly given. If that
2069 // option is present, then its value applies even for callees with size and
2070 // minsize attributes.
2071 // If the -inline-threshold is not specified, set the ColdThreshold from the
2072 // -inlinecold-threshold even if it is not explicitly passed. If
2073 // -inline-threshold is specified, then -inlinecold-threshold needs to be
2074 // explicitly specified to set the ColdThreshold knob
2075 if (InlineThreshold.getNumOccurrences() == 0) {
2076 Params.OptMinSizeThreshold = InlineConstants::OptMinSizeThreshold;
2077 Params.OptSizeThreshold = InlineConstants::OptSizeThreshold;
2078 Params.ColdThreshold = ColdThreshold;
2079 } else if (ColdThreshold.getNumOccurrences() > 0) {
2080 Params.ColdThreshold = ColdThreshold;
2085 InlineParams llvm::getInlineParams() {
2086 return getInlineParams(InlineThreshold);
2089 // Compute the default threshold for inlining based on the opt level and the
2091 static int computeThresholdFromOptLevels(unsigned OptLevel,
2092 unsigned SizeOptLevel) {
2094 return InlineConstants::OptAggressiveThreshold;
2095 if (SizeOptLevel == 1) // -Os
2096 return InlineConstants::OptSizeThreshold;
2097 if (SizeOptLevel == 2) // -Oz
2098 return InlineConstants::OptMinSizeThreshold;
2099 return InlineThreshold;
2102 InlineParams llvm::getInlineParams(unsigned OptLevel, unsigned SizeOptLevel) {
2104 getInlineParams(computeThresholdFromOptLevels(OptLevel, SizeOptLevel));
2105 // At O3, use the value of -locally-hot-callsite-threshold option to populate
2106 // Params.LocallyHotCallSiteThreshold. Below O3, this flag has effect only
2107 // when it is specified explicitly.
2109 Params.LocallyHotCallSiteThreshold = LocallyHotCallSiteThreshold;