1 //===- InlineCost.cpp - Cost analysis for inliner -------------------------===//
3 // The LLVM Compiler Infrastructure
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
8 //===----------------------------------------------------------------------===//
10 // This file implements inline cost analysis.
12 //===----------------------------------------------------------------------===//
14 #include "llvm/Analysis/InlineCost.h"
15 #include "llvm/ADT/STLExtras.h"
16 #include "llvm/ADT/SetVector.h"
17 #include "llvm/ADT/SmallPtrSet.h"
18 #include "llvm/ADT/SmallVector.h"
19 #include "llvm/ADT/Statistic.h"
20 #include "llvm/Analysis/AssumptionCache.h"
21 #include "llvm/Analysis/BlockFrequencyInfo.h"
22 #include "llvm/Analysis/CodeMetrics.h"
23 #include "llvm/Analysis/ConstantFolding.h"
24 #include "llvm/Analysis/InstructionSimplify.h"
25 #include "llvm/Analysis/ProfileSummaryInfo.h"
26 #include "llvm/Analysis/TargetTransformInfo.h"
27 #include "llvm/IR/CallSite.h"
28 #include "llvm/IR/CallingConv.h"
29 #include "llvm/IR/DataLayout.h"
30 #include "llvm/IR/GetElementPtrTypeIterator.h"
31 #include "llvm/IR/GlobalAlias.h"
32 #include "llvm/IR/InstVisitor.h"
33 #include "llvm/IR/IntrinsicInst.h"
34 #include "llvm/IR/Operator.h"
35 #include "llvm/Support/Debug.h"
36 #include "llvm/Support/raw_ostream.h"
40 #define DEBUG_TYPE "inline-cost"
42 STATISTIC(NumCallsAnalyzed, "Number of call sites analyzed");
44 static cl::opt<int> InlineThreshold(
45 "inline-threshold", cl::Hidden, cl::init(225), cl::ZeroOrMore,
46 cl::desc("Control the amount of inlining to perform (default = 225)"));
48 static cl::opt<int> HintThreshold(
49 "inlinehint-threshold", cl::Hidden, cl::init(325),
50 cl::desc("Threshold for inlining functions with inline hint"));
53 ColdCallSiteThreshold("inline-cold-callsite-threshold", cl::Hidden,
55 cl::desc("Threshold for inlining cold callsites"));
57 // We introduce this threshold to help performance of instrumentation based
58 // PGO before we actually hook up inliner with analysis passes such as BPI and
60 static cl::opt<int> ColdThreshold(
61 "inlinecold-threshold", cl::Hidden, cl::init(225),
62 cl::desc("Threshold for inlining functions with cold attribute"));
65 HotCallSiteThreshold("hot-callsite-threshold", cl::Hidden, cl::init(3000),
67 cl::desc("Threshold for hot callsites "));
71 class CallAnalyzer : public InstVisitor<CallAnalyzer, bool> {
72 typedef InstVisitor<CallAnalyzer, bool> Base;
73 friend class InstVisitor<CallAnalyzer, bool>;
75 /// The TargetTransformInfo available for this compilation.
76 const TargetTransformInfo &TTI;
78 /// Getter for the cache of @llvm.assume intrinsics.
79 std::function<AssumptionCache &(Function &)> &GetAssumptionCache;
81 /// Getter for BlockFrequencyInfo
82 Optional<function_ref<BlockFrequencyInfo &(Function &)>> &GetBFI;
84 /// Profile summary information.
85 ProfileSummaryInfo *PSI;
87 /// The called function.
90 // Cache the DataLayout since we use it a lot.
93 /// The candidate callsite being analyzed. Please do not use this to do
94 /// analysis in the caller function; we want the inline cost query to be
95 /// easily cacheable. Instead, use the cover function paramHasAttr.
98 /// Tunable parameters that control the analysis.
99 const InlineParams &Params;
104 bool IsCallerRecursive;
105 bool IsRecursiveCall;
106 bool ExposesReturnsTwice;
107 bool HasDynamicAlloca;
108 bool ContainsNoDuplicateCall;
113 /// Number of bytes allocated statically by the callee.
114 uint64_t AllocatedSize;
115 unsigned NumInstructions, NumVectorInstructions;
116 int FiftyPercentVectorBonus, TenPercentVectorBonus;
119 /// While we walk the potentially-inlined instructions, we build up and
120 /// maintain a mapping of simplified values specific to this callsite. The
121 /// idea is to propagate any special information we have about arguments to
122 /// this call through the inlinable section of the function, and account for
123 /// likely simplifications post-inlining. The most important aspect we track
124 /// is CFG altering simplifications -- when we prove a basic block dead, that
125 /// can cause dramatic shifts in the cost of inlining a function.
126 DenseMap<Value *, Constant *> SimplifiedValues;
128 /// Keep track of the values which map back (through function arguments) to
129 /// allocas on the caller stack which could be simplified through SROA.
130 DenseMap<Value *, Value *> SROAArgValues;
132 /// The mapping of caller Alloca values to their accumulated cost savings. If
133 /// we have to disable SROA for one of the allocas, this tells us how much
134 /// cost must be added.
135 DenseMap<Value *, int> SROAArgCosts;
137 /// Keep track of values which map to a pointer base and constant offset.
138 DenseMap<Value *, std::pair<Value *, APInt>> ConstantOffsetPtrs;
140 // Custom simplification helper routines.
141 bool isAllocaDerivedArg(Value *V);
142 bool lookupSROAArgAndCost(Value *V, Value *&Arg,
143 DenseMap<Value *, int>::iterator &CostIt);
144 void disableSROA(DenseMap<Value *, int>::iterator CostIt);
145 void disableSROA(Value *V);
146 void accumulateSROACost(DenseMap<Value *, int>::iterator CostIt,
147 int InstructionCost);
148 bool isGEPFree(GetElementPtrInst &GEP);
149 bool accumulateGEPOffset(GEPOperator &GEP, APInt &Offset);
150 bool simplifyCallSite(Function *F, CallSite CS);
151 template <typename Callable>
152 bool simplifyInstruction(Instruction &I, Callable Evaluate);
153 ConstantInt *stripAndComputeInBoundsConstantOffsets(Value *&V);
155 /// Return true if the given argument to the function being considered for
156 /// inlining has the given attribute set either at the call site or the
157 /// function declaration. Primarily used to inspect call site specific
158 /// attributes since these can be more precise than the ones on the callee
160 bool paramHasAttr(Argument *A, Attribute::AttrKind Attr);
162 /// Return true if the given value is known non null within the callee if
163 /// inlined through this particular callsite.
164 bool isKnownNonNullInCallee(Value *V);
166 /// Update Threshold based on callsite properties such as callee
167 /// attributes and callee hotness for PGO builds. The Callee is explicitly
168 /// passed to support analyzing indirect calls whose target is inferred by
170 void updateThreshold(CallSite CS, Function &Callee);
172 /// Return true if size growth is allowed when inlining the callee at CS.
173 bool allowSizeGrowth(CallSite CS);
175 // Custom analysis routines.
176 bool analyzeBlock(BasicBlock *BB, SmallPtrSetImpl<const Value *> &EphValues);
178 // Disable several entry points to the visitor so we don't accidentally use
179 // them by declaring but not defining them here.
180 void visit(Module *);
181 void visit(Module &);
182 void visit(Function *);
183 void visit(Function &);
184 void visit(BasicBlock *);
185 void visit(BasicBlock &);
187 // Provide base case for our instruction visit.
188 bool visitInstruction(Instruction &I);
190 // Our visit overrides.
191 bool visitAlloca(AllocaInst &I);
192 bool visitPHI(PHINode &I);
193 bool visitGetElementPtr(GetElementPtrInst &I);
194 bool visitBitCast(BitCastInst &I);
195 bool visitPtrToInt(PtrToIntInst &I);
196 bool visitIntToPtr(IntToPtrInst &I);
197 bool visitCastInst(CastInst &I);
198 bool visitUnaryInstruction(UnaryInstruction &I);
199 bool visitCmpInst(CmpInst &I);
200 bool visitSub(BinaryOperator &I);
201 bool visitBinaryOperator(BinaryOperator &I);
202 bool visitLoad(LoadInst &I);
203 bool visitStore(StoreInst &I);
204 bool visitExtractValue(ExtractValueInst &I);
205 bool visitInsertValue(InsertValueInst &I);
206 bool visitCallSite(CallSite CS);
207 bool visitReturnInst(ReturnInst &RI);
208 bool visitBranchInst(BranchInst &BI);
209 bool visitSwitchInst(SwitchInst &SI);
210 bool visitIndirectBrInst(IndirectBrInst &IBI);
211 bool visitResumeInst(ResumeInst &RI);
212 bool visitCleanupReturnInst(CleanupReturnInst &RI);
213 bool visitCatchReturnInst(CatchReturnInst &RI);
214 bool visitUnreachableInst(UnreachableInst &I);
217 CallAnalyzer(const TargetTransformInfo &TTI,
218 std::function<AssumptionCache &(Function &)> &GetAssumptionCache,
219 Optional<function_ref<BlockFrequencyInfo &(Function &)>> &GetBFI,
220 ProfileSummaryInfo *PSI, Function &Callee, CallSite CSArg,
221 const InlineParams &Params)
222 : TTI(TTI), GetAssumptionCache(GetAssumptionCache), GetBFI(GetBFI),
223 PSI(PSI), F(Callee), DL(F.getParent()->getDataLayout()),
224 CandidateCS(CSArg), Params(Params), Threshold(Params.DefaultThreshold),
225 Cost(0), IsCallerRecursive(false), IsRecursiveCall(false),
226 ExposesReturnsTwice(false), HasDynamicAlloca(false),
227 ContainsNoDuplicateCall(false), HasReturn(false), HasIndirectBr(false),
228 HasFrameEscape(false), AllocatedSize(0), NumInstructions(0),
229 NumVectorInstructions(0), FiftyPercentVectorBonus(0),
230 TenPercentVectorBonus(0), VectorBonus(0), NumConstantArgs(0),
231 NumConstantOffsetPtrArgs(0), NumAllocaArgs(0), NumConstantPtrCmps(0),
232 NumConstantPtrDiffs(0), NumInstructionsSimplified(0),
233 SROACostSavings(0), SROACostSavingsLost(0) {}
235 bool analyzeCall(CallSite CS);
237 int getThreshold() { return Threshold; }
238 int getCost() { return Cost; }
240 // Keep a bunch of stats about the cost savings found so we can print them
241 // out when debugging.
242 unsigned NumConstantArgs;
243 unsigned NumConstantOffsetPtrArgs;
244 unsigned NumAllocaArgs;
245 unsigned NumConstantPtrCmps;
246 unsigned NumConstantPtrDiffs;
247 unsigned NumInstructionsSimplified;
248 unsigned SROACostSavings;
249 unsigned SROACostSavingsLost;
256 /// \brief Test whether the given value is an Alloca-derived function argument.
257 bool CallAnalyzer::isAllocaDerivedArg(Value *V) {
258 return SROAArgValues.count(V);
261 /// \brief Lookup the SROA-candidate argument and cost iterator which V maps to.
262 /// Returns false if V does not map to a SROA-candidate.
263 bool CallAnalyzer::lookupSROAArgAndCost(
264 Value *V, Value *&Arg, DenseMap<Value *, int>::iterator &CostIt) {
265 if (SROAArgValues.empty() || SROAArgCosts.empty())
268 DenseMap<Value *, Value *>::iterator ArgIt = SROAArgValues.find(V);
269 if (ArgIt == SROAArgValues.end())
273 CostIt = SROAArgCosts.find(Arg);
274 return CostIt != SROAArgCosts.end();
277 /// \brief Disable SROA for the candidate marked by this cost iterator.
279 /// This marks the candidate as no longer viable for SROA, and adds the cost
280 /// savings associated with it back into the inline cost measurement.
281 void CallAnalyzer::disableSROA(DenseMap<Value *, int>::iterator CostIt) {
282 // If we're no longer able to perform SROA we need to undo its cost savings
283 // and prevent subsequent analysis.
284 Cost += CostIt->second;
285 SROACostSavings -= CostIt->second;
286 SROACostSavingsLost += CostIt->second;
287 SROAArgCosts.erase(CostIt);
290 /// \brief If 'V' maps to a SROA candidate, disable SROA for it.
291 void CallAnalyzer::disableSROA(Value *V) {
293 DenseMap<Value *, int>::iterator CostIt;
294 if (lookupSROAArgAndCost(V, SROAArg, CostIt))
298 /// \brief Accumulate the given cost for a particular SROA candidate.
299 void CallAnalyzer::accumulateSROACost(DenseMap<Value *, int>::iterator CostIt,
300 int InstructionCost) {
301 CostIt->second += InstructionCost;
302 SROACostSavings += InstructionCost;
305 /// \brief Accumulate a constant GEP offset into an APInt if possible.
307 /// Returns false if unable to compute the offset for any reason. Respects any
308 /// simplified values known during the analysis of this callsite.
309 bool CallAnalyzer::accumulateGEPOffset(GEPOperator &GEP, APInt &Offset) {
310 unsigned IntPtrWidth = DL.getPointerSizeInBits();
311 assert(IntPtrWidth == Offset.getBitWidth());
313 for (gep_type_iterator GTI = gep_type_begin(GEP), GTE = gep_type_end(GEP);
315 ConstantInt *OpC = dyn_cast<ConstantInt>(GTI.getOperand());
317 if (Constant *SimpleOp = SimplifiedValues.lookup(GTI.getOperand()))
318 OpC = dyn_cast<ConstantInt>(SimpleOp);
324 // Handle a struct index, which adds its field offset to the pointer.
325 if (StructType *STy = GTI.getStructTypeOrNull()) {
326 unsigned ElementIdx = OpC->getZExtValue();
327 const StructLayout *SL = DL.getStructLayout(STy);
328 Offset += APInt(IntPtrWidth, SL->getElementOffset(ElementIdx));
332 APInt TypeSize(IntPtrWidth, DL.getTypeAllocSize(GTI.getIndexedType()));
333 Offset += OpC->getValue().sextOrTrunc(IntPtrWidth) * TypeSize;
338 /// \brief Use TTI to check whether a GEP is free.
340 /// Respects any simplified values known during the analysis of this callsite.
341 bool CallAnalyzer::isGEPFree(GetElementPtrInst &GEP) {
342 SmallVector<Value *, 4> Indices;
343 for (User::op_iterator I = GEP.idx_begin(), E = GEP.idx_end(); I != E; ++I)
344 if (Constant *SimpleOp = SimplifiedValues.lookup(*I))
345 Indices.push_back(SimpleOp);
347 Indices.push_back(*I);
348 return TargetTransformInfo::TCC_Free ==
349 TTI.getGEPCost(GEP.getSourceElementType(), GEP.getPointerOperand(),
353 bool CallAnalyzer::visitAlloca(AllocaInst &I) {
354 // Check whether inlining will turn a dynamic alloca into a static
355 // alloca and handle that case.
356 if (I.isArrayAllocation()) {
357 Constant *Size = SimplifiedValues.lookup(I.getArraySize());
358 if (auto *AllocSize = dyn_cast_or_null<ConstantInt>(Size)) {
359 Type *Ty = I.getAllocatedType();
360 AllocatedSize = SaturatingMultiplyAdd(
361 AllocSize->getLimitedValue(), DL.getTypeAllocSize(Ty), AllocatedSize);
362 return Base::visitAlloca(I);
366 // Accumulate the allocated size.
367 if (I.isStaticAlloca()) {
368 Type *Ty = I.getAllocatedType();
369 AllocatedSize = SaturatingAdd(DL.getTypeAllocSize(Ty), AllocatedSize);
372 // We will happily inline static alloca instructions.
373 if (I.isStaticAlloca())
374 return Base::visitAlloca(I);
376 // FIXME: This is overly conservative. Dynamic allocas are inefficient for
377 // a variety of reasons, and so we would like to not inline them into
378 // functions which don't currently have a dynamic alloca. This simply
379 // disables inlining altogether in the presence of a dynamic alloca.
380 HasDynamicAlloca = true;
384 bool CallAnalyzer::visitPHI(PHINode &I) {
385 // FIXME: We should potentially be tracking values through phi nodes,
386 // especially when they collapse to a single value due to deleted CFG edges
389 // FIXME: We need to propagate SROA *disabling* through phi nodes, even
390 // though we don't want to propagate it's bonuses. The idea is to disable
391 // SROA if it *might* be used in an inappropriate manner.
393 // Phi nodes are always zero-cost.
397 bool CallAnalyzer::visitGetElementPtr(GetElementPtrInst &I) {
399 DenseMap<Value *, int>::iterator CostIt;
401 lookupSROAArgAndCost(I.getPointerOperand(), SROAArg, CostIt);
403 // Try to fold GEPs of constant-offset call site argument pointers. This
404 // requires target data and inbounds GEPs.
405 if (I.isInBounds()) {
406 // Check if we have a base + offset for the pointer.
407 Value *Ptr = I.getPointerOperand();
408 std::pair<Value *, APInt> BaseAndOffset = ConstantOffsetPtrs.lookup(Ptr);
409 if (BaseAndOffset.first) {
410 // Check if the offset of this GEP is constant, and if so accumulate it
412 if (!accumulateGEPOffset(cast<GEPOperator>(I), BaseAndOffset.second)) {
413 // Non-constant GEPs aren't folded, and disable SROA.
419 // Add the result as a new mapping to Base + Offset.
420 ConstantOffsetPtrs[&I] = BaseAndOffset;
422 // Also handle SROA candidates here, we already know that the GEP is
423 // all-constant indexed.
425 SROAArgValues[&I] = SROAArg;
431 // Lambda to check whether a GEP's indices are all constant.
432 auto IsGEPOffsetConstant = [&](GetElementPtrInst &GEP) {
433 for (User::op_iterator I = GEP.idx_begin(), E = GEP.idx_end(); I != E; ++I)
434 if (!isa<Constant>(*I) && !SimplifiedValues.lookup(*I))
439 if (IsGEPOffsetConstant(I)) {
441 SROAArgValues[&I] = SROAArg;
443 // Constant GEPs are modeled as free.
447 // Variable GEPs will require math and will disable SROA.
453 /// Simplify \p I if its operands are constants and update SimplifiedValues.
454 /// \p Evaluate is a callable specific to instruction type that evaluates the
455 /// instruction when all the operands are constants.
456 template <typename Callable>
457 bool CallAnalyzer::simplifyInstruction(Instruction &I, Callable Evaluate) {
458 SmallVector<Constant *, 2> COps;
459 for (Value *Op : I.operands()) {
460 Constant *COp = dyn_cast<Constant>(Op);
462 COp = SimplifiedValues.lookup(Op);
467 auto *C = Evaluate(COps);
470 SimplifiedValues[&I] = C;
474 bool CallAnalyzer::visitBitCast(BitCastInst &I) {
475 // Propagate constants through bitcasts.
476 if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) {
477 return ConstantExpr::getBitCast(COps[0], I.getType());
481 // Track base/offsets through casts
482 std::pair<Value *, APInt> BaseAndOffset =
483 ConstantOffsetPtrs.lookup(I.getOperand(0));
484 // Casts don't change the offset, just wrap it up.
485 if (BaseAndOffset.first)
486 ConstantOffsetPtrs[&I] = BaseAndOffset;
488 // Also look for SROA candidates here.
490 DenseMap<Value *, int>::iterator CostIt;
491 if (lookupSROAArgAndCost(I.getOperand(0), SROAArg, CostIt))
492 SROAArgValues[&I] = SROAArg;
494 // Bitcasts are always zero cost.
498 bool CallAnalyzer::visitPtrToInt(PtrToIntInst &I) {
499 // Propagate constants through ptrtoint.
500 if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) {
501 return ConstantExpr::getPtrToInt(COps[0], I.getType());
505 // Track base/offset pairs when converted to a plain integer provided the
506 // integer is large enough to represent the pointer.
507 unsigned IntegerSize = I.getType()->getScalarSizeInBits();
508 if (IntegerSize >= DL.getPointerSizeInBits()) {
509 std::pair<Value *, APInt> BaseAndOffset =
510 ConstantOffsetPtrs.lookup(I.getOperand(0));
511 if (BaseAndOffset.first)
512 ConstantOffsetPtrs[&I] = BaseAndOffset;
515 // This is really weird. Technically, ptrtoint will disable SROA. However,
516 // unless that ptrtoint is *used* somewhere in the live basic blocks after
517 // inlining, it will be nuked, and SROA should proceed. All of the uses which
518 // would block SROA would also block SROA if applied directly to a pointer,
519 // and so we can just add the integer in here. The only places where SROA is
520 // preserved either cannot fire on an integer, or won't in-and-of themselves
521 // disable SROA (ext) w/o some later use that we would see and disable.
523 DenseMap<Value *, int>::iterator CostIt;
524 if (lookupSROAArgAndCost(I.getOperand(0), SROAArg, CostIt))
525 SROAArgValues[&I] = SROAArg;
527 return TargetTransformInfo::TCC_Free == TTI.getUserCost(&I);
530 bool CallAnalyzer::visitIntToPtr(IntToPtrInst &I) {
531 // Propagate constants through ptrtoint.
532 if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) {
533 return ConstantExpr::getIntToPtr(COps[0], I.getType());
537 // Track base/offset pairs when round-tripped through a pointer without
538 // modifications provided the integer is not too large.
539 Value *Op = I.getOperand(0);
540 unsigned IntegerSize = Op->getType()->getScalarSizeInBits();
541 if (IntegerSize <= DL.getPointerSizeInBits()) {
542 std::pair<Value *, APInt> BaseAndOffset = ConstantOffsetPtrs.lookup(Op);
543 if (BaseAndOffset.first)
544 ConstantOffsetPtrs[&I] = BaseAndOffset;
547 // "Propagate" SROA here in the same manner as we do for ptrtoint above.
549 DenseMap<Value *, int>::iterator CostIt;
550 if (lookupSROAArgAndCost(Op, SROAArg, CostIt))
551 SROAArgValues[&I] = SROAArg;
553 return TargetTransformInfo::TCC_Free == TTI.getUserCost(&I);
556 bool CallAnalyzer::visitCastInst(CastInst &I) {
557 // Propagate constants through ptrtoint.
558 if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) {
559 return ConstantExpr::getCast(I.getOpcode(), COps[0], I.getType());
563 // Disable SROA in the face of arbitrary casts we don't whitelist elsewhere.
564 disableSROA(I.getOperand(0));
566 return TargetTransformInfo::TCC_Free == TTI.getUserCost(&I);
569 bool CallAnalyzer::visitUnaryInstruction(UnaryInstruction &I) {
570 Value *Operand = I.getOperand(0);
571 if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) {
572 return ConstantFoldInstOperands(&I, COps[0], DL);
576 // Disable any SROA on the argument to arbitrary unary operators.
577 disableSROA(Operand);
582 bool CallAnalyzer::paramHasAttr(Argument *A, Attribute::AttrKind Attr) {
583 return CandidateCS.paramHasAttr(A->getArgNo(), Attr);
586 bool CallAnalyzer::isKnownNonNullInCallee(Value *V) {
587 // Does the *call site* have the NonNull attribute set on an argument? We
588 // use the attribute on the call site to memoize any analysis done in the
589 // caller. This will also trip if the callee function has a non-null
590 // parameter attribute, but that's a less interesting case because hopefully
591 // the callee would already have been simplified based on that.
592 if (Argument *A = dyn_cast<Argument>(V))
593 if (paramHasAttr(A, Attribute::NonNull))
596 // Is this an alloca in the caller? This is distinct from the attribute case
597 // above because attributes aren't updated within the inliner itself and we
598 // always want to catch the alloca derived case.
599 if (isAllocaDerivedArg(V))
600 // We can actually predict the result of comparisons between an
601 // alloca-derived value and null. Note that this fires regardless of
608 bool CallAnalyzer::allowSizeGrowth(CallSite CS) {
609 // If the normal destination of the invoke or the parent block of the call
610 // site is unreachable-terminated, there is little point in inlining this
611 // unless there is literally zero cost.
612 // FIXME: Note that it is possible that an unreachable-terminated block has a
613 // hot entry. For example, in below scenario inlining hot_call_X() may be
621 // For now, we are not handling this corner case here as it is rare in real
622 // code. In future, we should elaborate this based on BPI and BFI in more
623 // general threshold adjusting heuristics in updateThreshold().
624 Instruction *Instr = CS.getInstruction();
625 if (InvokeInst *II = dyn_cast<InvokeInst>(Instr)) {
626 if (isa<UnreachableInst>(II->getNormalDest()->getTerminator()))
628 } else if (isa<UnreachableInst>(Instr->getParent()->getTerminator()))
634 void CallAnalyzer::updateThreshold(CallSite CS, Function &Callee) {
635 // If no size growth is allowed for this inlining, set Threshold to 0.
636 if (!allowSizeGrowth(CS)) {
641 Function *Caller = CS.getCaller();
643 // return min(A, B) if B is valid.
644 auto MinIfValid = [](int A, Optional<int> B) {
645 return B ? std::min(A, B.getValue()) : A;
648 // return max(A, B) if B is valid.
649 auto MaxIfValid = [](int A, Optional<int> B) {
650 return B ? std::max(A, B.getValue()) : A;
653 // Use the OptMinSizeThreshold or OptSizeThreshold knob if they are available
654 // and reduce the threshold if the caller has the necessary attribute.
655 if (Caller->optForMinSize())
656 Threshold = MinIfValid(Threshold, Params.OptMinSizeThreshold);
657 else if (Caller->optForSize())
658 Threshold = MinIfValid(Threshold, Params.OptSizeThreshold);
660 // Adjust the threshold based on inlinehint attribute and profile based
661 // hotness information if the caller does not have MinSize attribute.
662 if (!Caller->optForMinSize()) {
663 if (Callee.hasFnAttribute(Attribute::InlineHint))
664 Threshold = MaxIfValid(Threshold, Params.HintThreshold);
666 BlockFrequencyInfo *CallerBFI = GetBFI ? &((*GetBFI)(*Caller)) : nullptr;
667 if (PSI->isHotCallSite(CS, CallerBFI)) {
668 DEBUG(dbgs() << "Hot callsite.\n");
669 Threshold = Params.HotCallSiteThreshold.getValue();
670 } else if (PSI->isFunctionEntryHot(&Callee)) {
671 DEBUG(dbgs() << "Hot callee.\n");
672 // If callsite hotness can not be determined, we may still know
673 // that the callee is hot and treat it as a weaker hint for threshold
675 Threshold = MaxIfValid(Threshold, Params.HintThreshold);
676 } else if (PSI->isColdCallSite(CS, CallerBFI)) {
677 DEBUG(dbgs() << "Cold callsite.\n");
678 Threshold = MinIfValid(Threshold, Params.ColdCallSiteThreshold);
679 } else if (PSI->isFunctionEntryCold(&Callee)) {
680 DEBUG(dbgs() << "Cold callee.\n");
681 Threshold = MinIfValid(Threshold, Params.ColdThreshold);
686 // Finally, take the target-specific inlining threshold multiplier into
688 Threshold *= TTI.getInliningThresholdMultiplier();
691 bool CallAnalyzer::visitCmpInst(CmpInst &I) {
692 Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
693 // First try to handle simplified comparisons.
694 if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) {
695 return ConstantExpr::getCompare(I.getPredicate(), COps[0], COps[1]);
699 if (I.getOpcode() == Instruction::FCmp)
702 // Otherwise look for a comparison between constant offset pointers with
704 Value *LHSBase, *RHSBase;
705 APInt LHSOffset, RHSOffset;
706 std::tie(LHSBase, LHSOffset) = ConstantOffsetPtrs.lookup(LHS);
708 std::tie(RHSBase, RHSOffset) = ConstantOffsetPtrs.lookup(RHS);
709 if (RHSBase && LHSBase == RHSBase) {
710 // We have common bases, fold the icmp to a constant based on the
712 Constant *CLHS = ConstantInt::get(LHS->getContext(), LHSOffset);
713 Constant *CRHS = ConstantInt::get(RHS->getContext(), RHSOffset);
714 if (Constant *C = ConstantExpr::getICmp(I.getPredicate(), CLHS, CRHS)) {
715 SimplifiedValues[&I] = C;
716 ++NumConstantPtrCmps;
722 // If the comparison is an equality comparison with null, we can simplify it
723 // if we know the value (argument) can't be null
724 if (I.isEquality() && isa<ConstantPointerNull>(I.getOperand(1)) &&
725 isKnownNonNullInCallee(I.getOperand(0))) {
726 bool IsNotEqual = I.getPredicate() == CmpInst::ICMP_NE;
727 SimplifiedValues[&I] = IsNotEqual ? ConstantInt::getTrue(I.getType())
728 : ConstantInt::getFalse(I.getType());
731 // Finally check for SROA candidates in comparisons.
733 DenseMap<Value *, int>::iterator CostIt;
734 if (lookupSROAArgAndCost(I.getOperand(0), SROAArg, CostIt)) {
735 if (isa<ConstantPointerNull>(I.getOperand(1))) {
736 accumulateSROACost(CostIt, InlineConstants::InstrCost);
746 bool CallAnalyzer::visitSub(BinaryOperator &I) {
747 // Try to handle a special case: we can fold computing the difference of two
748 // constant-related pointers.
749 Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
750 Value *LHSBase, *RHSBase;
751 APInt LHSOffset, RHSOffset;
752 std::tie(LHSBase, LHSOffset) = ConstantOffsetPtrs.lookup(LHS);
754 std::tie(RHSBase, RHSOffset) = ConstantOffsetPtrs.lookup(RHS);
755 if (RHSBase && LHSBase == RHSBase) {
756 // We have common bases, fold the subtract to a constant based on the
758 Constant *CLHS = ConstantInt::get(LHS->getContext(), LHSOffset);
759 Constant *CRHS = ConstantInt::get(RHS->getContext(), RHSOffset);
760 if (Constant *C = ConstantExpr::getSub(CLHS, CRHS)) {
761 SimplifiedValues[&I] = C;
762 ++NumConstantPtrDiffs;
768 // Otherwise, fall back to the generic logic for simplifying and handling
770 return Base::visitSub(I);
773 bool CallAnalyzer::visitBinaryOperator(BinaryOperator &I) {
774 Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
775 auto Evaluate = [&](SmallVectorImpl<Constant *> &COps) {
776 Value *SimpleV = nullptr;
777 if (auto FI = dyn_cast<FPMathOperator>(&I))
778 SimpleV = SimplifyFPBinOp(I.getOpcode(), COps[0], COps[1],
779 FI->getFastMathFlags(), DL);
781 SimpleV = SimplifyBinOp(I.getOpcode(), COps[0], COps[1], DL);
782 return dyn_cast_or_null<Constant>(SimpleV);
785 if (simplifyInstruction(I, Evaluate))
788 // Disable any SROA on arguments to arbitrary, unsimplified binary operators.
795 bool CallAnalyzer::visitLoad(LoadInst &I) {
797 DenseMap<Value *, int>::iterator CostIt;
798 if (lookupSROAArgAndCost(I.getPointerOperand(), SROAArg, CostIt)) {
800 accumulateSROACost(CostIt, InlineConstants::InstrCost);
810 bool CallAnalyzer::visitStore(StoreInst &I) {
812 DenseMap<Value *, int>::iterator CostIt;
813 if (lookupSROAArgAndCost(I.getPointerOperand(), SROAArg, CostIt)) {
815 accumulateSROACost(CostIt, InlineConstants::InstrCost);
825 bool CallAnalyzer::visitExtractValue(ExtractValueInst &I) {
826 // Constant folding for extract value is trivial.
827 if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) {
828 return ConstantExpr::getExtractValue(COps[0], I.getIndices());
832 // SROA can look through these but give them a cost.
836 bool CallAnalyzer::visitInsertValue(InsertValueInst &I) {
837 // Constant folding for insert value is trivial.
838 if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) {
839 return ConstantExpr::getInsertValue(/*AggregateOperand*/ COps[0],
840 /*InsertedValueOperand*/ COps[1],
845 // SROA can look through these but give them a cost.
849 /// \brief Try to simplify a call site.
851 /// Takes a concrete function and callsite and tries to actually simplify it by
852 /// analyzing the arguments and call itself with instsimplify. Returns true if
853 /// it has simplified the callsite to some other entity (a constant), making it
855 bool CallAnalyzer::simplifyCallSite(Function *F, CallSite CS) {
856 // FIXME: Using the instsimplify logic directly for this is inefficient
857 // because we have to continually rebuild the argument list even when no
858 // simplifications can be performed. Until that is fixed with remapping
859 // inside of instsimplify, directly constant fold calls here.
860 if (!canConstantFoldCallTo(F))
863 // Try to re-map the arguments to constants.
864 SmallVector<Constant *, 4> ConstantArgs;
865 ConstantArgs.reserve(CS.arg_size());
866 for (CallSite::arg_iterator I = CS.arg_begin(), E = CS.arg_end(); I != E;
868 Constant *C = dyn_cast<Constant>(*I);
870 C = dyn_cast_or_null<Constant>(SimplifiedValues.lookup(*I));
872 return false; // This argument doesn't map to a constant.
874 ConstantArgs.push_back(C);
876 if (Constant *C = ConstantFoldCall(F, ConstantArgs)) {
877 SimplifiedValues[CS.getInstruction()] = C;
884 bool CallAnalyzer::visitCallSite(CallSite CS) {
885 if (CS.hasFnAttr(Attribute::ReturnsTwice) &&
886 !F.hasFnAttribute(Attribute::ReturnsTwice)) {
887 // This aborts the entire analysis.
888 ExposesReturnsTwice = true;
891 if (CS.isCall() && cast<CallInst>(CS.getInstruction())->cannotDuplicate())
892 ContainsNoDuplicateCall = true;
894 if (Function *F = CS.getCalledFunction()) {
895 // When we have a concrete function, first try to simplify it directly.
896 if (simplifyCallSite(F, CS))
899 // Next check if it is an intrinsic we know about.
900 // FIXME: Lift this into part of the InstVisitor.
901 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(CS.getInstruction())) {
902 switch (II->getIntrinsicID()) {
904 return Base::visitCallSite(CS);
906 case Intrinsic::load_relative:
907 // This is normally lowered to 4 LLVM instructions.
908 Cost += 3 * InlineConstants::InstrCost;
911 case Intrinsic::memset:
912 case Intrinsic::memcpy:
913 case Intrinsic::memmove:
914 // SROA can usually chew through these intrinsics, but they aren't free.
916 case Intrinsic::localescape:
917 HasFrameEscape = true;
922 if (F == CS.getInstruction()->getParent()->getParent()) {
923 // This flag will fully abort the analysis, so don't bother with anything
925 IsRecursiveCall = true;
929 if (TTI.isLoweredToCall(F)) {
930 // We account for the average 1 instruction per call argument setup
932 Cost += CS.arg_size() * InlineConstants::InstrCost;
934 // Everything other than inline ASM will also have a significant cost
935 // merely from making the call.
936 if (!isa<InlineAsm>(CS.getCalledValue()))
937 Cost += InlineConstants::CallPenalty;
940 return Base::visitCallSite(CS);
943 // Otherwise we're in a very special case -- an indirect function call. See
944 // if we can be particularly clever about this.
945 Value *Callee = CS.getCalledValue();
947 // First, pay the price of the argument setup. We account for the average
948 // 1 instruction per call argument setup here.
949 Cost += CS.arg_size() * InlineConstants::InstrCost;
951 // Next, check if this happens to be an indirect function call to a known
952 // function in this inline context. If not, we've done all we can.
953 Function *F = dyn_cast_or_null<Function>(SimplifiedValues.lookup(Callee));
955 return Base::visitCallSite(CS);
957 // If we have a constant that we are calling as a function, we can peer
958 // through it and see the function target. This happens not infrequently
959 // during devirtualization and so we want to give it a hefty bonus for
960 // inlining, but cap that bonus in the event that inlining wouldn't pan
961 // out. Pretend to inline the function, with a custom threshold.
962 auto IndirectCallParams = Params;
963 IndirectCallParams.DefaultThreshold = InlineConstants::IndirectCallThreshold;
964 CallAnalyzer CA(TTI, GetAssumptionCache, GetBFI, PSI, *F, CS,
966 if (CA.analyzeCall(CS)) {
967 // We were able to inline the indirect call! Subtract the cost from the
968 // threshold to get the bonus we want to apply, but don't go below zero.
969 Cost -= std::max(0, CA.getThreshold() - CA.getCost());
972 return Base::visitCallSite(CS);
975 bool CallAnalyzer::visitReturnInst(ReturnInst &RI) {
976 // At least one return instruction will be free after inlining.
977 bool Free = !HasReturn;
982 bool CallAnalyzer::visitBranchInst(BranchInst &BI) {
983 // We model unconditional branches as essentially free -- they really
984 // shouldn't exist at all, but handling them makes the behavior of the
985 // inliner more regular and predictable. Interestingly, conditional branches
986 // which will fold away are also free.
987 return BI.isUnconditional() || isa<ConstantInt>(BI.getCondition()) ||
988 dyn_cast_or_null<ConstantInt>(
989 SimplifiedValues.lookup(BI.getCondition()));
992 bool CallAnalyzer::visitSwitchInst(SwitchInst &SI) {
993 // We model unconditional switches as free, see the comments on handling
995 if (isa<ConstantInt>(SI.getCondition()))
997 if (Value *V = SimplifiedValues.lookup(SI.getCondition()))
998 if (isa<ConstantInt>(V))
1001 // Otherwise, we need to accumulate a cost proportional to the number of
1002 // distinct successor blocks. This fan-out in the CFG cannot be represented
1003 // for free even if we can represent the core switch as a jumptable that
1004 // takes a single instruction.
1006 // NB: We convert large switches which are just used to initialize large phi
1007 // nodes to lookup tables instead in simplify-cfg, so this shouldn't prevent
1008 // inlining those. It will prevent inlining in cases where the optimization
1009 // does not (yet) fire.
1010 SmallPtrSet<BasicBlock *, 8> SuccessorBlocks;
1011 SuccessorBlocks.insert(SI.getDefaultDest());
1012 for (auto Case : SI.cases())
1013 SuccessorBlocks.insert(Case.getCaseSuccessor());
1014 // Add cost corresponding to the number of distinct destinations. The first
1015 // we model as free because of fallthrough.
1016 Cost += (SuccessorBlocks.size() - 1) * InlineConstants::InstrCost;
1020 bool CallAnalyzer::visitIndirectBrInst(IndirectBrInst &IBI) {
1021 // We never want to inline functions that contain an indirectbr. This is
1022 // incorrect because all the blockaddress's (in static global initializers
1023 // for example) would be referring to the original function, and this
1024 // indirect jump would jump from the inlined copy of the function into the
1025 // original function which is extremely undefined behavior.
1026 // FIXME: This logic isn't really right; we can safely inline functions with
1027 // indirectbr's as long as no other function or global references the
1028 // blockaddress of a block within the current function.
1029 HasIndirectBr = true;
1033 bool CallAnalyzer::visitResumeInst(ResumeInst &RI) {
1034 // FIXME: It's not clear that a single instruction is an accurate model for
1035 // the inline cost of a resume instruction.
1039 bool CallAnalyzer::visitCleanupReturnInst(CleanupReturnInst &CRI) {
1040 // FIXME: It's not clear that a single instruction is an accurate model for
1041 // the inline cost of a cleanupret instruction.
1045 bool CallAnalyzer::visitCatchReturnInst(CatchReturnInst &CRI) {
1046 // FIXME: It's not clear that a single instruction is an accurate model for
1047 // the inline cost of a catchret instruction.
1051 bool CallAnalyzer::visitUnreachableInst(UnreachableInst &I) {
1052 // FIXME: It might be reasonably to discount the cost of instructions leading
1053 // to unreachable as they have the lowest possible impact on both runtime and
1055 return true; // No actual code is needed for unreachable.
1058 bool CallAnalyzer::visitInstruction(Instruction &I) {
1059 // Some instructions are free. All of the free intrinsics can also be
1060 // handled by SROA, etc.
1061 if (TargetTransformInfo::TCC_Free == TTI.getUserCost(&I))
1064 // We found something we don't understand or can't handle. Mark any SROA-able
1065 // values in the operand list as no longer viable.
1066 for (User::op_iterator OI = I.op_begin(), OE = I.op_end(); OI != OE; ++OI)
1072 /// \brief Analyze a basic block for its contribution to the inline cost.
1074 /// This method walks the analyzer over every instruction in the given basic
1075 /// block and accounts for their cost during inlining at this callsite. It
1076 /// aborts early if the threshold has been exceeded or an impossible to inline
1077 /// construct has been detected. It returns false if inlining is no longer
1078 /// viable, and true if inlining remains viable.
1079 bool CallAnalyzer::analyzeBlock(BasicBlock *BB,
1080 SmallPtrSetImpl<const Value *> &EphValues) {
1081 for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) {
1082 // FIXME: Currently, the number of instructions in a function regardless of
1083 // our ability to simplify them during inline to constants or dead code,
1084 // are actually used by the vector bonus heuristic. As long as that's true,
1085 // we have to special case debug intrinsics here to prevent differences in
1086 // inlining due to debug symbols. Eventually, the number of unsimplified
1087 // instructions shouldn't factor into the cost computation, but until then,
1088 // hack around it here.
1089 if (isa<DbgInfoIntrinsic>(I))
1092 // Skip ephemeral values.
1093 if (EphValues.count(&*I))
1097 if (isa<ExtractElementInst>(I) || I->getType()->isVectorTy())
1098 ++NumVectorInstructions;
1100 // If the instruction is floating point, and the target says this operation
1101 // is expensive or the function has the "use-soft-float" attribute, this may
1102 // eventually become a library call. Treat the cost as such.
1103 if (I->getType()->isFloatingPointTy()) {
1104 // If the function has the "use-soft-float" attribute, mark it as
1106 if (TTI.getFPOpCost(I->getType()) == TargetTransformInfo::TCC_Expensive ||
1107 (F.getFnAttribute("use-soft-float").getValueAsString() == "true"))
1108 Cost += InlineConstants::CallPenalty;
1111 // If the instruction simplified to a constant, there is no cost to this
1112 // instruction. Visit the instructions using our InstVisitor to account for
1113 // all of the per-instruction logic. The visit tree returns true if we
1114 // consumed the instruction in any way, and false if the instruction's base
1115 // cost should count against inlining.
1116 if (Base::visit(&*I))
1117 ++NumInstructionsSimplified;
1119 Cost += InlineConstants::InstrCost;
1121 // If the visit this instruction detected an uninlinable pattern, abort.
1122 if (IsRecursiveCall || ExposesReturnsTwice || HasDynamicAlloca ||
1123 HasIndirectBr || HasFrameEscape)
1126 // If the caller is a recursive function then we don't want to inline
1127 // functions which allocate a lot of stack space because it would increase
1128 // the caller stack usage dramatically.
1129 if (IsCallerRecursive &&
1130 AllocatedSize > InlineConstants::TotalAllocaSizeRecursiveCaller)
1133 // Check if we've past the maximum possible threshold so we don't spin in
1134 // huge basic blocks that will never inline.
1135 if (Cost > Threshold)
1142 /// \brief Compute the base pointer and cumulative constant offsets for V.
1144 /// This strips all constant offsets off of V, leaving it the base pointer, and
1145 /// accumulates the total constant offset applied in the returned constant. It
1146 /// returns 0 if V is not a pointer, and returns the constant '0' if there are
1147 /// no constant offsets applied.
1148 ConstantInt *CallAnalyzer::stripAndComputeInBoundsConstantOffsets(Value *&V) {
1149 if (!V->getType()->isPointerTy())
1152 unsigned IntPtrWidth = DL.getPointerSizeInBits();
1153 APInt Offset = APInt::getNullValue(IntPtrWidth);
1155 // Even though we don't look through PHI nodes, we could be called on an
1156 // instruction in an unreachable block, which may be on a cycle.
1157 SmallPtrSet<Value *, 4> Visited;
1160 if (GEPOperator *GEP = dyn_cast<GEPOperator>(V)) {
1161 if (!GEP->isInBounds() || !accumulateGEPOffset(*GEP, Offset))
1163 V = GEP->getPointerOperand();
1164 } else if (Operator::getOpcode(V) == Instruction::BitCast) {
1165 V = cast<Operator>(V)->getOperand(0);
1166 } else if (GlobalAlias *GA = dyn_cast<GlobalAlias>(V)) {
1167 if (GA->isInterposable())
1169 V = GA->getAliasee();
1173 assert(V->getType()->isPointerTy() && "Unexpected operand type!");
1174 } while (Visited.insert(V).second);
1176 Type *IntPtrTy = DL.getIntPtrType(V->getContext());
1177 return cast<ConstantInt>(ConstantInt::get(IntPtrTy, Offset));
1180 /// \brief Analyze a call site for potential inlining.
1182 /// Returns true if inlining this call is viable, and false if it is not
1183 /// viable. It computes the cost and adjusts the threshold based on numerous
1184 /// factors and heuristics. If this method returns false but the computed cost
1185 /// is below the computed threshold, then inlining was forcibly disabled by
1186 /// some artifact of the routine.
1187 bool CallAnalyzer::analyzeCall(CallSite CS) {
1190 // Perform some tweaks to the cost and threshold based on the direct
1191 // callsite information.
1193 // We want to more aggressively inline vector-dense kernels, so up the
1194 // threshold, and we'll lower it if the % of vector instructions gets too
1195 // low. Note that these bonuses are some what arbitrary and evolved over time
1196 // by accident as much as because they are principled bonuses.
1198 // FIXME: It would be nice to remove all such bonuses. At least it would be
1199 // nice to base the bonus values on something more scientific.
1200 assert(NumInstructions == 0);
1201 assert(NumVectorInstructions == 0);
1203 // Update the threshold based on callsite properties
1204 updateThreshold(CS, F);
1206 FiftyPercentVectorBonus = 3 * Threshold / 2;
1207 TenPercentVectorBonus = 3 * Threshold / 4;
1209 // Track whether the post-inlining function would have more than one basic
1210 // block. A single basic block is often intended for inlining. Balloon the
1211 // threshold by 50% until we pass the single-BB phase.
1212 bool SingleBB = true;
1213 int SingleBBBonus = Threshold / 2;
1215 // Speculatively apply all possible bonuses to Threshold. If cost exceeds
1216 // this Threshold any time, and cost cannot decrease, we can stop processing
1217 // the rest of the function body.
1218 Threshold += (SingleBBBonus + FiftyPercentVectorBonus);
1220 // Give out bonuses per argument, as the instructions setting them up will
1221 // be gone after inlining.
1222 for (unsigned I = 0, E = CS.arg_size(); I != E; ++I) {
1223 if (CS.isByValArgument(I)) {
1224 // We approximate the number of loads and stores needed by dividing the
1225 // size of the byval type by the target's pointer size.
1226 PointerType *PTy = cast<PointerType>(CS.getArgument(I)->getType());
1227 unsigned TypeSize = DL.getTypeSizeInBits(PTy->getElementType());
1228 unsigned PointerSize = DL.getPointerSizeInBits();
1229 // Ceiling division.
1230 unsigned NumStores = (TypeSize + PointerSize - 1) / PointerSize;
1232 // If it generates more than 8 stores it is likely to be expanded as an
1233 // inline memcpy so we take that as an upper bound. Otherwise we assume
1234 // one load and one store per word copied.
1235 // FIXME: The maxStoresPerMemcpy setting from the target should be used
1236 // here instead of a magic number of 8, but it's not available via
1238 NumStores = std::min(NumStores, 8U);
1240 Cost -= 2 * NumStores * InlineConstants::InstrCost;
1242 // For non-byval arguments subtract off one instruction per call
1244 Cost -= InlineConstants::InstrCost;
1247 // The call instruction also disappears after inlining.
1248 Cost -= InlineConstants::InstrCost + InlineConstants::CallPenalty;
1250 // If there is only one call of the function, and it has internal linkage,
1251 // the cost of inlining it drops dramatically.
1252 bool OnlyOneCallAndLocalLinkage =
1253 F.hasLocalLinkage() && F.hasOneUse() && &F == CS.getCalledFunction();
1254 if (OnlyOneCallAndLocalLinkage)
1255 Cost -= InlineConstants::LastCallToStaticBonus;
1257 // If this function uses the coldcc calling convention, prefer not to inline
1259 if (F.getCallingConv() == CallingConv::Cold)
1260 Cost += InlineConstants::ColdccPenalty;
1262 // Check if we're done. This can happen due to bonuses and penalties.
1263 if (Cost > Threshold)
1269 Function *Caller = CS.getInstruction()->getParent()->getParent();
1270 // Check if the caller function is recursive itself.
1271 for (User *U : Caller->users()) {
1275 Instruction *I = Site.getInstruction();
1276 if (I->getParent()->getParent() == Caller) {
1277 IsCallerRecursive = true;
1282 // Populate our simplified values by mapping from function arguments to call
1283 // arguments with known important simplifications.
1284 CallSite::arg_iterator CAI = CS.arg_begin();
1285 for (Function::arg_iterator FAI = F.arg_begin(), FAE = F.arg_end();
1286 FAI != FAE; ++FAI, ++CAI) {
1287 assert(CAI != CS.arg_end());
1288 if (Constant *C = dyn_cast<Constant>(CAI))
1289 SimplifiedValues[&*FAI] = C;
1291 Value *PtrArg = *CAI;
1292 if (ConstantInt *C = stripAndComputeInBoundsConstantOffsets(PtrArg)) {
1293 ConstantOffsetPtrs[&*FAI] = std::make_pair(PtrArg, C->getValue());
1295 // We can SROA any pointer arguments derived from alloca instructions.
1296 if (isa<AllocaInst>(PtrArg)) {
1297 SROAArgValues[&*FAI] = PtrArg;
1298 SROAArgCosts[PtrArg] = 0;
1302 NumConstantArgs = SimplifiedValues.size();
1303 NumConstantOffsetPtrArgs = ConstantOffsetPtrs.size();
1304 NumAllocaArgs = SROAArgValues.size();
1306 // FIXME: If a caller has multiple calls to a callee, we end up recomputing
1307 // the ephemeral values multiple times (and they're completely determined by
1308 // the callee, so this is purely duplicate work).
1309 SmallPtrSet<const Value *, 32> EphValues;
1310 CodeMetrics::collectEphemeralValues(&F, &GetAssumptionCache(F), EphValues);
1312 // The worklist of live basic blocks in the callee *after* inlining. We avoid
1313 // adding basic blocks of the callee which can be proven to be dead for this
1314 // particular call site in order to get more accurate cost estimates. This
1315 // requires a somewhat heavyweight iteration pattern: we need to walk the
1316 // basic blocks in a breadth-first order as we insert live successors. To
1317 // accomplish this, prioritizing for small iterations because we exit after
1318 // crossing our threshold, we use a small-size optimized SetVector.
1319 typedef SetVector<BasicBlock *, SmallVector<BasicBlock *, 16>,
1320 SmallPtrSet<BasicBlock *, 16>>
1322 BBSetVector BBWorklist;
1323 BBWorklist.insert(&F.getEntryBlock());
1324 // Note that we *must not* cache the size, this loop grows the worklist.
1325 for (unsigned Idx = 0; Idx != BBWorklist.size(); ++Idx) {
1326 // Bail out the moment we cross the threshold. This means we'll under-count
1327 // the cost, but only when undercounting doesn't matter.
1328 if (Cost > Threshold)
1331 BasicBlock *BB = BBWorklist[Idx];
1335 // Disallow inlining a blockaddress. A blockaddress only has defined
1336 // behavior for an indirect branch in the same function, and we do not
1337 // currently support inlining indirect branches. But, the inliner may not
1338 // see an indirect branch that ends up being dead code at a particular call
1339 // site. If the blockaddress escapes the function, e.g., via a global
1340 // variable, inlining may lead to an invalid cross-function reference.
1341 if (BB->hasAddressTaken())
1344 // Analyze the cost of this block. If we blow through the threshold, this
1345 // returns false, and we can bail on out.
1346 if (!analyzeBlock(BB, EphValues))
1349 TerminatorInst *TI = BB->getTerminator();
1351 // Add in the live successors by first checking whether we have terminator
1352 // that may be simplified based on the values simplified by this call.
1353 if (BranchInst *BI = dyn_cast<BranchInst>(TI)) {
1354 if (BI->isConditional()) {
1355 Value *Cond = BI->getCondition();
1356 if (ConstantInt *SimpleCond =
1357 dyn_cast_or_null<ConstantInt>(SimplifiedValues.lookup(Cond))) {
1358 BBWorklist.insert(BI->getSuccessor(SimpleCond->isZero() ? 1 : 0));
1362 } else if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) {
1363 Value *Cond = SI->getCondition();
1364 if (ConstantInt *SimpleCond =
1365 dyn_cast_or_null<ConstantInt>(SimplifiedValues.lookup(Cond))) {
1366 BBWorklist.insert(SI->findCaseValue(SimpleCond)->getCaseSuccessor());
1371 // If we're unable to select a particular successor, just count all of
1373 for (unsigned TIdx = 0, TSize = TI->getNumSuccessors(); TIdx != TSize;
1375 BBWorklist.insert(TI->getSuccessor(TIdx));
1377 // If we had any successors at this point, than post-inlining is likely to
1378 // have them as well. Note that we assume any basic blocks which existed
1379 // due to branches or switches which folded above will also fold after
1381 if (SingleBB && TI->getNumSuccessors() > 1) {
1382 // Take off the bonus we applied to the threshold.
1383 Threshold -= SingleBBBonus;
1388 // If this is a noduplicate call, we can still inline as long as
1389 // inlining this would cause the removal of the caller (so the instruction
1390 // is not actually duplicated, just moved).
1391 if (!OnlyOneCallAndLocalLinkage && ContainsNoDuplicateCall)
1394 // We applied the maximum possible vector bonus at the beginning. Now,
1395 // subtract the excess bonus, if any, from the Threshold before
1396 // comparing against Cost.
1397 if (NumVectorInstructions <= NumInstructions / 10)
1398 Threshold -= FiftyPercentVectorBonus;
1399 else if (NumVectorInstructions <= NumInstructions / 2)
1400 Threshold -= (FiftyPercentVectorBonus - TenPercentVectorBonus);
1402 return Cost < std::max(1, Threshold);
1405 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1406 /// \brief Dump stats about this call's analysis.
1407 LLVM_DUMP_METHOD void CallAnalyzer::dump() {
1408 #define DEBUG_PRINT_STAT(x) dbgs() << " " #x ": " << x << "\n"
1409 DEBUG_PRINT_STAT(NumConstantArgs);
1410 DEBUG_PRINT_STAT(NumConstantOffsetPtrArgs);
1411 DEBUG_PRINT_STAT(NumAllocaArgs);
1412 DEBUG_PRINT_STAT(NumConstantPtrCmps);
1413 DEBUG_PRINT_STAT(NumConstantPtrDiffs);
1414 DEBUG_PRINT_STAT(NumInstructionsSimplified);
1415 DEBUG_PRINT_STAT(NumInstructions);
1416 DEBUG_PRINT_STAT(SROACostSavings);
1417 DEBUG_PRINT_STAT(SROACostSavingsLost);
1418 DEBUG_PRINT_STAT(ContainsNoDuplicateCall);
1419 DEBUG_PRINT_STAT(Cost);
1420 DEBUG_PRINT_STAT(Threshold);
1421 #undef DEBUG_PRINT_STAT
1425 /// \brief Test that there are no attribute conflicts between Caller and Callee
1426 /// that prevent inlining.
1427 static bool functionsHaveCompatibleAttributes(Function *Caller,
1429 TargetTransformInfo &TTI) {
1430 return TTI.areInlineCompatible(Caller, Callee) &&
1431 AttributeFuncs::areInlineCompatible(*Caller, *Callee);
1434 InlineCost llvm::getInlineCost(
1435 CallSite CS, const InlineParams &Params, TargetTransformInfo &CalleeTTI,
1436 std::function<AssumptionCache &(Function &)> &GetAssumptionCache,
1437 Optional<function_ref<BlockFrequencyInfo &(Function &)>> GetBFI,
1438 ProfileSummaryInfo *PSI) {
1439 return getInlineCost(CS, CS.getCalledFunction(), Params, CalleeTTI,
1440 GetAssumptionCache, GetBFI, PSI);
1443 InlineCost llvm::getInlineCost(
1444 CallSite CS, Function *Callee, const InlineParams &Params,
1445 TargetTransformInfo &CalleeTTI,
1446 std::function<AssumptionCache &(Function &)> &GetAssumptionCache,
1447 Optional<function_ref<BlockFrequencyInfo &(Function &)>> GetBFI,
1448 ProfileSummaryInfo *PSI) {
1450 // Cannot inline indirect calls.
1452 return llvm::InlineCost::getNever();
1454 // Calls to functions with always-inline attributes should be inlined
1455 // whenever possible.
1456 if (CS.hasFnAttr(Attribute::AlwaysInline)) {
1457 if (isInlineViable(*Callee))
1458 return llvm::InlineCost::getAlways();
1459 return llvm::InlineCost::getNever();
1462 // Never inline functions with conflicting attributes (unless callee has
1463 // always-inline attribute).
1464 if (!functionsHaveCompatibleAttributes(CS.getCaller(), Callee, CalleeTTI))
1465 return llvm::InlineCost::getNever();
1467 // Don't inline this call if the caller has the optnone attribute.
1468 if (CS.getCaller()->hasFnAttribute(Attribute::OptimizeNone))
1469 return llvm::InlineCost::getNever();
1471 // Don't inline functions which can be interposed at link-time. Don't inline
1472 // functions marked noinline or call sites marked noinline.
1473 // Note: inlining non-exact non-interposable functions is fine, since we know
1474 // we have *a* correct implementation of the source level function.
1475 if (Callee->isInterposable() || Callee->hasFnAttribute(Attribute::NoInline) ||
1477 return llvm::InlineCost::getNever();
1479 DEBUG(llvm::dbgs() << " Analyzing call of " << Callee->getName()
1482 CallAnalyzer CA(CalleeTTI, GetAssumptionCache, GetBFI, PSI, *Callee, CS,
1484 bool ShouldInline = CA.analyzeCall(CS);
1488 // Check if there was a reason to force inlining or no inlining.
1489 if (!ShouldInline && CA.getCost() < CA.getThreshold())
1490 return InlineCost::getNever();
1491 if (ShouldInline && CA.getCost() >= CA.getThreshold())
1492 return InlineCost::getAlways();
1494 return llvm::InlineCost::get(CA.getCost(), CA.getThreshold());
1497 bool llvm::isInlineViable(Function &F) {
1498 bool ReturnsTwice = F.hasFnAttribute(Attribute::ReturnsTwice);
1499 for (Function::iterator BI = F.begin(), BE = F.end(); BI != BE; ++BI) {
1500 // Disallow inlining of functions which contain indirect branches or
1502 if (isa<IndirectBrInst>(BI->getTerminator()) || BI->hasAddressTaken())
1505 for (auto &II : *BI) {
1510 // Disallow recursive calls.
1511 if (&F == CS.getCalledFunction())
1514 // Disallow calls which expose returns-twice to a function not previously
1515 // attributed as such.
1516 if (!ReturnsTwice && CS.isCall() &&
1517 cast<CallInst>(CS.getInstruction())->canReturnTwice())
1520 // Disallow inlining functions that call @llvm.localescape. Doing this
1521 // correctly would require major changes to the inliner.
1522 if (CS.getCalledFunction() &&
1523 CS.getCalledFunction()->getIntrinsicID() ==
1524 llvm::Intrinsic::localescape)
1532 // APIs to create InlineParams based on command line flags and/or other
1535 InlineParams llvm::getInlineParams(int Threshold) {
1536 InlineParams Params;
1538 // This field is the threshold to use for a callee by default. This is
1539 // derived from one or more of:
1540 // * optimization or size-optimization levels,
1541 // * a value passed to createFunctionInliningPass function, or
1542 // * the -inline-threshold flag.
1543 // If the -inline-threshold flag is explicitly specified, that is used
1544 // irrespective of anything else.
1545 if (InlineThreshold.getNumOccurrences() > 0)
1546 Params.DefaultThreshold = InlineThreshold;
1548 Params.DefaultThreshold = Threshold;
1550 // Set the HintThreshold knob from the -inlinehint-threshold.
1551 Params.HintThreshold = HintThreshold;
1553 // Set the HotCallSiteThreshold knob from the -hot-callsite-threshold.
1554 Params.HotCallSiteThreshold = HotCallSiteThreshold;
1556 // Set the ColdCallSiteThreshold knob from the -inline-cold-callsite-threshold.
1557 Params.ColdCallSiteThreshold = ColdCallSiteThreshold;
1559 // Set the OptMinSizeThreshold and OptSizeThreshold params only if the
1560 // -inlinehint-threshold commandline option is not explicitly given. If that
1561 // option is present, then its value applies even for callees with size and
1562 // minsize attributes.
1563 // If the -inline-threshold is not specified, set the ColdThreshold from the
1564 // -inlinecold-threshold even if it is not explicitly passed. If
1565 // -inline-threshold is specified, then -inlinecold-threshold needs to be
1566 // explicitly specified to set the ColdThreshold knob
1567 if (InlineThreshold.getNumOccurrences() == 0) {
1568 Params.OptMinSizeThreshold = InlineConstants::OptMinSizeThreshold;
1569 Params.OptSizeThreshold = InlineConstants::OptSizeThreshold;
1570 Params.ColdThreshold = ColdThreshold;
1571 } else if (ColdThreshold.getNumOccurrences() > 0) {
1572 Params.ColdThreshold = ColdThreshold;
1577 InlineParams llvm::getInlineParams() {
1578 return getInlineParams(InlineThreshold);
1581 // Compute the default threshold for inlining based on the opt level and the
1583 static int computeThresholdFromOptLevels(unsigned OptLevel,
1584 unsigned SizeOptLevel) {
1586 return InlineConstants::OptAggressiveThreshold;
1587 if (SizeOptLevel == 1) // -Os
1588 return InlineConstants::OptSizeThreshold;
1589 if (SizeOptLevel == 2) // -Oz
1590 return InlineConstants::OptMinSizeThreshold;
1591 return InlineThreshold;
1594 InlineParams llvm::getInlineParams(unsigned OptLevel, unsigned SizeOptLevel) {
1595 return getInlineParams(computeThresholdFromOptLevels(OptLevel, SizeOptLevel));