1 //===- InlineCost.cpp - Cost analysis for inliner -------------------------===//
3 // The LLVM Compiler Infrastructure
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
8 //===----------------------------------------------------------------------===//
10 // This file implements inline cost analysis.
12 //===----------------------------------------------------------------------===//
14 #include "llvm/Analysis/InlineCost.h"
15 #include "llvm/ADT/STLExtras.h"
16 #include "llvm/ADT/SetVector.h"
17 #include "llvm/ADT/SmallPtrSet.h"
18 #include "llvm/ADT/SmallVector.h"
19 #include "llvm/ADT/Statistic.h"
20 #include "llvm/Analysis/AssumptionCache.h"
21 #include "llvm/Analysis/BlockFrequencyInfo.h"
22 #include "llvm/Analysis/CodeMetrics.h"
23 #include "llvm/Analysis/ConstantFolding.h"
24 #include "llvm/Analysis/CFG.h"
25 #include "llvm/Analysis/InstructionSimplify.h"
26 #include "llvm/Analysis/ProfileSummaryInfo.h"
27 #include "llvm/Analysis/TargetTransformInfo.h"
28 #include "llvm/Analysis/ValueTracking.h"
29 #include "llvm/Config/llvm-config.h"
30 #include "llvm/IR/CallSite.h"
31 #include "llvm/IR/CallingConv.h"
32 #include "llvm/IR/DataLayout.h"
33 #include "llvm/IR/GetElementPtrTypeIterator.h"
34 #include "llvm/IR/GlobalAlias.h"
35 #include "llvm/IR/InstVisitor.h"
36 #include "llvm/IR/IntrinsicInst.h"
37 #include "llvm/IR/Operator.h"
38 #include "llvm/Support/Debug.h"
39 #include "llvm/Support/raw_ostream.h"
43 #define DEBUG_TYPE "inline-cost"
45 STATISTIC(NumCallsAnalyzed, "Number of call sites analyzed");
47 static cl::opt<int> InlineThreshold(
48 "inline-threshold", cl::Hidden, cl::init(225), cl::ZeroOrMore,
49 cl::desc("Control the amount of inlining to perform (default = 225)"));
51 static cl::opt<int> HintThreshold(
52 "inlinehint-threshold", cl::Hidden, cl::init(325),
53 cl::desc("Threshold for inlining functions with inline hint"));
56 ColdCallSiteThreshold("inline-cold-callsite-threshold", cl::Hidden,
58 cl::desc("Threshold for inlining cold callsites"));
60 // We introduce this threshold to help performance of instrumentation based
61 // PGO before we actually hook up inliner with analysis passes such as BPI and
63 static cl::opt<int> ColdThreshold(
64 "inlinecold-threshold", cl::Hidden, cl::init(45),
65 cl::desc("Threshold for inlining functions with cold attribute"));
68 HotCallSiteThreshold("hot-callsite-threshold", cl::Hidden, cl::init(3000),
70 cl::desc("Threshold for hot callsites "));
72 static cl::opt<int> LocallyHotCallSiteThreshold(
73 "locally-hot-callsite-threshold", cl::Hidden, cl::init(525), cl::ZeroOrMore,
74 cl::desc("Threshold for locally hot callsites "));
76 static cl::opt<int> ColdCallSiteRelFreq(
77 "cold-callsite-rel-freq", cl::Hidden, cl::init(2), cl::ZeroOrMore,
78 cl::desc("Maxmimum block frequency, expressed as a percentage of caller's "
79 "entry frequency, for a callsite to be cold in the absence of "
80 "profile information."));
82 static cl::opt<int> HotCallSiteRelFreq(
83 "hot-callsite-rel-freq", cl::Hidden, cl::init(60), cl::ZeroOrMore,
84 cl::desc("Minimum block frequency, expressed as a multiple of caller's "
85 "entry frequency, for a callsite to be hot in the absence of "
86 "profile information."));
88 static cl::opt<bool> OptComputeFullInlineCost(
89 "inline-cost-full", cl::Hidden, cl::init(false),
90 cl::desc("Compute the full inline cost of a call site even when the cost "
91 "exceeds the threshold."));
95 class CallAnalyzer : public InstVisitor<CallAnalyzer, bool> {
96 typedef InstVisitor<CallAnalyzer, bool> Base;
97 friend class InstVisitor<CallAnalyzer, bool>;
99 /// The TargetTransformInfo available for this compilation.
100 const TargetTransformInfo &TTI;
102 /// Getter for the cache of @llvm.assume intrinsics.
103 std::function<AssumptionCache &(Function &)> &GetAssumptionCache;
105 /// Getter for BlockFrequencyInfo
106 Optional<function_ref<BlockFrequencyInfo &(Function &)>> &GetBFI;
108 /// Profile summary information.
109 ProfileSummaryInfo *PSI;
111 /// The called function.
114 // Cache the DataLayout since we use it a lot.
115 const DataLayout &DL;
117 /// The OptimizationRemarkEmitter available for this compilation.
118 OptimizationRemarkEmitter *ORE;
120 /// The candidate callsite being analyzed. Please do not use this to do
121 /// analysis in the caller function; we want the inline cost query to be
122 /// easily cacheable. Instead, use the cover function paramHasAttr.
123 CallSite CandidateCS;
125 /// Tunable parameters that control the analysis.
126 const InlineParams &Params;
130 bool ComputeFullInlineCost;
132 bool IsCallerRecursive;
133 bool IsRecursiveCall;
134 bool ExposesReturnsTwice;
135 bool HasDynamicAlloca;
136 bool ContainsNoDuplicateCall;
139 bool HasUninlineableIntrinsic;
142 /// Number of bytes allocated statically by the callee.
143 uint64_t AllocatedSize;
144 unsigned NumInstructions, NumVectorInstructions;
145 int VectorBonus, TenPercentVectorBonus;
146 // Bonus to be applied when the callee has only one reachable basic block.
149 /// While we walk the potentially-inlined instructions, we build up and
150 /// maintain a mapping of simplified values specific to this callsite. The
151 /// idea is to propagate any special information we have about arguments to
152 /// this call through the inlinable section of the function, and account for
153 /// likely simplifications post-inlining. The most important aspect we track
154 /// is CFG altering simplifications -- when we prove a basic block dead, that
155 /// can cause dramatic shifts in the cost of inlining a function.
156 DenseMap<Value *, Constant *> SimplifiedValues;
158 /// Keep track of the values which map back (through function arguments) to
159 /// allocas on the caller stack which could be simplified through SROA.
160 DenseMap<Value *, Value *> SROAArgValues;
162 /// The mapping of caller Alloca values to their accumulated cost savings. If
163 /// we have to disable SROA for one of the allocas, this tells us how much
164 /// cost must be added.
165 DenseMap<Value *, int> SROAArgCosts;
167 /// Keep track of values which map to a pointer base and constant offset.
168 DenseMap<Value *, std::pair<Value *, APInt>> ConstantOffsetPtrs;
170 /// Keep track of dead blocks due to the constant arguments.
171 SetVector<BasicBlock *> DeadBlocks;
173 /// The mapping of the blocks to their known unique successors due to the
174 /// constant arguments.
175 DenseMap<BasicBlock *, BasicBlock *> KnownSuccessors;
177 /// Model the elimination of repeated loads that is expected to happen
178 /// whenever we simplify away the stores that would otherwise cause them to be
180 bool EnableLoadElimination;
181 SmallPtrSet<Value *, 16> LoadAddrSet;
182 int LoadEliminationCost;
184 // Custom simplification helper routines.
185 bool isAllocaDerivedArg(Value *V);
186 bool lookupSROAArgAndCost(Value *V, Value *&Arg,
187 DenseMap<Value *, int>::iterator &CostIt);
188 void disableSROA(DenseMap<Value *, int>::iterator CostIt);
189 void disableSROA(Value *V);
190 void findDeadBlocks(BasicBlock *CurrBB, BasicBlock *NextBB);
191 void accumulateSROACost(DenseMap<Value *, int>::iterator CostIt,
192 int InstructionCost);
193 void disableLoadElimination();
194 bool isGEPFree(GetElementPtrInst &GEP);
195 bool canFoldInboundsGEP(GetElementPtrInst &I);
196 bool accumulateGEPOffset(GEPOperator &GEP, APInt &Offset);
197 bool simplifyCallSite(Function *F, CallSite CS);
198 template <typename Callable>
199 bool simplifyInstruction(Instruction &I, Callable Evaluate);
200 ConstantInt *stripAndComputeInBoundsConstantOffsets(Value *&V);
202 /// Return true if the given argument to the function being considered for
203 /// inlining has the given attribute set either at the call site or the
204 /// function declaration. Primarily used to inspect call site specific
205 /// attributes since these can be more precise than the ones on the callee
207 bool paramHasAttr(Argument *A, Attribute::AttrKind Attr);
209 /// Return true if the given value is known non null within the callee if
210 /// inlined through this particular callsite.
211 bool isKnownNonNullInCallee(Value *V);
213 /// Update Threshold based on callsite properties such as callee
214 /// attributes and callee hotness for PGO builds. The Callee is explicitly
215 /// passed to support analyzing indirect calls whose target is inferred by
217 void updateThreshold(CallSite CS, Function &Callee);
219 /// Return true if size growth is allowed when inlining the callee at CS.
220 bool allowSizeGrowth(CallSite CS);
222 /// Return true if \p CS is a cold callsite.
223 bool isColdCallSite(CallSite CS, BlockFrequencyInfo *CallerBFI);
225 /// Return a higher threshold if \p CS is a hot callsite.
226 Optional<int> getHotCallSiteThreshold(CallSite CS,
227 BlockFrequencyInfo *CallerBFI);
229 // Custom analysis routines.
230 bool analyzeBlock(BasicBlock *BB, SmallPtrSetImpl<const Value *> &EphValues);
232 // Disable several entry points to the visitor so we don't accidentally use
233 // them by declaring but not defining them here.
234 void visit(Module *);
235 void visit(Module &);
236 void visit(Function *);
237 void visit(Function &);
238 void visit(BasicBlock *);
239 void visit(BasicBlock &);
241 // Provide base case for our instruction visit.
242 bool visitInstruction(Instruction &I);
244 // Our visit overrides.
245 bool visitAlloca(AllocaInst &I);
246 bool visitPHI(PHINode &I);
247 bool visitGetElementPtr(GetElementPtrInst &I);
248 bool visitBitCast(BitCastInst &I);
249 bool visitPtrToInt(PtrToIntInst &I);
250 bool visitIntToPtr(IntToPtrInst &I);
251 bool visitCastInst(CastInst &I);
252 bool visitUnaryInstruction(UnaryInstruction &I);
253 bool visitCmpInst(CmpInst &I);
254 bool visitSub(BinaryOperator &I);
255 bool visitBinaryOperator(BinaryOperator &I);
256 bool visitLoad(LoadInst &I);
257 bool visitStore(StoreInst &I);
258 bool visitExtractValue(ExtractValueInst &I);
259 bool visitInsertValue(InsertValueInst &I);
260 bool visitCallSite(CallSite CS);
261 bool visitReturnInst(ReturnInst &RI);
262 bool visitBranchInst(BranchInst &BI);
263 bool visitSelectInst(SelectInst &SI);
264 bool visitSwitchInst(SwitchInst &SI);
265 bool visitIndirectBrInst(IndirectBrInst &IBI);
266 bool visitResumeInst(ResumeInst &RI);
267 bool visitCleanupReturnInst(CleanupReturnInst &RI);
268 bool visitCatchReturnInst(CatchReturnInst &RI);
269 bool visitUnreachableInst(UnreachableInst &I);
272 CallAnalyzer(const TargetTransformInfo &TTI,
273 std::function<AssumptionCache &(Function &)> &GetAssumptionCache,
274 Optional<function_ref<BlockFrequencyInfo &(Function &)>> &GetBFI,
275 ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE,
276 Function &Callee, CallSite CSArg, const InlineParams &Params)
277 : TTI(TTI), GetAssumptionCache(GetAssumptionCache), GetBFI(GetBFI),
278 PSI(PSI), F(Callee), DL(F.getParent()->getDataLayout()), ORE(ORE),
279 CandidateCS(CSArg), Params(Params), Threshold(Params.DefaultThreshold),
280 Cost(0), ComputeFullInlineCost(OptComputeFullInlineCost ||
281 Params.ComputeFullInlineCost || ORE),
282 IsCallerRecursive(false), IsRecursiveCall(false),
283 ExposesReturnsTwice(false), HasDynamicAlloca(false),
284 ContainsNoDuplicateCall(false), HasReturn(false), HasIndirectBr(false),
285 HasUninlineableIntrinsic(false), UsesVarArgs(false), AllocatedSize(0),
286 NumInstructions(0), NumVectorInstructions(0), VectorBonus(0),
287 SingleBBBonus(0), EnableLoadElimination(true), LoadEliminationCost(0),
288 NumConstantArgs(0), NumConstantOffsetPtrArgs(0), NumAllocaArgs(0),
289 NumConstantPtrCmps(0), NumConstantPtrDiffs(0),
290 NumInstructionsSimplified(0), SROACostSavings(0),
291 SROACostSavingsLost(0) {}
293 bool analyzeCall(CallSite CS);
295 int getThreshold() { return Threshold; }
296 int getCost() { return Cost; }
298 // Keep a bunch of stats about the cost savings found so we can print them
299 // out when debugging.
300 unsigned NumConstantArgs;
301 unsigned NumConstantOffsetPtrArgs;
302 unsigned NumAllocaArgs;
303 unsigned NumConstantPtrCmps;
304 unsigned NumConstantPtrDiffs;
305 unsigned NumInstructionsSimplified;
306 unsigned SROACostSavings;
307 unsigned SROACostSavingsLost;
314 /// Test whether the given value is an Alloca-derived function argument.
315 bool CallAnalyzer::isAllocaDerivedArg(Value *V) {
316 return SROAArgValues.count(V);
319 /// Lookup the SROA-candidate argument and cost iterator which V maps to.
320 /// Returns false if V does not map to a SROA-candidate.
321 bool CallAnalyzer::lookupSROAArgAndCost(
322 Value *V, Value *&Arg, DenseMap<Value *, int>::iterator &CostIt) {
323 if (SROAArgValues.empty() || SROAArgCosts.empty())
326 DenseMap<Value *, Value *>::iterator ArgIt = SROAArgValues.find(V);
327 if (ArgIt == SROAArgValues.end())
331 CostIt = SROAArgCosts.find(Arg);
332 return CostIt != SROAArgCosts.end();
335 /// Disable SROA for the candidate marked by this cost iterator.
337 /// This marks the candidate as no longer viable for SROA, and adds the cost
338 /// savings associated with it back into the inline cost measurement.
339 void CallAnalyzer::disableSROA(DenseMap<Value *, int>::iterator CostIt) {
340 // If we're no longer able to perform SROA we need to undo its cost savings
341 // and prevent subsequent analysis.
342 Cost += CostIt->second;
343 SROACostSavings -= CostIt->second;
344 SROACostSavingsLost += CostIt->second;
345 SROAArgCosts.erase(CostIt);
346 disableLoadElimination();
349 /// If 'V' maps to a SROA candidate, disable SROA for it.
350 void CallAnalyzer::disableSROA(Value *V) {
352 DenseMap<Value *, int>::iterator CostIt;
353 if (lookupSROAArgAndCost(V, SROAArg, CostIt))
357 /// Accumulate the given cost for a particular SROA candidate.
358 void CallAnalyzer::accumulateSROACost(DenseMap<Value *, int>::iterator CostIt,
359 int InstructionCost) {
360 CostIt->second += InstructionCost;
361 SROACostSavings += InstructionCost;
364 void CallAnalyzer::disableLoadElimination() {
365 if (EnableLoadElimination) {
366 Cost += LoadEliminationCost;
367 LoadEliminationCost = 0;
368 EnableLoadElimination = false;
372 /// Accumulate a constant GEP offset into an APInt if possible.
374 /// Returns false if unable to compute the offset for any reason. Respects any
375 /// simplified values known during the analysis of this callsite.
376 bool CallAnalyzer::accumulateGEPOffset(GEPOperator &GEP, APInt &Offset) {
377 unsigned IntPtrWidth = DL.getIndexTypeSizeInBits(GEP.getType());
378 assert(IntPtrWidth == Offset.getBitWidth());
380 for (gep_type_iterator GTI = gep_type_begin(GEP), GTE = gep_type_end(GEP);
382 ConstantInt *OpC = dyn_cast<ConstantInt>(GTI.getOperand());
384 if (Constant *SimpleOp = SimplifiedValues.lookup(GTI.getOperand()))
385 OpC = dyn_cast<ConstantInt>(SimpleOp);
391 // Handle a struct index, which adds its field offset to the pointer.
392 if (StructType *STy = GTI.getStructTypeOrNull()) {
393 unsigned ElementIdx = OpC->getZExtValue();
394 const StructLayout *SL = DL.getStructLayout(STy);
395 Offset += APInt(IntPtrWidth, SL->getElementOffset(ElementIdx));
399 APInt TypeSize(IntPtrWidth, DL.getTypeAllocSize(GTI.getIndexedType()));
400 Offset += OpC->getValue().sextOrTrunc(IntPtrWidth) * TypeSize;
405 /// Use TTI to check whether a GEP is free.
407 /// Respects any simplified values known during the analysis of this callsite.
408 bool CallAnalyzer::isGEPFree(GetElementPtrInst &GEP) {
409 SmallVector<Value *, 4> Operands;
410 Operands.push_back(GEP.getOperand(0));
411 for (User::op_iterator I = GEP.idx_begin(), E = GEP.idx_end(); I != E; ++I)
412 if (Constant *SimpleOp = SimplifiedValues.lookup(*I))
413 Operands.push_back(SimpleOp);
415 Operands.push_back(*I);
416 return TargetTransformInfo::TCC_Free == TTI.getUserCost(&GEP, Operands);
419 bool CallAnalyzer::visitAlloca(AllocaInst &I) {
420 // Check whether inlining will turn a dynamic alloca into a static
421 // alloca and handle that case.
422 if (I.isArrayAllocation()) {
423 Constant *Size = SimplifiedValues.lookup(I.getArraySize());
424 if (auto *AllocSize = dyn_cast_or_null<ConstantInt>(Size)) {
425 Type *Ty = I.getAllocatedType();
426 AllocatedSize = SaturatingMultiplyAdd(
427 AllocSize->getLimitedValue(), DL.getTypeAllocSize(Ty), AllocatedSize);
428 return Base::visitAlloca(I);
432 // Accumulate the allocated size.
433 if (I.isStaticAlloca()) {
434 Type *Ty = I.getAllocatedType();
435 AllocatedSize = SaturatingAdd(DL.getTypeAllocSize(Ty), AllocatedSize);
438 // We will happily inline static alloca instructions.
439 if (I.isStaticAlloca())
440 return Base::visitAlloca(I);
442 // FIXME: This is overly conservative. Dynamic allocas are inefficient for
443 // a variety of reasons, and so we would like to not inline them into
444 // functions which don't currently have a dynamic alloca. This simply
445 // disables inlining altogether in the presence of a dynamic alloca.
446 HasDynamicAlloca = true;
450 bool CallAnalyzer::visitPHI(PHINode &I) {
451 // FIXME: We need to propagate SROA *disabling* through phi nodes, even
452 // though we don't want to propagate it's bonuses. The idea is to disable
453 // SROA if it *might* be used in an inappropriate manner.
455 // Phi nodes are always zero-cost.
456 // FIXME: Pointer sizes may differ between different address spaces, so do we
457 // need to use correct address space in the call to getPointerSizeInBits here?
458 // Or could we skip the getPointerSizeInBits call completely? As far as I can
459 // see the ZeroOffset is used as a dummy value, so we can probably use any
460 // bit width for the ZeroOffset?
461 APInt ZeroOffset = APInt::getNullValue(DL.getPointerSizeInBits(0));
462 bool CheckSROA = I.getType()->isPointerTy();
464 // Track the constant or pointer with constant offset we've seen so far.
465 Constant *FirstC = nullptr;
466 std::pair<Value *, APInt> FirstBaseAndOffset = {nullptr, ZeroOffset};
467 Value *FirstV = nullptr;
469 for (unsigned i = 0, e = I.getNumIncomingValues(); i != e; ++i) {
470 BasicBlock *Pred = I.getIncomingBlock(i);
471 // If the incoming block is dead, skip the incoming block.
472 if (DeadBlocks.count(Pred))
474 // If the parent block of phi is not the known successor of the incoming
475 // block, skip the incoming block.
476 BasicBlock *KnownSuccessor = KnownSuccessors[Pred];
477 if (KnownSuccessor && KnownSuccessor != I.getParent())
480 Value *V = I.getIncomingValue(i);
481 // If the incoming value is this phi itself, skip the incoming value.
485 Constant *C = dyn_cast<Constant>(V);
487 C = SimplifiedValues.lookup(V);
489 std::pair<Value *, APInt> BaseAndOffset = {nullptr, ZeroOffset};
491 BaseAndOffset = ConstantOffsetPtrs.lookup(V);
493 if (!C && !BaseAndOffset.first)
494 // The incoming value is neither a constant nor a pointer with constant
495 // offset, exit early.
500 // If we've seen a constant incoming value before and it is the same
501 // constant we see this time, continue checking the next incoming value.
503 // Otherwise early exit because we either see a different constant or saw
504 // a constant before but we have a pointer with constant offset this time.
509 // The same logic as above, but check pointer with constant offset here.
510 if (FirstBaseAndOffset == BaseAndOffset)
516 // This is the 1st time we've seen a constant, record it.
521 // The remaining case is that this is the 1st time we've seen a pointer with
522 // constant offset, record it.
524 FirstBaseAndOffset = BaseAndOffset;
527 // Check if we can map phi to a constant.
529 SimplifiedValues[&I] = FirstC;
533 // Check if we can map phi to a pointer with constant offset.
534 if (FirstBaseAndOffset.first) {
535 ConstantOffsetPtrs[&I] = FirstBaseAndOffset;
538 DenseMap<Value *, int>::iterator CostIt;
539 if (lookupSROAArgAndCost(FirstV, SROAArg, CostIt))
540 SROAArgValues[&I] = SROAArg;
546 /// Check we can fold GEPs of constant-offset call site argument pointers.
547 /// This requires target data and inbounds GEPs.
549 /// \return true if the specified GEP can be folded.
550 bool CallAnalyzer::canFoldInboundsGEP(GetElementPtrInst &I) {
551 // Check if we have a base + offset for the pointer.
552 std::pair<Value *, APInt> BaseAndOffset =
553 ConstantOffsetPtrs.lookup(I.getPointerOperand());
554 if (!BaseAndOffset.first)
557 // Check if the offset of this GEP is constant, and if so accumulate it
559 if (!accumulateGEPOffset(cast<GEPOperator>(I), BaseAndOffset.second))
562 // Add the result as a new mapping to Base + Offset.
563 ConstantOffsetPtrs[&I] = BaseAndOffset;
568 bool CallAnalyzer::visitGetElementPtr(GetElementPtrInst &I) {
570 DenseMap<Value *, int>::iterator CostIt;
572 lookupSROAArgAndCost(I.getPointerOperand(), SROAArg, CostIt);
574 // Lambda to check whether a GEP's indices are all constant.
575 auto IsGEPOffsetConstant = [&](GetElementPtrInst &GEP) {
576 for (User::op_iterator I = GEP.idx_begin(), E = GEP.idx_end(); I != E; ++I)
577 if (!isa<Constant>(*I) && !SimplifiedValues.lookup(*I))
582 if ((I.isInBounds() && canFoldInboundsGEP(I)) || IsGEPOffsetConstant(I)) {
584 SROAArgValues[&I] = SROAArg;
586 // Constant GEPs are modeled as free.
590 // Variable GEPs will require math and will disable SROA.
596 /// Simplify \p I if its operands are constants and update SimplifiedValues.
597 /// \p Evaluate is a callable specific to instruction type that evaluates the
598 /// instruction when all the operands are constants.
599 template <typename Callable>
600 bool CallAnalyzer::simplifyInstruction(Instruction &I, Callable Evaluate) {
601 SmallVector<Constant *, 2> COps;
602 for (Value *Op : I.operands()) {
603 Constant *COp = dyn_cast<Constant>(Op);
605 COp = SimplifiedValues.lookup(Op);
610 auto *C = Evaluate(COps);
613 SimplifiedValues[&I] = C;
617 bool CallAnalyzer::visitBitCast(BitCastInst &I) {
618 // Propagate constants through bitcasts.
619 if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) {
620 return ConstantExpr::getBitCast(COps[0], I.getType());
624 // Track base/offsets through casts
625 std::pair<Value *, APInt> BaseAndOffset =
626 ConstantOffsetPtrs.lookup(I.getOperand(0));
627 // Casts don't change the offset, just wrap it up.
628 if (BaseAndOffset.first)
629 ConstantOffsetPtrs[&I] = BaseAndOffset;
631 // Also look for SROA candidates here.
633 DenseMap<Value *, int>::iterator CostIt;
634 if (lookupSROAArgAndCost(I.getOperand(0), SROAArg, CostIt))
635 SROAArgValues[&I] = SROAArg;
637 // Bitcasts are always zero cost.
641 bool CallAnalyzer::visitPtrToInt(PtrToIntInst &I) {
642 // Propagate constants through ptrtoint.
643 if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) {
644 return ConstantExpr::getPtrToInt(COps[0], I.getType());
648 // Track base/offset pairs when converted to a plain integer provided the
649 // integer is large enough to represent the pointer.
650 unsigned IntegerSize = I.getType()->getScalarSizeInBits();
651 unsigned AS = I.getOperand(0)->getType()->getPointerAddressSpace();
652 if (IntegerSize >= DL.getPointerSizeInBits(AS)) {
653 std::pair<Value *, APInt> BaseAndOffset =
654 ConstantOffsetPtrs.lookup(I.getOperand(0));
655 if (BaseAndOffset.first)
656 ConstantOffsetPtrs[&I] = BaseAndOffset;
659 // This is really weird. Technically, ptrtoint will disable SROA. However,
660 // unless that ptrtoint is *used* somewhere in the live basic blocks after
661 // inlining, it will be nuked, and SROA should proceed. All of the uses which
662 // would block SROA would also block SROA if applied directly to a pointer,
663 // and so we can just add the integer in here. The only places where SROA is
664 // preserved either cannot fire on an integer, or won't in-and-of themselves
665 // disable SROA (ext) w/o some later use that we would see and disable.
667 DenseMap<Value *, int>::iterator CostIt;
668 if (lookupSROAArgAndCost(I.getOperand(0), SROAArg, CostIt))
669 SROAArgValues[&I] = SROAArg;
671 return TargetTransformInfo::TCC_Free == TTI.getUserCost(&I);
674 bool CallAnalyzer::visitIntToPtr(IntToPtrInst &I) {
675 // Propagate constants through ptrtoint.
676 if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) {
677 return ConstantExpr::getIntToPtr(COps[0], I.getType());
681 // Track base/offset pairs when round-tripped through a pointer without
682 // modifications provided the integer is not too large.
683 Value *Op = I.getOperand(0);
684 unsigned IntegerSize = Op->getType()->getScalarSizeInBits();
685 if (IntegerSize <= DL.getPointerTypeSizeInBits(I.getType())) {
686 std::pair<Value *, APInt> BaseAndOffset = ConstantOffsetPtrs.lookup(Op);
687 if (BaseAndOffset.first)
688 ConstantOffsetPtrs[&I] = BaseAndOffset;
691 // "Propagate" SROA here in the same manner as we do for ptrtoint above.
693 DenseMap<Value *, int>::iterator CostIt;
694 if (lookupSROAArgAndCost(Op, SROAArg, CostIt))
695 SROAArgValues[&I] = SROAArg;
697 return TargetTransformInfo::TCC_Free == TTI.getUserCost(&I);
700 bool CallAnalyzer::visitCastInst(CastInst &I) {
701 // Propagate constants through ptrtoint.
702 if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) {
703 return ConstantExpr::getCast(I.getOpcode(), COps[0], I.getType());
707 // Disable SROA in the face of arbitrary casts we don't whitelist elsewhere.
708 disableSROA(I.getOperand(0));
710 // If this is a floating-point cast, and the target says this operation
711 // is expensive, this may eventually become a library call. Treat the cost
713 switch (I.getOpcode()) {
714 case Instruction::FPTrunc:
715 case Instruction::FPExt:
716 case Instruction::UIToFP:
717 case Instruction::SIToFP:
718 case Instruction::FPToUI:
719 case Instruction::FPToSI:
720 if (TTI.getFPOpCost(I.getType()) == TargetTransformInfo::TCC_Expensive)
721 Cost += InlineConstants::CallPenalty;
726 return TargetTransformInfo::TCC_Free == TTI.getUserCost(&I);
729 bool CallAnalyzer::visitUnaryInstruction(UnaryInstruction &I) {
730 Value *Operand = I.getOperand(0);
731 if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) {
732 return ConstantFoldInstOperands(&I, COps[0], DL);
736 // Disable any SROA on the argument to arbitrary unary operators.
737 disableSROA(Operand);
742 bool CallAnalyzer::paramHasAttr(Argument *A, Attribute::AttrKind Attr) {
743 return CandidateCS.paramHasAttr(A->getArgNo(), Attr);
746 bool CallAnalyzer::isKnownNonNullInCallee(Value *V) {
747 // Does the *call site* have the NonNull attribute set on an argument? We
748 // use the attribute on the call site to memoize any analysis done in the
749 // caller. This will also trip if the callee function has a non-null
750 // parameter attribute, but that's a less interesting case because hopefully
751 // the callee would already have been simplified based on that.
752 if (Argument *A = dyn_cast<Argument>(V))
753 if (paramHasAttr(A, Attribute::NonNull))
756 // Is this an alloca in the caller? This is distinct from the attribute case
757 // above because attributes aren't updated within the inliner itself and we
758 // always want to catch the alloca derived case.
759 if (isAllocaDerivedArg(V))
760 // We can actually predict the result of comparisons between an
761 // alloca-derived value and null. Note that this fires regardless of
768 bool CallAnalyzer::allowSizeGrowth(CallSite CS) {
769 // If the normal destination of the invoke or the parent block of the call
770 // site is unreachable-terminated, there is little point in inlining this
771 // unless there is literally zero cost.
772 // FIXME: Note that it is possible that an unreachable-terminated block has a
773 // hot entry. For example, in below scenario inlining hot_call_X() may be
781 // For now, we are not handling this corner case here as it is rare in real
782 // code. In future, we should elaborate this based on BPI and BFI in more
783 // general threshold adjusting heuristics in updateThreshold().
784 Instruction *Instr = CS.getInstruction();
785 if (InvokeInst *II = dyn_cast<InvokeInst>(Instr)) {
786 if (isa<UnreachableInst>(II->getNormalDest()->getTerminator()))
788 } else if (isa<UnreachableInst>(Instr->getParent()->getTerminator()))
794 bool CallAnalyzer::isColdCallSite(CallSite CS, BlockFrequencyInfo *CallerBFI) {
795 // If global profile summary is available, then callsite's coldness is
796 // determined based on that.
797 if (PSI && PSI->hasProfileSummary())
798 return PSI->isColdCallSite(CS, CallerBFI);
800 // Otherwise we need BFI to be available.
804 // Determine if the callsite is cold relative to caller's entry. We could
805 // potentially cache the computation of scaled entry frequency, but the added
806 // complexity is not worth it unless this scaling shows up high in the
808 const BranchProbability ColdProb(ColdCallSiteRelFreq, 100);
809 auto CallSiteBB = CS.getInstruction()->getParent();
810 auto CallSiteFreq = CallerBFI->getBlockFreq(CallSiteBB);
811 auto CallerEntryFreq =
812 CallerBFI->getBlockFreq(&(CS.getCaller()->getEntryBlock()));
813 return CallSiteFreq < CallerEntryFreq * ColdProb;
817 CallAnalyzer::getHotCallSiteThreshold(CallSite CS,
818 BlockFrequencyInfo *CallerBFI) {
820 // If global profile summary is available, then callsite's hotness is
821 // determined based on that.
822 if (PSI && PSI->hasProfileSummary() && PSI->isHotCallSite(CS, CallerBFI))
823 return Params.HotCallSiteThreshold;
825 // Otherwise we need BFI to be available and to have a locally hot callsite
827 if (!CallerBFI || !Params.LocallyHotCallSiteThreshold)
830 // Determine if the callsite is hot relative to caller's entry. We could
831 // potentially cache the computation of scaled entry frequency, but the added
832 // complexity is not worth it unless this scaling shows up high in the
834 auto CallSiteBB = CS.getInstruction()->getParent();
835 auto CallSiteFreq = CallerBFI->getBlockFreq(CallSiteBB).getFrequency();
836 auto CallerEntryFreq = CallerBFI->getEntryFreq();
837 if (CallSiteFreq >= CallerEntryFreq * HotCallSiteRelFreq)
838 return Params.LocallyHotCallSiteThreshold;
840 // Otherwise treat it normally.
844 void CallAnalyzer::updateThreshold(CallSite CS, Function &Callee) {
845 // If no size growth is allowed for this inlining, set Threshold to 0.
846 if (!allowSizeGrowth(CS)) {
851 Function *Caller = CS.getCaller();
853 // return min(A, B) if B is valid.
854 auto MinIfValid = [](int A, Optional<int> B) {
855 return B ? std::min(A, B.getValue()) : A;
858 // return max(A, B) if B is valid.
859 auto MaxIfValid = [](int A, Optional<int> B) {
860 return B ? std::max(A, B.getValue()) : A;
863 // Various bonus percentages. These are multiplied by Threshold to get the
865 // SingleBBBonus: This bonus is applied if the callee has a single reachable
866 // basic block at the given callsite context. This is speculatively applied
867 // and withdrawn if more than one basic block is seen.
869 // Vector bonuses: We want to more aggressively inline vector-dense kernels
870 // and apply this bonus based on the percentage of vector instructions. A
871 // bonus is applied if the vector instructions exceed 50% and half that amount
872 // is applied if it exceeds 10%. Note that these bonuses are some what
873 // arbitrary and evolved over time by accident as much as because they are
874 // principled bonuses.
875 // FIXME: It would be nice to base the bonus values on something more
878 // LstCallToStaticBonus: This large bonus is applied to ensure the inlining
879 // of the last call to a static function as inlining such functions is
880 // guaranteed to reduce code size.
882 // These bonus percentages may be set to 0 based on properties of the caller
884 int SingleBBBonusPercent = 50;
885 int VectorBonusPercent = 150;
886 int LastCallToStaticBonus = InlineConstants::LastCallToStaticBonus;
888 // Lambda to set all the above bonus and bonus percentages to 0.
889 auto DisallowAllBonuses = [&]() {
890 SingleBBBonusPercent = 0;
891 VectorBonusPercent = 0;
892 LastCallToStaticBonus = 0;
895 // Use the OptMinSizeThreshold or OptSizeThreshold knob if they are available
896 // and reduce the threshold if the caller has the necessary attribute.
897 if (Caller->optForMinSize()) {
898 Threshold = MinIfValid(Threshold, Params.OptMinSizeThreshold);
899 // For minsize, we want to disable the single BB bonus and the vector
900 // bonuses, but not the last-call-to-static bonus. Inlining the last call to
901 // a static function will, at the minimum, eliminate the parameter setup and
902 // call/return instructions.
903 SingleBBBonusPercent = 0;
904 VectorBonusPercent = 0;
905 } else if (Caller->optForSize())
906 Threshold = MinIfValid(Threshold, Params.OptSizeThreshold);
908 // Adjust the threshold based on inlinehint attribute and profile based
909 // hotness information if the caller does not have MinSize attribute.
910 if (!Caller->optForMinSize()) {
911 if (Callee.hasFnAttribute(Attribute::InlineHint))
912 Threshold = MaxIfValid(Threshold, Params.HintThreshold);
914 // FIXME: After switching to the new passmanager, simplify the logic below
915 // by checking only the callsite hotness/coldness as we will reliably
916 // have local profile information.
918 // Callsite hotness and coldness can be determined if sample profile is
919 // used (which adds hotness metadata to calls) or if caller's
920 // BlockFrequencyInfo is available.
921 BlockFrequencyInfo *CallerBFI = GetBFI ? &((*GetBFI)(*Caller)) : nullptr;
922 auto HotCallSiteThreshold = getHotCallSiteThreshold(CS, CallerBFI);
923 if (!Caller->optForSize() && HotCallSiteThreshold) {
924 LLVM_DEBUG(dbgs() << "Hot callsite.\n");
925 // FIXME: This should update the threshold only if it exceeds the
926 // current threshold, but AutoFDO + ThinLTO currently relies on this
927 // behavior to prevent inlining of hot callsites during ThinLTO
929 Threshold = HotCallSiteThreshold.getValue();
930 } else if (isColdCallSite(CS, CallerBFI)) {
931 LLVM_DEBUG(dbgs() << "Cold callsite.\n");
932 // Do not apply bonuses for a cold callsite including the
933 // LastCallToStatic bonus. While this bonus might result in code size
934 // reduction, it can cause the size of a non-cold caller to increase
935 // preventing it from being inlined.
936 DisallowAllBonuses();
937 Threshold = MinIfValid(Threshold, Params.ColdCallSiteThreshold);
939 // Use callee's global profile information only if we have no way of
940 // determining this via callsite information.
941 if (PSI->isFunctionEntryHot(&Callee)) {
942 LLVM_DEBUG(dbgs() << "Hot callee.\n");
943 // If callsite hotness can not be determined, we may still know
944 // that the callee is hot and treat it as a weaker hint for threshold
946 Threshold = MaxIfValid(Threshold, Params.HintThreshold);
947 } else if (PSI->isFunctionEntryCold(&Callee)) {
948 LLVM_DEBUG(dbgs() << "Cold callee.\n");
949 // Do not apply bonuses for a cold callee including the
950 // LastCallToStatic bonus. While this bonus might result in code size
951 // reduction, it can cause the size of a non-cold caller to increase
952 // preventing it from being inlined.
953 DisallowAllBonuses();
954 Threshold = MinIfValid(Threshold, Params.ColdThreshold);
959 // Finally, take the target-specific inlining threshold multiplier into
961 Threshold *= TTI.getInliningThresholdMultiplier();
963 SingleBBBonus = Threshold * SingleBBBonusPercent / 100;
964 VectorBonus = Threshold * VectorBonusPercent / 100;
966 bool OnlyOneCallAndLocalLinkage =
967 F.hasLocalLinkage() && F.hasOneUse() && &F == CS.getCalledFunction();
968 // If there is only one call of the function, and it has internal linkage,
969 // the cost of inlining it drops dramatically. It may seem odd to update
970 // Cost in updateThreshold, but the bonus depends on the logic in this method.
971 if (OnlyOneCallAndLocalLinkage)
972 Cost -= LastCallToStaticBonus;
975 bool CallAnalyzer::visitCmpInst(CmpInst &I) {
976 Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
977 // First try to handle simplified comparisons.
978 if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) {
979 return ConstantExpr::getCompare(I.getPredicate(), COps[0], COps[1]);
983 if (I.getOpcode() == Instruction::FCmp)
986 // Otherwise look for a comparison between constant offset pointers with
988 Value *LHSBase, *RHSBase;
989 APInt LHSOffset, RHSOffset;
990 std::tie(LHSBase, LHSOffset) = ConstantOffsetPtrs.lookup(LHS);
992 std::tie(RHSBase, RHSOffset) = ConstantOffsetPtrs.lookup(RHS);
993 if (RHSBase && LHSBase == RHSBase) {
994 // We have common bases, fold the icmp to a constant based on the
996 Constant *CLHS = ConstantInt::get(LHS->getContext(), LHSOffset);
997 Constant *CRHS = ConstantInt::get(RHS->getContext(), RHSOffset);
998 if (Constant *C = ConstantExpr::getICmp(I.getPredicate(), CLHS, CRHS)) {
999 SimplifiedValues[&I] = C;
1000 ++NumConstantPtrCmps;
1006 // If the comparison is an equality comparison with null, we can simplify it
1007 // if we know the value (argument) can't be null
1008 if (I.isEquality() && isa<ConstantPointerNull>(I.getOperand(1)) &&
1009 isKnownNonNullInCallee(I.getOperand(0))) {
1010 bool IsNotEqual = I.getPredicate() == CmpInst::ICMP_NE;
1011 SimplifiedValues[&I] = IsNotEqual ? ConstantInt::getTrue(I.getType())
1012 : ConstantInt::getFalse(I.getType());
1015 // Finally check for SROA candidates in comparisons.
1017 DenseMap<Value *, int>::iterator CostIt;
1018 if (lookupSROAArgAndCost(I.getOperand(0), SROAArg, CostIt)) {
1019 if (isa<ConstantPointerNull>(I.getOperand(1))) {
1020 accumulateSROACost(CostIt, InlineConstants::InstrCost);
1024 disableSROA(CostIt);
1030 bool CallAnalyzer::visitSub(BinaryOperator &I) {
1031 // Try to handle a special case: we can fold computing the difference of two
1032 // constant-related pointers.
1033 Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
1034 Value *LHSBase, *RHSBase;
1035 APInt LHSOffset, RHSOffset;
1036 std::tie(LHSBase, LHSOffset) = ConstantOffsetPtrs.lookup(LHS);
1038 std::tie(RHSBase, RHSOffset) = ConstantOffsetPtrs.lookup(RHS);
1039 if (RHSBase && LHSBase == RHSBase) {
1040 // We have common bases, fold the subtract to a constant based on the
1042 Constant *CLHS = ConstantInt::get(LHS->getContext(), LHSOffset);
1043 Constant *CRHS = ConstantInt::get(RHS->getContext(), RHSOffset);
1044 if (Constant *C = ConstantExpr::getSub(CLHS, CRHS)) {
1045 SimplifiedValues[&I] = C;
1046 ++NumConstantPtrDiffs;
1052 // Otherwise, fall back to the generic logic for simplifying and handling
1054 return Base::visitSub(I);
1057 bool CallAnalyzer::visitBinaryOperator(BinaryOperator &I) {
1058 Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
1059 Constant *CLHS = dyn_cast<Constant>(LHS);
1061 CLHS = SimplifiedValues.lookup(LHS);
1062 Constant *CRHS = dyn_cast<Constant>(RHS);
1064 CRHS = SimplifiedValues.lookup(RHS);
1066 Value *SimpleV = nullptr;
1067 if (auto FI = dyn_cast<FPMathOperator>(&I))
1068 SimpleV = SimplifyFPBinOp(I.getOpcode(), CLHS ? CLHS : LHS,
1069 CRHS ? CRHS : RHS, FI->getFastMathFlags(), DL);
1072 SimplifyBinOp(I.getOpcode(), CLHS ? CLHS : LHS, CRHS ? CRHS : RHS, DL);
1074 if (Constant *C = dyn_cast_or_null<Constant>(SimpleV))
1075 SimplifiedValues[&I] = C;
1080 // Disable any SROA on arguments to arbitrary, unsimplified binary operators.
1084 // If the instruction is floating point, and the target says this operation
1085 // is expensive, this may eventually become a library call. Treat the cost
1087 if (I.getType()->isFloatingPointTy() &&
1088 TTI.getFPOpCost(I.getType()) == TargetTransformInfo::TCC_Expensive)
1089 Cost += InlineConstants::CallPenalty;
1094 bool CallAnalyzer::visitLoad(LoadInst &I) {
1096 DenseMap<Value *, int>::iterator CostIt;
1097 if (lookupSROAArgAndCost(I.getPointerOperand(), SROAArg, CostIt)) {
1099 accumulateSROACost(CostIt, InlineConstants::InstrCost);
1103 disableSROA(CostIt);
1106 // If the data is already loaded from this address and hasn't been clobbered
1107 // by any stores or calls, this load is likely to be redundant and can be
1109 if (EnableLoadElimination &&
1110 !LoadAddrSet.insert(I.getPointerOperand()).second && I.isUnordered()) {
1111 LoadEliminationCost += InlineConstants::InstrCost;
1118 bool CallAnalyzer::visitStore(StoreInst &I) {
1120 DenseMap<Value *, int>::iterator CostIt;
1121 if (lookupSROAArgAndCost(I.getPointerOperand(), SROAArg, CostIt)) {
1123 accumulateSROACost(CostIt, InlineConstants::InstrCost);
1127 disableSROA(CostIt);
1130 // The store can potentially clobber loads and prevent repeated loads from
1131 // being eliminated.
1133 // 1. We can probably keep an initial set of eliminatable loads substracted
1134 // from the cost even when we finally see a store. We just need to disable
1135 // *further* accumulation of elimination savings.
1136 // 2. We should probably at some point thread MemorySSA for the callee into
1137 // this and then use that to actually compute *really* precise savings.
1138 disableLoadElimination();
1142 bool CallAnalyzer::visitExtractValue(ExtractValueInst &I) {
1143 // Constant folding for extract value is trivial.
1144 if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) {
1145 return ConstantExpr::getExtractValue(COps[0], I.getIndices());
1149 // SROA can look through these but give them a cost.
1153 bool CallAnalyzer::visitInsertValue(InsertValueInst &I) {
1154 // Constant folding for insert value is trivial.
1155 if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) {
1156 return ConstantExpr::getInsertValue(/*AggregateOperand*/ COps[0],
1157 /*InsertedValueOperand*/ COps[1],
1162 // SROA can look through these but give them a cost.
1166 /// Try to simplify a call site.
1168 /// Takes a concrete function and callsite and tries to actually simplify it by
1169 /// analyzing the arguments and call itself with instsimplify. Returns true if
1170 /// it has simplified the callsite to some other entity (a constant), making it
1172 bool CallAnalyzer::simplifyCallSite(Function *F, CallSite CS) {
1173 // FIXME: Using the instsimplify logic directly for this is inefficient
1174 // because we have to continually rebuild the argument list even when no
1175 // simplifications can be performed. Until that is fixed with remapping
1176 // inside of instsimplify, directly constant fold calls here.
1177 if (!canConstantFoldCallTo(CS, F))
1180 // Try to re-map the arguments to constants.
1181 SmallVector<Constant *, 4> ConstantArgs;
1182 ConstantArgs.reserve(CS.arg_size());
1183 for (CallSite::arg_iterator I = CS.arg_begin(), E = CS.arg_end(); I != E;
1185 Constant *C = dyn_cast<Constant>(*I);
1187 C = dyn_cast_or_null<Constant>(SimplifiedValues.lookup(*I));
1189 return false; // This argument doesn't map to a constant.
1191 ConstantArgs.push_back(C);
1193 if (Constant *C = ConstantFoldCall(CS, F, ConstantArgs)) {
1194 SimplifiedValues[CS.getInstruction()] = C;
1201 bool CallAnalyzer::visitCallSite(CallSite CS) {
1202 if (CS.hasFnAttr(Attribute::ReturnsTwice) &&
1203 !F.hasFnAttribute(Attribute::ReturnsTwice)) {
1204 // This aborts the entire analysis.
1205 ExposesReturnsTwice = true;
1208 if (CS.isCall() && cast<CallInst>(CS.getInstruction())->cannotDuplicate())
1209 ContainsNoDuplicateCall = true;
1211 if (Function *F = CS.getCalledFunction()) {
1212 // When we have a concrete function, first try to simplify it directly.
1213 if (simplifyCallSite(F, CS))
1216 // Next check if it is an intrinsic we know about.
1217 // FIXME: Lift this into part of the InstVisitor.
1218 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(CS.getInstruction())) {
1219 switch (II->getIntrinsicID()) {
1221 if (!CS.onlyReadsMemory() && !isAssumeLikeIntrinsic(II))
1222 disableLoadElimination();
1223 return Base::visitCallSite(CS);
1225 case Intrinsic::load_relative:
1226 // This is normally lowered to 4 LLVM instructions.
1227 Cost += 3 * InlineConstants::InstrCost;
1230 case Intrinsic::memset:
1231 case Intrinsic::memcpy:
1232 case Intrinsic::memmove:
1233 disableLoadElimination();
1234 // SROA can usually chew through these intrinsics, but they aren't free.
1236 case Intrinsic::icall_branch_funnel:
1237 case Intrinsic::localescape:
1238 HasUninlineableIntrinsic = true;
1240 case Intrinsic::vastart:
1241 case Intrinsic::vaend:
1247 if (F == CS.getInstruction()->getFunction()) {
1248 // This flag will fully abort the analysis, so don't bother with anything
1250 IsRecursiveCall = true;
1254 if (TTI.isLoweredToCall(F)) {
1255 // We account for the average 1 instruction per call argument setup
1257 Cost += CS.arg_size() * InlineConstants::InstrCost;
1259 // Everything other than inline ASM will also have a significant cost
1260 // merely from making the call.
1261 if (!isa<InlineAsm>(CS.getCalledValue()))
1262 Cost += InlineConstants::CallPenalty;
1265 if (!CS.onlyReadsMemory())
1266 disableLoadElimination();
1267 return Base::visitCallSite(CS);
1270 // Otherwise we're in a very special case -- an indirect function call. See
1271 // if we can be particularly clever about this.
1272 Value *Callee = CS.getCalledValue();
1274 // First, pay the price of the argument setup. We account for the average
1275 // 1 instruction per call argument setup here.
1276 Cost += CS.arg_size() * InlineConstants::InstrCost;
1278 // Next, check if this happens to be an indirect function call to a known
1279 // function in this inline context. If not, we've done all we can.
1280 Function *F = dyn_cast_or_null<Function>(SimplifiedValues.lookup(Callee));
1282 if (!CS.onlyReadsMemory())
1283 disableLoadElimination();
1284 return Base::visitCallSite(CS);
1287 // If we have a constant that we are calling as a function, we can peer
1288 // through it and see the function target. This happens not infrequently
1289 // during devirtualization and so we want to give it a hefty bonus for
1290 // inlining, but cap that bonus in the event that inlining wouldn't pan
1291 // out. Pretend to inline the function, with a custom threshold.
1292 auto IndirectCallParams = Params;
1293 IndirectCallParams.DefaultThreshold = InlineConstants::IndirectCallThreshold;
1294 CallAnalyzer CA(TTI, GetAssumptionCache, GetBFI, PSI, ORE, *F, CS,
1295 IndirectCallParams);
1296 if (CA.analyzeCall(CS)) {
1297 // We were able to inline the indirect call! Subtract the cost from the
1298 // threshold to get the bonus we want to apply, but don't go below zero.
1299 Cost -= std::max(0, CA.getThreshold() - CA.getCost());
1302 if (!F->onlyReadsMemory())
1303 disableLoadElimination();
1304 return Base::visitCallSite(CS);
1307 bool CallAnalyzer::visitReturnInst(ReturnInst &RI) {
1308 // At least one return instruction will be free after inlining.
1309 bool Free = !HasReturn;
1314 bool CallAnalyzer::visitBranchInst(BranchInst &BI) {
1315 // We model unconditional branches as essentially free -- they really
1316 // shouldn't exist at all, but handling them makes the behavior of the
1317 // inliner more regular and predictable. Interestingly, conditional branches
1318 // which will fold away are also free.
1319 return BI.isUnconditional() || isa<ConstantInt>(BI.getCondition()) ||
1320 dyn_cast_or_null<ConstantInt>(
1321 SimplifiedValues.lookup(BI.getCondition()));
1324 bool CallAnalyzer::visitSelectInst(SelectInst &SI) {
1325 bool CheckSROA = SI.getType()->isPointerTy();
1326 Value *TrueVal = SI.getTrueValue();
1327 Value *FalseVal = SI.getFalseValue();
1329 Constant *TrueC = dyn_cast<Constant>(TrueVal);
1331 TrueC = SimplifiedValues.lookup(TrueVal);
1332 Constant *FalseC = dyn_cast<Constant>(FalseVal);
1334 FalseC = SimplifiedValues.lookup(FalseVal);
1336 dyn_cast_or_null<Constant>(SimplifiedValues.lookup(SI.getCondition()));
1339 // Select C, X, X => X
1340 if (TrueC == FalseC && TrueC) {
1341 SimplifiedValues[&SI] = TrueC;
1346 return Base::visitSelectInst(SI);
1348 std::pair<Value *, APInt> TrueBaseAndOffset =
1349 ConstantOffsetPtrs.lookup(TrueVal);
1350 std::pair<Value *, APInt> FalseBaseAndOffset =
1351 ConstantOffsetPtrs.lookup(FalseVal);
1352 if (TrueBaseAndOffset == FalseBaseAndOffset && TrueBaseAndOffset.first) {
1353 ConstantOffsetPtrs[&SI] = TrueBaseAndOffset;
1356 DenseMap<Value *, int>::iterator CostIt;
1357 if (lookupSROAArgAndCost(TrueVal, SROAArg, CostIt))
1358 SROAArgValues[&SI] = SROAArg;
1362 return Base::visitSelectInst(SI);
1365 // Select condition is a constant.
1366 Value *SelectedV = CondC->isAllOnesValue()
1368 : (CondC->isNullValue()) ? FalseVal : nullptr;
1370 // Condition is a vector constant that is not all 1s or all 0s. If all
1371 // operands are constants, ConstantExpr::getSelect() can handle the cases
1372 // such as select vectors.
1373 if (TrueC && FalseC) {
1374 if (auto *C = ConstantExpr::getSelect(CondC, TrueC, FalseC)) {
1375 SimplifiedValues[&SI] = C;
1379 return Base::visitSelectInst(SI);
1382 // Condition is either all 1s or all 0s. SI can be simplified.
1383 if (Constant *SelectedC = dyn_cast<Constant>(SelectedV)) {
1384 SimplifiedValues[&SI] = SelectedC;
1391 std::pair<Value *, APInt> BaseAndOffset =
1392 ConstantOffsetPtrs.lookup(SelectedV);
1393 if (BaseAndOffset.first) {
1394 ConstantOffsetPtrs[&SI] = BaseAndOffset;
1397 DenseMap<Value *, int>::iterator CostIt;
1398 if (lookupSROAArgAndCost(SelectedV, SROAArg, CostIt))
1399 SROAArgValues[&SI] = SROAArg;
1405 bool CallAnalyzer::visitSwitchInst(SwitchInst &SI) {
1406 // We model unconditional switches as free, see the comments on handling
1408 if (isa<ConstantInt>(SI.getCondition()))
1410 if (Value *V = SimplifiedValues.lookup(SI.getCondition()))
1411 if (isa<ConstantInt>(V))
1414 // Assume the most general case where the switch is lowered into
1415 // either a jump table, bit test, or a balanced binary tree consisting of
1416 // case clusters without merging adjacent clusters with the same
1417 // destination. We do not consider the switches that are lowered with a mix
1418 // of jump table/bit test/binary search tree. The cost of the switch is
1419 // proportional to the size of the tree or the size of jump table range.
1421 // NB: We convert large switches which are just used to initialize large phi
1422 // nodes to lookup tables instead in simplify-cfg, so this shouldn't prevent
1423 // inlining those. It will prevent inlining in cases where the optimization
1424 // does not (yet) fire.
1426 // Maximum valid cost increased in this function.
1427 int CostUpperBound = INT_MAX - InlineConstants::InstrCost - 1;
1429 // Exit early for a large switch, assuming one case needs at least one
1431 // FIXME: This is not true for a bit test, but ignore such case for now to
1432 // save compile-time.
1433 int64_t CostLowerBound =
1434 std::min((int64_t)CostUpperBound,
1435 (int64_t)SI.getNumCases() * InlineConstants::InstrCost + Cost);
1437 if (CostLowerBound > Threshold && !ComputeFullInlineCost) {
1438 Cost = CostLowerBound;
1442 unsigned JumpTableSize = 0;
1443 unsigned NumCaseCluster =
1444 TTI.getEstimatedNumberOfCaseClusters(SI, JumpTableSize);
1446 // If suitable for a jump table, consider the cost for the table size and
1447 // branch to destination.
1448 if (JumpTableSize) {
1449 int64_t JTCost = (int64_t)JumpTableSize * InlineConstants::InstrCost +
1450 4 * InlineConstants::InstrCost;
1452 Cost = std::min((int64_t)CostUpperBound, JTCost + Cost);
1456 // Considering forming a binary search, we should find the number of nodes
1457 // which is same as the number of comparisons when lowered. For a given
1458 // number of clusters, n, we can define a recursive function, f(n), to find
1459 // the number of nodes in the tree. The recursion is :
1460 // f(n) = 1 + f(n/2) + f (n - n/2), when n > 3,
1461 // and f(n) = n, when n <= 3.
1462 // This will lead a binary tree where the leaf should be either f(2) or f(3)
1463 // when n > 3. So, the number of comparisons from leaves should be n, while
1464 // the number of non-leaf should be :
1465 // 2^(log2(n) - 1) - 1
1466 // = 2^log2(n) * 2^-1 - 1
1468 // Considering comparisons from leaf and non-leaf nodes, we can estimate the
1469 // number of comparisons in a simple closed form :
1470 // n + n / 2 - 1 = n * 3 / 2 - 1
1471 if (NumCaseCluster <= 3) {
1472 // Suppose a comparison includes one compare and one conditional branch.
1473 Cost += NumCaseCluster * 2 * InlineConstants::InstrCost;
1477 int64_t ExpectedNumberOfCompare = 3 * (int64_t)NumCaseCluster / 2 - 1;
1478 int64_t SwitchCost =
1479 ExpectedNumberOfCompare * 2 * InlineConstants::InstrCost;
1481 Cost = std::min((int64_t)CostUpperBound, SwitchCost + Cost);
1485 bool CallAnalyzer::visitIndirectBrInst(IndirectBrInst &IBI) {
1486 // We never want to inline functions that contain an indirectbr. This is
1487 // incorrect because all the blockaddress's (in static global initializers
1488 // for example) would be referring to the original function, and this
1489 // indirect jump would jump from the inlined copy of the function into the
1490 // original function which is extremely undefined behavior.
1491 // FIXME: This logic isn't really right; we can safely inline functions with
1492 // indirectbr's as long as no other function or global references the
1493 // blockaddress of a block within the current function.
1494 HasIndirectBr = true;
1498 bool CallAnalyzer::visitResumeInst(ResumeInst &RI) {
1499 // FIXME: It's not clear that a single instruction is an accurate model for
1500 // the inline cost of a resume instruction.
1504 bool CallAnalyzer::visitCleanupReturnInst(CleanupReturnInst &CRI) {
1505 // FIXME: It's not clear that a single instruction is an accurate model for
1506 // the inline cost of a cleanupret instruction.
1510 bool CallAnalyzer::visitCatchReturnInst(CatchReturnInst &CRI) {
1511 // FIXME: It's not clear that a single instruction is an accurate model for
1512 // the inline cost of a catchret instruction.
1516 bool CallAnalyzer::visitUnreachableInst(UnreachableInst &I) {
1517 // FIXME: It might be reasonably to discount the cost of instructions leading
1518 // to unreachable as they have the lowest possible impact on both runtime and
1520 return true; // No actual code is needed for unreachable.
1523 bool CallAnalyzer::visitInstruction(Instruction &I) {
1524 // Some instructions are free. All of the free intrinsics can also be
1525 // handled by SROA, etc.
1526 if (TargetTransformInfo::TCC_Free == TTI.getUserCost(&I))
1529 // We found something we don't understand or can't handle. Mark any SROA-able
1530 // values in the operand list as no longer viable.
1531 for (User::op_iterator OI = I.op_begin(), OE = I.op_end(); OI != OE; ++OI)
1537 /// Analyze a basic block for its contribution to the inline cost.
1539 /// This method walks the analyzer over every instruction in the given basic
1540 /// block and accounts for their cost during inlining at this callsite. It
1541 /// aborts early if the threshold has been exceeded or an impossible to inline
1542 /// construct has been detected. It returns false if inlining is no longer
1543 /// viable, and true if inlining remains viable.
1544 bool CallAnalyzer::analyzeBlock(BasicBlock *BB,
1545 SmallPtrSetImpl<const Value *> &EphValues) {
1546 for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) {
1547 // FIXME: Currently, the number of instructions in a function regardless of
1548 // our ability to simplify them during inline to constants or dead code,
1549 // are actually used by the vector bonus heuristic. As long as that's true,
1550 // we have to special case debug intrinsics here to prevent differences in
1551 // inlining due to debug symbols. Eventually, the number of unsimplified
1552 // instructions shouldn't factor into the cost computation, but until then,
1553 // hack around it here.
1554 if (isa<DbgInfoIntrinsic>(I))
1557 // Skip ephemeral values.
1558 if (EphValues.count(&*I))
1562 if (isa<ExtractElementInst>(I) || I->getType()->isVectorTy())
1563 ++NumVectorInstructions;
1565 // If the instruction simplified to a constant, there is no cost to this
1566 // instruction. Visit the instructions using our InstVisitor to account for
1567 // all of the per-instruction logic. The visit tree returns true if we
1568 // consumed the instruction in any way, and false if the instruction's base
1569 // cost should count against inlining.
1570 if (Base::visit(&*I))
1571 ++NumInstructionsSimplified;
1573 Cost += InlineConstants::InstrCost;
1575 using namespace ore;
1576 // If the visit this instruction detected an uninlinable pattern, abort.
1577 if (IsRecursiveCall || ExposesReturnsTwice || HasDynamicAlloca ||
1578 HasIndirectBr || HasUninlineableIntrinsic || UsesVarArgs) {
1581 return OptimizationRemarkMissed(DEBUG_TYPE, "NeverInline",
1582 CandidateCS.getInstruction())
1584 << " has uninlinable pattern and cost is not fully computed";
1589 // If the caller is a recursive function then we don't want to inline
1590 // functions which allocate a lot of stack space because it would increase
1591 // the caller stack usage dramatically.
1592 if (IsCallerRecursive &&
1593 AllocatedSize > InlineConstants::TotalAllocaSizeRecursiveCaller) {
1596 return OptimizationRemarkMissed(DEBUG_TYPE, "NeverInline",
1597 CandidateCS.getInstruction())
1599 << " is recursive and allocates too much stack space. Cost is "
1600 "not fully computed";
1605 // Check if we've past the maximum possible threshold so we don't spin in
1606 // huge basic blocks that will never inline.
1607 if (Cost >= Threshold && !ComputeFullInlineCost)
1614 /// Compute the base pointer and cumulative constant offsets for V.
1616 /// This strips all constant offsets off of V, leaving it the base pointer, and
1617 /// accumulates the total constant offset applied in the returned constant. It
1618 /// returns 0 if V is not a pointer, and returns the constant '0' if there are
1619 /// no constant offsets applied.
1620 ConstantInt *CallAnalyzer::stripAndComputeInBoundsConstantOffsets(Value *&V) {
1621 if (!V->getType()->isPointerTy())
1624 unsigned AS = V->getType()->getPointerAddressSpace();
1625 unsigned IntPtrWidth = DL.getIndexSizeInBits(AS);
1626 APInt Offset = APInt::getNullValue(IntPtrWidth);
1628 // Even though we don't look through PHI nodes, we could be called on an
1629 // instruction in an unreachable block, which may be on a cycle.
1630 SmallPtrSet<Value *, 4> Visited;
1633 if (GEPOperator *GEP = dyn_cast<GEPOperator>(V)) {
1634 if (!GEP->isInBounds() || !accumulateGEPOffset(*GEP, Offset))
1636 V = GEP->getPointerOperand();
1637 } else if (Operator::getOpcode(V) == Instruction::BitCast) {
1638 V = cast<Operator>(V)->getOperand(0);
1639 } else if (GlobalAlias *GA = dyn_cast<GlobalAlias>(V)) {
1640 if (GA->isInterposable())
1642 V = GA->getAliasee();
1646 assert(V->getType()->isPointerTy() && "Unexpected operand type!");
1647 } while (Visited.insert(V).second);
1649 Type *IntPtrTy = DL.getIntPtrType(V->getContext(), AS);
1650 return cast<ConstantInt>(ConstantInt::get(IntPtrTy, Offset));
1653 /// Find dead blocks due to deleted CFG edges during inlining.
1655 /// If we know the successor of the current block, \p CurrBB, has to be \p
1656 /// NextBB, the other successors of \p CurrBB are dead if these successors have
1657 /// no live incoming CFG edges. If one block is found to be dead, we can
1658 /// continue growing the dead block list by checking the successors of the dead
1659 /// blocks to see if all their incoming edges are dead or not.
1660 void CallAnalyzer::findDeadBlocks(BasicBlock *CurrBB, BasicBlock *NextBB) {
1661 auto IsEdgeDead = [&](BasicBlock *Pred, BasicBlock *Succ) {
1662 // A CFG edge is dead if the predecessor is dead or the predessor has a
1663 // known successor which is not the one under exam.
1664 return (DeadBlocks.count(Pred) ||
1665 (KnownSuccessors[Pred] && KnownSuccessors[Pred] != Succ));
1668 auto IsNewlyDead = [&](BasicBlock *BB) {
1669 // If all the edges to a block are dead, the block is also dead.
1670 return (!DeadBlocks.count(BB) &&
1671 llvm::all_of(predecessors(BB),
1672 [&](BasicBlock *P) { return IsEdgeDead(P, BB); }));
1675 for (BasicBlock *Succ : successors(CurrBB)) {
1676 if (Succ == NextBB || !IsNewlyDead(Succ))
1678 SmallVector<BasicBlock *, 4> NewDead;
1679 NewDead.push_back(Succ);
1680 while (!NewDead.empty()) {
1681 BasicBlock *Dead = NewDead.pop_back_val();
1682 if (DeadBlocks.insert(Dead))
1683 // Continue growing the dead block lists.
1684 for (BasicBlock *S : successors(Dead))
1686 NewDead.push_back(S);
1691 /// Analyze a call site for potential inlining.
1693 /// Returns true if inlining this call is viable, and false if it is not
1694 /// viable. It computes the cost and adjusts the threshold based on numerous
1695 /// factors and heuristics. If this method returns false but the computed cost
1696 /// is below the computed threshold, then inlining was forcibly disabled by
1697 /// some artifact of the routine.
1698 bool CallAnalyzer::analyzeCall(CallSite CS) {
1701 // Perform some tweaks to the cost and threshold based on the direct
1702 // callsite information.
1704 // We want to more aggressively inline vector-dense kernels, so up the
1705 // threshold, and we'll lower it if the % of vector instructions gets too
1706 // low. Note that these bonuses are some what arbitrary and evolved over time
1707 // by accident as much as because they are principled bonuses.
1709 // FIXME: It would be nice to remove all such bonuses. At least it would be
1710 // nice to base the bonus values on something more scientific.
1711 assert(NumInstructions == 0);
1712 assert(NumVectorInstructions == 0);
1714 // Update the threshold based on callsite properties
1715 updateThreshold(CS, F);
1717 // Speculatively apply all possible bonuses to Threshold. If cost exceeds
1718 // this Threshold any time, and cost cannot decrease, we can stop processing
1719 // the rest of the function body.
1720 Threshold += (SingleBBBonus + VectorBonus);
1722 // Give out bonuses for the callsite, as the instructions setting them up
1723 // will be gone after inlining.
1724 Cost -= getCallsiteCost(CS, DL);
1726 // If this function uses the coldcc calling convention, prefer not to inline
1728 if (F.getCallingConv() == CallingConv::Cold)
1729 Cost += InlineConstants::ColdccPenalty;
1731 // Check if we're done. This can happen due to bonuses and penalties.
1732 if (Cost >= Threshold && !ComputeFullInlineCost)
1738 Function *Caller = CS.getInstruction()->getFunction();
1739 // Check if the caller function is recursive itself.
1740 for (User *U : Caller->users()) {
1744 Instruction *I = Site.getInstruction();
1745 if (I->getFunction() == Caller) {
1746 IsCallerRecursive = true;
1751 // Populate our simplified values by mapping from function arguments to call
1752 // arguments with known important simplifications.
1753 CallSite::arg_iterator CAI = CS.arg_begin();
1754 for (Function::arg_iterator FAI = F.arg_begin(), FAE = F.arg_end();
1755 FAI != FAE; ++FAI, ++CAI) {
1756 assert(CAI != CS.arg_end());
1757 if (Constant *C = dyn_cast<Constant>(CAI))
1758 SimplifiedValues[&*FAI] = C;
1760 Value *PtrArg = *CAI;
1761 if (ConstantInt *C = stripAndComputeInBoundsConstantOffsets(PtrArg)) {
1762 ConstantOffsetPtrs[&*FAI] = std::make_pair(PtrArg, C->getValue());
1764 // We can SROA any pointer arguments derived from alloca instructions.
1765 if (isa<AllocaInst>(PtrArg)) {
1766 SROAArgValues[&*FAI] = PtrArg;
1767 SROAArgCosts[PtrArg] = 0;
1771 NumConstantArgs = SimplifiedValues.size();
1772 NumConstantOffsetPtrArgs = ConstantOffsetPtrs.size();
1773 NumAllocaArgs = SROAArgValues.size();
1775 // FIXME: If a caller has multiple calls to a callee, we end up recomputing
1776 // the ephemeral values multiple times (and they're completely determined by
1777 // the callee, so this is purely duplicate work).
1778 SmallPtrSet<const Value *, 32> EphValues;
1779 CodeMetrics::collectEphemeralValues(&F, &GetAssumptionCache(F), EphValues);
1781 // The worklist of live basic blocks in the callee *after* inlining. We avoid
1782 // adding basic blocks of the callee which can be proven to be dead for this
1783 // particular call site in order to get more accurate cost estimates. This
1784 // requires a somewhat heavyweight iteration pattern: we need to walk the
1785 // basic blocks in a breadth-first order as we insert live successors. To
1786 // accomplish this, prioritizing for small iterations because we exit after
1787 // crossing our threshold, we use a small-size optimized SetVector.
1788 typedef SetVector<BasicBlock *, SmallVector<BasicBlock *, 16>,
1789 SmallPtrSet<BasicBlock *, 16>>
1791 BBSetVector BBWorklist;
1792 BBWorklist.insert(&F.getEntryBlock());
1793 bool SingleBB = true;
1794 // Note that we *must not* cache the size, this loop grows the worklist.
1795 for (unsigned Idx = 0; Idx != BBWorklist.size(); ++Idx) {
1796 // Bail out the moment we cross the threshold. This means we'll under-count
1797 // the cost, but only when undercounting doesn't matter.
1798 if (Cost >= Threshold && !ComputeFullInlineCost)
1801 BasicBlock *BB = BBWorklist[Idx];
1805 // Disallow inlining a blockaddress. A blockaddress only has defined
1806 // behavior for an indirect branch in the same function, and we do not
1807 // currently support inlining indirect branches. But, the inliner may not
1808 // see an indirect branch that ends up being dead code at a particular call
1809 // site. If the blockaddress escapes the function, e.g., via a global
1810 // variable, inlining may lead to an invalid cross-function reference.
1811 if (BB->hasAddressTaken())
1814 // Analyze the cost of this block. If we blow through the threshold, this
1815 // returns false, and we can bail on out.
1816 if (!analyzeBlock(BB, EphValues))
1819 TerminatorInst *TI = BB->getTerminator();
1821 // Add in the live successors by first checking whether we have terminator
1822 // that may be simplified based on the values simplified by this call.
1823 if (BranchInst *BI = dyn_cast<BranchInst>(TI)) {
1824 if (BI->isConditional()) {
1825 Value *Cond = BI->getCondition();
1826 if (ConstantInt *SimpleCond =
1827 dyn_cast_or_null<ConstantInt>(SimplifiedValues.lookup(Cond))) {
1828 BasicBlock *NextBB = BI->getSuccessor(SimpleCond->isZero() ? 1 : 0);
1829 BBWorklist.insert(NextBB);
1830 KnownSuccessors[BB] = NextBB;
1831 findDeadBlocks(BB, NextBB);
1835 } else if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) {
1836 Value *Cond = SI->getCondition();
1837 if (ConstantInt *SimpleCond =
1838 dyn_cast_or_null<ConstantInt>(SimplifiedValues.lookup(Cond))) {
1839 BasicBlock *NextBB = SI->findCaseValue(SimpleCond)->getCaseSuccessor();
1840 BBWorklist.insert(NextBB);
1841 KnownSuccessors[BB] = NextBB;
1842 findDeadBlocks(BB, NextBB);
1847 // If we're unable to select a particular successor, just count all of
1849 for (unsigned TIdx = 0, TSize = TI->getNumSuccessors(); TIdx != TSize;
1851 BBWorklist.insert(TI->getSuccessor(TIdx));
1853 // If we had any successors at this point, than post-inlining is likely to
1854 // have them as well. Note that we assume any basic blocks which existed
1855 // due to branches or switches which folded above will also fold after
1857 if (SingleBB && TI->getNumSuccessors() > 1) {
1858 // Take off the bonus we applied to the threshold.
1859 Threshold -= SingleBBBonus;
1864 bool OnlyOneCallAndLocalLinkage =
1865 F.hasLocalLinkage() && F.hasOneUse() && &F == CS.getCalledFunction();
1866 // If this is a noduplicate call, we can still inline as long as
1867 // inlining this would cause the removal of the caller (so the instruction
1868 // is not actually duplicated, just moved).
1869 if (!OnlyOneCallAndLocalLinkage && ContainsNoDuplicateCall)
1872 // We applied the maximum possible vector bonus at the beginning. Now,
1873 // subtract the excess bonus, if any, from the Threshold before
1874 // comparing against Cost.
1875 if (NumVectorInstructions <= NumInstructions / 10)
1876 Threshold -= VectorBonus;
1877 else if (NumVectorInstructions <= NumInstructions / 2)
1878 Threshold -= VectorBonus/2;
1880 return Cost < std::max(1, Threshold);
1883 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1884 /// Dump stats about this call's analysis.
1885 LLVM_DUMP_METHOD void CallAnalyzer::dump() {
1886 #define DEBUG_PRINT_STAT(x) dbgs() << " " #x ": " << x << "\n"
1887 DEBUG_PRINT_STAT(NumConstantArgs);
1888 DEBUG_PRINT_STAT(NumConstantOffsetPtrArgs);
1889 DEBUG_PRINT_STAT(NumAllocaArgs);
1890 DEBUG_PRINT_STAT(NumConstantPtrCmps);
1891 DEBUG_PRINT_STAT(NumConstantPtrDiffs);
1892 DEBUG_PRINT_STAT(NumInstructionsSimplified);
1893 DEBUG_PRINT_STAT(NumInstructions);
1894 DEBUG_PRINT_STAT(SROACostSavings);
1895 DEBUG_PRINT_STAT(SROACostSavingsLost);
1896 DEBUG_PRINT_STAT(LoadEliminationCost);
1897 DEBUG_PRINT_STAT(ContainsNoDuplicateCall);
1898 DEBUG_PRINT_STAT(Cost);
1899 DEBUG_PRINT_STAT(Threshold);
1900 #undef DEBUG_PRINT_STAT
1904 /// Test that there are no attribute conflicts between Caller and Callee
1905 /// that prevent inlining.
1906 static bool functionsHaveCompatibleAttributes(Function *Caller,
1908 TargetTransformInfo &TTI) {
1909 return TTI.areInlineCompatible(Caller, Callee) &&
1910 AttributeFuncs::areInlineCompatible(*Caller, *Callee);
1913 int llvm::getCallsiteCost(CallSite CS, const DataLayout &DL) {
1915 for (unsigned I = 0, E = CS.arg_size(); I != E; ++I) {
1916 if (CS.isByValArgument(I)) {
1917 // We approximate the number of loads and stores needed by dividing the
1918 // size of the byval type by the target's pointer size.
1919 PointerType *PTy = cast<PointerType>(CS.getArgument(I)->getType());
1920 unsigned TypeSize = DL.getTypeSizeInBits(PTy->getElementType());
1921 unsigned AS = PTy->getAddressSpace();
1922 unsigned PointerSize = DL.getPointerSizeInBits(AS);
1923 // Ceiling division.
1924 unsigned NumStores = (TypeSize + PointerSize - 1) / PointerSize;
1926 // If it generates more than 8 stores it is likely to be expanded as an
1927 // inline memcpy so we take that as an upper bound. Otherwise we assume
1928 // one load and one store per word copied.
1929 // FIXME: The maxStoresPerMemcpy setting from the target should be used
1930 // here instead of a magic number of 8, but it's not available via
1932 NumStores = std::min(NumStores, 8U);
1934 Cost += 2 * NumStores * InlineConstants::InstrCost;
1936 // For non-byval arguments subtract off one instruction per call
1938 Cost += InlineConstants::InstrCost;
1941 // The call instruction also disappears after inlining.
1942 Cost += InlineConstants::InstrCost + InlineConstants::CallPenalty;
1946 InlineCost llvm::getInlineCost(
1947 CallSite CS, const InlineParams &Params, TargetTransformInfo &CalleeTTI,
1948 std::function<AssumptionCache &(Function &)> &GetAssumptionCache,
1949 Optional<function_ref<BlockFrequencyInfo &(Function &)>> GetBFI,
1950 ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE) {
1951 return getInlineCost(CS, CS.getCalledFunction(), Params, CalleeTTI,
1952 GetAssumptionCache, GetBFI, PSI, ORE);
1955 InlineCost llvm::getInlineCost(
1956 CallSite CS, Function *Callee, const InlineParams &Params,
1957 TargetTransformInfo &CalleeTTI,
1958 std::function<AssumptionCache &(Function &)> &GetAssumptionCache,
1959 Optional<function_ref<BlockFrequencyInfo &(Function &)>> GetBFI,
1960 ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE) {
1962 // Cannot inline indirect calls.
1964 return llvm::InlineCost::getNever();
1966 // Never inline calls with byval arguments that does not have the alloca
1967 // address space. Since byval arguments can be replaced with a copy to an
1968 // alloca, the inlined code would need to be adjusted to handle that the
1969 // argument is in the alloca address space (so it is a little bit complicated
1971 unsigned AllocaAS = Callee->getParent()->getDataLayout().getAllocaAddrSpace();
1972 for (unsigned I = 0, E = CS.arg_size(); I != E; ++I)
1973 if (CS.isByValArgument(I)) {
1974 PointerType *PTy = cast<PointerType>(CS.getArgument(I)->getType());
1975 if (PTy->getAddressSpace() != AllocaAS)
1976 return llvm::InlineCost::getNever();
1979 // Calls to functions with always-inline attributes should be inlined
1980 // whenever possible.
1981 if (CS.hasFnAttr(Attribute::AlwaysInline)) {
1982 if (isInlineViable(*Callee))
1983 return llvm::InlineCost::getAlways();
1984 return llvm::InlineCost::getNever();
1987 // Never inline functions with conflicting attributes (unless callee has
1988 // always-inline attribute).
1989 Function *Caller = CS.getCaller();
1990 if (!functionsHaveCompatibleAttributes(Caller, Callee, CalleeTTI))
1991 return llvm::InlineCost::getNever();
1993 // Don't inline this call if the caller has the optnone attribute.
1994 if (Caller->hasFnAttribute(Attribute::OptimizeNone))
1995 return llvm::InlineCost::getNever();
1997 // Don't inline a function that treats null pointer as valid into a caller
1998 // that does not have this attribute.
1999 if (!Caller->nullPointerIsDefined() && Callee->nullPointerIsDefined())
2000 return llvm::InlineCost::getNever();
2002 // Don't inline functions which can be interposed at link-time. Don't inline
2003 // functions marked noinline or call sites marked noinline.
2004 // Note: inlining non-exact non-interposable functions is fine, since we know
2005 // we have *a* correct implementation of the source level function.
2006 if (Callee->isInterposable() || Callee->hasFnAttribute(Attribute::NoInline) ||
2008 return llvm::InlineCost::getNever();
2010 LLVM_DEBUG(llvm::dbgs() << " Analyzing call of " << Callee->getName()
2011 << "... (caller:" << Caller->getName() << ")\n");
2013 CallAnalyzer CA(CalleeTTI, GetAssumptionCache, GetBFI, PSI, ORE, *Callee, CS,
2015 bool ShouldInline = CA.analyzeCall(CS);
2017 LLVM_DEBUG(CA.dump());
2019 // Check if there was a reason to force inlining or no inlining.
2020 if (!ShouldInline && CA.getCost() < CA.getThreshold())
2021 return InlineCost::getNever();
2022 if (ShouldInline && CA.getCost() >= CA.getThreshold())
2023 return InlineCost::getAlways();
2025 return llvm::InlineCost::get(CA.getCost(), CA.getThreshold());
2028 bool llvm::isInlineViable(Function &F) {
2029 bool ReturnsTwice = F.hasFnAttribute(Attribute::ReturnsTwice);
2030 for (Function::iterator BI = F.begin(), BE = F.end(); BI != BE; ++BI) {
2031 // Disallow inlining of functions which contain indirect branches or
2033 if (isa<IndirectBrInst>(BI->getTerminator()) || BI->hasAddressTaken())
2036 for (auto &II : *BI) {
2041 // Disallow recursive calls.
2042 if (&F == CS.getCalledFunction())
2045 // Disallow calls which expose returns-twice to a function not previously
2046 // attributed as such.
2047 if (!ReturnsTwice && CS.isCall() &&
2048 cast<CallInst>(CS.getInstruction())->canReturnTwice())
2051 if (CS.getCalledFunction())
2052 switch (CS.getCalledFunction()->getIntrinsicID()) {
2055 // Disallow inlining of @llvm.icall.branch.funnel because current
2056 // backend can't separate call targets from call arguments.
2057 case llvm::Intrinsic::icall_branch_funnel:
2058 // Disallow inlining functions that call @llvm.localescape. Doing this
2059 // correctly would require major changes to the inliner.
2060 case llvm::Intrinsic::localescape:
2061 // Disallow inlining of functions that access VarArgs.
2062 case llvm::Intrinsic::vastart:
2063 case llvm::Intrinsic::vaend:
2072 // APIs to create InlineParams based on command line flags and/or other
2075 InlineParams llvm::getInlineParams(int Threshold) {
2076 InlineParams Params;
2078 // This field is the threshold to use for a callee by default. This is
2079 // derived from one or more of:
2080 // * optimization or size-optimization levels,
2081 // * a value passed to createFunctionInliningPass function, or
2082 // * the -inline-threshold flag.
2083 // If the -inline-threshold flag is explicitly specified, that is used
2084 // irrespective of anything else.
2085 if (InlineThreshold.getNumOccurrences() > 0)
2086 Params.DefaultThreshold = InlineThreshold;
2088 Params.DefaultThreshold = Threshold;
2090 // Set the HintThreshold knob from the -inlinehint-threshold.
2091 Params.HintThreshold = HintThreshold;
2093 // Set the HotCallSiteThreshold knob from the -hot-callsite-threshold.
2094 Params.HotCallSiteThreshold = HotCallSiteThreshold;
2096 // If the -locally-hot-callsite-threshold is explicitly specified, use it to
2097 // populate LocallyHotCallSiteThreshold. Later, we populate
2098 // Params.LocallyHotCallSiteThreshold from -locally-hot-callsite-threshold if
2099 // we know that optimization level is O3 (in the getInlineParams variant that
2100 // takes the opt and size levels).
2101 // FIXME: Remove this check (and make the assignment unconditional) after
2102 // addressing size regression issues at O2.
2103 if (LocallyHotCallSiteThreshold.getNumOccurrences() > 0)
2104 Params.LocallyHotCallSiteThreshold = LocallyHotCallSiteThreshold;
2106 // Set the ColdCallSiteThreshold knob from the -inline-cold-callsite-threshold.
2107 Params.ColdCallSiteThreshold = ColdCallSiteThreshold;
2109 // Set the OptMinSizeThreshold and OptSizeThreshold params only if the
2110 // -inlinehint-threshold commandline option is not explicitly given. If that
2111 // option is present, then its value applies even for callees with size and
2112 // minsize attributes.
2113 // If the -inline-threshold is not specified, set the ColdThreshold from the
2114 // -inlinecold-threshold even if it is not explicitly passed. If
2115 // -inline-threshold is specified, then -inlinecold-threshold needs to be
2116 // explicitly specified to set the ColdThreshold knob
2117 if (InlineThreshold.getNumOccurrences() == 0) {
2118 Params.OptMinSizeThreshold = InlineConstants::OptMinSizeThreshold;
2119 Params.OptSizeThreshold = InlineConstants::OptSizeThreshold;
2120 Params.ColdThreshold = ColdThreshold;
2121 } else if (ColdThreshold.getNumOccurrences() > 0) {
2122 Params.ColdThreshold = ColdThreshold;
2127 InlineParams llvm::getInlineParams() {
2128 return getInlineParams(InlineThreshold);
2131 // Compute the default threshold for inlining based on the opt level and the
2133 static int computeThresholdFromOptLevels(unsigned OptLevel,
2134 unsigned SizeOptLevel) {
2136 return InlineConstants::OptAggressiveThreshold;
2137 if (SizeOptLevel == 1) // -Os
2138 return InlineConstants::OptSizeThreshold;
2139 if (SizeOptLevel == 2) // -Oz
2140 return InlineConstants::OptMinSizeThreshold;
2141 return InlineThreshold;
2144 InlineParams llvm::getInlineParams(unsigned OptLevel, unsigned SizeOptLevel) {
2146 getInlineParams(computeThresholdFromOptLevels(OptLevel, SizeOptLevel));
2147 // At O3, use the value of -locally-hot-callsite-threshold option to populate
2148 // Params.LocallyHotCallSiteThreshold. Below O3, this flag has effect only
2149 // when it is specified explicitly.
2151 Params.LocallyHotCallSiteThreshold = LocallyHotCallSiteThreshold;