contrib/llvm/lib/Analysis/InlineCost.cpp

   1 //===- InlineCost.cpp - Cost analysis for inliner -------------------------===//
   2 //
   3 //                     The LLVM Compiler Infrastructure
   4 //
   5 // This file is distributed under the University of Illinois Open Source
   6 // License. See LICENSE.TXT for details.
   7 //
   8 //===----------------------------------------------------------------------===//
   9 //
  10 // This file implements inline cost analysis.
  11 //
  12 //===----------------------------------------------------------------------===//
  13
  14 #include "llvm/Analysis/InlineCost.h"
  15 #include "llvm/ADT/STLExtras.h"
  16 #include "llvm/ADT/SetVector.h"
  17 #include "llvm/ADT/SmallPtrSet.h"
  18 #include "llvm/ADT/SmallVector.h"
  19 #include "llvm/ADT/Statistic.h"
  20 #include "llvm/Analysis/AssumptionCache.h"
  21 #include "llvm/Analysis/BlockFrequencyInfo.h"
  22 #include "llvm/Analysis/CodeMetrics.h"
  23 #include "llvm/Analysis/ConstantFolding.h"
  24 #include "llvm/Analysis/CFG.h"
  25 #include "llvm/Analysis/InstructionSimplify.h"
  26 #include "llvm/Analysis/ProfileSummaryInfo.h"
  27 #include "llvm/Analysis/TargetTransformInfo.h"
  28 #include "llvm/Analysis/ValueTracking.h"
  29 #include "llvm/IR/CallSite.h"
  30 #include "llvm/IR/CallingConv.h"
  31 #include "llvm/IR/DataLayout.h"
  32 #include "llvm/IR/GetElementPtrTypeIterator.h"
  33 #include "llvm/IR/GlobalAlias.h"
  34 #include "llvm/IR/InstVisitor.h"
  35 #include "llvm/IR/IntrinsicInst.h"
  36 #include "llvm/IR/Operator.h"
  37 #include "llvm/Support/Debug.h"
  38 #include "llvm/Support/raw_ostream.h"
  39
  40 using namespace llvm;
  41
  42 #define DEBUG_TYPE "inline-cost"
  43
  44 STATISTIC(NumCallsAnalyzed, "Number of call sites analyzed");
  45
  46 static cl::opt<int> InlineThreshold(
  47     "inline-threshold", cl::Hidden, cl::init(225), cl::ZeroOrMore,
  48     cl::desc("Control the amount of inlining to perform (default = 225)"));
  49
  50 static cl::opt<int> HintThreshold(
  51     "inlinehint-threshold", cl::Hidden, cl::init(325),
  52     cl::desc("Threshold for inlining functions with inline hint"));
  53
  54 static cl::opt<int>
  55     ColdCallSiteThreshold("inline-cold-callsite-threshold", cl::Hidden,
  56                           cl::init(45),
  57                           cl::desc("Threshold for inlining cold callsites"));
  58
  59 // We introduce this threshold to help performance of instrumentation based
  60 // PGO before we actually hook up inliner with analysis passes such as BPI and
  61 // BFI.
  62 static cl::opt<int> ColdThreshold(
  63     "inlinecold-threshold", cl::Hidden, cl::init(45),
  64     cl::desc("Threshold for inlining functions with cold attribute"));
  65
  66 static cl::opt<int>
  67     HotCallSiteThreshold("hot-callsite-threshold", cl::Hidden, cl::init(3000),
  68                          cl::ZeroOrMore,
  69                          cl::desc("Threshold for hot callsites "));
  70
  71 static cl::opt<int> LocallyHotCallSiteThreshold(
  72     "locally-hot-callsite-threshold", cl::Hidden, cl::init(525), cl::ZeroOrMore,
  73     cl::desc("Threshold for locally hot callsites "));
  74
  75 static cl::opt<int> ColdCallSiteRelFreq(
  76     "cold-callsite-rel-freq", cl::Hidden, cl::init(2), cl::ZeroOrMore,
  77     cl::desc("Maxmimum block frequency, expressed as a percentage of caller's "
  78              "entry frequency, for a callsite to be cold in the absence of "
  79              "profile information."));
  80
  81 static cl::opt<int> HotCallSiteRelFreq(
  82     "hot-callsite-rel-freq", cl::Hidden, cl::init(60), cl::ZeroOrMore,
  83     cl::desc("Minimum block frequency, expressed as a multiple of caller's "
  84              "entry frequency, for a callsite to be hot in the absence of "
  85              "profile information."));
  86
  87 static cl::opt<bool> OptComputeFullInlineCost(
  88     "inline-cost-full", cl::Hidden, cl::init(false),
  89     cl::desc("Compute the full inline cost of a call site even when the cost "
  90              "exceeds the threshold."));
  91
  92 namespace {
  93
  94 class CallAnalyzer : public InstVisitor<CallAnalyzer, bool> {
  95   typedef InstVisitor<CallAnalyzer, bool> Base;
  96   friend class InstVisitor<CallAnalyzer, bool>;
  97
  98   /// The TargetTransformInfo available for this compilation.
  99   const TargetTransformInfo &TTI;
 100
 101   /// Getter for the cache of @llvm.assume intrinsics.
 102   std::function<AssumptionCache &(Function &)> &GetAssumptionCache;
 103
 104   /// Getter for BlockFrequencyInfo
 105   Optional<function_ref<BlockFrequencyInfo &(Function &)>> &GetBFI;
 106
 107   /// Profile summary information.
 108   ProfileSummaryInfo *PSI;
 109
 110   /// The called function.
 111   Function &F;
 112
 113   // Cache the DataLayout since we use it a lot.
 114   const DataLayout &DL;
 115
 116   /// The OptimizationRemarkEmitter available for this compilation.
 117   OptimizationRemarkEmitter *ORE;
 118
 119   /// The candidate callsite being analyzed. Please do not use this to do
 120   /// analysis in the caller function; we want the inline cost query to be
 121   /// easily cacheable. Instead, use the cover function paramHasAttr.
 122   CallSite CandidateCS;
 123
 124   /// Tunable parameters that control the analysis.
 125   const InlineParams &Params;
 126
 127   int Threshold;
 128   int Cost;
 129   bool ComputeFullInlineCost;
 130
 131   bool IsCallerRecursive;
 132   bool IsRecursiveCall;
 133   bool ExposesReturnsTwice;
 134   bool HasDynamicAlloca;
 135   bool ContainsNoDuplicateCall;
 136   bool HasReturn;
 137   bool HasIndirectBr;
 138   bool HasFrameEscape;
 139
 140   /// Number of bytes allocated statically by the callee.
 141   uint64_t AllocatedSize;
 142   unsigned NumInstructions, NumVectorInstructions;
 143   int VectorBonus, TenPercentVectorBonus;
 144   // Bonus to be applied when the callee has only one reachable basic block.
 145   int SingleBBBonus;
 146
 147   /// While we walk the potentially-inlined instructions, we build up and
 148   /// maintain a mapping of simplified values specific to this callsite. The
 149   /// idea is to propagate any special information we have about arguments to
 150   /// this call through the inlinable section of the function, and account for
 151   /// likely simplifications post-inlining. The most important aspect we track
 152   /// is CFG altering simplifications -- when we prove a basic block dead, that
 153   /// can cause dramatic shifts in the cost of inlining a function.
 154   DenseMap<Value *, Constant *> SimplifiedValues;
 155
 156   /// Keep track of the values which map back (through function arguments) to
 157   /// allocas on the caller stack which could be simplified through SROA.
 158   DenseMap<Value *, Value *> SROAArgValues;
 159
 160   /// The mapping of caller Alloca values to their accumulated cost savings. If
 161   /// we have to disable SROA for one of the allocas, this tells us how much
 162   /// cost must be added.
 163   DenseMap<Value *, int> SROAArgCosts;
 164
 165   /// Keep track of values which map to a pointer base and constant offset.
 166   DenseMap<Value *, std::pair<Value *, APInt>> ConstantOffsetPtrs;
 167
 168   /// Keep track of dead blocks due to the constant arguments.
 169   SetVector<BasicBlock *> DeadBlocks;
 170
 171   /// The mapping of the blocks to their known unique successors due to the
 172   /// constant arguments.
 173   DenseMap<BasicBlock *, BasicBlock *> KnownSuccessors;
 174
 175   /// Model the elimination of repeated loads that is expected to happen
 176   /// whenever we simplify away the stores that would otherwise cause them to be
 177   /// loads.
 178   bool EnableLoadElimination;
 179   SmallPtrSet<Value *, 16> LoadAddrSet;
 180   int LoadEliminationCost;
 181
 182   // Custom simplification helper routines.
 183   bool isAllocaDerivedArg(Value *V);
 184   bool lookupSROAArgAndCost(Value *V, Value *&Arg,
 185                             DenseMap<Value *, int>::iterator &CostIt);
 186   void disableSROA(DenseMap<Value *, int>::iterator CostIt);
 187   void disableSROA(Value *V);
 188   void findDeadBlocks(BasicBlock *CurrBB, BasicBlock *NextBB);
 189   void accumulateSROACost(DenseMap<Value *, int>::iterator CostIt,
 190                           int InstructionCost);
 191   void disableLoadElimination();
 192   bool isGEPFree(GetElementPtrInst &GEP);
 193   bool canFoldInboundsGEP(GetElementPtrInst &I);
 194   bool accumulateGEPOffset(GEPOperator &GEP, APInt &Offset);
 195   bool simplifyCallSite(Function *F, CallSite CS);
 196   template <typename Callable>
 197   bool simplifyInstruction(Instruction &I, Callable Evaluate);
 198   ConstantInt *stripAndComputeInBoundsConstantOffsets(Value *&V);
 199
 200   /// Return true if the given argument to the function being considered for
 201   /// inlining has the given attribute set either at the call site or the
 202   /// function declaration.  Primarily used to inspect call site specific
 203   /// attributes since these can be more precise than the ones on the callee
 204   /// itself.
 205   bool paramHasAttr(Argument *A, Attribute::AttrKind Attr);
 206
 207   /// Return true if the given value is known non null within the callee if
 208   /// inlined through this particular callsite.
 209   bool isKnownNonNullInCallee(Value *V);
 210
 211   /// Update Threshold based on callsite properties such as callee
 212   /// attributes and callee hotness for PGO builds. The Callee is explicitly
 213   /// passed to support analyzing indirect calls whose target is inferred by
 214   /// analysis.
 215   void updateThreshold(CallSite CS, Function &Callee);
 216
 217   /// Return true if size growth is allowed when inlining the callee at CS.
 218   bool allowSizeGrowth(CallSite CS);
 219
 220   /// Return true if \p CS is a cold callsite.
 221   bool isColdCallSite(CallSite CS, BlockFrequencyInfo *CallerBFI);
 222
 223   /// Return a higher threshold if \p CS is a hot callsite.
 224   Optional<int> getHotCallSiteThreshold(CallSite CS,
 225                                         BlockFrequencyInfo *CallerBFI);
 226
 227   // Custom analysis routines.
 228   bool analyzeBlock(BasicBlock *BB, SmallPtrSetImpl<const Value *> &EphValues);
 229
 230   // Disable several entry points to the visitor so we don't accidentally use
 231   // them by declaring but not defining them here.
 232   void visit(Module *);
 233   void visit(Module &);
 234   void visit(Function *);
 235   void visit(Function &);
 236   void visit(BasicBlock *);
 237   void visit(BasicBlock &);
 238
 239   // Provide base case for our instruction visit.
 240   bool visitInstruction(Instruction &I);
 241
 242   // Our visit overrides.
 243   bool visitAlloca(AllocaInst &I);
 244   bool visitPHI(PHINode &I);
 245   bool visitGetElementPtr(GetElementPtrInst &I);
 246   bool visitBitCast(BitCastInst &I);
 247   bool visitPtrToInt(PtrToIntInst &I);
 248   bool visitIntToPtr(IntToPtrInst &I);
 249   bool visitCastInst(CastInst &I);
 250   bool visitUnaryInstruction(UnaryInstruction &I);
 251   bool visitCmpInst(CmpInst &I);
 252   bool visitSub(BinaryOperator &I);
 253   bool visitBinaryOperator(BinaryOperator &I);
 254   bool visitLoad(LoadInst &I);
 255   bool visitStore(StoreInst &I);
 256   bool visitExtractValue(ExtractValueInst &I);
 257   bool visitInsertValue(InsertValueInst &I);
 258   bool visitCallSite(CallSite CS);
 259   bool visitReturnInst(ReturnInst &RI);
 260   bool visitBranchInst(BranchInst &BI);
 261   bool visitSelectInst(SelectInst &SI);
 262   bool visitSwitchInst(SwitchInst &SI);
 263   bool visitIndirectBrInst(IndirectBrInst &IBI);
 264   bool visitResumeInst(ResumeInst &RI);
 265   bool visitCleanupReturnInst(CleanupReturnInst &RI);
 266   bool visitCatchReturnInst(CatchReturnInst &RI);
 267   bool visitUnreachableInst(UnreachableInst &I);
 268
 269 public:
 270   CallAnalyzer(const TargetTransformInfo &TTI,
 271                std::function<AssumptionCache &(Function &)> &GetAssumptionCache,
 272                Optional<function_ref<BlockFrequencyInfo &(Function &)>> &GetBFI,
 273                ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE,
 274                Function &Callee, CallSite CSArg, const InlineParams &Params)
 275       : TTI(TTI), GetAssumptionCache(GetAssumptionCache), GetBFI(GetBFI),
 276         PSI(PSI), F(Callee), DL(F.getParent()->getDataLayout()), ORE(ORE),
 277         CandidateCS(CSArg), Params(Params), Threshold(Params.DefaultThreshold),
 278         Cost(0), ComputeFullInlineCost(OptComputeFullInlineCost ||
 279                                        Params.ComputeFullInlineCost || ORE),
 280         IsCallerRecursive(false), IsRecursiveCall(false),
 281         ExposesReturnsTwice(false), HasDynamicAlloca(false),
 282         ContainsNoDuplicateCall(false), HasReturn(false), HasIndirectBr(false),
 283         HasFrameEscape(false), AllocatedSize(0), NumInstructions(0),
 284         NumVectorInstructions(0), VectorBonus(0), SingleBBBonus(0),
 285         EnableLoadElimination(true), LoadEliminationCost(0), NumConstantArgs(0),
 286         NumConstantOffsetPtrArgs(0), NumAllocaArgs(0), NumConstantPtrCmps(0),
 287         NumConstantPtrDiffs(0), NumInstructionsSimplified(0),
 288         SROACostSavings(0), SROACostSavingsLost(0) {}
 289
 290   bool analyzeCall(CallSite CS);
 291
 292   int getThreshold() { return Threshold; }
 293   int getCost() { return Cost; }
 294
 295   // Keep a bunch of stats about the cost savings found so we can print them
 296   // out when debugging.
 297   unsigned NumConstantArgs;
 298   unsigned NumConstantOffsetPtrArgs;
 299   unsigned NumAllocaArgs;
 300   unsigned NumConstantPtrCmps;
 301   unsigned NumConstantPtrDiffs;
 302   unsigned NumInstructionsSimplified;
 303   unsigned SROACostSavings;
 304   unsigned SROACostSavingsLost;
 305
 306   void dump();
 307 };
 308
 309 } // namespace
 310
 311 /// \brief Test whether the given value is an Alloca-derived function argument.
 312 bool CallAnalyzer::isAllocaDerivedArg(Value *V) {
 313   return SROAArgValues.count(V);
 314 }
 315
 316 /// \brief Lookup the SROA-candidate argument and cost iterator which V maps to.
 317 /// Returns false if V does not map to a SROA-candidate.
 318 bool CallAnalyzer::lookupSROAArgAndCost(
 319     Value *V, Value *&Arg, DenseMap<Value *, int>::iterator &CostIt) {
 320   if (SROAArgValues.empty() || SROAArgCosts.empty())
 321     return false;
 322
 323   DenseMap<Value *, Value *>::iterator ArgIt = SROAArgValues.find(V);
 324   if (ArgIt == SROAArgValues.end())
 325     return false;
 326
 327   Arg = ArgIt->second;
 328   CostIt = SROAArgCosts.find(Arg);
 329   return CostIt != SROAArgCosts.end();
 330 }
 331
 332 /// \brief Disable SROA for the candidate marked by this cost iterator.
 333 ///
 334 /// This marks the candidate as no longer viable for SROA, and adds the cost
 335 /// savings associated with it back into the inline cost measurement.
 336 void CallAnalyzer::disableSROA(DenseMap<Value *, int>::iterator CostIt) {
 337   // If we're no longer able to perform SROA we need to undo its cost savings
 338   // and prevent subsequent analysis.
 339   Cost += CostIt->second;
 340   SROACostSavings -= CostIt->second;
 341   SROACostSavingsLost += CostIt->second;
 342   SROAArgCosts.erase(CostIt);
 343   disableLoadElimination();
 344 }
 345
 346 /// \brief If 'V' maps to a SROA candidate, disable SROA for it.
 347 void CallAnalyzer::disableSROA(Value *V) {
 348   Value *SROAArg;
 349   DenseMap<Value *, int>::iterator CostIt;
 350   if (lookupSROAArgAndCost(V, SROAArg, CostIt))
 351     disableSROA(CostIt);
 352 }
 353
 354 /// \brief Accumulate the given cost for a particular SROA candidate.
 355 void CallAnalyzer::accumulateSROACost(DenseMap<Value *, int>::iterator CostIt,
 356                                       int InstructionCost) {
 357   CostIt->second += InstructionCost;
 358   SROACostSavings += InstructionCost;
 359 }
 360
 361 void CallAnalyzer::disableLoadElimination() {
 362   if (EnableLoadElimination) {
 363     Cost += LoadEliminationCost;
 364     LoadEliminationCost = 0;
 365     EnableLoadElimination = false;
 366   }
 367 }
 368
 369 /// \brief Accumulate a constant GEP offset into an APInt if possible.
 370 ///
 371 /// Returns false if unable to compute the offset for any reason. Respects any
 372 /// simplified values known during the analysis of this callsite.
 373 bool CallAnalyzer::accumulateGEPOffset(GEPOperator &GEP, APInt &Offset) {
 374   unsigned IntPtrWidth = DL.getPointerSizeInBits();
 375   assert(IntPtrWidth == Offset.getBitWidth());
 376
 377   for (gep_type_iterator GTI = gep_type_begin(GEP), GTE = gep_type_end(GEP);
 378        GTI != GTE; ++GTI) {
 379     ConstantInt *OpC = dyn_cast<ConstantInt>(GTI.getOperand());
 380     if (!OpC)
 381       if (Constant *SimpleOp = SimplifiedValues.lookup(GTI.getOperand()))
 382         OpC = dyn_cast<ConstantInt>(SimpleOp);
 383     if (!OpC)
 384       return false;
 385     if (OpC->isZero())
 386       continue;
 387
 388     // Handle a struct index, which adds its field offset to the pointer.
 389     if (StructType *STy = GTI.getStructTypeOrNull()) {
 390       unsigned ElementIdx = OpC->getZExtValue();
 391       const StructLayout *SL = DL.getStructLayout(STy);
 392       Offset += APInt(IntPtrWidth, SL->getElementOffset(ElementIdx));
 393       continue;
 394     }
 395
 396     APInt TypeSize(IntPtrWidth, DL.getTypeAllocSize(GTI.getIndexedType()));
 397     Offset += OpC->getValue().sextOrTrunc(IntPtrWidth) * TypeSize;
 398   }
 399   return true;
 400 }
 401
 402 /// \brief Use TTI to check whether a GEP is free.
 403 ///
 404 /// Respects any simplified values known during the analysis of this callsite.
 405 bool CallAnalyzer::isGEPFree(GetElementPtrInst &GEP) {
 406   SmallVector<Value *, 4> Operands;
 407   Operands.push_back(GEP.getOperand(0));
 408   for (User::op_iterator I = GEP.idx_begin(), E = GEP.idx_end(); I != E; ++I)
 409     if (Constant *SimpleOp = SimplifiedValues.lookup(*I))
 410        Operands.push_back(SimpleOp);
 411      else
 412        Operands.push_back(*I);
 413   return TargetTransformInfo::TCC_Free == TTI.getUserCost(&GEP, Operands);
 414 }
 415
 416 bool CallAnalyzer::visitAlloca(AllocaInst &I) {
 417   // Check whether inlining will turn a dynamic alloca into a static
 418   // alloca and handle that case.
 419   if (I.isArrayAllocation()) {
 420     Constant *Size = SimplifiedValues.lookup(I.getArraySize());
 421     if (auto *AllocSize = dyn_cast_or_null<ConstantInt>(Size)) {
 422       Type *Ty = I.getAllocatedType();
 423       AllocatedSize = SaturatingMultiplyAdd(
 424           AllocSize->getLimitedValue(), DL.getTypeAllocSize(Ty), AllocatedSize);
 425       return Base::visitAlloca(I);
 426     }
 427   }
 428
 429   // Accumulate the allocated size.
 430   if (I.isStaticAlloca()) {
 431     Type *Ty = I.getAllocatedType();
 432     AllocatedSize = SaturatingAdd(DL.getTypeAllocSize(Ty), AllocatedSize);
 433   }
 434
 435   // We will happily inline static alloca instructions.
 436   if (I.isStaticAlloca())
 437     return Base::visitAlloca(I);
 438
 439   // FIXME: This is overly conservative. Dynamic allocas are inefficient for
 440   // a variety of reasons, and so we would like to not inline them into
 441   // functions which don't currently have a dynamic alloca. This simply
 442   // disables inlining altogether in the presence of a dynamic alloca.
 443   HasDynamicAlloca = true;
 444   return false;
 445 }
 446
 447 bool CallAnalyzer::visitPHI(PHINode &I) {
 448   // FIXME: We need to propagate SROA *disabling* through phi nodes, even
 449   // though we don't want to propagate it's bonuses. The idea is to disable
 450   // SROA if it *might* be used in an inappropriate manner.
 451
 452   // Phi nodes are always zero-cost.
 453
 454   APInt ZeroOffset = APInt::getNullValue(DL.getPointerSizeInBits());
 455   bool CheckSROA = I.getType()->isPointerTy();
 456
 457   // Track the constant or pointer with constant offset we've seen so far.
 458   Constant *FirstC = nullptr;
 459   std::pair<Value *, APInt> FirstBaseAndOffset = {nullptr, ZeroOffset};
 460   Value *FirstV = nullptr;
 461
 462   for (unsigned i = 0, e = I.getNumIncomingValues(); i != e; ++i) {
 463     BasicBlock *Pred = I.getIncomingBlock(i);
 464     // If the incoming block is dead, skip the incoming block.
 465     if (DeadBlocks.count(Pred))
 466       continue;
 467     // If the parent block of phi is not the known successor of the incoming
 468     // block, skip the incoming block.
 469     BasicBlock *KnownSuccessor = KnownSuccessors[Pred];
 470     if (KnownSuccessor && KnownSuccessor != I.getParent())
 471       continue;
 472
 473     Value *V = I.getIncomingValue(i);
 474     // If the incoming value is this phi itself, skip the incoming value.
 475     if (&I == V)
 476       continue;
 477
 478     Constant *C = dyn_cast<Constant>(V);
 479     if (!C)
 480       C = SimplifiedValues.lookup(V);
 481
 482     std::pair<Value *, APInt> BaseAndOffset = {nullptr, ZeroOffset};
 483     if (!C && CheckSROA)
 484       BaseAndOffset = ConstantOffsetPtrs.lookup(V);
 485
 486     if (!C && !BaseAndOffset.first)
 487       // The incoming value is neither a constant nor a pointer with constant
 488       // offset, exit early.
 489       return true;
 490
 491     if (FirstC) {
 492       if (FirstC == C)
 493         // If we've seen a constant incoming value before and it is the same
 494         // constant we see this time, continue checking the next incoming value.
 495         continue;
 496       // Otherwise early exit because we either see a different constant or saw
 497       // a constant before but we have a pointer with constant offset this time.
 498       return true;
 499     }
 500
 501     if (FirstV) {
 502       // The same logic as above, but check pointer with constant offset here.
 503       if (FirstBaseAndOffset == BaseAndOffset)
 504         continue;
 505       return true;
 506     }
 507
 508     if (C) {
 509       // This is the 1st time we've seen a constant, record it.
 510       FirstC = C;
 511       continue;
 512     }
 513
 514     // The remaining case is that this is the 1st time we've seen a pointer with
 515     // constant offset, record it.
 516     FirstV = V;
 517     FirstBaseAndOffset = BaseAndOffset;
 518   }
 519
 520   // Check if we can map phi to a constant.
 521   if (FirstC) {
 522     SimplifiedValues[&I] = FirstC;
 523     return true;
 524   }
 525
 526   // Check if we can map phi to a pointer with constant offset.
 527   if (FirstBaseAndOffset.first) {
 528     ConstantOffsetPtrs[&I] = FirstBaseAndOffset;
 529
 530     Value *SROAArg;
 531     DenseMap<Value *, int>::iterator CostIt;
 532     if (lookupSROAArgAndCost(FirstV, SROAArg, CostIt))
 533       SROAArgValues[&I] = SROAArg;
 534   }
 535
 536   return true;
 537 }
 538
 539 /// \brief Check we can fold GEPs of constant-offset call site argument pointers.
 540 /// This requires target data and inbounds GEPs.
 541 ///
 542 /// \return true if the specified GEP can be folded.
 543 bool CallAnalyzer::canFoldInboundsGEP(GetElementPtrInst &I) {
 544   // Check if we have a base + offset for the pointer.
 545   std::pair<Value *, APInt> BaseAndOffset =
 546       ConstantOffsetPtrs.lookup(I.getPointerOperand());
 547   if (!BaseAndOffset.first)
 548     return false;
 549
 550   // Check if the offset of this GEP is constant, and if so accumulate it
 551   // into Offset.
 552   if (!accumulateGEPOffset(cast<GEPOperator>(I), BaseAndOffset.second))
 553     return false;
 554
 555   // Add the result as a new mapping to Base + Offset.
 556   ConstantOffsetPtrs[&I] = BaseAndOffset;
 557
 558   return true;
 559 }
 560
 561 bool CallAnalyzer::visitGetElementPtr(GetElementPtrInst &I) {
 562   Value *SROAArg;
 563   DenseMap<Value *, int>::iterator CostIt;
 564   bool SROACandidate =
 565       lookupSROAArgAndCost(I.getPointerOperand(), SROAArg, CostIt);
 566
 567   // Lambda to check whether a GEP's indices are all constant.
 568   auto IsGEPOffsetConstant = [&](GetElementPtrInst &GEP) {
 569     for (User::op_iterator I = GEP.idx_begin(), E = GEP.idx_end(); I != E; ++I)
 570       if (!isa<Constant>(*I) && !SimplifiedValues.lookup(*I))
 571         return false;
 572     return true;
 573   };
 574
 575   if ((I.isInBounds() && canFoldInboundsGEP(I)) || IsGEPOffsetConstant(I)) {
 576     if (SROACandidate)
 577       SROAArgValues[&I] = SROAArg;
 578
 579     // Constant GEPs are modeled as free.
 580     return true;
 581   }
 582
 583   // Variable GEPs will require math and will disable SROA.
 584   if (SROACandidate)
 585     disableSROA(CostIt);
 586   return isGEPFree(I);
 587 }
 588
 589 /// Simplify \p I if its operands are constants and update SimplifiedValues.
 590 /// \p Evaluate is a callable specific to instruction type that evaluates the
 591 /// instruction when all the operands are constants.
 592 template <typename Callable>
 593 bool CallAnalyzer::simplifyInstruction(Instruction &I, Callable Evaluate) {
 594   SmallVector<Constant *, 2> COps;
 595   for (Value *Op : I.operands()) {
 596     Constant *COp = dyn_cast<Constant>(Op);
 597     if (!COp)
 598       COp = SimplifiedValues.lookup(Op);
 599     if (!COp)
 600       return false;
 601     COps.push_back(COp);
 602   }
 603   auto *C = Evaluate(COps);
 604   if (!C)
 605     return false;
 606   SimplifiedValues[&I] = C;
 607   return true;
 608 }
 609
 610 bool CallAnalyzer::visitBitCast(BitCastInst &I) {
 611   // Propagate constants through bitcasts.
 612   if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) {
 613         return ConstantExpr::getBitCast(COps[0], I.getType());
 614       }))
 615     return true;
 616
 617   // Track base/offsets through casts
 618   std::pair<Value *, APInt> BaseAndOffset =
 619       ConstantOffsetPtrs.lookup(I.getOperand(0));
 620   // Casts don't change the offset, just wrap it up.
 621   if (BaseAndOffset.first)
 622     ConstantOffsetPtrs[&I] = BaseAndOffset;
 623
 624   // Also look for SROA candidates here.
 625   Value *SROAArg;
 626   DenseMap<Value *, int>::iterator CostIt;
 627   if (lookupSROAArgAndCost(I.getOperand(0), SROAArg, CostIt))
 628     SROAArgValues[&I] = SROAArg;
 629
 630   // Bitcasts are always zero cost.
 631   return true;
 632 }
 633
 634 bool CallAnalyzer::visitPtrToInt(PtrToIntInst &I) {
 635   // Propagate constants through ptrtoint.
 636   if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) {
 637         return ConstantExpr::getPtrToInt(COps[0], I.getType());
 638       }))
 639     return true;
 640
 641   // Track base/offset pairs when converted to a plain integer provided the
 642   // integer is large enough to represent the pointer.
 643   unsigned IntegerSize = I.getType()->getScalarSizeInBits();
 644   if (IntegerSize >= DL.getPointerSizeInBits()) {
 645     std::pair<Value *, APInt> BaseAndOffset =
 646         ConstantOffsetPtrs.lookup(I.getOperand(0));
 647     if (BaseAndOffset.first)
 648       ConstantOffsetPtrs[&I] = BaseAndOffset;
 649   }
 650
 651   // This is really weird. Technically, ptrtoint will disable SROA. However,
 652   // unless that ptrtoint is *used* somewhere in the live basic blocks after
 653   // inlining, it will be nuked, and SROA should proceed. All of the uses which
 654   // would block SROA would also block SROA if applied directly to a pointer,
 655   // and so we can just add the integer in here. The only places where SROA is
 656   // preserved either cannot fire on an integer, or won't in-and-of themselves
 657   // disable SROA (ext) w/o some later use that we would see and disable.
 658   Value *SROAArg;
 659   DenseMap<Value *, int>::iterator CostIt;
 660   if (lookupSROAArgAndCost(I.getOperand(0), SROAArg, CostIt))
 661     SROAArgValues[&I] = SROAArg;
 662
 663   return TargetTransformInfo::TCC_Free == TTI.getUserCost(&I);
 664 }
 665
 666 bool CallAnalyzer::visitIntToPtr(IntToPtrInst &I) {
 667   // Propagate constants through ptrtoint.
 668   if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) {
 669         return ConstantExpr::getIntToPtr(COps[0], I.getType());
 670       }))
 671     return true;
 672
 673   // Track base/offset pairs when round-tripped through a pointer without
 674   // modifications provided the integer is not too large.
 675   Value *Op = I.getOperand(0);
 676   unsigned IntegerSize = Op->getType()->getScalarSizeInBits();
 677   if (IntegerSize <= DL.getPointerSizeInBits()) {
 678     std::pair<Value *, APInt> BaseAndOffset = ConstantOffsetPtrs.lookup(Op);
 679     if (BaseAndOffset.first)
 680       ConstantOffsetPtrs[&I] = BaseAndOffset;
 681   }
 682
 683   // "Propagate" SROA here in the same manner as we do for ptrtoint above.
 684   Value *SROAArg;
 685   DenseMap<Value *, int>::iterator CostIt;
 686   if (lookupSROAArgAndCost(Op, SROAArg, CostIt))
 687     SROAArgValues[&I] = SROAArg;
 688
 689   return TargetTransformInfo::TCC_Free == TTI.getUserCost(&I);
 690 }
 691
 692 bool CallAnalyzer::visitCastInst(CastInst &I) {
 693   // Propagate constants through ptrtoint.
 694   if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) {
 695         return ConstantExpr::getCast(I.getOpcode(), COps[0], I.getType());
 696       }))
 697     return true;
 698
 699   // Disable SROA in the face of arbitrary casts we don't whitelist elsewhere.
 700   disableSROA(I.getOperand(0));
 701
 702   // If this is a floating-point cast, and the target says this operation
 703   // is expensive, this may eventually become a library call. Treat the cost
 704   // as such.
 705   switch (I.getOpcode()) {
 706   case Instruction::FPTrunc:
 707   case Instruction::FPExt:
 708   case Instruction::UIToFP:
 709   case Instruction::SIToFP:
 710   case Instruction::FPToUI:
 711   case Instruction::FPToSI:
 712     if (TTI.getFPOpCost(I.getType()) == TargetTransformInfo::TCC_Expensive)
 713       Cost += InlineConstants::CallPenalty;
 714   default:
 715     break;
 716   }
 717
 718   return TargetTransformInfo::TCC_Free == TTI.getUserCost(&I);
 719 }
 720
 721 bool CallAnalyzer::visitUnaryInstruction(UnaryInstruction &I) {
 722   Value *Operand = I.getOperand(0);
 723   if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) {
 724         return ConstantFoldInstOperands(&I, COps[0], DL);
 725       }))
 726     return true;
 727
 728   // Disable any SROA on the argument to arbitrary unary operators.
 729   disableSROA(Operand);
 730
 731   return false;
 732 }
 733
 734 bool CallAnalyzer::paramHasAttr(Argument *A, Attribute::AttrKind Attr) {
 735   return CandidateCS.paramHasAttr(A->getArgNo(), Attr);
 736 }
 737
 738 bool CallAnalyzer::isKnownNonNullInCallee(Value *V) {
 739   // Does the *call site* have the NonNull attribute set on an argument?  We
 740   // use the attribute on the call site to memoize any analysis done in the
 741   // caller. This will also trip if the callee function has a non-null
 742   // parameter attribute, but that's a less interesting case because hopefully
 743   // the callee would already have been simplified based on that.
 744   if (Argument *A = dyn_cast<Argument>(V))
 745     if (paramHasAttr(A, Attribute::NonNull))
 746       return true;
 747
 748   // Is this an alloca in the caller?  This is distinct from the attribute case
 749   // above because attributes aren't updated within the inliner itself and we
 750   // always want to catch the alloca derived case.
 751   if (isAllocaDerivedArg(V))
 752     // We can actually predict the result of comparisons between an
 753     // alloca-derived value and null. Note that this fires regardless of
 754     // SROA firing.
 755     return true;
 756
 757   return false;
 758 }
 759
 760 bool CallAnalyzer::allowSizeGrowth(CallSite CS) {
 761   // If the normal destination of the invoke or the parent block of the call
 762   // site is unreachable-terminated, there is little point in inlining this
 763   // unless there is literally zero cost.
 764   // FIXME: Note that it is possible that an unreachable-terminated block has a
 765   // hot entry. For example, in below scenario inlining hot_call_X() may be
 766   // beneficial :
 767   // main() {
 768   //   hot_call_1();
 769   //   ...
 770   //   hot_call_N()
 771   //   exit(0);
 772   // }
 773   // For now, we are not handling this corner case here as it is rare in real
 774   // code. In future, we should elaborate this based on BPI and BFI in more
 775   // general threshold adjusting heuristics in updateThreshold().
 776   Instruction *Instr = CS.getInstruction();
 777   if (InvokeInst *II = dyn_cast<InvokeInst>(Instr)) {
 778     if (isa<UnreachableInst>(II->getNormalDest()->getTerminator()))
 779       return false;
 780   } else if (isa<UnreachableInst>(Instr->getParent()->getTerminator()))
 781     return false;
 782
 783   return true;
 784 }
 785
 786 bool CallAnalyzer::isColdCallSite(CallSite CS, BlockFrequencyInfo *CallerBFI) {
 787   // If global profile summary is available, then callsite's coldness is
 788   // determined based on that.
 789   if (PSI && PSI->hasProfileSummary())
 790     return PSI->isColdCallSite(CS, CallerBFI);
 791
 792   // Otherwise we need BFI to be available.
 793   if (!CallerBFI)
 794     return false;
 795
 796   // Determine if the callsite is cold relative to caller's entry. We could
 797   // potentially cache the computation of scaled entry frequency, but the added
 798   // complexity is not worth it unless this scaling shows up high in the
 799   // profiles.
 800   const BranchProbability ColdProb(ColdCallSiteRelFreq, 100);
 801   auto CallSiteBB = CS.getInstruction()->getParent();
 802   auto CallSiteFreq = CallerBFI->getBlockFreq(CallSiteBB);
 803   auto CallerEntryFreq =
 804       CallerBFI->getBlockFreq(&(CS.getCaller()->getEntryBlock()));
 805   return CallSiteFreq < CallerEntryFreq * ColdProb;
 806 }
 807
 808 Optional<int>
 809 CallAnalyzer::getHotCallSiteThreshold(CallSite CS,
 810                                       BlockFrequencyInfo *CallerBFI) {
 811
 812   // If global profile summary is available, then callsite's hotness is
 813   // determined based on that.
 814   if (PSI && PSI->hasProfileSummary() && PSI->isHotCallSite(CS, CallerBFI))
 815     return Params.HotCallSiteThreshold;
 816
 817   // Otherwise we need BFI to be available and to have a locally hot callsite
 818   // threshold.
 819   if (!CallerBFI || !Params.LocallyHotCallSiteThreshold)
 820     return None;
 821
 822   // Determine if the callsite is hot relative to caller's entry. We could
 823   // potentially cache the computation of scaled entry frequency, but the added
 824   // complexity is not worth it unless this scaling shows up high in the
 825   // profiles.
 826   auto CallSiteBB = CS.getInstruction()->getParent();
 827   auto CallSiteFreq = CallerBFI->getBlockFreq(CallSiteBB).getFrequency();
 828   auto CallerEntryFreq = CallerBFI->getEntryFreq();
 829   if (CallSiteFreq >= CallerEntryFreq * HotCallSiteRelFreq)
 830     return Params.LocallyHotCallSiteThreshold;
 831
 832   // Otherwise treat it normally.
 833   return None;
 834 }
 835
 836 void CallAnalyzer::updateThreshold(CallSite CS, Function &Callee) {
 837   // If no size growth is allowed for this inlining, set Threshold to 0.
 838   if (!allowSizeGrowth(CS)) {
 839     Threshold = 0;
 840     return;
 841   }
 842
 843   Function *Caller = CS.getCaller();
 844
 845   // return min(A, B) if B is valid.
 846   auto MinIfValid = [](int A, Optional<int> B) {
 847     return B ? std::min(A, B.getValue()) : A;
 848   };
 849
 850   // return max(A, B) if B is valid.
 851   auto MaxIfValid = [](int A, Optional<int> B) {
 852     return B ? std::max(A, B.getValue()) : A;
 853   };
 854
 855   // Various bonus percentages. These are multiplied by Threshold to get the
 856   // bonus values.
 857   // SingleBBBonus: This bonus is applied if the callee has a single reachable
 858   // basic block at the given callsite context. This is speculatively applied
 859   // and withdrawn if more than one basic block is seen.
 860   //
 861   // Vector bonuses: We want to more aggressively inline vector-dense kernels
 862   // and apply this bonus based on the percentage of vector instructions. A
 863   // bonus is applied if the vector instructions exceed 50% and half that amount
 864   // is applied if it exceeds 10%. Note that these bonuses are some what
 865   // arbitrary and evolved over time by accident as much as because they are
 866   // principled bonuses.
 867   // FIXME: It would be nice to base the bonus values on something more
 868   // scientific.
 869   //
 870   // LstCallToStaticBonus: This large bonus is applied to ensure the inlining
 871   // of the last call to a static function as inlining such functions is
 872   // guaranteed to reduce code size.
 873   //
 874   // These bonus percentages may be set to 0 based on properties of the caller
 875   // and the callsite.
 876   int SingleBBBonusPercent = 50;
 877   int VectorBonusPercent = 150;
 878   int LastCallToStaticBonus = InlineConstants::LastCallToStaticBonus;
 879
 880   // Lambda to set all the above bonus and bonus percentages to 0.
 881   auto DisallowAllBonuses = [&]() {
 882     SingleBBBonusPercent = 0;
 883     VectorBonusPercent = 0;
 884     LastCallToStaticBonus = 0;
 885   };
 886
 887   // Use the OptMinSizeThreshold or OptSizeThreshold knob if they are available
 888   // and reduce the threshold if the caller has the necessary attribute.
 889   if (Caller->optForMinSize()) {
 890     Threshold = MinIfValid(Threshold, Params.OptMinSizeThreshold);
 891     // For minsize, we want to disable the single BB bonus and the vector
 892     // bonuses, but not the last-call-to-static bonus. Inlining the last call to
 893     // a static function will, at the minimum, eliminate the parameter setup and
 894     // call/return instructions.
 895     SingleBBBonusPercent = 0;
 896     VectorBonusPercent = 0;
 897   } else if (Caller->optForSize())
 898     Threshold = MinIfValid(Threshold, Params.OptSizeThreshold);
 899
 900   // Adjust the threshold based on inlinehint attribute and profile based
 901   // hotness information if the caller does not have MinSize attribute.
 902   if (!Caller->optForMinSize()) {
 903     if (Callee.hasFnAttribute(Attribute::InlineHint))
 904       Threshold = MaxIfValid(Threshold, Params.HintThreshold);
 905
 906     // FIXME: After switching to the new passmanager, simplify the logic below
 907     // by checking only the callsite hotness/coldness as we will reliably
 908     // have local profile information.
 909     //
 910     // Callsite hotness and coldness can be determined if sample profile is
 911     // used (which adds hotness metadata to calls) or if caller's
 912     // BlockFrequencyInfo is available.
 913     BlockFrequencyInfo *CallerBFI = GetBFI ? &((*GetBFI)(*Caller)) : nullptr;
 914     auto HotCallSiteThreshold = getHotCallSiteThreshold(CS, CallerBFI);
 915     if (!Caller->optForSize() && HotCallSiteThreshold) {
 916       DEBUG(dbgs() << "Hot callsite.\n");
 917       // FIXME: This should update the threshold only if it exceeds the
 918       // current threshold, but AutoFDO + ThinLTO currently relies on this
 919       // behavior to prevent inlining of hot callsites during ThinLTO
 920       // compile phase.
 921       Threshold = HotCallSiteThreshold.getValue();
 922     } else if (isColdCallSite(CS, CallerBFI)) {
 923       DEBUG(dbgs() << "Cold callsite.\n");
 924       // Do not apply bonuses for a cold callsite including the
 925       // LastCallToStatic bonus. While this bonus might result in code size
 926       // reduction, it can cause the size of a non-cold caller to increase
 927       // preventing it from being inlined.
 928       DisallowAllBonuses();
 929       Threshold = MinIfValid(Threshold, Params.ColdCallSiteThreshold);
 930     } else if (PSI) {
 931       // Use callee's global profile information only if we have no way of
 932       // determining this via callsite information.
 933       if (PSI->isFunctionEntryHot(&Callee)) {
 934         DEBUG(dbgs() << "Hot callee.\n");
 935         // If callsite hotness can not be determined, we may still know
 936         // that the callee is hot and treat it as a weaker hint for threshold
 937         // increase.
 938         Threshold = MaxIfValid(Threshold, Params.HintThreshold);
 939       } else if (PSI->isFunctionEntryCold(&Callee)) {
 940         DEBUG(dbgs() << "Cold callee.\n");
 941         // Do not apply bonuses for a cold callee including the
 942         // LastCallToStatic bonus. While this bonus might result in code size
 943         // reduction, it can cause the size of a non-cold caller to increase
 944         // preventing it from being inlined.
 945         DisallowAllBonuses();
 946         Threshold = MinIfValid(Threshold, Params.ColdThreshold);
 947       }
 948     }
 949   }
 950
 951   // Finally, take the target-specific inlining threshold multiplier into
 952   // account.
 953   Threshold *= TTI.getInliningThresholdMultiplier();
 954
 955   SingleBBBonus = Threshold * SingleBBBonusPercent / 100;
 956   VectorBonus = Threshold * VectorBonusPercent / 100;
 957
 958   bool OnlyOneCallAndLocalLinkage =
 959       F.hasLocalLinkage() && F.hasOneUse() && &F == CS.getCalledFunction();
 960   // If there is only one call of the function, and it has internal linkage,
 961   // the cost of inlining it drops dramatically. It may seem odd to update
 962   // Cost in updateThreshold, but the bonus depends on the logic in this method.
 963   if (OnlyOneCallAndLocalLinkage)
 964     Cost -= LastCallToStaticBonus;
 965 }
 966
 967 bool CallAnalyzer::visitCmpInst(CmpInst &I) {
 968   Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
 969   // First try to handle simplified comparisons.
 970   if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) {
 971         return ConstantExpr::getCompare(I.getPredicate(), COps[0], COps[1]);
 972       }))
 973     return true;
 974
 975   if (I.getOpcode() == Instruction::FCmp)
 976     return false;
 977
 978   // Otherwise look for a comparison between constant offset pointers with
 979   // a common base.
 980   Value *LHSBase, *RHSBase;
 981   APInt LHSOffset, RHSOffset;
 982   std::tie(LHSBase, LHSOffset) = ConstantOffsetPtrs.lookup(LHS);
 983   if (LHSBase) {
 984     std::tie(RHSBase, RHSOffset) = ConstantOffsetPtrs.lookup(RHS);
 985     if (RHSBase && LHSBase == RHSBase) {
 986       // We have common bases, fold the icmp to a constant based on the
 987       // offsets.
 988       Constant *CLHS = ConstantInt::get(LHS->getContext(), LHSOffset);
 989       Constant *CRHS = ConstantInt::get(RHS->getContext(), RHSOffset);
 990       if (Constant *C = ConstantExpr::getICmp(I.getPredicate(), CLHS, CRHS)) {
 991         SimplifiedValues[&I] = C;
 992         ++NumConstantPtrCmps;
 993         return true;
 994       }
 995     }
 996   }
 997
 998   // If the comparison is an equality comparison with null, we can simplify it
 999   // if we know the value (argument) can't be null
1000   if (I.isEquality() && isa<ConstantPointerNull>(I.getOperand(1)) &&
1001       isKnownNonNullInCallee(I.getOperand(0))) {
1002     bool IsNotEqual = I.getPredicate() == CmpInst::ICMP_NE;
1003     SimplifiedValues[&I] = IsNotEqual ? ConstantInt::getTrue(I.getType())
1004                                       : ConstantInt::getFalse(I.getType());
1005     return true;
1006   }
1007   // Finally check for SROA candidates in comparisons.
1008   Value *SROAArg;
1009   DenseMap<Value *, int>::iterator CostIt;
1010   if (lookupSROAArgAndCost(I.getOperand(0), SROAArg, CostIt)) {
1011     if (isa<ConstantPointerNull>(I.getOperand(1))) {
1012       accumulateSROACost(CostIt, InlineConstants::InstrCost);
1013       return true;
1014     }
1015
1016     disableSROA(CostIt);
1017   }
1018
1019   return false;
1020 }
1021
1022 bool CallAnalyzer::visitSub(BinaryOperator &I) {
1023   // Try to handle a special case: we can fold computing the difference of two
1024   // constant-related pointers.
1025   Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
1026   Value *LHSBase, *RHSBase;
1027   APInt LHSOffset, RHSOffset;
1028   std::tie(LHSBase, LHSOffset) = ConstantOffsetPtrs.lookup(LHS);
1029   if (LHSBase) {
1030     std::tie(RHSBase, RHSOffset) = ConstantOffsetPtrs.lookup(RHS);
1031     if (RHSBase && LHSBase == RHSBase) {
1032       // We have common bases, fold the subtract to a constant based on the
1033       // offsets.
1034       Constant *CLHS = ConstantInt::get(LHS->getContext(), LHSOffset);
1035       Constant *CRHS = ConstantInt::get(RHS->getContext(), RHSOffset);
1036       if (Constant *C = ConstantExpr::getSub(CLHS, CRHS)) {
1037         SimplifiedValues[&I] = C;
1038         ++NumConstantPtrDiffs;
1039         return true;
1040       }
1041     }
1042   }
1043
1044   // Otherwise, fall back to the generic logic for simplifying and handling
1045   // instructions.
1046   return Base::visitSub(I);
1047 }
1048
1049 bool CallAnalyzer::visitBinaryOperator(BinaryOperator &I) {
1050   Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
1051   Constant *CLHS = dyn_cast<Constant>(LHS);
1052   if (!CLHS)
1053     CLHS = SimplifiedValues.lookup(LHS);
1054   Constant *CRHS = dyn_cast<Constant>(RHS);
1055   if (!CRHS)
1056     CRHS = SimplifiedValues.lookup(RHS);
1057
1058   Value *SimpleV = nullptr;
1059   if (auto FI = dyn_cast<FPMathOperator>(&I))
1060     SimpleV = SimplifyFPBinOp(I.getOpcode(), CLHS ? CLHS : LHS,
1061                               CRHS ? CRHS : RHS, FI->getFastMathFlags(), DL);
1062   else
1063     SimpleV =
1064         SimplifyBinOp(I.getOpcode(), CLHS ? CLHS : LHS, CRHS ? CRHS : RHS, DL);
1065
1066   if (Constant *C = dyn_cast_or_null<Constant>(SimpleV))
1067     SimplifiedValues[&I] = C;
1068
1069   if (SimpleV)
1070     return true;
1071
1072   // Disable any SROA on arguments to arbitrary, unsimplified binary operators.
1073   disableSROA(LHS);
1074   disableSROA(RHS);
1075
1076   // If the instruction is floating point, and the target says this operation
1077   // is expensive, this may eventually become a library call. Treat the cost
1078   // as such.
1079   if (I.getType()->isFloatingPointTy() &&
1080       TTI.getFPOpCost(I.getType()) == TargetTransformInfo::TCC_Expensive)
1081     Cost += InlineConstants::CallPenalty;
1082
1083   return false;
1084 }
1085
1086 bool CallAnalyzer::visitLoad(LoadInst &I) {
1087   Value *SROAArg;
1088   DenseMap<Value *, int>::iterator CostIt;
1089   if (lookupSROAArgAndCost(I.getPointerOperand(), SROAArg, CostIt)) {
1090     if (I.isSimple()) {
1091       accumulateSROACost(CostIt, InlineConstants::InstrCost);
1092       return true;
1093     }
1094
1095     disableSROA(CostIt);
1096   }
1097
1098   // If the data is already loaded from this address and hasn't been clobbered
1099   // by any stores or calls, this load is likely to be redundant and can be
1100   // eliminated.
1101   if (EnableLoadElimination &&
1102       !LoadAddrSet.insert(I.getPointerOperand()).second && I.isUnordered()) {
1103     LoadEliminationCost += InlineConstants::InstrCost;
1104     return true;
1105   }
1106
1107   return false;
1108 }
1109
1110 bool CallAnalyzer::visitStore(StoreInst &I) {
1111   Value *SROAArg;
1112   DenseMap<Value *, int>::iterator CostIt;
1113   if (lookupSROAArgAndCost(I.getPointerOperand(), SROAArg, CostIt)) {
1114     if (I.isSimple()) {
1115       accumulateSROACost(CostIt, InlineConstants::InstrCost);
1116       return true;
1117     }
1118
1119     disableSROA(CostIt);
1120   }
1121
1122   // The store can potentially clobber loads and prevent repeated loads from
1123   // being eliminated.
1124   // FIXME:
1125   // 1. We can probably keep an initial set of eliminatable loads substracted
1126   // from the cost even when we finally see a store. We just need to disable
1127   // *further* accumulation of elimination savings.
1128   // 2. We should probably at some point thread MemorySSA for the callee into
1129   // this and then use that to actually compute *really* precise savings.
1130   disableLoadElimination();
1131   return false;
1132 }
1133
1134 bool CallAnalyzer::visitExtractValue(ExtractValueInst &I) {
1135   // Constant folding for extract value is trivial.
1136   if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) {
1137         return ConstantExpr::getExtractValue(COps[0], I.getIndices());
1138       }))
1139     return true;
1140
1141   // SROA can look through these but give them a cost.
1142   return false;
1143 }
1144
1145 bool CallAnalyzer::visitInsertValue(InsertValueInst &I) {
1146   // Constant folding for insert value is trivial.
1147   if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) {
1148         return ConstantExpr::getInsertValue(/*AggregateOperand*/ COps[0],
1149                                             /*InsertedValueOperand*/ COps[1],
1150                                             I.getIndices());
1151       }))
1152     return true;
1153
1154   // SROA can look through these but give them a cost.
1155   return false;
1156 }
1157
1158 /// \brief Try to simplify a call site.
1159 ///
1160 /// Takes a concrete function and callsite and tries to actually simplify it by
1161 /// analyzing the arguments and call itself with instsimplify. Returns true if
1162 /// it has simplified the callsite to some other entity (a constant), making it
1163 /// free.
1164 bool CallAnalyzer::simplifyCallSite(Function *F, CallSite CS) {
1165   // FIXME: Using the instsimplify logic directly for this is inefficient
1166   // because we have to continually rebuild the argument list even when no
1167   // simplifications can be performed. Until that is fixed with remapping
1168   // inside of instsimplify, directly constant fold calls here.
1169   if (!canConstantFoldCallTo(CS, F))
1170     return false;
1171
1172   // Try to re-map the arguments to constants.
1173   SmallVector<Constant *, 4> ConstantArgs;
1174   ConstantArgs.reserve(CS.arg_size());
1175   for (CallSite::arg_iterator I = CS.arg_begin(), E = CS.arg_end(); I != E;
1176        ++I) {
1177     Constant *C = dyn_cast<Constant>(*I);
1178     if (!C)
1179       C = dyn_cast_or_null<Constant>(SimplifiedValues.lookup(*I));
1180     if (!C)
1181       return false; // This argument doesn't map to a constant.
1182
1183     ConstantArgs.push_back(C);
1184   }
1185   if (Constant *C = ConstantFoldCall(CS, F, ConstantArgs)) {
1186     SimplifiedValues[CS.getInstruction()] = C;
1187     return true;
1188   }
1189
1190   return false;
1191 }
1192
1193 bool CallAnalyzer::visitCallSite(CallSite CS) {
1194   if (CS.hasFnAttr(Attribute::ReturnsTwice) &&
1195       !F.hasFnAttribute(Attribute::ReturnsTwice)) {
1196     // This aborts the entire analysis.
1197     ExposesReturnsTwice = true;
1198     return false;
1199   }
1200   if (CS.isCall() && cast<CallInst>(CS.getInstruction())->cannotDuplicate())
1201     ContainsNoDuplicateCall = true;
1202
1203   if (Function *F = CS.getCalledFunction()) {
1204     // When we have a concrete function, first try to simplify it directly.
1205     if (simplifyCallSite(F, CS))
1206       return true;
1207
1208     // Next check if it is an intrinsic we know about.
1209     // FIXME: Lift this into part of the InstVisitor.
1210     if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(CS.getInstruction())) {
1211       switch (II->getIntrinsicID()) {
1212       default:
1213         if (!CS.onlyReadsMemory() && !isAssumeLikeIntrinsic(II))
1214           disableLoadElimination();
1215         return Base::visitCallSite(CS);
1216
1217       case Intrinsic::load_relative:
1218         // This is normally lowered to 4 LLVM instructions.
1219         Cost += 3 * InlineConstants::InstrCost;
1220         return false;
1221
1222       case Intrinsic::memset:
1223       case Intrinsic::memcpy:
1224       case Intrinsic::memmove:
1225         disableLoadElimination();
1226         // SROA can usually chew through these intrinsics, but they aren't free.
1227         return false;
1228       case Intrinsic::localescape:
1229         HasFrameEscape = true;
1230         return false;
1231       }
1232     }
1233
1234     if (F == CS.getInstruction()->getFunction()) {
1235       // This flag will fully abort the analysis, so don't bother with anything
1236       // else.
1237       IsRecursiveCall = true;
1238       return false;
1239     }
1240
1241     if (TTI.isLoweredToCall(F)) {
1242       // We account for the average 1 instruction per call argument setup
1243       // here.
1244       Cost += CS.arg_size() * InlineConstants::InstrCost;
1245
1246       // Everything other than inline ASM will also have a significant cost
1247       // merely from making the call.
1248       if (!isa<InlineAsm>(CS.getCalledValue()))
1249         Cost += InlineConstants::CallPenalty;
1250     }
1251
1252     if (!CS.onlyReadsMemory())
1253       disableLoadElimination();
1254     return Base::visitCallSite(CS);
1255   }
1256
1257   // Otherwise we're in a very special case -- an indirect function call. See
1258   // if we can be particularly clever about this.
1259   Value *Callee = CS.getCalledValue();
1260
1261   // First, pay the price of the argument setup. We account for the average
1262   // 1 instruction per call argument setup here.
1263   Cost += CS.arg_size() * InlineConstants::InstrCost;
1264
1265   // Next, check if this happens to be an indirect function call to a known
1266   // function in this inline context. If not, we've done all we can.
1267   Function *F = dyn_cast_or_null<Function>(SimplifiedValues.lookup(Callee));
1268   if (!F) {
1269     if (!CS.onlyReadsMemory())
1270       disableLoadElimination();
1271     return Base::visitCallSite(CS);
1272   }
1273
1274   // If we have a constant that we are calling as a function, we can peer
1275   // through it and see the function target. This happens not infrequently
1276   // during devirtualization and so we want to give it a hefty bonus for
1277   // inlining, but cap that bonus in the event that inlining wouldn't pan
1278   // out. Pretend to inline the function, with a custom threshold.
1279   auto IndirectCallParams = Params;
1280   IndirectCallParams.DefaultThreshold = InlineConstants::IndirectCallThreshold;
1281   CallAnalyzer CA(TTI, GetAssumptionCache, GetBFI, PSI, ORE, *F, CS,
1282                   IndirectCallParams);
1283   if (CA.analyzeCall(CS)) {
1284     // We were able to inline the indirect call! Subtract the cost from the
1285     // threshold to get the bonus we want to apply, but don't go below zero.
1286     Cost -= std::max(0, CA.getThreshold() - CA.getCost());
1287   }
1288
1289   if (!F->onlyReadsMemory())
1290     disableLoadElimination();
1291   return Base::visitCallSite(CS);
1292 }
1293
1294 bool CallAnalyzer::visitReturnInst(ReturnInst &RI) {
1295   // At least one return instruction will be free after inlining.
1296   bool Free = !HasReturn;
1297   HasReturn = true;
1298   return Free;
1299 }
1300
1301 bool CallAnalyzer::visitBranchInst(BranchInst &BI) {
1302   // We model unconditional branches as essentially free -- they really
1303   // shouldn't exist at all, but handling them makes the behavior of the
1304   // inliner more regular and predictable. Interestingly, conditional branches
1305   // which will fold away are also free.
1306   return BI.isUnconditional() || isa<ConstantInt>(BI.getCondition()) ||
1307          dyn_cast_or_null<ConstantInt>(
1308              SimplifiedValues.lookup(BI.getCondition()));
1309 }
1310
1311 bool CallAnalyzer::visitSelectInst(SelectInst &SI) {
1312   bool CheckSROA = SI.getType()->isPointerTy();
1313   Value *TrueVal = SI.getTrueValue();
1314   Value *FalseVal = SI.getFalseValue();
1315
1316   Constant *TrueC = dyn_cast<Constant>(TrueVal);
1317   if (!TrueC)
1318     TrueC = SimplifiedValues.lookup(TrueVal);
1319   Constant *FalseC = dyn_cast<Constant>(FalseVal);
1320   if (!FalseC)
1321     FalseC = SimplifiedValues.lookup(FalseVal);
1322   Constant *CondC =
1323       dyn_cast_or_null<Constant>(SimplifiedValues.lookup(SI.getCondition()));
1324
1325   if (!CondC) {
1326     // Select C, X, X => X
1327     if (TrueC == FalseC && TrueC) {
1328       SimplifiedValues[&SI] = TrueC;
1329       return true;
1330     }
1331
1332     if (!CheckSROA)
1333       return Base::visitSelectInst(SI);
1334
1335     std::pair<Value *, APInt> TrueBaseAndOffset =
1336         ConstantOffsetPtrs.lookup(TrueVal);
1337     std::pair<Value *, APInt> FalseBaseAndOffset =
1338         ConstantOffsetPtrs.lookup(FalseVal);
1339     if (TrueBaseAndOffset == FalseBaseAndOffset && TrueBaseAndOffset.first) {
1340       ConstantOffsetPtrs[&SI] = TrueBaseAndOffset;
1341
1342       Value *SROAArg;
1343       DenseMap<Value *, int>::iterator CostIt;
1344       if (lookupSROAArgAndCost(TrueVal, SROAArg, CostIt))
1345         SROAArgValues[&SI] = SROAArg;
1346       return true;
1347     }
1348
1349     return Base::visitSelectInst(SI);
1350   }
1351
1352   // Select condition is a constant.
1353   Value *SelectedV = CondC->isAllOnesValue()
1354                          ? TrueVal
1355                          : (CondC->isNullValue()) ? FalseVal : nullptr;
1356   if (!SelectedV) {
1357     // Condition is a vector constant that is not all 1s or all 0s.  If all
1358     // operands are constants, ConstantExpr::getSelect() can handle the cases
1359     // such as select vectors.
1360     if (TrueC && FalseC) {
1361       if (auto *C = ConstantExpr::getSelect(CondC, TrueC, FalseC)) {
1362         SimplifiedValues[&SI] = C;
1363         return true;
1364       }
1365     }
1366     return Base::visitSelectInst(SI);
1367   }
1368
1369   // Condition is either all 1s or all 0s. SI can be simplified.
1370   if (Constant *SelectedC = dyn_cast<Constant>(SelectedV)) {
1371     SimplifiedValues[&SI] = SelectedC;
1372     return true;
1373   }
1374
1375   if (!CheckSROA)
1376     return true;
1377
1378   std::pair<Value *, APInt> BaseAndOffset =
1379       ConstantOffsetPtrs.lookup(SelectedV);
1380   if (BaseAndOffset.first) {
1381     ConstantOffsetPtrs[&SI] = BaseAndOffset;
1382
1383     Value *SROAArg;
1384     DenseMap<Value *, int>::iterator CostIt;
1385     if (lookupSROAArgAndCost(SelectedV, SROAArg, CostIt))
1386       SROAArgValues[&SI] = SROAArg;
1387   }
1388
1389   return true;
1390 }
1391
1392 bool CallAnalyzer::visitSwitchInst(SwitchInst &SI) {
1393   // We model unconditional switches as free, see the comments on handling
1394   // branches.
1395   if (isa<ConstantInt>(SI.getCondition()))
1396     return true;
1397   if (Value *V = SimplifiedValues.lookup(SI.getCondition()))
1398     if (isa<ConstantInt>(V))
1399       return true;
1400
1401   // Assume the most general case where the switch is lowered into
1402   // either a jump table, bit test, or a balanced binary tree consisting of
1403   // case clusters without merging adjacent clusters with the same
1404   // destination. We do not consider the switches that are lowered with a mix
1405   // of jump table/bit test/binary search tree. The cost of the switch is
1406   // proportional to the size of the tree or the size of jump table range.
1407   //
1408   // NB: We convert large switches which are just used to initialize large phi
1409   // nodes to lookup tables instead in simplify-cfg, so this shouldn't prevent
1410   // inlining those. It will prevent inlining in cases where the optimization
1411   // does not (yet) fire.
1412
1413   // Maximum valid cost increased in this function.
1414   int CostUpperBound = INT_MAX - InlineConstants::InstrCost - 1;
1415
1416   // Exit early for a large switch, assuming one case needs at least one
1417   // instruction.
1418   // FIXME: This is not true for a bit test, but ignore such case for now to
1419   // save compile-time.
1420   int64_t CostLowerBound =
1421       std::min((int64_t)CostUpperBound,
1422                (int64_t)SI.getNumCases() * InlineConstants::InstrCost + Cost);
1423
1424   if (CostLowerBound > Threshold && !ComputeFullInlineCost) {
1425     Cost = CostLowerBound;
1426     return false;
1427   }
1428
1429   unsigned JumpTableSize = 0;
1430   unsigned NumCaseCluster =
1431       TTI.getEstimatedNumberOfCaseClusters(SI, JumpTableSize);
1432
1433   // If suitable for a jump table, consider the cost for the table size and
1434   // branch to destination.
1435   if (JumpTableSize) {
1436     int64_t JTCost = (int64_t)JumpTableSize * InlineConstants::InstrCost +
1437                      4 * InlineConstants::InstrCost;
1438
1439     Cost = std::min((int64_t)CostUpperBound, JTCost + Cost);
1440     return false;
1441   }
1442
1443   // Considering forming a binary search, we should find the number of nodes
1444   // which is same as the number of comparisons when lowered. For a given
1445   // number of clusters, n, we can define a recursive function, f(n), to find
1446   // the number of nodes in the tree. The recursion is :
1447   // f(n) = 1 + f(n/2) + f (n - n/2), when n > 3,
1448   // and f(n) = n, when n <= 3.
1449   // This will lead a binary tree where the leaf should be either f(2) or f(3)
1450   // when n > 3.  So, the number of comparisons from leaves should be n, while
1451   // the number of non-leaf should be :
1452   //   2^(log2(n) - 1) - 1
1453   //   = 2^log2(n) * 2^-1 - 1
1454   //   = n / 2 - 1.
1455   // Considering comparisons from leaf and non-leaf nodes, we can estimate the
1456   // number of comparisons in a simple closed form :
1457   //   n + n / 2 - 1 = n * 3 / 2 - 1
1458   if (NumCaseCluster <= 3) {
1459     // Suppose a comparison includes one compare and one conditional branch.
1460     Cost += NumCaseCluster * 2 * InlineConstants::InstrCost;
1461     return false;
1462   }
1463
1464   int64_t ExpectedNumberOfCompare = 3 * (int64_t)NumCaseCluster / 2 - 1;
1465   int64_t SwitchCost =
1466       ExpectedNumberOfCompare * 2 * InlineConstants::InstrCost;
1467
1468   Cost = std::min((int64_t)CostUpperBound, SwitchCost + Cost);
1469   return false;
1470 }
1471
1472 bool CallAnalyzer::visitIndirectBrInst(IndirectBrInst &IBI) {
1473   // We never want to inline functions that contain an indirectbr.  This is
1474   // incorrect because all the blockaddress's (in static global initializers
1475   // for example) would be referring to the original function, and this
1476   // indirect jump would jump from the inlined copy of the function into the
1477   // original function which is extremely undefined behavior.
1478   // FIXME: This logic isn't really right; we can safely inline functions with
1479   // indirectbr's as long as no other function or global references the
1480   // blockaddress of a block within the current function.
1481   HasIndirectBr = true;
1482   return false;
1483 }
1484
1485 bool CallAnalyzer::visitResumeInst(ResumeInst &RI) {
1486   // FIXME: It's not clear that a single instruction is an accurate model for
1487   // the inline cost of a resume instruction.
1488   return false;
1489 }
1490
1491 bool CallAnalyzer::visitCleanupReturnInst(CleanupReturnInst &CRI) {
1492   // FIXME: It's not clear that a single instruction is an accurate model for
1493   // the inline cost of a cleanupret instruction.
1494   return false;
1495 }
1496
1497 bool CallAnalyzer::visitCatchReturnInst(CatchReturnInst &CRI) {
1498   // FIXME: It's not clear that a single instruction is an accurate model for
1499   // the inline cost of a catchret instruction.
1500   return false;
1501 }
1502
1503 bool CallAnalyzer::visitUnreachableInst(UnreachableInst &I) {
1504   // FIXME: It might be reasonably to discount the cost of instructions leading
1505   // to unreachable as they have the lowest possible impact on both runtime and
1506   // code size.
1507   return true; // No actual code is needed for unreachable.
1508 }
1509
1510 bool CallAnalyzer::visitInstruction(Instruction &I) {
1511   // Some instructions are free. All of the free intrinsics can also be
1512   // handled by SROA, etc.
1513   if (TargetTransformInfo::TCC_Free == TTI.getUserCost(&I))
1514     return true;
1515
1516   // We found something we don't understand or can't handle. Mark any SROA-able
1517   // values in the operand list as no longer viable.
1518   for (User::op_iterator OI = I.op_begin(), OE = I.op_end(); OI != OE; ++OI)
1519     disableSROA(*OI);
1520
1521   return false;
1522 }
1523
1524 /// \brief Analyze a basic block for its contribution to the inline cost.
1525 ///
1526 /// This method walks the analyzer over every instruction in the given basic
1527 /// block and accounts for their cost during inlining at this callsite. It
1528 /// aborts early if the threshold has been exceeded or an impossible to inline
1529 /// construct has been detected. It returns false if inlining is no longer
1530 /// viable, and true if inlining remains viable.
1531 bool CallAnalyzer::analyzeBlock(BasicBlock *BB,
1532                                 SmallPtrSetImpl<const Value *> &EphValues) {
1533   for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) {
1534     // FIXME: Currently, the number of instructions in a function regardless of
1535     // our ability to simplify them during inline to constants or dead code,
1536     // are actually used by the vector bonus heuristic. As long as that's true,
1537     // we have to special case debug intrinsics here to prevent differences in
1538     // inlining due to debug symbols. Eventually, the number of unsimplified
1539     // instructions shouldn't factor into the cost computation, but until then,
1540     // hack around it here.
1541     if (isa<DbgInfoIntrinsic>(I))
1542       continue;
1543
1544     // Skip ephemeral values.
1545     if (EphValues.count(&*I))
1546       continue;
1547
1548     ++NumInstructions;
1549     if (isa<ExtractElementInst>(I) || I->getType()->isVectorTy())
1550       ++NumVectorInstructions;
1551
1552     // If the instruction simplified to a constant, there is no cost to this
1553     // instruction. Visit the instructions using our InstVisitor to account for
1554     // all of the per-instruction logic. The visit tree returns true if we
1555     // consumed the instruction in any way, and false if the instruction's base
1556     // cost should count against inlining.
1557     if (Base::visit(&*I))
1558       ++NumInstructionsSimplified;
1559     else
1560       Cost += InlineConstants::InstrCost;
1561
1562     using namespace ore;
1563     // If the visit this instruction detected an uninlinable pattern, abort.
1564     if (IsRecursiveCall || ExposesReturnsTwice || HasDynamicAlloca ||
1565         HasIndirectBr || HasFrameEscape) {
1566       if (ORE)
1567         ORE->emit([&]() {
1568           return OptimizationRemarkMissed(DEBUG_TYPE, "NeverInline",
1569                                           CandidateCS.getInstruction())
1570                  << NV("Callee", &F)
1571                  << " has uninlinable pattern and cost is not fully computed";
1572         });
1573       return false;
1574     }
1575
1576     // If the caller is a recursive function then we don't want to inline
1577     // functions which allocate a lot of stack space because it would increase
1578     // the caller stack usage dramatically.
1579     if (IsCallerRecursive &&
1580         AllocatedSize > InlineConstants::TotalAllocaSizeRecursiveCaller) {
1581       if (ORE)
1582         ORE->emit([&]() {
1583           return OptimizationRemarkMissed(DEBUG_TYPE, "NeverInline",
1584                                           CandidateCS.getInstruction())
1585                  << NV("Callee", &F)
1586                  << " is recursive and allocates too much stack space. Cost is "
1587                     "not fully computed";
1588         });
1589       return false;
1590     }
1591
1592     // Check if we've past the maximum possible threshold so we don't spin in
1593     // huge basic blocks that will never inline.
1594     if (Cost >= Threshold && !ComputeFullInlineCost)
1595       return false;
1596   }
1597
1598   return true;
1599 }
1600
1601 /// \brief Compute the base pointer and cumulative constant offsets for V.
1602 ///
1603 /// This strips all constant offsets off of V, leaving it the base pointer, and
1604 /// accumulates the total constant offset applied in the returned constant. It
1605 /// returns 0 if V is not a pointer, and returns the constant '0' if there are
1606 /// no constant offsets applied.
1607 ConstantInt *CallAnalyzer::stripAndComputeInBoundsConstantOffsets(Value *&V) {
1608   if (!V->getType()->isPointerTy())
1609     return nullptr;
1610
1611   unsigned IntPtrWidth = DL.getPointerSizeInBits();
1612   APInt Offset = APInt::getNullValue(IntPtrWidth);
1613
1614   // Even though we don't look through PHI nodes, we could be called on an
1615   // instruction in an unreachable block, which may be on a cycle.
1616   SmallPtrSet<Value *, 4> Visited;
1617   Visited.insert(V);
1618   do {
1619     if (GEPOperator *GEP = dyn_cast<GEPOperator>(V)) {
1620       if (!GEP->isInBounds() || !accumulateGEPOffset(*GEP, Offset))
1621         return nullptr;
1622       V = GEP->getPointerOperand();
1623     } else if (Operator::getOpcode(V) == Instruction::BitCast) {
1624       V = cast<Operator>(V)->getOperand(0);
1625     } else if (GlobalAlias *GA = dyn_cast<GlobalAlias>(V)) {
1626       if (GA->isInterposable())
1627         break;
1628       V = GA->getAliasee();
1629     } else {
1630       break;
1631     }
1632     assert(V->getType()->isPointerTy() && "Unexpected operand type!");
1633   } while (Visited.insert(V).second);
1634
1635   Type *IntPtrTy = DL.getIntPtrType(V->getContext());
1636   return cast<ConstantInt>(ConstantInt::get(IntPtrTy, Offset));
1637 }
1638
1639 /// \brief Find dead blocks due to deleted CFG edges during inlining.
1640 ///
1641 /// If we know the successor of the current block, \p CurrBB, has to be \p
1642 /// NextBB, the other successors of \p CurrBB are dead if these successors have
1643 /// no live incoming CFG edges.  If one block is found to be dead, we can
1644 /// continue growing the dead block list by checking the successors of the dead
1645 /// blocks to see if all their incoming edges are dead or not.
1646 void CallAnalyzer::findDeadBlocks(BasicBlock *CurrBB, BasicBlock *NextBB) {
1647   auto IsEdgeDead = [&](BasicBlock *Pred, BasicBlock *Succ) {
1648     // A CFG edge is dead if the predecessor is dead or the predessor has a
1649     // known successor which is not the one under exam.
1650     return (DeadBlocks.count(Pred) ||
1651             (KnownSuccessors[Pred] && KnownSuccessors[Pred] != Succ));
1652   };
1653
1654   auto IsNewlyDead = [&](BasicBlock *BB) {
1655     // If all the edges to a block are dead, the block is also dead.
1656     return (!DeadBlocks.count(BB) &&
1657             llvm::all_of(predecessors(BB),
1658                          [&](BasicBlock *P) { return IsEdgeDead(P, BB); }));
1659   };
1660
1661   for (BasicBlock *Succ : successors(CurrBB)) {
1662     if (Succ == NextBB || !IsNewlyDead(Succ))
1663       continue;
1664     SmallVector<BasicBlock *, 4> NewDead;
1665     NewDead.push_back(Succ);
1666     while (!NewDead.empty()) {
1667       BasicBlock *Dead = NewDead.pop_back_val();
1668       if (DeadBlocks.insert(Dead))
1669         // Continue growing the dead block lists.
1670         for (BasicBlock *S : successors(Dead))
1671           if (IsNewlyDead(S))
1672             NewDead.push_back(S);
1673     }
1674   }
1675 }
1676
1677 /// \brief Analyze a call site for potential inlining.
1678 ///
1679 /// Returns true if inlining this call is viable, and false if it is not
1680 /// viable. It computes the cost and adjusts the threshold based on numerous
1681 /// factors and heuristics. If this method returns false but the computed cost
1682 /// is below the computed threshold, then inlining was forcibly disabled by
1683 /// some artifact of the routine.
1684 bool CallAnalyzer::analyzeCall(CallSite CS) {
1685   ++NumCallsAnalyzed;
1686
1687   // Perform some tweaks to the cost and threshold based on the direct
1688   // callsite information.
1689
1690   // We want to more aggressively inline vector-dense kernels, so up the
1691   // threshold, and we'll lower it if the % of vector instructions gets too
1692   // low. Note that these bonuses are some what arbitrary and evolved over time
1693   // by accident as much as because they are principled bonuses.
1694   //
1695   // FIXME: It would be nice to remove all such bonuses. At least it would be
1696   // nice to base the bonus values on something more scientific.
1697   assert(NumInstructions == 0);
1698   assert(NumVectorInstructions == 0);
1699
1700   // Update the threshold based on callsite properties
1701   updateThreshold(CS, F);
1702
1703   // Speculatively apply all possible bonuses to Threshold. If cost exceeds
1704   // this Threshold any time, and cost cannot decrease, we can stop processing
1705   // the rest of the function body.
1706   Threshold += (SingleBBBonus + VectorBonus);
1707
1708   // Give out bonuses for the callsite, as the instructions setting them up
1709   // will be gone after inlining.
1710   Cost -= getCallsiteCost(CS, DL);
1711
1712   // If this function uses the coldcc calling convention, prefer not to inline
1713   // it.
1714   if (F.getCallingConv() == CallingConv::Cold)
1715     Cost += InlineConstants::ColdccPenalty;
1716
1717   // Check if we're done. This can happen due to bonuses and penalties.
1718   if (Cost >= Threshold && !ComputeFullInlineCost)
1719     return false;
1720
1721   if (F.empty())
1722     return true;
1723
1724   Function *Caller = CS.getInstruction()->getFunction();
1725   // Check if the caller function is recursive itself.
1726   for (User *U : Caller->users()) {
1727     CallSite Site(U);
1728     if (!Site)
1729       continue;
1730     Instruction *I = Site.getInstruction();
1731     if (I->getFunction() == Caller) {
1732       IsCallerRecursive = true;
1733       break;
1734     }
1735   }
1736
1737   // Populate our simplified values by mapping from function arguments to call
1738   // arguments with known important simplifications.
1739   CallSite::arg_iterator CAI = CS.arg_begin();
1740   for (Function::arg_iterator FAI = F.arg_begin(), FAE = F.arg_end();
1741        FAI != FAE; ++FAI, ++CAI) {
1742     assert(CAI != CS.arg_end());
1743     if (Constant *C = dyn_cast<Constant>(CAI))
1744       SimplifiedValues[&*FAI] = C;
1745
1746     Value *PtrArg = *CAI;
1747     if (ConstantInt *C = stripAndComputeInBoundsConstantOffsets(PtrArg)) {
1748       ConstantOffsetPtrs[&*FAI] = std::make_pair(PtrArg, C->getValue());
1749
1750       // We can SROA any pointer arguments derived from alloca instructions.
1751       if (isa<AllocaInst>(PtrArg)) {
1752         SROAArgValues[&*FAI] = PtrArg;
1753         SROAArgCosts[PtrArg] = 0;
1754       }
1755     }
1756   }
1757   NumConstantArgs = SimplifiedValues.size();
1758   NumConstantOffsetPtrArgs = ConstantOffsetPtrs.size();
1759   NumAllocaArgs = SROAArgValues.size();
1760
1761   // FIXME: If a caller has multiple calls to a callee, we end up recomputing
1762   // the ephemeral values multiple times (and they're completely determined by
1763   // the callee, so this is purely duplicate work).
1764   SmallPtrSet<const Value *, 32> EphValues;
1765   CodeMetrics::collectEphemeralValues(&F, &GetAssumptionCache(F), EphValues);
1766
1767   // The worklist of live basic blocks in the callee *after* inlining. We avoid
1768   // adding basic blocks of the callee which can be proven to be dead for this
1769   // particular call site in order to get more accurate cost estimates. This
1770   // requires a somewhat heavyweight iteration pattern: we need to walk the
1771   // basic blocks in a breadth-first order as we insert live successors. To
1772   // accomplish this, prioritizing for small iterations because we exit after
1773   // crossing our threshold, we use a small-size optimized SetVector.
1774   typedef SetVector<BasicBlock *, SmallVector<BasicBlock *, 16>,
1775                     SmallPtrSet<BasicBlock *, 16>>
1776       BBSetVector;
1777   BBSetVector BBWorklist;
1778   BBWorklist.insert(&F.getEntryBlock());
1779   bool SingleBB = true;
1780   // Note that we *must not* cache the size, this loop grows the worklist.
1781   for (unsigned Idx = 0; Idx != BBWorklist.size(); ++Idx) {
1782     // Bail out the moment we cross the threshold. This means we'll under-count
1783     // the cost, but only when undercounting doesn't matter.
1784     if (Cost >= Threshold && !ComputeFullInlineCost)
1785       break;
1786
1787     BasicBlock *BB = BBWorklist[Idx];
1788     if (BB->empty())
1789       continue;
1790
1791     // Disallow inlining a blockaddress. A blockaddress only has defined
1792     // behavior for an indirect branch in the same function, and we do not
1793     // currently support inlining indirect branches. But, the inliner may not
1794     // see an indirect branch that ends up being dead code at a particular call
1795     // site. If the blockaddress escapes the function, e.g., via a global
1796     // variable, inlining may lead to an invalid cross-function reference.
1797     if (BB->hasAddressTaken())
1798       return false;
1799
1800     // Analyze the cost of this block. If we blow through the threshold, this
1801     // returns false, and we can bail on out.
1802     if (!analyzeBlock(BB, EphValues))
1803       return false;
1804
1805     TerminatorInst *TI = BB->getTerminator();
1806
1807     // Add in the live successors by first checking whether we have terminator
1808     // that may be simplified based on the values simplified by this call.
1809     if (BranchInst *BI = dyn_cast<BranchInst>(TI)) {
1810       if (BI->isConditional()) {
1811         Value *Cond = BI->getCondition();
1812         if (ConstantInt *SimpleCond =
1813                 dyn_cast_or_null<ConstantInt>(SimplifiedValues.lookup(Cond))) {
1814           BasicBlock *NextBB = BI->getSuccessor(SimpleCond->isZero() ? 1 : 0);
1815           BBWorklist.insert(NextBB);
1816           KnownSuccessors[BB] = NextBB;
1817           findDeadBlocks(BB, NextBB);
1818           continue;
1819         }
1820       }
1821     } else if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) {
1822       Value *Cond = SI->getCondition();
1823       if (ConstantInt *SimpleCond =
1824               dyn_cast_or_null<ConstantInt>(SimplifiedValues.lookup(Cond))) {
1825         BasicBlock *NextBB = SI->findCaseValue(SimpleCond)->getCaseSuccessor();
1826         BBWorklist.insert(NextBB);
1827         KnownSuccessors[BB] = NextBB;
1828         findDeadBlocks(BB, NextBB);
1829         continue;
1830       }
1831     }
1832
1833     // If we're unable to select a particular successor, just count all of
1834     // them.
1835     for (unsigned TIdx = 0, TSize = TI->getNumSuccessors(); TIdx != TSize;
1836          ++TIdx)
1837       BBWorklist.insert(TI->getSuccessor(TIdx));
1838
1839     // If we had any successors at this point, than post-inlining is likely to
1840     // have them as well. Note that we assume any basic blocks which existed
1841     // due to branches or switches which folded above will also fold after
1842     // inlining.
1843     if (SingleBB && TI->getNumSuccessors() > 1) {
1844       // Take off the bonus we applied to the threshold.
1845       Threshold -= SingleBBBonus;
1846       SingleBB = false;
1847     }
1848   }
1849
1850   bool OnlyOneCallAndLocalLinkage =
1851       F.hasLocalLinkage() && F.hasOneUse() && &F == CS.getCalledFunction();
1852   // If this is a noduplicate call, we can still inline as long as
1853   // inlining this would cause the removal of the caller (so the instruction
1854   // is not actually duplicated, just moved).
1855   if (!OnlyOneCallAndLocalLinkage && ContainsNoDuplicateCall)
1856     return false;
1857
1858   // We applied the maximum possible vector bonus at the beginning. Now,
1859   // subtract the excess bonus, if any, from the Threshold before
1860   // comparing against Cost.
1861   if (NumVectorInstructions <= NumInstructions / 10)
1862     Threshold -= VectorBonus;
1863   else if (NumVectorInstructions <= NumInstructions / 2)
1864     Threshold -= VectorBonus/2;
1865
1866   return Cost < std::max(1, Threshold);
1867 }
1868
1869 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1870 /// \brief Dump stats about this call's analysis.
1871 LLVM_DUMP_METHOD void CallAnalyzer::dump() {
1872 #define DEBUG_PRINT_STAT(x) dbgs() << "      " #x ": " << x << "\n"
1873   DEBUG_PRINT_STAT(NumConstantArgs);
1874   DEBUG_PRINT_STAT(NumConstantOffsetPtrArgs);
1875   DEBUG_PRINT_STAT(NumAllocaArgs);
1876   DEBUG_PRINT_STAT(NumConstantPtrCmps);
1877   DEBUG_PRINT_STAT(NumConstantPtrDiffs);
1878   DEBUG_PRINT_STAT(NumInstructionsSimplified);
1879   DEBUG_PRINT_STAT(NumInstructions);
1880   DEBUG_PRINT_STAT(SROACostSavings);
1881   DEBUG_PRINT_STAT(SROACostSavingsLost);
1882   DEBUG_PRINT_STAT(LoadEliminationCost);
1883   DEBUG_PRINT_STAT(ContainsNoDuplicateCall);
1884   DEBUG_PRINT_STAT(Cost);
1885   DEBUG_PRINT_STAT(Threshold);
1886 #undef DEBUG_PRINT_STAT
1887 }
1888 #endif
1889
1890 /// \brief Test that there are no attribute conflicts between Caller and Callee
1891 ///        that prevent inlining.
1892 static bool functionsHaveCompatibleAttributes(Function *Caller,
1893                                               Function *Callee,
1894                                               TargetTransformInfo &TTI) {
1895   return TTI.areInlineCompatible(Caller, Callee) &&
1896          AttributeFuncs::areInlineCompatible(*Caller, *Callee);
1897 }
1898
1899 int llvm::getCallsiteCost(CallSite CS, const DataLayout &DL) {
1900   int Cost = 0;
1901   for (unsigned I = 0, E = CS.arg_size(); I != E; ++I) {
1902     if (CS.isByValArgument(I)) {
1903       // We approximate the number of loads and stores needed by dividing the
1904       // size of the byval type by the target's pointer size.
1905       PointerType *PTy = cast<PointerType>(CS.getArgument(I)->getType());
1906       unsigned TypeSize = DL.getTypeSizeInBits(PTy->getElementType());
1907       unsigned PointerSize = DL.getPointerSizeInBits();
1908       // Ceiling division.
1909       unsigned NumStores = (TypeSize + PointerSize - 1) / PointerSize;
1910
1911       // If it generates more than 8 stores it is likely to be expanded as an
1912       // inline memcpy so we take that as an upper bound. Otherwise we assume
1913       // one load and one store per word copied.
1914       // FIXME: The maxStoresPerMemcpy setting from the target should be used
1915       // here instead of a magic number of 8, but it's not available via
1916       // DataLayout.
1917       NumStores = std::min(NumStores, 8U);
1918
1919       Cost += 2 * NumStores * InlineConstants::InstrCost;
1920     } else {
1921       // For non-byval arguments subtract off one instruction per call
1922       // argument.
1923       Cost += InlineConstants::InstrCost;
1924     }
1925   }
1926   // The call instruction also disappears after inlining.
1927   Cost += InlineConstants::InstrCost + InlineConstants::CallPenalty;
1928   return Cost;
1929 }
1930
1931 InlineCost llvm::getInlineCost(
1932     CallSite CS, const InlineParams &Params, TargetTransformInfo &CalleeTTI,
1933     std::function<AssumptionCache &(Function &)> &GetAssumptionCache,
1934     Optional<function_ref<BlockFrequencyInfo &(Function &)>> GetBFI,
1935     ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE) {
1936   return getInlineCost(CS, CS.getCalledFunction(), Params, CalleeTTI,
1937                        GetAssumptionCache, GetBFI, PSI, ORE);
1938 }
1939
1940 InlineCost llvm::getInlineCost(
1941     CallSite CS, Function *Callee, const InlineParams &Params,
1942     TargetTransformInfo &CalleeTTI,
1943     std::function<AssumptionCache &(Function &)> &GetAssumptionCache,
1944     Optional<function_ref<BlockFrequencyInfo &(Function &)>> GetBFI,
1945     ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE) {
1946
1947   // Cannot inline indirect calls.
1948   if (!Callee)
1949     return llvm::InlineCost::getNever();
1950
1951   // Calls to functions with always-inline attributes should be inlined
1952   // whenever possible.
1953   if (CS.hasFnAttr(Attribute::AlwaysInline)) {
1954     if (isInlineViable(*Callee))
1955       return llvm::InlineCost::getAlways();
1956     return llvm::InlineCost::getNever();
1957   }
1958
1959   // Never inline functions with conflicting attributes (unless callee has
1960   // always-inline attribute).
1961   Function *Caller = CS.getCaller();
1962   if (!functionsHaveCompatibleAttributes(Caller, Callee, CalleeTTI))
1963     return llvm::InlineCost::getNever();
1964
1965   // Don't inline this call if the caller has the optnone attribute.
1966   if (Caller->hasFnAttribute(Attribute::OptimizeNone))
1967     return llvm::InlineCost::getNever();
1968
1969   // Don't inline functions which can be interposed at link-time.  Don't inline
1970   // functions marked noinline or call sites marked noinline.
1971   // Note: inlining non-exact non-interposable functions is fine, since we know
1972   // we have *a* correct implementation of the source level function.
1973   if (Callee->isInterposable() || Callee->hasFnAttribute(Attribute::NoInline) ||
1974       CS.isNoInline())
1975     return llvm::InlineCost::getNever();
1976
1977   DEBUG(llvm::dbgs() << "      Analyzing call of " << Callee->getName()
1978                      << "... (caller:" << Caller->getName() << ")\n");
1979
1980   CallAnalyzer CA(CalleeTTI, GetAssumptionCache, GetBFI, PSI, ORE, *Callee, CS,
1981                   Params);
1982   bool ShouldInline = CA.analyzeCall(CS);
1983
1984   DEBUG(CA.dump());
1985
1986   // Check if there was a reason to force inlining or no inlining.
1987   if (!ShouldInline && CA.getCost() < CA.getThreshold())
1988     return InlineCost::getNever();
1989   if (ShouldInline && CA.getCost() >= CA.getThreshold())
1990     return InlineCost::getAlways();
1991
1992   return llvm::InlineCost::get(CA.getCost(), CA.getThreshold());
1993 }
1994
1995 bool llvm::isInlineViable(Function &F) {
1996   bool ReturnsTwice = F.hasFnAttribute(Attribute::ReturnsTwice);
1997   for (Function::iterator BI = F.begin(), BE = F.end(); BI != BE; ++BI) {
1998     // Disallow inlining of functions which contain indirect branches or
1999     // blockaddresses.
2000     if (isa<IndirectBrInst>(BI->getTerminator()) || BI->hasAddressTaken())
2001       return false;
2002
2003     for (auto &II : *BI) {
2004       CallSite CS(&II);
2005       if (!CS)
2006         continue;
2007
2008       // Disallow recursive calls.
2009       if (&F == CS.getCalledFunction())
2010         return false;
2011
2012       // Disallow calls which expose returns-twice to a function not previously
2013       // attributed as such.
2014       if (!ReturnsTwice && CS.isCall() &&
2015           cast<CallInst>(CS.getInstruction())->canReturnTwice())
2016         return false;
2017
2018       // Disallow inlining functions that call @llvm.localescape. Doing this
2019       // correctly would require major changes to the inliner.
2020       if (CS.getCalledFunction() &&
2021           CS.getCalledFunction()->getIntrinsicID() ==
2022               llvm::Intrinsic::localescape)
2023         return false;
2024     }
2025   }
2026
2027   return true;
2028 }
2029
2030 // APIs to create InlineParams based on command line flags and/or other
2031 // parameters.
2032
2033 InlineParams llvm::getInlineParams(int Threshold) {
2034   InlineParams Params;
2035
2036   // This field is the threshold to use for a callee by default. This is
2037   // derived from one or more of:
2038   //  * optimization or size-optimization levels,
2039   //  * a value passed to createFunctionInliningPass function, or
2040   //  * the -inline-threshold flag.
2041   //  If the -inline-threshold flag is explicitly specified, that is used
2042   //  irrespective of anything else.
2043   if (InlineThreshold.getNumOccurrences() > 0)
2044     Params.DefaultThreshold = InlineThreshold;
2045   else
2046     Params.DefaultThreshold = Threshold;
2047
2048   // Set the HintThreshold knob from the -inlinehint-threshold.
2049   Params.HintThreshold = HintThreshold;
2050
2051   // Set the HotCallSiteThreshold knob from the -hot-callsite-threshold.
2052   Params.HotCallSiteThreshold = HotCallSiteThreshold;
2053
2054   // If the -locally-hot-callsite-threshold is explicitly specified, use it to
2055   // populate LocallyHotCallSiteThreshold. Later, we populate
2056   // Params.LocallyHotCallSiteThreshold from -locally-hot-callsite-threshold if
2057   // we know that optimization level is O3 (in the getInlineParams variant that
2058   // takes the opt and size levels).
2059   // FIXME: Remove this check (and make the assignment unconditional) after
2060   // addressing size regression issues at O2.
2061   if (LocallyHotCallSiteThreshold.getNumOccurrences() > 0)
2062     Params.LocallyHotCallSiteThreshold = LocallyHotCallSiteThreshold;
2063
2064   // Set the ColdCallSiteThreshold knob from the -inline-cold-callsite-threshold.
2065   Params.ColdCallSiteThreshold = ColdCallSiteThreshold;
2066
2067   // Set the OptMinSizeThreshold and OptSizeThreshold params only if the
2068   // -inlinehint-threshold commandline option is not explicitly given. If that
2069   // option is present, then its value applies even for callees with size and
2070   // minsize attributes.
2071   // If the -inline-threshold is not specified, set the ColdThreshold from the
2072   // -inlinecold-threshold even if it is not explicitly passed. If
2073   // -inline-threshold is specified, then -inlinecold-threshold needs to be
2074   // explicitly specified to set the ColdThreshold knob
2075   if (InlineThreshold.getNumOccurrences() == 0) {
2076     Params.OptMinSizeThreshold = InlineConstants::OptMinSizeThreshold;
2077     Params.OptSizeThreshold = InlineConstants::OptSizeThreshold;
2078     Params.ColdThreshold = ColdThreshold;
2079   } else if (ColdThreshold.getNumOccurrences() > 0) {
2080     Params.ColdThreshold = ColdThreshold;
2081   }
2082   return Params;
2083 }
2084
2085 InlineParams llvm::getInlineParams() {
2086   return getInlineParams(InlineThreshold);
2087 }
2088
2089 // Compute the default threshold for inlining based on the opt level and the
2090 // size opt level.
2091 static int computeThresholdFromOptLevels(unsigned OptLevel,
2092                                          unsigned SizeOptLevel) {
2093   if (OptLevel > 2)
2094     return InlineConstants::OptAggressiveThreshold;
2095   if (SizeOptLevel == 1) // -Os
2096     return InlineConstants::OptSizeThreshold;
2097   if (SizeOptLevel == 2) // -Oz
2098     return InlineConstants::OptMinSizeThreshold;
2099   return InlineThreshold;
2100 }
2101
2102 InlineParams llvm::getInlineParams(unsigned OptLevel, unsigned SizeOptLevel) {
2103   auto Params =
2104       getInlineParams(computeThresholdFromOptLevels(OptLevel, SizeOptLevel));
2105   // At O3, use the value of -locally-hot-callsite-threshold option to populate
2106   // Params.LocallyHotCallSiteThreshold. Below O3, this flag has effect only
2107   // when it is specified explicitly.
2108   if (OptLevel > 2)
2109     Params.LocallyHotCallSiteThreshold = LocallyHotCallSiteThreshold;
2110   return Params;
2111 }