contrib/llvm/lib/Analysis/InlineCost.cpp

   1 //===- InlineCost.cpp - Cost analysis for inliner -------------------------===//
   2 //
   3 //                     The LLVM Compiler Infrastructure
   4 //
   5 // This file is distributed under the University of Illinois Open Source
   6 // License. See LICENSE.TXT for details.
   7 //
   8 //===----------------------------------------------------------------------===//
   9 //
  10 // This file implements inline cost analysis.
  11 //
  12 //===----------------------------------------------------------------------===//
  13
  14 #include "llvm/Analysis/InlineCost.h"
  15 #include "llvm/ADT/STLExtras.h"
  16 #include "llvm/ADT/SetVector.h"
  17 #include "llvm/ADT/SmallPtrSet.h"
  18 #include "llvm/ADT/SmallVector.h"
  19 #include "llvm/ADT/Statistic.h"
  20 #include "llvm/Analysis/AssumptionCache.h"
  21 #include "llvm/Analysis/CodeMetrics.h"
  22 #include "llvm/Analysis/ConstantFolding.h"
  23 #include "llvm/Analysis/InstructionSimplify.h"
  24 #include "llvm/Analysis/ProfileSummaryInfo.h"
  25 #include "llvm/Analysis/TargetTransformInfo.h"
  26 #include "llvm/IR/CallSite.h"
  27 #include "llvm/IR/CallingConv.h"
  28 #include "llvm/IR/DataLayout.h"
  29 #include "llvm/IR/GetElementPtrTypeIterator.h"
  30 #include "llvm/IR/GlobalAlias.h"
  31 #include "llvm/IR/InstVisitor.h"
  32 #include "llvm/IR/IntrinsicInst.h"
  33 #include "llvm/IR/Operator.h"
  34 #include "llvm/Support/Debug.h"
  35 #include "llvm/Support/raw_ostream.h"
  36
  37 using namespace llvm;
  38
  39 #define DEBUG_TYPE "inline-cost"
  40
  41 STATISTIC(NumCallsAnalyzed, "Number of call sites analyzed");
  42
  43 // Threshold to use when optsize is specified (and there is no
  44 // -inline-threshold).
  45 const int OptSizeThreshold = 75;
  46
  47 // Threshold to use when -Oz is specified (and there is no -inline-threshold).
  48 const int OptMinSizeThreshold = 25;
  49
  50 // Threshold to use when -O[34] is specified (and there is no
  51 // -inline-threshold).
  52 const int OptAggressiveThreshold = 275;
  53
  54 static cl::opt<int> DefaultInlineThreshold(
  55     "inline-threshold", cl::Hidden, cl::init(225), cl::ZeroOrMore,
  56     cl::desc("Control the amount of inlining to perform (default = 225)"));
  57
  58 static cl::opt<int> HintThreshold(
  59     "inlinehint-threshold", cl::Hidden, cl::init(325),
  60     cl::desc("Threshold for inlining functions with inline hint"));
  61
  62 // We introduce this threshold to help performance of instrumentation based
  63 // PGO before we actually hook up inliner with analysis passes such as BPI and
  64 // BFI.
  65 static cl::opt<int> ColdThreshold(
  66     "inlinecold-threshold", cl::Hidden, cl::init(225),
  67     cl::desc("Threshold for inlining functions with cold attribute"));
  68
  69 namespace {
  70
  71 class CallAnalyzer : public InstVisitor<CallAnalyzer, bool> {
  72   typedef InstVisitor<CallAnalyzer, bool> Base;
  73   friend class InstVisitor<CallAnalyzer, bool>;
  74
  75   /// The TargetTransformInfo available for this compilation.
  76   const TargetTransformInfo &TTI;
  77
  78   /// The cache of @llvm.assume intrinsics.
  79   AssumptionCacheTracker *ACT;
  80
  81   /// Profile summary information.
  82   ProfileSummaryInfo *PSI;
  83
  84   // The called function.
  85   Function &F;
  86
  87   // The candidate callsite being analyzed. Please do not use this to do
  88   // analysis in the caller function; we want the inline cost query to be
  89   // easily cacheable. Instead, use the cover function paramHasAttr.
  90   CallSite CandidateCS;
  91
  92   int Threshold;
  93   int Cost;
  94
  95   bool IsCallerRecursive;
  96   bool IsRecursiveCall;
  97   bool ExposesReturnsTwice;
  98   bool HasDynamicAlloca;
  99   bool ContainsNoDuplicateCall;
 100   bool HasReturn;
 101   bool HasIndirectBr;
 102   bool HasFrameEscape;
 103
 104   /// Number of bytes allocated statically by the callee.
 105   uint64_t AllocatedSize;
 106   unsigned NumInstructions, NumVectorInstructions;
 107   int FiftyPercentVectorBonus, TenPercentVectorBonus;
 108   int VectorBonus;
 109
 110   // While we walk the potentially-inlined instructions, we build up and
 111   // maintain a mapping of simplified values specific to this callsite. The
 112   // idea is to propagate any special information we have about arguments to
 113   // this call through the inlinable section of the function, and account for
 114   // likely simplifications post-inlining. The most important aspect we track
 115   // is CFG altering simplifications -- when we prove a basic block dead, that
 116   // can cause dramatic shifts in the cost of inlining a function.
 117   DenseMap<Value *, Constant *> SimplifiedValues;
 118
 119   // Keep track of the values which map back (through function arguments) to
 120   // allocas on the caller stack which could be simplified through SROA.
 121   DenseMap<Value *, Value *> SROAArgValues;
 122
 123   // The mapping of caller Alloca values to their accumulated cost savings. If
 124   // we have to disable SROA for one of the allocas, this tells us how much
 125   // cost must be added.
 126   DenseMap<Value *, int> SROAArgCosts;
 127
 128   // Keep track of values which map to a pointer base and constant offset.
 129   DenseMap<Value *, std::pair<Value *, APInt>> ConstantOffsetPtrs;
 130
 131   // Custom simplification helper routines.
 132   bool isAllocaDerivedArg(Value *V);
 133   bool lookupSROAArgAndCost(Value *V, Value *&Arg,
 134                             DenseMap<Value *, int>::iterator &CostIt);
 135   void disableSROA(DenseMap<Value *, int>::iterator CostIt);
 136   void disableSROA(Value *V);
 137   void accumulateSROACost(DenseMap<Value *, int>::iterator CostIt,
 138                           int InstructionCost);
 139   bool isGEPOffsetConstant(GetElementPtrInst &GEP);
 140   bool accumulateGEPOffset(GEPOperator &GEP, APInt &Offset);
 141   bool simplifyCallSite(Function *F, CallSite CS);
 142   ConstantInt *stripAndComputeInBoundsConstantOffsets(Value *&V);
 143
 144   /// Return true if the given argument to the function being considered for
 145   /// inlining has the given attribute set either at the call site or the
 146   /// function declaration.  Primarily used to inspect call site specific
 147   /// attributes since these can be more precise than the ones on the callee
 148   /// itself.
 149   bool paramHasAttr(Argument *A, Attribute::AttrKind Attr);
 150
 151   /// Return true if the given value is known non null within the callee if
 152   /// inlined through this particular callsite.
 153   bool isKnownNonNullInCallee(Value *V);
 154
 155   /// Update Threshold based on callsite properties such as callee
 156   /// attributes and callee hotness for PGO builds. The Callee is explicitly
 157   /// passed to support analyzing indirect calls whose target is inferred by
 158   /// analysis.
 159   void updateThreshold(CallSite CS, Function &Callee);
 160
 161   /// Return true if size growth is allowed when inlining the callee at CS.
 162   bool allowSizeGrowth(CallSite CS);
 163
 164   // Custom analysis routines.
 165   bool analyzeBlock(BasicBlock *BB, SmallPtrSetImpl<const Value *> &EphValues);
 166
 167   // Disable several entry points to the visitor so we don't accidentally use
 168   // them by declaring but not defining them here.
 169   void visit(Module *);
 170   void visit(Module &);
 171   void visit(Function *);
 172   void visit(Function &);
 173   void visit(BasicBlock *);
 174   void visit(BasicBlock &);
 175
 176   // Provide base case for our instruction visit.
 177   bool visitInstruction(Instruction &I);
 178
 179   // Our visit overrides.
 180   bool visitAlloca(AllocaInst &I);
 181   bool visitPHI(PHINode &I);
 182   bool visitGetElementPtr(GetElementPtrInst &I);
 183   bool visitBitCast(BitCastInst &I);
 184   bool visitPtrToInt(PtrToIntInst &I);
 185   bool visitIntToPtr(IntToPtrInst &I);
 186   bool visitCastInst(CastInst &I);
 187   bool visitUnaryInstruction(UnaryInstruction &I);
 188   bool visitCmpInst(CmpInst &I);
 189   bool visitSub(BinaryOperator &I);
 190   bool visitBinaryOperator(BinaryOperator &I);
 191   bool visitLoad(LoadInst &I);
 192   bool visitStore(StoreInst &I);
 193   bool visitExtractValue(ExtractValueInst &I);
 194   bool visitInsertValue(InsertValueInst &I);
 195   bool visitCallSite(CallSite CS);
 196   bool visitReturnInst(ReturnInst &RI);
 197   bool visitBranchInst(BranchInst &BI);
 198   bool visitSwitchInst(SwitchInst &SI);
 199   bool visitIndirectBrInst(IndirectBrInst &IBI);
 200   bool visitResumeInst(ResumeInst &RI);
 201   bool visitCleanupReturnInst(CleanupReturnInst &RI);
 202   bool visitCatchReturnInst(CatchReturnInst &RI);
 203   bool visitUnreachableInst(UnreachableInst &I);
 204
 205 public:
 206   CallAnalyzer(const TargetTransformInfo &TTI, AssumptionCacheTracker *ACT,
 207                ProfileSummaryInfo *PSI, Function &Callee, int Threshold,
 208                CallSite CSArg)
 209       : TTI(TTI), ACT(ACT), PSI(PSI), F(Callee), CandidateCS(CSArg),
 210         Threshold(Threshold), Cost(0), IsCallerRecursive(false),
 211         IsRecursiveCall(false), ExposesReturnsTwice(false),
 212         HasDynamicAlloca(false), ContainsNoDuplicateCall(false),
 213         HasReturn(false), HasIndirectBr(false), HasFrameEscape(false),
 214         AllocatedSize(0), NumInstructions(0), NumVectorInstructions(0),
 215         FiftyPercentVectorBonus(0), TenPercentVectorBonus(0), VectorBonus(0),
 216         NumConstantArgs(0), NumConstantOffsetPtrArgs(0), NumAllocaArgs(0),
 217         NumConstantPtrCmps(0), NumConstantPtrDiffs(0),
 218         NumInstructionsSimplified(0), SROACostSavings(0),
 219         SROACostSavingsLost(0) {}
 220
 221   bool analyzeCall(CallSite CS);
 222
 223   int getThreshold() { return Threshold; }
 224   int getCost() { return Cost; }
 225
 226   // Keep a bunch of stats about the cost savings found so we can print them
 227   // out when debugging.
 228   unsigned NumConstantArgs;
 229   unsigned NumConstantOffsetPtrArgs;
 230   unsigned NumAllocaArgs;
 231   unsigned NumConstantPtrCmps;
 232   unsigned NumConstantPtrDiffs;
 233   unsigned NumInstructionsSimplified;
 234   unsigned SROACostSavings;
 235   unsigned SROACostSavingsLost;
 236
 237   void dump();
 238 };
 239
 240 } // namespace
 241
 242 /// \brief Test whether the given value is an Alloca-derived function argument.
 243 bool CallAnalyzer::isAllocaDerivedArg(Value *V) {
 244   return SROAArgValues.count(V);
 245 }
 246
 247 /// \brief Lookup the SROA-candidate argument and cost iterator which V maps to.
 248 /// Returns false if V does not map to a SROA-candidate.
 249 bool CallAnalyzer::lookupSROAArgAndCost(
 250     Value *V, Value *&Arg, DenseMap<Value *, int>::iterator &CostIt) {
 251   if (SROAArgValues.empty() || SROAArgCosts.empty())
 252     return false;
 253
 254   DenseMap<Value *, Value *>::iterator ArgIt = SROAArgValues.find(V);
 255   if (ArgIt == SROAArgValues.end())
 256     return false;
 257
 258   Arg = ArgIt->second;
 259   CostIt = SROAArgCosts.find(Arg);
 260   return CostIt != SROAArgCosts.end();
 261 }
 262
 263 /// \brief Disable SROA for the candidate marked by this cost iterator.
 264 ///
 265 /// This marks the candidate as no longer viable for SROA, and adds the cost
 266 /// savings associated with it back into the inline cost measurement.
 267 void CallAnalyzer::disableSROA(DenseMap<Value *, int>::iterator CostIt) {
 268   // If we're no longer able to perform SROA we need to undo its cost savings
 269   // and prevent subsequent analysis.
 270   Cost += CostIt->second;
 271   SROACostSavings -= CostIt->second;
 272   SROACostSavingsLost += CostIt->second;
 273   SROAArgCosts.erase(CostIt);
 274 }
 275
 276 /// \brief If 'V' maps to a SROA candidate, disable SROA for it.
 277 void CallAnalyzer::disableSROA(Value *V) {
 278   Value *SROAArg;
 279   DenseMap<Value *, int>::iterator CostIt;
 280   if (lookupSROAArgAndCost(V, SROAArg, CostIt))
 281     disableSROA(CostIt);
 282 }
 283
 284 /// \brief Accumulate the given cost for a particular SROA candidate.
 285 void CallAnalyzer::accumulateSROACost(DenseMap<Value *, int>::iterator CostIt,
 286                                       int InstructionCost) {
 287   CostIt->second += InstructionCost;
 288   SROACostSavings += InstructionCost;
 289 }
 290
 291 /// \brief Check whether a GEP's indices are all constant.
 292 ///
 293 /// Respects any simplified values known during the analysis of this callsite.
 294 bool CallAnalyzer::isGEPOffsetConstant(GetElementPtrInst &GEP) {
 295   for (User::op_iterator I = GEP.idx_begin(), E = GEP.idx_end(); I != E; ++I)
 296     if (!isa<Constant>(*I) && !SimplifiedValues.lookup(*I))
 297       return false;
 298
 299   return true;
 300 }
 301
 302 /// \brief Accumulate a constant GEP offset into an APInt if possible.
 303 ///
 304 /// Returns false if unable to compute the offset for any reason. Respects any
 305 /// simplified values known during the analysis of this callsite.
 306 bool CallAnalyzer::accumulateGEPOffset(GEPOperator &GEP, APInt &Offset) {
 307   const DataLayout &DL = F.getParent()->getDataLayout();
 308   unsigned IntPtrWidth = DL.getPointerSizeInBits();
 309   assert(IntPtrWidth == Offset.getBitWidth());
 310
 311   for (gep_type_iterator GTI = gep_type_begin(GEP), GTE = gep_type_end(GEP);
 312        GTI != GTE; ++GTI) {
 313     ConstantInt *OpC = dyn_cast<ConstantInt>(GTI.getOperand());
 314     if (!OpC)
 315       if (Constant *SimpleOp = SimplifiedValues.lookup(GTI.getOperand()))
 316         OpC = dyn_cast<ConstantInt>(SimpleOp);
 317     if (!OpC)
 318       return false;
 319     if (OpC->isZero())
 320       continue;
 321
 322     // Handle a struct index, which adds its field offset to the pointer.
 323     if (StructType *STy = dyn_cast<StructType>(*GTI)) {
 324       unsigned ElementIdx = OpC->getZExtValue();
 325       const StructLayout *SL = DL.getStructLayout(STy);
 326       Offset += APInt(IntPtrWidth, SL->getElementOffset(ElementIdx));
 327       continue;
 328     }
 329
 330     APInt TypeSize(IntPtrWidth, DL.getTypeAllocSize(GTI.getIndexedType()));
 331     Offset += OpC->getValue().sextOrTrunc(IntPtrWidth) * TypeSize;
 332   }
 333   return true;
 334 }
 335
 336 bool CallAnalyzer::visitAlloca(AllocaInst &I) {
 337   // Check whether inlining will turn a dynamic alloca into a static
 338   // alloca and handle that case.
 339   if (I.isArrayAllocation()) {
 340     Constant *Size = SimplifiedValues.lookup(I.getArraySize());
 341     if (auto *AllocSize = dyn_cast_or_null<ConstantInt>(Size)) {
 342       const DataLayout &DL = F.getParent()->getDataLayout();
 343       Type *Ty = I.getAllocatedType();
 344       AllocatedSize = SaturatingMultiplyAdd(
 345           AllocSize->getLimitedValue(), DL.getTypeAllocSize(Ty), AllocatedSize);
 346       return Base::visitAlloca(I);
 347     }
 348   }
 349
 350   // Accumulate the allocated size.
 351   if (I.isStaticAlloca()) {
 352     const DataLayout &DL = F.getParent()->getDataLayout();
 353     Type *Ty = I.getAllocatedType();
 354     AllocatedSize = SaturatingAdd(DL.getTypeAllocSize(Ty), AllocatedSize);
 355   }
 356
 357   // We will happily inline static alloca instructions.
 358   if (I.isStaticAlloca())
 359     return Base::visitAlloca(I);
 360
 361   // FIXME: This is overly conservative. Dynamic allocas are inefficient for
 362   // a variety of reasons, and so we would like to not inline them into
 363   // functions which don't currently have a dynamic alloca. This simply
 364   // disables inlining altogether in the presence of a dynamic alloca.
 365   HasDynamicAlloca = true;
 366   return false;
 367 }
 368
 369 bool CallAnalyzer::visitPHI(PHINode &I) {
 370   // FIXME: We should potentially be tracking values through phi nodes,
 371   // especially when they collapse to a single value due to deleted CFG edges
 372   // during inlining.
 373
 374   // FIXME: We need to propagate SROA *disabling* through phi nodes, even
 375   // though we don't want to propagate it's bonuses. The idea is to disable
 376   // SROA if it *might* be used in an inappropriate manner.
 377
 378   // Phi nodes are always zero-cost.
 379   return true;
 380 }
 381
 382 bool CallAnalyzer::visitGetElementPtr(GetElementPtrInst &I) {
 383   Value *SROAArg;
 384   DenseMap<Value *, int>::iterator CostIt;
 385   bool SROACandidate =
 386       lookupSROAArgAndCost(I.getPointerOperand(), SROAArg, CostIt);
 387
 388   // Try to fold GEPs of constant-offset call site argument pointers. This
 389   // requires target data and inbounds GEPs.
 390   if (I.isInBounds()) {
 391     // Check if we have a base + offset for the pointer.
 392     Value *Ptr = I.getPointerOperand();
 393     std::pair<Value *, APInt> BaseAndOffset = ConstantOffsetPtrs.lookup(Ptr);
 394     if (BaseAndOffset.first) {
 395       // Check if the offset of this GEP is constant, and if so accumulate it
 396       // into Offset.
 397       if (!accumulateGEPOffset(cast<GEPOperator>(I), BaseAndOffset.second)) {
 398         // Non-constant GEPs aren't folded, and disable SROA.
 399         if (SROACandidate)
 400           disableSROA(CostIt);
 401         return false;
 402       }
 403
 404       // Add the result as a new mapping to Base + Offset.
 405       ConstantOffsetPtrs[&I] = BaseAndOffset;
 406
 407       // Also handle SROA candidates here, we already know that the GEP is
 408       // all-constant indexed.
 409       if (SROACandidate)
 410         SROAArgValues[&I] = SROAArg;
 411
 412       return true;
 413     }
 414   }
 415
 416   if (isGEPOffsetConstant(I)) {
 417     if (SROACandidate)
 418       SROAArgValues[&I] = SROAArg;
 419
 420     // Constant GEPs are modeled as free.
 421     return true;
 422   }
 423
 424   // Variable GEPs will require math and will disable SROA.
 425   if (SROACandidate)
 426     disableSROA(CostIt);
 427   return false;
 428 }
 429
 430 bool CallAnalyzer::visitBitCast(BitCastInst &I) {
 431   // Propagate constants through bitcasts.
 432   Constant *COp = dyn_cast<Constant>(I.getOperand(0));
 433   if (!COp)
 434     COp = SimplifiedValues.lookup(I.getOperand(0));
 435   if (COp)
 436     if (Constant *C = ConstantExpr::getBitCast(COp, I.getType())) {
 437       SimplifiedValues[&I] = C;
 438       return true;
 439     }
 440
 441   // Track base/offsets through casts
 442   std::pair<Value *, APInt> BaseAndOffset =
 443       ConstantOffsetPtrs.lookup(I.getOperand(0));
 444   // Casts don't change the offset, just wrap it up.
 445   if (BaseAndOffset.first)
 446     ConstantOffsetPtrs[&I] = BaseAndOffset;
 447
 448   // Also look for SROA candidates here.
 449   Value *SROAArg;
 450   DenseMap<Value *, int>::iterator CostIt;
 451   if (lookupSROAArgAndCost(I.getOperand(0), SROAArg, CostIt))
 452     SROAArgValues[&I] = SROAArg;
 453
 454   // Bitcasts are always zero cost.
 455   return true;
 456 }
 457
 458 bool CallAnalyzer::visitPtrToInt(PtrToIntInst &I) {
 459   // Propagate constants through ptrtoint.
 460   Constant *COp = dyn_cast<Constant>(I.getOperand(0));
 461   if (!COp)
 462     COp = SimplifiedValues.lookup(I.getOperand(0));
 463   if (COp)
 464     if (Constant *C = ConstantExpr::getPtrToInt(COp, I.getType())) {
 465       SimplifiedValues[&I] = C;
 466       return true;
 467     }
 468
 469   // Track base/offset pairs when converted to a plain integer provided the
 470   // integer is large enough to represent the pointer.
 471   unsigned IntegerSize = I.getType()->getScalarSizeInBits();
 472   const DataLayout &DL = F.getParent()->getDataLayout();
 473   if (IntegerSize >= DL.getPointerSizeInBits()) {
 474     std::pair<Value *, APInt> BaseAndOffset =
 475         ConstantOffsetPtrs.lookup(I.getOperand(0));
 476     if (BaseAndOffset.first)
 477       ConstantOffsetPtrs[&I] = BaseAndOffset;
 478   }
 479
 480   // This is really weird. Technically, ptrtoint will disable SROA. However,
 481   // unless that ptrtoint is *used* somewhere in the live basic blocks after
 482   // inlining, it will be nuked, and SROA should proceed. All of the uses which
 483   // would block SROA would also block SROA if applied directly to a pointer,
 484   // and so we can just add the integer in here. The only places where SROA is
 485   // preserved either cannot fire on an integer, or won't in-and-of themselves
 486   // disable SROA (ext) w/o some later use that we would see and disable.
 487   Value *SROAArg;
 488   DenseMap<Value *, int>::iterator CostIt;
 489   if (lookupSROAArgAndCost(I.getOperand(0), SROAArg, CostIt))
 490     SROAArgValues[&I] = SROAArg;
 491
 492   return TargetTransformInfo::TCC_Free == TTI.getUserCost(&I);
 493 }
 494
 495 bool CallAnalyzer::visitIntToPtr(IntToPtrInst &I) {
 496   // Propagate constants through ptrtoint.
 497   Constant *COp = dyn_cast<Constant>(I.getOperand(0));
 498   if (!COp)
 499     COp = SimplifiedValues.lookup(I.getOperand(0));
 500   if (COp)
 501     if (Constant *C = ConstantExpr::getIntToPtr(COp, I.getType())) {
 502       SimplifiedValues[&I] = C;
 503       return true;
 504     }
 505
 506   // Track base/offset pairs when round-tripped through a pointer without
 507   // modifications provided the integer is not too large.
 508   Value *Op = I.getOperand(0);
 509   unsigned IntegerSize = Op->getType()->getScalarSizeInBits();
 510   const DataLayout &DL = F.getParent()->getDataLayout();
 511   if (IntegerSize <= DL.getPointerSizeInBits()) {
 512     std::pair<Value *, APInt> BaseAndOffset = ConstantOffsetPtrs.lookup(Op);
 513     if (BaseAndOffset.first)
 514       ConstantOffsetPtrs[&I] = BaseAndOffset;
 515   }
 516
 517   // "Propagate" SROA here in the same manner as we do for ptrtoint above.
 518   Value *SROAArg;
 519   DenseMap<Value *, int>::iterator CostIt;
 520   if (lookupSROAArgAndCost(Op, SROAArg, CostIt))
 521     SROAArgValues[&I] = SROAArg;
 522
 523   return TargetTransformInfo::TCC_Free == TTI.getUserCost(&I);
 524 }
 525
 526 bool CallAnalyzer::visitCastInst(CastInst &I) {
 527   // Propagate constants through ptrtoint.
 528   Constant *COp = dyn_cast<Constant>(I.getOperand(0));
 529   if (!COp)
 530     COp = SimplifiedValues.lookup(I.getOperand(0));
 531   if (COp)
 532     if (Constant *C = ConstantExpr::getCast(I.getOpcode(), COp, I.getType())) {
 533       SimplifiedValues[&I] = C;
 534       return true;
 535     }
 536
 537   // Disable SROA in the face of arbitrary casts we don't whitelist elsewhere.
 538   disableSROA(I.getOperand(0));
 539
 540   return TargetTransformInfo::TCC_Free == TTI.getUserCost(&I);
 541 }
 542
 543 bool CallAnalyzer::visitUnaryInstruction(UnaryInstruction &I) {
 544   Value *Operand = I.getOperand(0);
 545   Constant *COp = dyn_cast<Constant>(Operand);
 546   if (!COp)
 547     COp = SimplifiedValues.lookup(Operand);
 548   if (COp) {
 549     const DataLayout &DL = F.getParent()->getDataLayout();
 550     if (Constant *C = ConstantFoldInstOperands(&I, COp, DL)) {
 551       SimplifiedValues[&I] = C;
 552       return true;
 553     }
 554   }
 555
 556   // Disable any SROA on the argument to arbitrary unary operators.
 557   disableSROA(Operand);
 558
 559   return false;
 560 }
 561
 562 bool CallAnalyzer::paramHasAttr(Argument *A, Attribute::AttrKind Attr) {
 563   unsigned ArgNo = A->getArgNo();
 564   return CandidateCS.paramHasAttr(ArgNo + 1, Attr);
 565 }
 566
 567 bool CallAnalyzer::isKnownNonNullInCallee(Value *V) {
 568   // Does the *call site* have the NonNull attribute set on an argument?  We
 569   // use the attribute on the call site to memoize any analysis done in the
 570   // caller. This will also trip if the callee function has a non-null
 571   // parameter attribute, but that's a less interesting case because hopefully
 572   // the callee would already have been simplified based on that.
 573   if (Argument *A = dyn_cast<Argument>(V))
 574     if (paramHasAttr(A, Attribute::NonNull))
 575       return true;
 576
 577   // Is this an alloca in the caller?  This is distinct from the attribute case
 578   // above because attributes aren't updated within the inliner itself and we
 579   // always want to catch the alloca derived case.
 580   if (isAllocaDerivedArg(V))
 581     // We can actually predict the result of comparisons between an
 582     // alloca-derived value and null. Note that this fires regardless of
 583     // SROA firing.
 584     return true;
 585
 586   return false;
 587 }
 588
 589 bool CallAnalyzer::allowSizeGrowth(CallSite CS) {
 590   // If the normal destination of the invoke or the parent block of the call
 591   // site is unreachable-terminated, there is little point in inlining this
 592   // unless there is literally zero cost.
 593   // FIXME: Note that it is possible that an unreachable-terminated block has a
 594   // hot entry. For example, in below scenario inlining hot_call_X() may be
 595   // beneficial :
 596   // main() {
 597   //   hot_call_1();
 598   //   ...
 599   //   hot_call_N()
 600   //   exit(0);
 601   // }
 602   // For now, we are not handling this corner case here as it is rare in real
 603   // code. In future, we should elaborate this based on BPI and BFI in more
 604   // general threshold adjusting heuristics in updateThreshold().
 605   Instruction *Instr = CS.getInstruction();
 606   if (InvokeInst *II = dyn_cast<InvokeInst>(Instr)) {
 607     if (isa<UnreachableInst>(II->getNormalDest()->getTerminator()))
 608       return false;
 609   } else if (isa<UnreachableInst>(Instr->getParent()->getTerminator()))
 610     return false;
 611
 612   return true;
 613 }
 614
 615 void CallAnalyzer::updateThreshold(CallSite CS, Function &Callee) {
 616   // If no size growth is allowed for this inlining, set Threshold to 0.
 617   if (!allowSizeGrowth(CS)) {
 618     Threshold = 0;
 619     return;
 620   }
 621
 622   Function *Caller = CS.getCaller();
 623   if (DefaultInlineThreshold.getNumOccurrences() > 0) {
 624     // Explicitly specified -inline-threhold overrides the threshold passed to
 625     // CallAnalyzer's constructor.
 626     Threshold = DefaultInlineThreshold;
 627   } else {
 628     // If -inline-threshold is not given, listen to the optsize and minsize
 629     // attributes when they would decrease the threshold.
 630     if (Caller->optForMinSize() && OptMinSizeThreshold < Threshold)
 631       Threshold = OptMinSizeThreshold;
 632     else if (Caller->optForSize() && OptSizeThreshold < Threshold)
 633       Threshold = OptSizeThreshold;
 634   }
 635
 636   bool HotCallsite = false;
 637   uint64_t TotalWeight;
 638   if (CS.getInstruction()->extractProfTotalWeight(TotalWeight) &&
 639       PSI->isHotCount(TotalWeight))
 640     HotCallsite = true;
 641
 642   // Listen to the inlinehint attribute or profile based hotness information
 643   // when it would increase the threshold and the caller does not need to
 644   // minimize its size.
 645   bool InlineHint = Callee.hasFnAttribute(Attribute::InlineHint) ||
 646                     PSI->isHotFunction(&Callee) ||
 647                     HotCallsite;
 648   if (InlineHint && HintThreshold > Threshold && !Caller->optForMinSize())
 649     Threshold = HintThreshold;
 650
 651   bool ColdCallee = PSI->isColdFunction(&Callee);
 652   // Command line argument for DefaultInlineThreshold will override the default
 653   // ColdThreshold. If we have -inline-threshold but no -inlinecold-threshold,
 654   // do not use the default cold threshold even if it is smaller.
 655   if ((DefaultInlineThreshold.getNumOccurrences() == 0 ||
 656        ColdThreshold.getNumOccurrences() > 0) &&
 657       ColdCallee && ColdThreshold < Threshold)
 658     Threshold = ColdThreshold;
 659
 660   // Finally, take the target-specific inlining threshold multiplier into
 661   // account.
 662   Threshold *= TTI.getInliningThresholdMultiplier();
 663 }
 664
 665 bool CallAnalyzer::visitCmpInst(CmpInst &I) {
 666   Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
 667   // First try to handle simplified comparisons.
 668   if (!isa<Constant>(LHS))
 669     if (Constant *SimpleLHS = SimplifiedValues.lookup(LHS))
 670       LHS = SimpleLHS;
 671   if (!isa<Constant>(RHS))
 672     if (Constant *SimpleRHS = SimplifiedValues.lookup(RHS))
 673       RHS = SimpleRHS;
 674   if (Constant *CLHS = dyn_cast<Constant>(LHS)) {
 675     if (Constant *CRHS = dyn_cast<Constant>(RHS))
 676       if (Constant *C =
 677               ConstantExpr::getCompare(I.getPredicate(), CLHS, CRHS)) {
 678         SimplifiedValues[&I] = C;
 679         return true;
 680       }
 681   }
 682
 683   if (I.getOpcode() == Instruction::FCmp)
 684     return false;
 685
 686   // Otherwise look for a comparison between constant offset pointers with
 687   // a common base.
 688   Value *LHSBase, *RHSBase;
 689   APInt LHSOffset, RHSOffset;
 690   std::tie(LHSBase, LHSOffset) = ConstantOffsetPtrs.lookup(LHS);
 691   if (LHSBase) {
 692     std::tie(RHSBase, RHSOffset) = ConstantOffsetPtrs.lookup(RHS);
 693     if (RHSBase && LHSBase == RHSBase) {
 694       // We have common bases, fold the icmp to a constant based on the
 695       // offsets.
 696       Constant *CLHS = ConstantInt::get(LHS->getContext(), LHSOffset);
 697       Constant *CRHS = ConstantInt::get(RHS->getContext(), RHSOffset);
 698       if (Constant *C = ConstantExpr::getICmp(I.getPredicate(), CLHS, CRHS)) {
 699         SimplifiedValues[&I] = C;
 700         ++NumConstantPtrCmps;
 701         return true;
 702       }
 703     }
 704   }
 705
 706   // If the comparison is an equality comparison with null, we can simplify it
 707   // if we know the value (argument) can't be null
 708   if (I.isEquality() && isa<ConstantPointerNull>(I.getOperand(1)) &&
 709       isKnownNonNullInCallee(I.getOperand(0))) {
 710     bool IsNotEqual = I.getPredicate() == CmpInst::ICMP_NE;
 711     SimplifiedValues[&I] = IsNotEqual ? ConstantInt::getTrue(I.getType())
 712                                       : ConstantInt::getFalse(I.getType());
 713     return true;
 714   }
 715   // Finally check for SROA candidates in comparisons.
 716   Value *SROAArg;
 717   DenseMap<Value *, int>::iterator CostIt;
 718   if (lookupSROAArgAndCost(I.getOperand(0), SROAArg, CostIt)) {
 719     if (isa<ConstantPointerNull>(I.getOperand(1))) {
 720       accumulateSROACost(CostIt, InlineConstants::InstrCost);
 721       return true;
 722     }
 723
 724     disableSROA(CostIt);
 725   }
 726
 727   return false;
 728 }
 729
 730 bool CallAnalyzer::visitSub(BinaryOperator &I) {
 731   // Try to handle a special case: we can fold computing the difference of two
 732   // constant-related pointers.
 733   Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
 734   Value *LHSBase, *RHSBase;
 735   APInt LHSOffset, RHSOffset;
 736   std::tie(LHSBase, LHSOffset) = ConstantOffsetPtrs.lookup(LHS);
 737   if (LHSBase) {
 738     std::tie(RHSBase, RHSOffset) = ConstantOffsetPtrs.lookup(RHS);
 739     if (RHSBase && LHSBase == RHSBase) {
 740       // We have common bases, fold the subtract to a constant based on the
 741       // offsets.
 742       Constant *CLHS = ConstantInt::get(LHS->getContext(), LHSOffset);
 743       Constant *CRHS = ConstantInt::get(RHS->getContext(), RHSOffset);
 744       if (Constant *C = ConstantExpr::getSub(CLHS, CRHS)) {
 745         SimplifiedValues[&I] = C;
 746         ++NumConstantPtrDiffs;
 747         return true;
 748       }
 749     }
 750   }
 751
 752   // Otherwise, fall back to the generic logic for simplifying and handling
 753   // instructions.
 754   return Base::visitSub(I);
 755 }
 756
 757 bool CallAnalyzer::visitBinaryOperator(BinaryOperator &I) {
 758   Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
 759   const DataLayout &DL = F.getParent()->getDataLayout();
 760   if (!isa<Constant>(LHS))
 761     if (Constant *SimpleLHS = SimplifiedValues.lookup(LHS))
 762       LHS = SimpleLHS;
 763   if (!isa<Constant>(RHS))
 764     if (Constant *SimpleRHS = SimplifiedValues.lookup(RHS))
 765       RHS = SimpleRHS;
 766   Value *SimpleV = nullptr;
 767   if (auto FI = dyn_cast<FPMathOperator>(&I))
 768     SimpleV =
 769         SimplifyFPBinOp(I.getOpcode(), LHS, RHS, FI->getFastMathFlags(), DL);
 770   else
 771     SimpleV = SimplifyBinOp(I.getOpcode(), LHS, RHS, DL);
 772
 773   if (Constant *C = dyn_cast_or_null<Constant>(SimpleV)) {
 774     SimplifiedValues[&I] = C;
 775     return true;
 776   }
 777
 778   // Disable any SROA on arguments to arbitrary, unsimplified binary operators.
 779   disableSROA(LHS);
 780   disableSROA(RHS);
 781
 782   return false;
 783 }
 784
 785 bool CallAnalyzer::visitLoad(LoadInst &I) {
 786   Value *SROAArg;
 787   DenseMap<Value *, int>::iterator CostIt;
 788   if (lookupSROAArgAndCost(I.getPointerOperand(), SROAArg, CostIt)) {
 789     if (I.isSimple()) {
 790       accumulateSROACost(CostIt, InlineConstants::InstrCost);
 791       return true;
 792     }
 793
 794     disableSROA(CostIt);
 795   }
 796
 797   return false;
 798 }
 799
 800 bool CallAnalyzer::visitStore(StoreInst &I) {
 801   Value *SROAArg;
 802   DenseMap<Value *, int>::iterator CostIt;
 803   if (lookupSROAArgAndCost(I.getPointerOperand(), SROAArg, CostIt)) {
 804     if (I.isSimple()) {
 805       accumulateSROACost(CostIt, InlineConstants::InstrCost);
 806       return true;
 807     }
 808
 809     disableSROA(CostIt);
 810   }
 811
 812   return false;
 813 }
 814
 815 bool CallAnalyzer::visitExtractValue(ExtractValueInst &I) {
 816   // Constant folding for extract value is trivial.
 817   Constant *C = dyn_cast<Constant>(I.getAggregateOperand());
 818   if (!C)
 819     C = SimplifiedValues.lookup(I.getAggregateOperand());
 820   if (C) {
 821     SimplifiedValues[&I] = ConstantExpr::getExtractValue(C, I.getIndices());
 822     return true;
 823   }
 824
 825   // SROA can look through these but give them a cost.
 826   return false;
 827 }
 828
 829 bool CallAnalyzer::visitInsertValue(InsertValueInst &I) {
 830   // Constant folding for insert value is trivial.
 831   Constant *AggC = dyn_cast<Constant>(I.getAggregateOperand());
 832   if (!AggC)
 833     AggC = SimplifiedValues.lookup(I.getAggregateOperand());
 834   Constant *InsertedC = dyn_cast<Constant>(I.getInsertedValueOperand());
 835   if (!InsertedC)
 836     InsertedC = SimplifiedValues.lookup(I.getInsertedValueOperand());
 837   if (AggC && InsertedC) {
 838     SimplifiedValues[&I] =
 839         ConstantExpr::getInsertValue(AggC, InsertedC, I.getIndices());
 840     return true;
 841   }
 842
 843   // SROA can look through these but give them a cost.
 844   return false;
 845 }
 846
 847 /// \brief Try to simplify a call site.
 848 ///
 849 /// Takes a concrete function and callsite and tries to actually simplify it by
 850 /// analyzing the arguments and call itself with instsimplify. Returns true if
 851 /// it has simplified the callsite to some other entity (a constant), making it
 852 /// free.
 853 bool CallAnalyzer::simplifyCallSite(Function *F, CallSite CS) {
 854   // FIXME: Using the instsimplify logic directly for this is inefficient
 855   // because we have to continually rebuild the argument list even when no
 856   // simplifications can be performed. Until that is fixed with remapping
 857   // inside of instsimplify, directly constant fold calls here.
 858   if (!canConstantFoldCallTo(F))
 859     return false;
 860
 861   // Try to re-map the arguments to constants.
 862   SmallVector<Constant *, 4> ConstantArgs;
 863   ConstantArgs.reserve(CS.arg_size());
 864   for (CallSite::arg_iterator I = CS.arg_begin(), E = CS.arg_end(); I != E;
 865        ++I) {
 866     Constant *C = dyn_cast<Constant>(*I);
 867     if (!C)
 868       C = dyn_cast_or_null<Constant>(SimplifiedValues.lookup(*I));
 869     if (!C)
 870       return false; // This argument doesn't map to a constant.
 871
 872     ConstantArgs.push_back(C);
 873   }
 874   if (Constant *C = ConstantFoldCall(F, ConstantArgs)) {
 875     SimplifiedValues[CS.getInstruction()] = C;
 876     return true;
 877   }
 878
 879   return false;
 880 }
 881
 882 bool CallAnalyzer::visitCallSite(CallSite CS) {
 883   if (CS.hasFnAttr(Attribute::ReturnsTwice) &&
 884       !F.hasFnAttribute(Attribute::ReturnsTwice)) {
 885     // This aborts the entire analysis.
 886     ExposesReturnsTwice = true;
 887     return false;
 888   }
 889   if (CS.isCall() && cast<CallInst>(CS.getInstruction())->cannotDuplicate())
 890     ContainsNoDuplicateCall = true;
 891
 892   if (Function *F = CS.getCalledFunction()) {
 893     // When we have a concrete function, first try to simplify it directly.
 894     if (simplifyCallSite(F, CS))
 895       return true;
 896
 897     // Next check if it is an intrinsic we know about.
 898     // FIXME: Lift this into part of the InstVisitor.
 899     if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(CS.getInstruction())) {
 900       switch (II->getIntrinsicID()) {
 901       default:
 902         return Base::visitCallSite(CS);
 903
 904       case Intrinsic::load_relative:
 905         // This is normally lowered to 4 LLVM instructions.
 906         Cost += 3 * InlineConstants::InstrCost;
 907         return false;
 908
 909       case Intrinsic::memset:
 910       case Intrinsic::memcpy:
 911       case Intrinsic::memmove:
 912         // SROA can usually chew through these intrinsics, but they aren't free.
 913         return false;
 914       case Intrinsic::localescape:
 915         HasFrameEscape = true;
 916         return false;
 917       }
 918     }
 919
 920     if (F == CS.getInstruction()->getParent()->getParent()) {
 921       // This flag will fully abort the analysis, so don't bother with anything
 922       // else.
 923       IsRecursiveCall = true;
 924       return false;
 925     }
 926
 927     if (TTI.isLoweredToCall(F)) {
 928       // We account for the average 1 instruction per call argument setup
 929       // here.
 930       Cost += CS.arg_size() * InlineConstants::InstrCost;
 931
 932       // Everything other than inline ASM will also have a significant cost
 933       // merely from making the call.
 934       if (!isa<InlineAsm>(CS.getCalledValue()))
 935         Cost += InlineConstants::CallPenalty;
 936     }
 937
 938     return Base::visitCallSite(CS);
 939   }
 940
 941   // Otherwise we're in a very special case -- an indirect function call. See
 942   // if we can be particularly clever about this.
 943   Value *Callee = CS.getCalledValue();
 944
 945   // First, pay the price of the argument setup. We account for the average
 946   // 1 instruction per call argument setup here.
 947   Cost += CS.arg_size() * InlineConstants::InstrCost;
 948
 949   // Next, check if this happens to be an indirect function call to a known
 950   // function in this inline context. If not, we've done all we can.
 951   Function *F = dyn_cast_or_null<Function>(SimplifiedValues.lookup(Callee));
 952   if (!F)
 953     return Base::visitCallSite(CS);
 954
 955   // If we have a constant that we are calling as a function, we can peer
 956   // through it and see the function target. This happens not infrequently
 957   // during devirtualization and so we want to give it a hefty bonus for
 958   // inlining, but cap that bonus in the event that inlining wouldn't pan
 959   // out. Pretend to inline the function, with a custom threshold.
 960   CallAnalyzer CA(TTI, ACT, PSI, *F, InlineConstants::IndirectCallThreshold,
 961                   CS);
 962   if (CA.analyzeCall(CS)) {
 963     // We were able to inline the indirect call! Subtract the cost from the
 964     // threshold to get the bonus we want to apply, but don't go below zero.
 965     Cost -= std::max(0, CA.getThreshold() - CA.getCost());
 966   }
 967
 968   return Base::visitCallSite(CS);
 969 }
 970
 971 bool CallAnalyzer::visitReturnInst(ReturnInst &RI) {
 972   // At least one return instruction will be free after inlining.
 973   bool Free = !HasReturn;
 974   HasReturn = true;
 975   return Free;
 976 }
 977
 978 bool CallAnalyzer::visitBranchInst(BranchInst &BI) {
 979   // We model unconditional branches as essentially free -- they really
 980   // shouldn't exist at all, but handling them makes the behavior of the
 981   // inliner more regular and predictable. Interestingly, conditional branches
 982   // which will fold away are also free.
 983   return BI.isUnconditional() || isa<ConstantInt>(BI.getCondition()) ||
 984          dyn_cast_or_null<ConstantInt>(
 985              SimplifiedValues.lookup(BI.getCondition()));
 986 }
 987
 988 bool CallAnalyzer::visitSwitchInst(SwitchInst &SI) {
 989   // We model unconditional switches as free, see the comments on handling
 990   // branches.
 991   if (isa<ConstantInt>(SI.getCondition()))
 992     return true;
 993   if (Value *V = SimplifiedValues.lookup(SI.getCondition()))
 994     if (isa<ConstantInt>(V))
 995       return true;
 996
 997   // Otherwise, we need to accumulate a cost proportional to the number of
 998   // distinct successor blocks. This fan-out in the CFG cannot be represented
 999   // for free even if we can represent the core switch as a jumptable that
1000   // takes a single instruction.
1001   //
1002   // NB: We convert large switches which are just used to initialize large phi
1003   // nodes to lookup tables instead in simplify-cfg, so this shouldn't prevent
1004   // inlining those. It will prevent inlining in cases where the optimization
1005   // does not (yet) fire.
1006   SmallPtrSet<BasicBlock *, 8> SuccessorBlocks;
1007   SuccessorBlocks.insert(SI.getDefaultDest());
1008   for (auto I = SI.case_begin(), E = SI.case_end(); I != E; ++I)
1009     SuccessorBlocks.insert(I.getCaseSuccessor());
1010   // Add cost corresponding to the number of distinct destinations. The first
1011   // we model as free because of fallthrough.
1012   Cost += (SuccessorBlocks.size() - 1) * InlineConstants::InstrCost;
1013   return false;
1014 }
1015
1016 bool CallAnalyzer::visitIndirectBrInst(IndirectBrInst &IBI) {
1017   // We never want to inline functions that contain an indirectbr.  This is
1018   // incorrect because all the blockaddress's (in static global initializers
1019   // for example) would be referring to the original function, and this
1020   // indirect jump would jump from the inlined copy of the function into the
1021   // original function which is extremely undefined behavior.
1022   // FIXME: This logic isn't really right; we can safely inline functions with
1023   // indirectbr's as long as no other function or global references the
1024   // blockaddress of a block within the current function.
1025   HasIndirectBr = true;
1026   return false;
1027 }
1028
1029 bool CallAnalyzer::visitResumeInst(ResumeInst &RI) {
1030   // FIXME: It's not clear that a single instruction is an accurate model for
1031   // the inline cost of a resume instruction.
1032   return false;
1033 }
1034
1035 bool CallAnalyzer::visitCleanupReturnInst(CleanupReturnInst &CRI) {
1036   // FIXME: It's not clear that a single instruction is an accurate model for
1037   // the inline cost of a cleanupret instruction.
1038   return false;
1039 }
1040
1041 bool CallAnalyzer::visitCatchReturnInst(CatchReturnInst &CRI) {
1042   // FIXME: It's not clear that a single instruction is an accurate model for
1043   // the inline cost of a catchret instruction.
1044   return false;
1045 }
1046
1047 bool CallAnalyzer::visitUnreachableInst(UnreachableInst &I) {
1048   // FIXME: It might be reasonably to discount the cost of instructions leading
1049   // to unreachable as they have the lowest possible impact on both runtime and
1050   // code size.
1051   return true; // No actual code is needed for unreachable.
1052 }
1053
1054 bool CallAnalyzer::visitInstruction(Instruction &I) {
1055   // Some instructions are free. All of the free intrinsics can also be
1056   // handled by SROA, etc.
1057   if (TargetTransformInfo::TCC_Free == TTI.getUserCost(&I))
1058     return true;
1059
1060   // We found something we don't understand or can't handle. Mark any SROA-able
1061   // values in the operand list as no longer viable.
1062   for (User::op_iterator OI = I.op_begin(), OE = I.op_end(); OI != OE; ++OI)
1063     disableSROA(*OI);
1064
1065   return false;
1066 }
1067
1068 /// \brief Analyze a basic block for its contribution to the inline cost.
1069 ///
1070 /// This method walks the analyzer over every instruction in the given basic
1071 /// block and accounts for their cost during inlining at this callsite. It
1072 /// aborts early if the threshold has been exceeded or an impossible to inline
1073 /// construct has been detected. It returns false if inlining is no longer
1074 /// viable, and true if inlining remains viable.
1075 bool CallAnalyzer::analyzeBlock(BasicBlock *BB,
1076                                 SmallPtrSetImpl<const Value *> &EphValues) {
1077   for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) {
1078     // FIXME: Currently, the number of instructions in a function regardless of
1079     // our ability to simplify them during inline to constants or dead code,
1080     // are actually used by the vector bonus heuristic. As long as that's true,
1081     // we have to special case debug intrinsics here to prevent differences in
1082     // inlining due to debug symbols. Eventually, the number of unsimplified
1083     // instructions shouldn't factor into the cost computation, but until then,
1084     // hack around it here.
1085     if (isa<DbgInfoIntrinsic>(I))
1086       continue;
1087
1088     // Skip ephemeral values.
1089     if (EphValues.count(&*I))
1090       continue;
1091
1092     ++NumInstructions;
1093     if (isa<ExtractElementInst>(I) || I->getType()->isVectorTy())
1094       ++NumVectorInstructions;
1095
1096     // If the instruction is floating point, and the target says this operation
1097     // is expensive or the function has the "use-soft-float" attribute, this may
1098     // eventually become a library call. Treat the cost as such.
1099     if (I->getType()->isFloatingPointTy()) {
1100       bool hasSoftFloatAttr = false;
1101
1102       // If the function has the "use-soft-float" attribute, mark it as
1103       // expensive.
1104       if (F.hasFnAttribute("use-soft-float")) {
1105         Attribute Attr = F.getFnAttribute("use-soft-float");
1106         StringRef Val = Attr.getValueAsString();
1107         if (Val == "true")
1108           hasSoftFloatAttr = true;
1109       }
1110
1111       if (TTI.getFPOpCost(I->getType()) == TargetTransformInfo::TCC_Expensive ||
1112           hasSoftFloatAttr)
1113         Cost += InlineConstants::CallPenalty;
1114     }
1115
1116     // If the instruction simplified to a constant, there is no cost to this
1117     // instruction. Visit the instructions using our InstVisitor to account for
1118     // all of the per-instruction logic. The visit tree returns true if we
1119     // consumed the instruction in any way, and false if the instruction's base
1120     // cost should count against inlining.
1121     if (Base::visit(&*I))
1122       ++NumInstructionsSimplified;
1123     else
1124       Cost += InlineConstants::InstrCost;
1125
1126     // If the visit this instruction detected an uninlinable pattern, abort.
1127     if (IsRecursiveCall || ExposesReturnsTwice || HasDynamicAlloca ||
1128         HasIndirectBr || HasFrameEscape)
1129       return false;
1130
1131     // If the caller is a recursive function then we don't want to inline
1132     // functions which allocate a lot of stack space because it would increase
1133     // the caller stack usage dramatically.
1134     if (IsCallerRecursive &&
1135         AllocatedSize > InlineConstants::TotalAllocaSizeRecursiveCaller)
1136       return false;
1137
1138     // Check if we've past the maximum possible threshold so we don't spin in
1139     // huge basic blocks that will never inline.
1140     if (Cost > Threshold)
1141       return false;
1142   }
1143
1144   return true;
1145 }
1146
1147 /// \brief Compute the base pointer and cumulative constant offsets for V.
1148 ///
1149 /// This strips all constant offsets off of V, leaving it the base pointer, and
1150 /// accumulates the total constant offset applied in the returned constant. It
1151 /// returns 0 if V is not a pointer, and returns the constant '0' if there are
1152 /// no constant offsets applied.
1153 ConstantInt *CallAnalyzer::stripAndComputeInBoundsConstantOffsets(Value *&V) {
1154   if (!V->getType()->isPointerTy())
1155     return nullptr;
1156
1157   const DataLayout &DL = F.getParent()->getDataLayout();
1158   unsigned IntPtrWidth = DL.getPointerSizeInBits();
1159   APInt Offset = APInt::getNullValue(IntPtrWidth);
1160
1161   // Even though we don't look through PHI nodes, we could be called on an
1162   // instruction in an unreachable block, which may be on a cycle.
1163   SmallPtrSet<Value *, 4> Visited;
1164   Visited.insert(V);
1165   do {
1166     if (GEPOperator *GEP = dyn_cast<GEPOperator>(V)) {
1167       if (!GEP->isInBounds() || !accumulateGEPOffset(*GEP, Offset))
1168         return nullptr;
1169       V = GEP->getPointerOperand();
1170     } else if (Operator::getOpcode(V) == Instruction::BitCast) {
1171       V = cast<Operator>(V)->getOperand(0);
1172     } else if (GlobalAlias *GA = dyn_cast<GlobalAlias>(V)) {
1173       if (GA->isInterposable())
1174         break;
1175       V = GA->getAliasee();
1176     } else {
1177       break;
1178     }
1179     assert(V->getType()->isPointerTy() && "Unexpected operand type!");
1180   } while (Visited.insert(V).second);
1181
1182   Type *IntPtrTy = DL.getIntPtrType(V->getContext());
1183   return cast<ConstantInt>(ConstantInt::get(IntPtrTy, Offset));
1184 }
1185
1186 /// \brief Analyze a call site for potential inlining.
1187 ///
1188 /// Returns true if inlining this call is viable, and false if it is not
1189 /// viable. It computes the cost and adjusts the threshold based on numerous
1190 /// factors and heuristics. If this method returns false but the computed cost
1191 /// is below the computed threshold, then inlining was forcibly disabled by
1192 /// some artifact of the routine.
1193 bool CallAnalyzer::analyzeCall(CallSite CS) {
1194   ++NumCallsAnalyzed;
1195
1196   // Perform some tweaks to the cost and threshold based on the direct
1197   // callsite information.
1198
1199   // We want to more aggressively inline vector-dense kernels, so up the
1200   // threshold, and we'll lower it if the % of vector instructions gets too
1201   // low. Note that these bonuses are some what arbitrary and evolved over time
1202   // by accident as much as because they are principled bonuses.
1203   //
1204   // FIXME: It would be nice to remove all such bonuses. At least it would be
1205   // nice to base the bonus values on something more scientific.
1206   assert(NumInstructions == 0);
1207   assert(NumVectorInstructions == 0);
1208
1209   // Update the threshold based on callsite properties
1210   updateThreshold(CS, F);
1211
1212   FiftyPercentVectorBonus = 3 * Threshold / 2;
1213   TenPercentVectorBonus = 3 * Threshold / 4;
1214   const DataLayout &DL = F.getParent()->getDataLayout();
1215
1216   // Track whether the post-inlining function would have more than one basic
1217   // block. A single basic block is often intended for inlining. Balloon the
1218   // threshold by 50% until we pass the single-BB phase.
1219   bool SingleBB = true;
1220   int SingleBBBonus = Threshold / 2;
1221
1222   // Speculatively apply all possible bonuses to Threshold. If cost exceeds
1223   // this Threshold any time, and cost cannot decrease, we can stop processing
1224   // the rest of the function body.
1225   Threshold += (SingleBBBonus + FiftyPercentVectorBonus);
1226
1227   // Give out bonuses per argument, as the instructions setting them up will
1228   // be gone after inlining.
1229   for (unsigned I = 0, E = CS.arg_size(); I != E; ++I) {
1230     if (CS.isByValArgument(I)) {
1231       // We approximate the number of loads and stores needed by dividing the
1232       // size of the byval type by the target's pointer size.
1233       PointerType *PTy = cast<PointerType>(CS.getArgument(I)->getType());
1234       unsigned TypeSize = DL.getTypeSizeInBits(PTy->getElementType());
1235       unsigned PointerSize = DL.getPointerSizeInBits();
1236       // Ceiling division.
1237       unsigned NumStores = (TypeSize + PointerSize - 1) / PointerSize;
1238
1239       // If it generates more than 8 stores it is likely to be expanded as an
1240       // inline memcpy so we take that as an upper bound. Otherwise we assume
1241       // one load and one store per word copied.
1242       // FIXME: The maxStoresPerMemcpy setting from the target should be used
1243       // here instead of a magic number of 8, but it's not available via
1244       // DataLayout.
1245       NumStores = std::min(NumStores, 8U);
1246
1247       Cost -= 2 * NumStores * InlineConstants::InstrCost;
1248     } else {
1249       // For non-byval arguments subtract off one instruction per call
1250       // argument.
1251       Cost -= InlineConstants::InstrCost;
1252     }
1253   }
1254
1255   // If there is only one call of the function, and it has internal linkage,
1256   // the cost of inlining it drops dramatically.
1257   bool OnlyOneCallAndLocalLinkage =
1258       F.hasLocalLinkage() && F.hasOneUse() && &F == CS.getCalledFunction();
1259   if (OnlyOneCallAndLocalLinkage)
1260     Cost += InlineConstants::LastCallToStaticBonus;
1261
1262   // If this function uses the coldcc calling convention, prefer not to inline
1263   // it.
1264   if (F.getCallingConv() == CallingConv::Cold)
1265     Cost += InlineConstants::ColdccPenalty;
1266
1267   // Check if we're done. This can happen due to bonuses and penalties.
1268   if (Cost > Threshold)
1269     return false;
1270
1271   if (F.empty())
1272     return true;
1273
1274   Function *Caller = CS.getInstruction()->getParent()->getParent();
1275   // Check if the caller function is recursive itself.
1276   for (User *U : Caller->users()) {
1277     CallSite Site(U);
1278     if (!Site)
1279       continue;
1280     Instruction *I = Site.getInstruction();
1281     if (I->getParent()->getParent() == Caller) {
1282       IsCallerRecursive = true;
1283       break;
1284     }
1285   }
1286
1287   // Populate our simplified values by mapping from function arguments to call
1288   // arguments with known important simplifications.
1289   CallSite::arg_iterator CAI = CS.arg_begin();
1290   for (Function::arg_iterator FAI = F.arg_begin(), FAE = F.arg_end();
1291        FAI != FAE; ++FAI, ++CAI) {
1292     assert(CAI != CS.arg_end());
1293     if (Constant *C = dyn_cast<Constant>(CAI))
1294       SimplifiedValues[&*FAI] = C;
1295
1296     Value *PtrArg = *CAI;
1297     if (ConstantInt *C = stripAndComputeInBoundsConstantOffsets(PtrArg)) {
1298       ConstantOffsetPtrs[&*FAI] = std::make_pair(PtrArg, C->getValue());
1299
1300       // We can SROA any pointer arguments derived from alloca instructions.
1301       if (isa<AllocaInst>(PtrArg)) {
1302         SROAArgValues[&*FAI] = PtrArg;
1303         SROAArgCosts[PtrArg] = 0;
1304       }
1305     }
1306   }
1307   NumConstantArgs = SimplifiedValues.size();
1308   NumConstantOffsetPtrArgs = ConstantOffsetPtrs.size();
1309   NumAllocaArgs = SROAArgValues.size();
1310
1311   // FIXME: If a caller has multiple calls to a callee, we end up recomputing
1312   // the ephemeral values multiple times (and they're completely determined by
1313   // the callee, so this is purely duplicate work).
1314   SmallPtrSet<const Value *, 32> EphValues;
1315   CodeMetrics::collectEphemeralValues(&F, &ACT->getAssumptionCache(F),
1316                                       EphValues);
1317
1318   // The worklist of live basic blocks in the callee *after* inlining. We avoid
1319   // adding basic blocks of the callee which can be proven to be dead for this
1320   // particular call site in order to get more accurate cost estimates. This
1321   // requires a somewhat heavyweight iteration pattern: we need to walk the
1322   // basic blocks in a breadth-first order as we insert live successors. To
1323   // accomplish this, prioritizing for small iterations because we exit after
1324   // crossing our threshold, we use a small-size optimized SetVector.
1325   typedef SetVector<BasicBlock *, SmallVector<BasicBlock *, 16>,
1326                     SmallPtrSet<BasicBlock *, 16>>
1327       BBSetVector;
1328   BBSetVector BBWorklist;
1329   BBWorklist.insert(&F.getEntryBlock());
1330   // Note that we *must not* cache the size, this loop grows the worklist.
1331   for (unsigned Idx = 0; Idx != BBWorklist.size(); ++Idx) {
1332     // Bail out the moment we cross the threshold. This means we'll under-count
1333     // the cost, but only when undercounting doesn't matter.
1334     if (Cost > Threshold)
1335       break;
1336
1337     BasicBlock *BB = BBWorklist[Idx];
1338     if (BB->empty())
1339       continue;
1340
1341     // Disallow inlining a blockaddress. A blockaddress only has defined
1342     // behavior for an indirect branch in the same function, and we do not
1343     // currently support inlining indirect branches. But, the inliner may not
1344     // see an indirect branch that ends up being dead code at a particular call
1345     // site. If the blockaddress escapes the function, e.g., via a global
1346     // variable, inlining may lead to an invalid cross-function reference.
1347     if (BB->hasAddressTaken())
1348       return false;
1349
1350     // Analyze the cost of this block. If we blow through the threshold, this
1351     // returns false, and we can bail on out.
1352     if (!analyzeBlock(BB, EphValues))
1353       return false;
1354
1355     TerminatorInst *TI = BB->getTerminator();
1356
1357     // Add in the live successors by first checking whether we have terminator
1358     // that may be simplified based on the values simplified by this call.
1359     if (BranchInst *BI = dyn_cast<BranchInst>(TI)) {
1360       if (BI->isConditional()) {
1361         Value *Cond = BI->getCondition();
1362         if (ConstantInt *SimpleCond =
1363                 dyn_cast_or_null<ConstantInt>(SimplifiedValues.lookup(Cond))) {
1364           BBWorklist.insert(BI->getSuccessor(SimpleCond->isZero() ? 1 : 0));
1365           continue;
1366         }
1367       }
1368     } else if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) {
1369       Value *Cond = SI->getCondition();
1370       if (ConstantInt *SimpleCond =
1371               dyn_cast_or_null<ConstantInt>(SimplifiedValues.lookup(Cond))) {
1372         BBWorklist.insert(SI->findCaseValue(SimpleCond).getCaseSuccessor());
1373         continue;
1374       }
1375     }
1376
1377     // If we're unable to select a particular successor, just count all of
1378     // them.
1379     for (unsigned TIdx = 0, TSize = TI->getNumSuccessors(); TIdx != TSize;
1380          ++TIdx)
1381       BBWorklist.insert(TI->getSuccessor(TIdx));
1382
1383     // If we had any successors at this point, than post-inlining is likely to
1384     // have them as well. Note that we assume any basic blocks which existed
1385     // due to branches or switches which folded above will also fold after
1386     // inlining.
1387     if (SingleBB && TI->getNumSuccessors() > 1) {
1388       // Take off the bonus we applied to the threshold.
1389       Threshold -= SingleBBBonus;
1390       SingleBB = false;
1391     }
1392   }
1393
1394   // If this is a noduplicate call, we can still inline as long as
1395   // inlining this would cause the removal of the caller (so the instruction
1396   // is not actually duplicated, just moved).
1397   if (!OnlyOneCallAndLocalLinkage && ContainsNoDuplicateCall)
1398     return false;
1399
1400   // We applied the maximum possible vector bonus at the beginning. Now,
1401   // subtract the excess bonus, if any, from the Threshold before
1402   // comparing against Cost.
1403   if (NumVectorInstructions <= NumInstructions / 10)
1404     Threshold -= FiftyPercentVectorBonus;
1405   else if (NumVectorInstructions <= NumInstructions / 2)
1406     Threshold -= (FiftyPercentVectorBonus - TenPercentVectorBonus);
1407
1408   return Cost < std::max(1, Threshold);
1409 }
1410
1411 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1412 /// \brief Dump stats about this call's analysis.
1413 LLVM_DUMP_METHOD void CallAnalyzer::dump() {
1414 #define DEBUG_PRINT_STAT(x) dbgs() << "      " #x ": " << x << "\n"
1415   DEBUG_PRINT_STAT(NumConstantArgs);
1416   DEBUG_PRINT_STAT(NumConstantOffsetPtrArgs);
1417   DEBUG_PRINT_STAT(NumAllocaArgs);
1418   DEBUG_PRINT_STAT(NumConstantPtrCmps);
1419   DEBUG_PRINT_STAT(NumConstantPtrDiffs);
1420   DEBUG_PRINT_STAT(NumInstructionsSimplified);
1421   DEBUG_PRINT_STAT(NumInstructions);
1422   DEBUG_PRINT_STAT(SROACostSavings);
1423   DEBUG_PRINT_STAT(SROACostSavingsLost);
1424   DEBUG_PRINT_STAT(ContainsNoDuplicateCall);
1425   DEBUG_PRINT_STAT(Cost);
1426   DEBUG_PRINT_STAT(Threshold);
1427 #undef DEBUG_PRINT_STAT
1428 }
1429 #endif
1430
1431 /// \brief Test that two functions either have or have not the given attribute
1432 ///        at the same time.
1433 template <typename AttrKind>
1434 static bool attributeMatches(Function *F1, Function *F2, AttrKind Attr) {
1435   return F1->getFnAttribute(Attr) == F2->getFnAttribute(Attr);
1436 }
1437
1438 /// \brief Test that there are no attribute conflicts between Caller and Callee
1439 ///        that prevent inlining.
1440 static bool functionsHaveCompatibleAttributes(Function *Caller,
1441                                               Function *Callee,
1442                                               TargetTransformInfo &TTI) {
1443   return TTI.areInlineCompatible(Caller, Callee) &&
1444          AttributeFuncs::areInlineCompatible(*Caller, *Callee);
1445 }
1446
1447 InlineCost llvm::getInlineCost(CallSite CS, int DefaultThreshold,
1448                                TargetTransformInfo &CalleeTTI,
1449                                AssumptionCacheTracker *ACT,
1450                                ProfileSummaryInfo *PSI) {
1451   return getInlineCost(CS, CS.getCalledFunction(), DefaultThreshold, CalleeTTI,
1452                        ACT, PSI);
1453 }
1454
1455 int llvm::computeThresholdFromOptLevels(unsigned OptLevel,
1456                                         unsigned SizeOptLevel) {
1457   if (OptLevel > 2)
1458     return OptAggressiveThreshold;
1459   if (SizeOptLevel == 1) // -Os
1460     return OptSizeThreshold;
1461   if (SizeOptLevel == 2) // -Oz
1462     return OptMinSizeThreshold;
1463   return DefaultInlineThreshold;
1464 }
1465
1466 int llvm::getDefaultInlineThreshold() { return DefaultInlineThreshold; }
1467
1468 InlineCost llvm::getInlineCost(CallSite CS, Function *Callee,
1469                                int DefaultThreshold,
1470                                TargetTransformInfo &CalleeTTI,
1471                                AssumptionCacheTracker *ACT,
1472                                ProfileSummaryInfo *PSI) {
1473
1474   // Cannot inline indirect calls.
1475   if (!Callee)
1476     return llvm::InlineCost::getNever();
1477
1478   // Calls to functions with always-inline attributes should be inlined
1479   // whenever possible.
1480   if (CS.hasFnAttr(Attribute::AlwaysInline)) {
1481     if (isInlineViable(*Callee))
1482       return llvm::InlineCost::getAlways();
1483     return llvm::InlineCost::getNever();
1484   }
1485
1486   // Never inline functions with conflicting attributes (unless callee has
1487   // always-inline attribute).
1488   if (!functionsHaveCompatibleAttributes(CS.getCaller(), Callee, CalleeTTI))
1489     return llvm::InlineCost::getNever();
1490
1491   // Don't inline this call if the caller has the optnone attribute.
1492   if (CS.getCaller()->hasFnAttribute(Attribute::OptimizeNone))
1493     return llvm::InlineCost::getNever();
1494
1495   // Don't inline functions which can be interposed at link-time.  Don't inline
1496   // functions marked noinline or call sites marked noinline.
1497   // Note: inlining non-exact non-interposable fucntions is fine, since we know
1498   // we have *a* correct implementation of the source level function.
1499   if (Callee->isInterposable() || Callee->hasFnAttribute(Attribute::NoInline) ||
1500       CS.isNoInline())
1501     return llvm::InlineCost::getNever();
1502
1503   DEBUG(llvm::dbgs() << "      Analyzing call of " << Callee->getName()
1504                      << "...\n");
1505
1506   CallAnalyzer CA(CalleeTTI, ACT, PSI, *Callee, DefaultThreshold, CS);
1507   bool ShouldInline = CA.analyzeCall(CS);
1508
1509   DEBUG(CA.dump());
1510
1511   // Check if there was a reason to force inlining or no inlining.
1512   if (!ShouldInline && CA.getCost() < CA.getThreshold())
1513     return InlineCost::getNever();
1514   if (ShouldInline && CA.getCost() >= CA.getThreshold())
1515     return InlineCost::getAlways();
1516
1517   return llvm::InlineCost::get(CA.getCost(), CA.getThreshold());
1518 }
1519
1520 bool llvm::isInlineViable(Function &F) {
1521   bool ReturnsTwice = F.hasFnAttribute(Attribute::ReturnsTwice);
1522   for (Function::iterator BI = F.begin(), BE = F.end(); BI != BE; ++BI) {
1523     // Disallow inlining of functions which contain indirect branches or
1524     // blockaddresses.
1525     if (isa<IndirectBrInst>(BI->getTerminator()) || BI->hasAddressTaken())
1526       return false;
1527
1528     for (auto &II : *BI) {
1529       CallSite CS(&II);
1530       if (!CS)
1531         continue;
1532
1533       // Disallow recursive calls.
1534       if (&F == CS.getCalledFunction())
1535         return false;
1536
1537       // Disallow calls which expose returns-twice to a function not previously
1538       // attributed as such.
1539       if (!ReturnsTwice && CS.isCall() &&
1540           cast<CallInst>(CS.getInstruction())->canReturnTwice())
1541         return false;
1542
1543       // Disallow inlining functions that call @llvm.localescape. Doing this
1544       // correctly would require major changes to the inliner.
1545       if (CS.getCalledFunction() &&
1546           CS.getCalledFunction()->getIntrinsicID() ==
1547               llvm::Intrinsic::localescape)
1548         return false;
1549     }
1550   }
1551
1552   return true;
1553 }