contrib/llvm/lib/CodeGen/CodeGenPrepare.cpp

   1 //===- CodeGenPrepare.cpp - Prepare a function for code generation --------===//
   2 //
   3 //                     The LLVM Compiler Infrastructure
   4 //
   5 // This file is distributed under the University of Illinois Open Source
   6 // License. See LICENSE.TXT for details.
   7 //
   8 //===----------------------------------------------------------------------===//
   9 //
  10 // This pass munges the code in the input function to better prepare it for
  11 // SelectionDAG-based code generation. This works around limitations in it's
  12 // basic-block-at-a-time approach. It should eventually be removed.
  13 //
  14 //===----------------------------------------------------------------------===//
  15
  16 #include "llvm/ADT/DenseMap.h"
  17 #include "llvm/ADT/SetVector.h"
  18 #include "llvm/ADT/SmallSet.h"
  19 #include "llvm/ADT/Statistic.h"
  20 #include "llvm/Analysis/BlockFrequencyInfo.h"
  21 #include "llvm/Analysis/BranchProbabilityInfo.h"
  22 #include "llvm/Analysis/CFG.h"
  23 #include "llvm/Analysis/InstructionSimplify.h"
  24 #include "llvm/Analysis/LoopInfo.h"
  25 #include "llvm/Analysis/MemoryBuiltins.h"
  26 #include "llvm/Analysis/ProfileSummaryInfo.h"
  27 #include "llvm/Analysis/TargetLibraryInfo.h"
  28 #include "llvm/Analysis/TargetTransformInfo.h"
  29 #include "llvm/Analysis/ValueTracking.h"
  30 #include "llvm/CodeGen/Analysis.h"
  31 #include "llvm/CodeGen/Passes.h"
  32 #include "llvm/CodeGen/TargetPassConfig.h"
  33 #include "llvm/IR/CallSite.h"
  34 #include "llvm/IR/Constants.h"
  35 #include "llvm/IR/DataLayout.h"
  36 #include "llvm/IR/DerivedTypes.h"
  37 #include "llvm/IR/Dominators.h"
  38 #include "llvm/IR/Function.h"
  39 #include "llvm/IR/GetElementPtrTypeIterator.h"
  40 #include "llvm/IR/IRBuilder.h"
  41 #include "llvm/IR/InlineAsm.h"
  42 #include "llvm/IR/Instructions.h"
  43 #include "llvm/IR/IntrinsicInst.h"
  44 #include "llvm/IR/MDBuilder.h"
  45 #include "llvm/IR/PatternMatch.h"
  46 #include "llvm/IR/Statepoint.h"
  47 #include "llvm/IR/ValueHandle.h"
  48 #include "llvm/IR/ValueMap.h"
  49 #include "llvm/Pass.h"
  50 #include "llvm/Support/BranchProbability.h"
  51 #include "llvm/Support/CommandLine.h"
  52 #include "llvm/Support/Debug.h"
  53 #include "llvm/Support/raw_ostream.h"
  54 #include "llvm/Target/TargetLowering.h"
  55 #include "llvm/Target/TargetSubtargetInfo.h"
  56 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
  57 #include "llvm/Transforms/Utils/BuildLibCalls.h"
  58 #include "llvm/Transforms/Utils/BypassSlowDivision.h"
  59 #include "llvm/Transforms/Utils/Cloning.h"
  60 #include "llvm/Transforms/Utils/Local.h"
  61 #include "llvm/Transforms/Utils/SimplifyLibCalls.h"
  62 #include "llvm/Transforms/Utils/ValueMapper.h"
  63
  64 using namespace llvm;
  65 using namespace llvm::PatternMatch;
  66
  67 #define DEBUG_TYPE "codegenprepare"
  68
  69 STATISTIC(NumBlocksElim, "Number of blocks eliminated");
  70 STATISTIC(NumPHIsElim,   "Number of trivial PHIs eliminated");
  71 STATISTIC(NumGEPsElim,   "Number of GEPs converted to casts");
  72 STATISTIC(NumCmpUses, "Number of uses of Cmp expressions replaced with uses of "
  73                       "sunken Cmps");
  74 STATISTIC(NumCastUses, "Number of uses of Cast expressions replaced with uses "
  75                        "of sunken Casts");
  76 STATISTIC(NumMemoryInsts, "Number of memory instructions whose address "
  77                           "computations were sunk");
  78 STATISTIC(NumExtsMoved,  "Number of [s|z]ext instructions combined with loads");
  79 STATISTIC(NumExtUses,    "Number of uses of [s|z]ext instructions optimized");
  80 STATISTIC(NumAndsAdded,
  81           "Number of and mask instructions added to form ext loads");
  82 STATISTIC(NumAndUses, "Number of uses of and mask instructions optimized");
  83 STATISTIC(NumRetsDup,    "Number of return instructions duplicated");
  84 STATISTIC(NumDbgValueMoved, "Number of debug value instructions moved");
  85 STATISTIC(NumSelectsExpanded, "Number of selects turned into branches");
  86 STATISTIC(NumStoreExtractExposed, "Number of store(extractelement) exposed");
  87
  88 STATISTIC(NumMemCmpCalls, "Number of memcmp calls");
  89 STATISTIC(NumMemCmpNotConstant, "Number of memcmp calls without constant size");
  90 STATISTIC(NumMemCmpGreaterThanMax,
  91           "Number of memcmp calls with size greater than max size");
  92 STATISTIC(NumMemCmpInlined, "Number of inlined memcmp calls");
  93
  94 static cl::opt<bool> DisableBranchOpts(
  95   "disable-cgp-branch-opts", cl::Hidden, cl::init(false),
  96   cl::desc("Disable branch optimizations in CodeGenPrepare"));
  97
  98 static cl::opt<bool>
  99     DisableGCOpts("disable-cgp-gc-opts", cl::Hidden, cl::init(false),
 100                   cl::desc("Disable GC optimizations in CodeGenPrepare"));
 101
 102 static cl::opt<bool> DisableSelectToBranch(
 103   "disable-cgp-select2branch", cl::Hidden, cl::init(false),
 104   cl::desc("Disable select to branch conversion."));
 105
 106 static cl::opt<bool> AddrSinkUsingGEPs(
 107   "addr-sink-using-gep", cl::Hidden, cl::init(true),
 108   cl::desc("Address sinking in CGP using GEPs."));
 109
 110 static cl::opt<bool> EnableAndCmpSinking(
 111    "enable-andcmp-sinking", cl::Hidden, cl::init(true),
 112    cl::desc("Enable sinkinig and/cmp into branches."));
 113
 114 static cl::opt<bool> DisableStoreExtract(
 115     "disable-cgp-store-extract", cl::Hidden, cl::init(false),
 116     cl::desc("Disable store(extract) optimizations in CodeGenPrepare"));
 117
 118 static cl::opt<bool> StressStoreExtract(
 119     "stress-cgp-store-extract", cl::Hidden, cl::init(false),
 120     cl::desc("Stress test store(extract) optimizations in CodeGenPrepare"));
 121
 122 static cl::opt<bool> DisableExtLdPromotion(
 123     "disable-cgp-ext-ld-promotion", cl::Hidden, cl::init(false),
 124     cl::desc("Disable ext(promotable(ld)) -> promoted(ext(ld)) optimization in "
 125              "CodeGenPrepare"));
 126
 127 static cl::opt<bool> StressExtLdPromotion(
 128     "stress-cgp-ext-ld-promotion", cl::Hidden, cl::init(false),
 129     cl::desc("Stress test ext(promotable(ld)) -> promoted(ext(ld)) "
 130              "optimization in CodeGenPrepare"));
 131
 132 static cl::opt<bool> DisablePreheaderProtect(
 133     "disable-preheader-prot", cl::Hidden, cl::init(false),
 134     cl::desc("Disable protection against removing loop preheaders"));
 135
 136 static cl::opt<bool> ProfileGuidedSectionPrefix(
 137     "profile-guided-section-prefix", cl::Hidden, cl::init(true), cl::ZeroOrMore,
 138     cl::desc("Use profile info to add section prefix for hot/cold functions"));
 139
 140 static cl::opt<unsigned> FreqRatioToSkipMerge(
 141     "cgp-freq-ratio-to-skip-merge", cl::Hidden, cl::init(2),
 142     cl::desc("Skip merging empty blocks if (frequency of empty block) / "
 143              "(frequency of destination block) is greater than this ratio"));
 144
 145 static cl::opt<bool> ForceSplitStore(
 146     "force-split-store", cl::Hidden, cl::init(false),
 147     cl::desc("Force store splitting no matter what the target query says."));
 148
 149 static cl::opt<bool>
 150 EnableTypePromotionMerge("cgp-type-promotion-merge", cl::Hidden,
 151     cl::desc("Enable merging of redundant sexts when one is dominating"
 152     " the other."), cl::init(true));
 153
 154 static cl::opt<unsigned> MemCmpNumLoadsPerBlock(
 155     "memcmp-num-loads-per-block", cl::Hidden, cl::init(1),
 156     cl::desc("The number of loads per basic block for inline expansion of "
 157              "memcmp that is only being compared against zero."));
 158
 159 namespace {
 160 typedef SmallPtrSet<Instruction *, 16> SetOfInstrs;
 161 typedef PointerIntPair<Type *, 1, bool> TypeIsSExt;
 162 typedef DenseMap<Instruction *, TypeIsSExt> InstrToOrigTy;
 163 typedef SmallVector<Instruction *, 16> SExts;
 164 typedef DenseMap<Value *, SExts> ValueToSExts;
 165 class TypePromotionTransaction;
 166
 167   class CodeGenPrepare : public FunctionPass {
 168     const TargetMachine *TM;
 169     const TargetSubtargetInfo *SubtargetInfo;
 170     const TargetLowering *TLI;
 171     const TargetRegisterInfo *TRI;
 172     const TargetTransformInfo *TTI;
 173     const TargetLibraryInfo *TLInfo;
 174     const LoopInfo *LI;
 175     std::unique_ptr<BlockFrequencyInfo> BFI;
 176     std::unique_ptr<BranchProbabilityInfo> BPI;
 177
 178     /// As we scan instructions optimizing them, this is the next instruction
 179     /// to optimize. Transforms that can invalidate this should update it.
 180     BasicBlock::iterator CurInstIterator;
 181
 182     /// Keeps track of non-local addresses that have been sunk into a block.
 183     /// This allows us to avoid inserting duplicate code for blocks with
 184     /// multiple load/stores of the same address.
 185     ValueMap<Value*, Value*> SunkAddrs;
 186
 187     /// Keeps track of all instructions inserted for the current function.
 188     SetOfInstrs InsertedInsts;
 189     /// Keeps track of the type of the related instruction before their
 190     /// promotion for the current function.
 191     InstrToOrigTy PromotedInsts;
 192
 193     /// Keep track of instructions removed during promotion.
 194     SetOfInstrs RemovedInsts;
 195
 196     /// Keep track of sext chains based on their initial value.
 197     DenseMap<Value *, Instruction *> SeenChainsForSExt;
 198
 199     /// Keep track of SExt promoted.
 200     ValueToSExts ValToSExtendedUses;
 201
 202     /// True if CFG is modified in any way.
 203     bool ModifiedDT;
 204
 205     /// True if optimizing for size.
 206     bool OptSize;
 207
 208     /// DataLayout for the Function being processed.
 209     const DataLayout *DL;
 210
 211   public:
 212     static char ID; // Pass identification, replacement for typeid
 213     CodeGenPrepare()
 214         : FunctionPass(ID), TM(nullptr), TLI(nullptr), TTI(nullptr),
 215           DL(nullptr) {
 216       initializeCodeGenPreparePass(*PassRegistry::getPassRegistry());
 217     }
 218     bool runOnFunction(Function &F) override;
 219
 220     StringRef getPassName() const override { return "CodeGen Prepare"; }
 221
 222     void getAnalysisUsage(AnalysisUsage &AU) const override {
 223       // FIXME: When we can selectively preserve passes, preserve the domtree.
 224       AU.addRequired<ProfileSummaryInfoWrapperPass>();
 225       AU.addRequired<TargetLibraryInfoWrapperPass>();
 226       AU.addRequired<TargetTransformInfoWrapperPass>();
 227       AU.addRequired<LoopInfoWrapperPass>();
 228     }
 229
 230   private:
 231     bool eliminateFallThrough(Function &F);
 232     bool eliminateMostlyEmptyBlocks(Function &F);
 233     BasicBlock *findDestBlockOfMergeableEmptyBlock(BasicBlock *BB);
 234     bool canMergeBlocks(const BasicBlock *BB, const BasicBlock *DestBB) const;
 235     void eliminateMostlyEmptyBlock(BasicBlock *BB);
 236     bool isMergingEmptyBlockProfitable(BasicBlock *BB, BasicBlock *DestBB,
 237                                        bool isPreheader);
 238     bool optimizeBlock(BasicBlock &BB, bool &ModifiedDT);
 239     bool optimizeInst(Instruction *I, bool &ModifiedDT);
 240     bool optimizeMemoryInst(Instruction *I, Value *Addr,
 241                             Type *AccessTy, unsigned AS);
 242     bool optimizeInlineAsmInst(CallInst *CS);
 243     bool optimizeCallInst(CallInst *CI, bool &ModifiedDT);
 244     bool optimizeExt(Instruction *&I);
 245     bool optimizeExtUses(Instruction *I);
 246     bool optimizeLoadExt(LoadInst *I);
 247     bool optimizeSelectInst(SelectInst *SI);
 248     bool optimizeShuffleVectorInst(ShuffleVectorInst *SI);
 249     bool optimizeSwitchInst(SwitchInst *CI);
 250     bool optimizeExtractElementInst(Instruction *Inst);
 251     bool dupRetToEnableTailCallOpts(BasicBlock *BB);
 252     bool placeDbgValues(Function &F);
 253     bool canFormExtLd(const SmallVectorImpl<Instruction *> &MovedExts,
 254                       LoadInst *&LI, Instruction *&Inst, bool HasPromoted);
 255     bool tryToPromoteExts(TypePromotionTransaction &TPT,
 256                           const SmallVectorImpl<Instruction *> &Exts,
 257                           SmallVectorImpl<Instruction *> &ProfitablyMovedExts,
 258                           unsigned CreatedInstsCost = 0);
 259     bool mergeSExts(Function &F);
 260     bool performAddressTypePromotion(
 261         Instruction *&Inst,
 262         bool AllowPromotionWithoutCommonHeader,
 263         bool HasPromoted, TypePromotionTransaction &TPT,
 264         SmallVectorImpl<Instruction *> &SpeculativelyMovedExts);
 265     bool splitBranchCondition(Function &F);
 266     bool simplifyOffsetableRelocate(Instruction &I);
 267     bool splitIndirectCriticalEdges(Function &F);
 268   };
 269 }
 270
 271 char CodeGenPrepare::ID = 0;
 272 INITIALIZE_PASS_BEGIN(CodeGenPrepare, DEBUG_TYPE,
 273                       "Optimize for code generation", false, false)
 274 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
 275 INITIALIZE_PASS_END(CodeGenPrepare, DEBUG_TYPE,
 276                     "Optimize for code generation", false, false)
 277
 278 FunctionPass *llvm::createCodeGenPreparePass() { return new CodeGenPrepare(); }
 279
 280 bool CodeGenPrepare::runOnFunction(Function &F) {
 281   if (skipFunction(F))
 282     return false;
 283
 284   DL = &F.getParent()->getDataLayout();
 285
 286   bool EverMadeChange = false;
 287   // Clear per function information.
 288   InsertedInsts.clear();
 289   PromotedInsts.clear();
 290   BFI.reset();
 291   BPI.reset();
 292
 293   ModifiedDT = false;
 294   if (auto *TPC = getAnalysisIfAvailable<TargetPassConfig>()) {
 295     TM = &TPC->getTM<TargetMachine>();
 296     SubtargetInfo = TM->getSubtargetImpl(F);
 297     TLI = SubtargetInfo->getTargetLowering();
 298     TRI = SubtargetInfo->getRegisterInfo();
 299   }
 300   TLInfo = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
 301   TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
 302   LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
 303   OptSize = F.optForSize();
 304
 305   if (ProfileGuidedSectionPrefix) {
 306     ProfileSummaryInfo *PSI =
 307         getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
 308     if (PSI->isFunctionHotInCallGraph(&F))
 309       F.setSectionPrefix(".hot");
 310     else if (PSI->isFunctionColdInCallGraph(&F))
 311       F.setSectionPrefix(".unlikely");
 312   }
 313
 314   /// This optimization identifies DIV instructions that can be
 315   /// profitably bypassed and carried out with a shorter, faster divide.
 316   if (!OptSize && TLI && TLI->isSlowDivBypassed()) {
 317     const DenseMap<unsigned int, unsigned int> &BypassWidths =
 318        TLI->getBypassSlowDivWidths();
 319     BasicBlock* BB = &*F.begin();
 320     while (BB != nullptr) {
 321       // bypassSlowDivision may create new BBs, but we don't want to reapply the
 322       // optimization to those blocks.
 323       BasicBlock* Next = BB->getNextNode();
 324       EverMadeChange |= bypassSlowDivision(BB, BypassWidths);
 325       BB = Next;
 326     }
 327   }
 328
 329   // Eliminate blocks that contain only PHI nodes and an
 330   // unconditional branch.
 331   EverMadeChange |= eliminateMostlyEmptyBlocks(F);
 332
 333   // llvm.dbg.value is far away from the value then iSel may not be able
 334   // handle it properly. iSel will drop llvm.dbg.value if it can not
 335   // find a node corresponding to the value.
 336   EverMadeChange |= placeDbgValues(F);
 337
 338   if (!DisableBranchOpts)
 339     EverMadeChange |= splitBranchCondition(F);
 340
 341   // Split some critical edges where one of the sources is an indirect branch,
 342   // to help generate sane code for PHIs involving such edges.
 343   EverMadeChange |= splitIndirectCriticalEdges(F);
 344
 345   bool MadeChange = true;
 346   while (MadeChange) {
 347     MadeChange = false;
 348     SeenChainsForSExt.clear();
 349     ValToSExtendedUses.clear();
 350     RemovedInsts.clear();
 351     for (Function::iterator I = F.begin(); I != F.end(); ) {
 352       BasicBlock *BB = &*I++;
 353       bool ModifiedDTOnIteration = false;
 354       MadeChange |= optimizeBlock(*BB, ModifiedDTOnIteration);
 355
 356       // Restart BB iteration if the dominator tree of the Function was changed
 357       if (ModifiedDTOnIteration)
 358         break;
 359     }
 360     if (EnableTypePromotionMerge && !ValToSExtendedUses.empty())
 361       MadeChange |= mergeSExts(F);
 362
 363     // Really free removed instructions during promotion.
 364     for (Instruction *I : RemovedInsts)
 365       I->deleteValue();
 366
 367     EverMadeChange |= MadeChange;
 368   }
 369
 370   SunkAddrs.clear();
 371
 372   if (!DisableBranchOpts) {
 373     MadeChange = false;
 374     SmallPtrSet<BasicBlock*, 8> WorkList;
 375     for (BasicBlock &BB : F) {
 376       SmallVector<BasicBlock *, 2> Successors(succ_begin(&BB), succ_end(&BB));
 377       MadeChange |= ConstantFoldTerminator(&BB, true);
 378       if (!MadeChange) continue;
 379
 380       for (SmallVectorImpl<BasicBlock*>::iterator
 381              II = Successors.begin(), IE = Successors.end(); II != IE; ++II)
 382         if (pred_begin(*II) == pred_end(*II))
 383           WorkList.insert(*II);
 384     }
 385
 386     // Delete the dead blocks and any of their dead successors.
 387     MadeChange |= !WorkList.empty();
 388     while (!WorkList.empty()) {
 389       BasicBlock *BB = *WorkList.begin();
 390       WorkList.erase(BB);
 391       SmallVector<BasicBlock*, 2> Successors(succ_begin(BB), succ_end(BB));
 392
 393       DeleteDeadBlock(BB);
 394
 395       for (SmallVectorImpl<BasicBlock*>::iterator
 396              II = Successors.begin(), IE = Successors.end(); II != IE; ++II)
 397         if (pred_begin(*II) == pred_end(*II))
 398           WorkList.insert(*II);
 399     }
 400
 401     // Merge pairs of basic blocks with unconditional branches, connected by
 402     // a single edge.
 403     if (EverMadeChange || MadeChange)
 404       MadeChange |= eliminateFallThrough(F);
 405
 406     EverMadeChange |= MadeChange;
 407   }
 408
 409   if (!DisableGCOpts) {
 410     SmallVector<Instruction *, 2> Statepoints;
 411     for (BasicBlock &BB : F)
 412       for (Instruction &I : BB)
 413         if (isStatepoint(I))
 414           Statepoints.push_back(&I);
 415     for (auto &I : Statepoints)
 416       EverMadeChange |= simplifyOffsetableRelocate(*I);
 417   }
 418
 419   return EverMadeChange;
 420 }
 421
 422 /// Merge basic blocks which are connected by a single edge, where one of the
 423 /// basic blocks has a single successor pointing to the other basic block,
 424 /// which has a single predecessor.
 425 bool CodeGenPrepare::eliminateFallThrough(Function &F) {
 426   bool Changed = false;
 427   // Scan all of the blocks in the function, except for the entry block.
 428   for (Function::iterator I = std::next(F.begin()), E = F.end(); I != E;) {
 429     BasicBlock *BB = &*I++;
 430     // If the destination block has a single pred, then this is a trivial
 431     // edge, just collapse it.
 432     BasicBlock *SinglePred = BB->getSinglePredecessor();
 433
 434     // Don't merge if BB's address is taken.
 435     if (!SinglePred || SinglePred == BB || BB->hasAddressTaken()) continue;
 436
 437     BranchInst *Term = dyn_cast<BranchInst>(SinglePred->getTerminator());
 438     if (Term && !Term->isConditional()) {
 439       Changed = true;
 440       DEBUG(dbgs() << "To merge:\n"<< *SinglePred << "\n\n\n");
 441       // Remember if SinglePred was the entry block of the function.
 442       // If so, we will need to move BB back to the entry position.
 443       bool isEntry = SinglePred == &SinglePred->getParent()->getEntryBlock();
 444       MergeBasicBlockIntoOnlyPred(BB, nullptr);
 445
 446       if (isEntry && BB != &BB->getParent()->getEntryBlock())
 447         BB->moveBefore(&BB->getParent()->getEntryBlock());
 448
 449       // We have erased a block. Update the iterator.
 450       I = BB->getIterator();
 451     }
 452   }
 453   return Changed;
 454 }
 455
 456 /// Find a destination block from BB if BB is mergeable empty block.
 457 BasicBlock *CodeGenPrepare::findDestBlockOfMergeableEmptyBlock(BasicBlock *BB) {
 458   // If this block doesn't end with an uncond branch, ignore it.
 459   BranchInst *BI = dyn_cast<BranchInst>(BB->getTerminator());
 460   if (!BI || !BI->isUnconditional())
 461     return nullptr;
 462
 463   // If the instruction before the branch (skipping debug info) isn't a phi
 464   // node, then other stuff is happening here.
 465   BasicBlock::iterator BBI = BI->getIterator();
 466   if (BBI != BB->begin()) {
 467     --BBI;
 468     while (isa<DbgInfoIntrinsic>(BBI)) {
 469       if (BBI == BB->begin())
 470         break;
 471       --BBI;
 472     }
 473     if (!isa<DbgInfoIntrinsic>(BBI) && !isa<PHINode>(BBI))
 474       return nullptr;
 475   }
 476
 477   // Do not break infinite loops.
 478   BasicBlock *DestBB = BI->getSuccessor(0);
 479   if (DestBB == BB)
 480     return nullptr;
 481
 482   if (!canMergeBlocks(BB, DestBB))
 483     DestBB = nullptr;
 484
 485   return DestBB;
 486 }
 487
 488 // Return the unique indirectbr predecessor of a block. This may return null
 489 // even if such a predecessor exists, if it's not useful for splitting.
 490 // If a predecessor is found, OtherPreds will contain all other (non-indirectbr)
 491 // predecessors of BB.
 492 static BasicBlock *
 493 findIBRPredecessor(BasicBlock *BB, SmallVectorImpl<BasicBlock *> &OtherPreds) {
 494   // If the block doesn't have any PHIs, we don't care about it, since there's
 495   // no point in splitting it.
 496   PHINode *PN = dyn_cast<PHINode>(BB->begin());
 497   if (!PN)
 498     return nullptr;
 499
 500   // Verify we have exactly one IBR predecessor.
 501   // Conservatively bail out if one of the other predecessors is not a "regular"
 502   // terminator (that is, not a switch or a br).
 503   BasicBlock *IBB = nullptr;
 504   for (unsigned Pred = 0, E = PN->getNumIncomingValues(); Pred != E; ++Pred) {
 505     BasicBlock *PredBB = PN->getIncomingBlock(Pred);
 506     TerminatorInst *PredTerm = PredBB->getTerminator();
 507     switch (PredTerm->getOpcode()) {
 508     case Instruction::IndirectBr:
 509       if (IBB)
 510         return nullptr;
 511       IBB = PredBB;
 512       break;
 513     case Instruction::Br:
 514     case Instruction::Switch:
 515       OtherPreds.push_back(PredBB);
 516       continue;
 517     default:
 518       return nullptr;
 519     }
 520   }
 521
 522   return IBB;
 523 }
 524
 525 // Split critical edges where the source of the edge is an indirectbr
 526 // instruction. This isn't always possible, but we can handle some easy cases.
 527 // This is useful because MI is unable to split such critical edges,
 528 // which means it will not be able to sink instructions along those edges.
 529 // This is especially painful for indirect branches with many successors, where
 530 // we end up having to prepare all outgoing values in the origin block.
 531 //
 532 // Our normal algorithm for splitting critical edges requires us to update
 533 // the outgoing edges of the edge origin block, but for an indirectbr this
 534 // is hard, since it would require finding and updating the block addresses
 535 // the indirect branch uses. But if a block only has a single indirectbr
 536 // predecessor, with the others being regular branches, we can do it in a
 537 // different way.
 538 // Say we have A -> D, B -> D, I -> D where only I -> D is an indirectbr.
 539 // We can split D into D0 and D1, where D0 contains only the PHIs from D,
 540 // and D1 is the D block body. We can then duplicate D0 as D0A and D0B, and
 541 // create the following structure:
 542 // A -> D0A, B -> D0A, I -> D0B, D0A -> D1, D0B -> D1
 543 bool CodeGenPrepare::splitIndirectCriticalEdges(Function &F) {
 544   // Check whether the function has any indirectbrs, and collect which blocks
 545   // they may jump to. Since most functions don't have indirect branches,
 546   // this lowers the common case's overhead to O(Blocks) instead of O(Edges).
 547   SmallSetVector<BasicBlock *, 16> Targets;
 548   for (auto &BB : F) {
 549     auto *IBI = dyn_cast<IndirectBrInst>(BB.getTerminator());
 550     if (!IBI)
 551       continue;
 552
 553     for (unsigned Succ = 0, E = IBI->getNumSuccessors(); Succ != E; ++Succ)
 554       Targets.insert(IBI->getSuccessor(Succ));
 555   }
 556
 557   if (Targets.empty())
 558     return false;
 559
 560   bool Changed = false;
 561   for (BasicBlock *Target : Targets) {
 562     SmallVector<BasicBlock *, 16> OtherPreds;
 563     BasicBlock *IBRPred = findIBRPredecessor(Target, OtherPreds);
 564     // If we did not found an indirectbr, or the indirectbr is the only
 565     // incoming edge, this isn't the kind of edge we're looking for.
 566     if (!IBRPred || OtherPreds.empty())
 567       continue;
 568
 569     // Don't even think about ehpads/landingpads.
 570     Instruction *FirstNonPHI = Target->getFirstNonPHI();
 571     if (FirstNonPHI->isEHPad() || Target->isLandingPad())
 572       continue;
 573
 574     BasicBlock *BodyBlock = Target->splitBasicBlock(FirstNonPHI, ".split");
 575     // It's possible Target was its own successor through an indirectbr.
 576     // In this case, the indirectbr now comes from BodyBlock.
 577     if (IBRPred == Target)
 578       IBRPred = BodyBlock;
 579
 580     // At this point Target only has PHIs, and BodyBlock has the rest of the
 581     // block's body. Create a copy of Target that will be used by the "direct"
 582     // preds.
 583     ValueToValueMapTy VMap;
 584     BasicBlock *DirectSucc = CloneBasicBlock(Target, VMap, ".clone", &F);
 585
 586     for (BasicBlock *Pred : OtherPreds) {
 587       // If the target is a loop to itself, then the terminator of the split
 588       // block needs to be updated.
 589       if (Pred == Target)
 590         BodyBlock->getTerminator()->replaceUsesOfWith(Target, DirectSucc);
 591       else
 592         Pred->getTerminator()->replaceUsesOfWith(Target, DirectSucc);
 593     }
 594
 595     // Ok, now fix up the PHIs. We know the two blocks only have PHIs, and that
 596     // they are clones, so the number of PHIs are the same.
 597     // (a) Remove the edge coming from IBRPred from the "Direct" PHI
 598     // (b) Leave that as the only edge in the "Indirect" PHI.
 599     // (c) Merge the two in the body block.
 600     BasicBlock::iterator Indirect = Target->begin(),
 601                          End = Target->getFirstNonPHI()->getIterator();
 602     BasicBlock::iterator Direct = DirectSucc->begin();
 603     BasicBlock::iterator MergeInsert = BodyBlock->getFirstInsertionPt();
 604
 605     assert(&*End == Target->getTerminator() &&
 606            "Block was expected to only contain PHIs");
 607
 608     while (Indirect != End) {
 609       PHINode *DirPHI = cast<PHINode>(Direct);
 610       PHINode *IndPHI = cast<PHINode>(Indirect);
 611
 612       // Now, clean up - the direct block shouldn't get the indirect value,
 613       // and vice versa.
 614       DirPHI->removeIncomingValue(IBRPred);
 615       Direct++;
 616
 617       // Advance the pointer here, to avoid invalidation issues when the old
 618       // PHI is erased.
 619       Indirect++;
 620
 621       PHINode *NewIndPHI = PHINode::Create(IndPHI->getType(), 1, "ind", IndPHI);
 622       NewIndPHI->addIncoming(IndPHI->getIncomingValueForBlock(IBRPred),
 623                              IBRPred);
 624
 625       // Create a PHI in the body block, to merge the direct and indirect
 626       // predecessors.
 627       PHINode *MergePHI =
 628           PHINode::Create(IndPHI->getType(), 2, "merge", &*MergeInsert);
 629       MergePHI->addIncoming(NewIndPHI, Target);
 630       MergePHI->addIncoming(DirPHI, DirectSucc);
 631
 632       IndPHI->replaceAllUsesWith(MergePHI);
 633       IndPHI->eraseFromParent();
 634     }
 635
 636     Changed = true;
 637   }
 638
 639   return Changed;
 640 }
 641
 642 /// Eliminate blocks that contain only PHI nodes, debug info directives, and an
 643 /// unconditional branch. Passes before isel (e.g. LSR/loopsimplify) often split
 644 /// edges in ways that are non-optimal for isel. Start by eliminating these
 645 /// blocks so we can split them the way we want them.
 646 bool CodeGenPrepare::eliminateMostlyEmptyBlocks(Function &F) {
 647   SmallPtrSet<BasicBlock *, 16> Preheaders;
 648   SmallVector<Loop *, 16> LoopList(LI->begin(), LI->end());
 649   while (!LoopList.empty()) {
 650     Loop *L = LoopList.pop_back_val();
 651     LoopList.insert(LoopList.end(), L->begin(), L->end());
 652     if (BasicBlock *Preheader = L->getLoopPreheader())
 653       Preheaders.insert(Preheader);
 654   }
 655
 656   bool MadeChange = false;
 657   // Note that this intentionally skips the entry block.
 658   for (Function::iterator I = std::next(F.begin()), E = F.end(); I != E;) {
 659     BasicBlock *BB = &*I++;
 660     BasicBlock *DestBB = findDestBlockOfMergeableEmptyBlock(BB);
 661     if (!DestBB ||
 662         !isMergingEmptyBlockProfitable(BB, DestBB, Preheaders.count(BB)))
 663       continue;
 664
 665     eliminateMostlyEmptyBlock(BB);
 666     MadeChange = true;
 667   }
 668   return MadeChange;
 669 }
 670
 671 bool CodeGenPrepare::isMergingEmptyBlockProfitable(BasicBlock *BB,
 672                                                    BasicBlock *DestBB,
 673                                                    bool isPreheader) {
 674   // Do not delete loop preheaders if doing so would create a critical edge.
 675   // Loop preheaders can be good locations to spill registers. If the
 676   // preheader is deleted and we create a critical edge, registers may be
 677   // spilled in the loop body instead.
 678   if (!DisablePreheaderProtect && isPreheader &&
 679       !(BB->getSinglePredecessor() &&
 680         BB->getSinglePredecessor()->getSingleSuccessor()))
 681     return false;
 682
 683   // Try to skip merging if the unique predecessor of BB is terminated by a
 684   // switch or indirect branch instruction, and BB is used as an incoming block
 685   // of PHIs in DestBB. In such case, merging BB and DestBB would cause ISel to
 686   // add COPY instructions in the predecessor of BB instead of BB (if it is not
 687   // merged). Note that the critical edge created by merging such blocks wont be
 688   // split in MachineSink because the jump table is not analyzable. By keeping
 689   // such empty block (BB), ISel will place COPY instructions in BB, not in the
 690   // predecessor of BB.
 691   BasicBlock *Pred = BB->getUniquePredecessor();
 692   if (!Pred ||
 693       !(isa<SwitchInst>(Pred->getTerminator()) ||
 694         isa<IndirectBrInst>(Pred->getTerminator())))
 695     return true;
 696
 697   if (BB->getTerminator() != BB->getFirstNonPHI())
 698     return true;
 699
 700   // We use a simple cost heuristic which determine skipping merging is
 701   // profitable if the cost of skipping merging is less than the cost of
 702   // merging : Cost(skipping merging) < Cost(merging BB), where the
 703   // Cost(skipping merging) is Freq(BB) * (Cost(Copy) + Cost(Branch)), and
 704   // the Cost(merging BB) is Freq(Pred) * Cost(Copy).
 705   // Assuming Cost(Copy) == Cost(Branch), we could simplify it to :
 706   //   Freq(Pred) / Freq(BB) > 2.
 707   // Note that if there are multiple empty blocks sharing the same incoming
 708   // value for the PHIs in the DestBB, we consider them together. In such
 709   // case, Cost(merging BB) will be the sum of their frequencies.
 710
 711   if (!isa<PHINode>(DestBB->begin()))
 712     return true;
 713
 714   SmallPtrSet<BasicBlock *, 16> SameIncomingValueBBs;
 715
 716   // Find all other incoming blocks from which incoming values of all PHIs in
 717   // DestBB are the same as the ones from BB.
 718   for (pred_iterator PI = pred_begin(DestBB), E = pred_end(DestBB); PI != E;
 719        ++PI) {
 720     BasicBlock *DestBBPred = *PI;
 721     if (DestBBPred == BB)
 722       continue;
 723
 724     bool HasAllSameValue = true;
 725     BasicBlock::const_iterator DestBBI = DestBB->begin();
 726     while (const PHINode *DestPN = dyn_cast<PHINode>(DestBBI++)) {
 727       if (DestPN->getIncomingValueForBlock(BB) !=
 728           DestPN->getIncomingValueForBlock(DestBBPred)) {
 729         HasAllSameValue = false;
 730         break;
 731       }
 732     }
 733     if (HasAllSameValue)
 734       SameIncomingValueBBs.insert(DestBBPred);
 735   }
 736
 737   // See if all BB's incoming values are same as the value from Pred. In this
 738   // case, no reason to skip merging because COPYs are expected to be place in
 739   // Pred already.
 740   if (SameIncomingValueBBs.count(Pred))
 741     return true;
 742
 743   if (!BFI) {
 744     Function &F = *BB->getParent();
 745     LoopInfo LI{DominatorTree(F)};
 746     BPI.reset(new BranchProbabilityInfo(F, LI));
 747     BFI.reset(new BlockFrequencyInfo(F, *BPI, LI));
 748   }
 749
 750   BlockFrequency PredFreq = BFI->getBlockFreq(Pred);
 751   BlockFrequency BBFreq = BFI->getBlockFreq(BB);
 752
 753   for (auto SameValueBB : SameIncomingValueBBs)
 754     if (SameValueBB->getUniquePredecessor() == Pred &&
 755         DestBB == findDestBlockOfMergeableEmptyBlock(SameValueBB))
 756       BBFreq += BFI->getBlockFreq(SameValueBB);
 757
 758   return PredFreq.getFrequency() <=
 759          BBFreq.getFrequency() * FreqRatioToSkipMerge;
 760 }
 761
 762 /// Return true if we can merge BB into DestBB if there is a single
 763 /// unconditional branch between them, and BB contains no other non-phi
 764 /// instructions.
 765 bool CodeGenPrepare::canMergeBlocks(const BasicBlock *BB,
 766                                     const BasicBlock *DestBB) const {
 767   // We only want to eliminate blocks whose phi nodes are used by phi nodes in
 768   // the successor.  If there are more complex condition (e.g. preheaders),
 769   // don't mess around with them.
 770   BasicBlock::const_iterator BBI = BB->begin();
 771   while (const PHINode *PN = dyn_cast<PHINode>(BBI++)) {
 772     for (const User *U : PN->users()) {
 773       const Instruction *UI = cast<Instruction>(U);
 774       if (UI->getParent() != DestBB || !isa<PHINode>(UI))
 775         return false;
 776       // If User is inside DestBB block and it is a PHINode then check
 777       // incoming value. If incoming value is not from BB then this is
 778       // a complex condition (e.g. preheaders) we want to avoid here.
 779       if (UI->getParent() == DestBB) {
 780         if (const PHINode *UPN = dyn_cast<PHINode>(UI))
 781           for (unsigned I = 0, E = UPN->getNumIncomingValues(); I != E; ++I) {
 782             Instruction *Insn = dyn_cast<Instruction>(UPN->getIncomingValue(I));
 783             if (Insn && Insn->getParent() == BB &&
 784                 Insn->getParent() != UPN->getIncomingBlock(I))
 785               return false;
 786           }
 787       }
 788     }
 789   }
 790
 791   // If BB and DestBB contain any common predecessors, then the phi nodes in BB
 792   // and DestBB may have conflicting incoming values for the block.  If so, we
 793   // can't merge the block.
 794   const PHINode *DestBBPN = dyn_cast<PHINode>(DestBB->begin());
 795   if (!DestBBPN) return true;  // no conflict.
 796
 797   // Collect the preds of BB.
 798   SmallPtrSet<const BasicBlock*, 16> BBPreds;
 799   if (const PHINode *BBPN = dyn_cast<PHINode>(BB->begin())) {
 800     // It is faster to get preds from a PHI than with pred_iterator.
 801     for (unsigned i = 0, e = BBPN->getNumIncomingValues(); i != e; ++i)
 802       BBPreds.insert(BBPN->getIncomingBlock(i));
 803   } else {
 804     BBPreds.insert(pred_begin(BB), pred_end(BB));
 805   }
 806
 807   // Walk the preds of DestBB.
 808   for (unsigned i = 0, e = DestBBPN->getNumIncomingValues(); i != e; ++i) {
 809     BasicBlock *Pred = DestBBPN->getIncomingBlock(i);
 810     if (BBPreds.count(Pred)) {   // Common predecessor?
 811       BBI = DestBB->begin();
 812       while (const PHINode *PN = dyn_cast<PHINode>(BBI++)) {
 813         const Value *V1 = PN->getIncomingValueForBlock(Pred);
 814         const Value *V2 = PN->getIncomingValueForBlock(BB);
 815
 816         // If V2 is a phi node in BB, look up what the mapped value will be.
 817         if (const PHINode *V2PN = dyn_cast<PHINode>(V2))
 818           if (V2PN->getParent() == BB)
 819             V2 = V2PN->getIncomingValueForBlock(Pred);
 820
 821         // If there is a conflict, bail out.
 822         if (V1 != V2) return false;
 823       }
 824     }
 825   }
 826
 827   return true;
 828 }
 829
 830
 831 /// Eliminate a basic block that has only phi's and an unconditional branch in
 832 /// it.
 833 void CodeGenPrepare::eliminateMostlyEmptyBlock(BasicBlock *BB) {
 834   BranchInst *BI = cast<BranchInst>(BB->getTerminator());
 835   BasicBlock *DestBB = BI->getSuccessor(0);
 836
 837   DEBUG(dbgs() << "MERGING MOSTLY EMPTY BLOCKS - BEFORE:\n" << *BB << *DestBB);
 838
 839   // If the destination block has a single pred, then this is a trivial edge,
 840   // just collapse it.
 841   if (BasicBlock *SinglePred = DestBB->getSinglePredecessor()) {
 842     if (SinglePred != DestBB) {
 843       // Remember if SinglePred was the entry block of the function.  If so, we
 844       // will need to move BB back to the entry position.
 845       bool isEntry = SinglePred == &SinglePred->getParent()->getEntryBlock();
 846       MergeBasicBlockIntoOnlyPred(DestBB, nullptr);
 847
 848       if (isEntry && BB != &BB->getParent()->getEntryBlock())
 849         BB->moveBefore(&BB->getParent()->getEntryBlock());
 850
 851       DEBUG(dbgs() << "AFTER:\n" << *DestBB << "\n\n\n");
 852       return;
 853     }
 854   }
 855
 856   // Otherwise, we have multiple predecessors of BB.  Update the PHIs in DestBB
 857   // to handle the new incoming edges it is about to have.
 858   PHINode *PN;
 859   for (BasicBlock::iterator BBI = DestBB->begin();
 860        (PN = dyn_cast<PHINode>(BBI)); ++BBI) {
 861     // Remove the incoming value for BB, and remember it.
 862     Value *InVal = PN->removeIncomingValue(BB, false);
 863
 864     // Two options: either the InVal is a phi node defined in BB or it is some
 865     // value that dominates BB.
 866     PHINode *InValPhi = dyn_cast<PHINode>(InVal);
 867     if (InValPhi && InValPhi->getParent() == BB) {
 868       // Add all of the input values of the input PHI as inputs of this phi.
 869       for (unsigned i = 0, e = InValPhi->getNumIncomingValues(); i != e; ++i)
 870         PN->addIncoming(InValPhi->getIncomingValue(i),
 871                         InValPhi->getIncomingBlock(i));
 872     } else {
 873       // Otherwise, add one instance of the dominating value for each edge that
 874       // we will be adding.
 875       if (PHINode *BBPN = dyn_cast<PHINode>(BB->begin())) {
 876         for (unsigned i = 0, e = BBPN->getNumIncomingValues(); i != e; ++i)
 877           PN->addIncoming(InVal, BBPN->getIncomingBlock(i));
 878       } else {
 879         for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI)
 880           PN->addIncoming(InVal, *PI);
 881       }
 882     }
 883   }
 884
 885   // The PHIs are now updated, change everything that refers to BB to use
 886   // DestBB and remove BB.
 887   BB->replaceAllUsesWith(DestBB);
 888   BB->eraseFromParent();
 889   ++NumBlocksElim;
 890
 891   DEBUG(dbgs() << "AFTER:\n" << *DestBB << "\n\n\n");
 892 }
 893
 894 // Computes a map of base pointer relocation instructions to corresponding
 895 // derived pointer relocation instructions given a vector of all relocate calls
 896 static void computeBaseDerivedRelocateMap(
 897     const SmallVectorImpl<GCRelocateInst *> &AllRelocateCalls,
 898     DenseMap<GCRelocateInst *, SmallVector<GCRelocateInst *, 2>>
 899         &RelocateInstMap) {
 900   // Collect information in two maps: one primarily for locating the base object
 901   // while filling the second map; the second map is the final structure holding
 902   // a mapping between Base and corresponding Derived relocate calls
 903   DenseMap<std::pair<unsigned, unsigned>, GCRelocateInst *> RelocateIdxMap;
 904   for (auto *ThisRelocate : AllRelocateCalls) {
 905     auto K = std::make_pair(ThisRelocate->getBasePtrIndex(),
 906                             ThisRelocate->getDerivedPtrIndex());
 907     RelocateIdxMap.insert(std::make_pair(K, ThisRelocate));
 908   }
 909   for (auto &Item : RelocateIdxMap) {
 910     std::pair<unsigned, unsigned> Key = Item.first;
 911     if (Key.first == Key.second)
 912       // Base relocation: nothing to insert
 913       continue;
 914
 915     GCRelocateInst *I = Item.second;
 916     auto BaseKey = std::make_pair(Key.first, Key.first);
 917
 918     // We're iterating over RelocateIdxMap so we cannot modify it.
 919     auto MaybeBase = RelocateIdxMap.find(BaseKey);
 920     if (MaybeBase == RelocateIdxMap.end())
 921       // TODO: We might want to insert a new base object relocate and gep off
 922       // that, if there are enough derived object relocates.
 923       continue;
 924
 925     RelocateInstMap[MaybeBase->second].push_back(I);
 926   }
 927 }
 928
 929 // Accepts a GEP and extracts the operands into a vector provided they're all
 930 // small integer constants
 931 static bool getGEPSmallConstantIntOffsetV(GetElementPtrInst *GEP,
 932                                           SmallVectorImpl<Value *> &OffsetV) {
 933   for (unsigned i = 1; i < GEP->getNumOperands(); i++) {
 934     // Only accept small constant integer operands
 935     auto Op = dyn_cast<ConstantInt>(GEP->getOperand(i));
 936     if (!Op || Op->getZExtValue() > 20)
 937       return false;
 938   }
 939
 940   for (unsigned i = 1; i < GEP->getNumOperands(); i++)
 941     OffsetV.push_back(GEP->getOperand(i));
 942   return true;
 943 }
 944
 945 // Takes a RelocatedBase (base pointer relocation instruction) and Targets to
 946 // replace, computes a replacement, and affects it.
 947 static bool
 948 simplifyRelocatesOffABase(GCRelocateInst *RelocatedBase,
 949                           const SmallVectorImpl<GCRelocateInst *> &Targets) {
 950   bool MadeChange = false;
 951   for (GCRelocateInst *ToReplace : Targets) {
 952     assert(ToReplace->getBasePtrIndex() == RelocatedBase->getBasePtrIndex() &&
 953            "Not relocating a derived object of the original base object");
 954     if (ToReplace->getBasePtrIndex() == ToReplace->getDerivedPtrIndex()) {
 955       // A duplicate relocate call. TODO: coalesce duplicates.
 956       continue;
 957     }
 958
 959     if (RelocatedBase->getParent() != ToReplace->getParent()) {
 960       // Base and derived relocates are in different basic blocks.
 961       // In this case transform is only valid when base dominates derived
 962       // relocate. However it would be too expensive to check dominance
 963       // for each such relocate, so we skip the whole transformation.
 964       continue;
 965     }
 966
 967     Value *Base = ToReplace->getBasePtr();
 968     auto Derived = dyn_cast<GetElementPtrInst>(ToReplace->getDerivedPtr());
 969     if (!Derived || Derived->getPointerOperand() != Base)
 970       continue;
 971
 972     SmallVector<Value *, 2> OffsetV;
 973     if (!getGEPSmallConstantIntOffsetV(Derived, OffsetV))
 974       continue;
 975
 976     // Create a Builder and replace the target callsite with a gep
 977     assert(RelocatedBase->getNextNode() &&
 978            "Should always have one since it's not a terminator");
 979
 980     // Insert after RelocatedBase
 981     IRBuilder<> Builder(RelocatedBase->getNextNode());
 982     Builder.SetCurrentDebugLocation(ToReplace->getDebugLoc());
 983
 984     // If gc_relocate does not match the actual type, cast it to the right type.
 985     // In theory, there must be a bitcast after gc_relocate if the type does not
 986     // match, and we should reuse it to get the derived pointer. But it could be
 987     // cases like this:
 988     // bb1:
 989     //  ...
 990     //  %g1 = call coldcc i8 addrspace(1)* @llvm.experimental.gc.relocate.p1i8(...)
 991     //  br label %merge
 992     //
 993     // bb2:
 994     //  ...
 995     //  %g2 = call coldcc i8 addrspace(1)* @llvm.experimental.gc.relocate.p1i8(...)
 996     //  br label %merge
 997     //
 998     // merge:
 999     //  %p1 = phi i8 addrspace(1)* [ %g1, %bb1 ], [ %g2, %bb2 ]
1000     //  %cast = bitcast i8 addrspace(1)* %p1 in to i32 addrspace(1)*
1001     //
1002     // In this case, we can not find the bitcast any more. So we insert a new bitcast
1003     // no matter there is already one or not. In this way, we can handle all cases, and
1004     // the extra bitcast should be optimized away in later passes.
1005     Value *ActualRelocatedBase = RelocatedBase;
1006     if (RelocatedBase->getType() != Base->getType()) {
1007       ActualRelocatedBase =
1008           Builder.CreateBitCast(RelocatedBase, Base->getType());
1009     }
1010     Value *Replacement = Builder.CreateGEP(
1011         Derived->getSourceElementType(), ActualRelocatedBase, makeArrayRef(OffsetV));
1012     Replacement->takeName(ToReplace);
1013     // If the newly generated derived pointer's type does not match the original derived
1014     // pointer's type, cast the new derived pointer to match it. Same reasoning as above.
1015     Value *ActualReplacement = Replacement;
1016     if (Replacement->getType() != ToReplace->getType()) {
1017       ActualReplacement =
1018           Builder.CreateBitCast(Replacement, ToReplace->getType());
1019     }
1020     ToReplace->replaceAllUsesWith(ActualReplacement);
1021     ToReplace->eraseFromParent();
1022
1023     MadeChange = true;
1024   }
1025   return MadeChange;
1026 }
1027
1028 // Turns this:
1029 //
1030 // %base = ...
1031 // %ptr = gep %base + 15
1032 // %tok = statepoint (%fun, i32 0, i32 0, i32 0, %base, %ptr)
1033 // %base' = relocate(%tok, i32 4, i32 4)
1034 // %ptr' = relocate(%tok, i32 4, i32 5)
1035 // %val = load %ptr'
1036 //
1037 // into this:
1038 //
1039 // %base = ...
1040 // %ptr = gep %base + 15
1041 // %tok = statepoint (%fun, i32 0, i32 0, i32 0, %base, %ptr)
1042 // %base' = gc.relocate(%tok, i32 4, i32 4)
1043 // %ptr' = gep %base' + 15
1044 // %val = load %ptr'
1045 bool CodeGenPrepare::simplifyOffsetableRelocate(Instruction &I) {
1046   bool MadeChange = false;
1047   SmallVector<GCRelocateInst *, 2> AllRelocateCalls;
1048
1049   for (auto *U : I.users())
1050     if (GCRelocateInst *Relocate = dyn_cast<GCRelocateInst>(U))
1051       // Collect all the relocate calls associated with a statepoint
1052       AllRelocateCalls.push_back(Relocate);
1053
1054   // We need atleast one base pointer relocation + one derived pointer
1055   // relocation to mangle
1056   if (AllRelocateCalls.size() < 2)
1057     return false;
1058
1059   // RelocateInstMap is a mapping from the base relocate instruction to the
1060   // corresponding derived relocate instructions
1061   DenseMap<GCRelocateInst *, SmallVector<GCRelocateInst *, 2>> RelocateInstMap;
1062   computeBaseDerivedRelocateMap(AllRelocateCalls, RelocateInstMap);
1063   if (RelocateInstMap.empty())
1064     return false;
1065
1066   for (auto &Item : RelocateInstMap)
1067     // Item.first is the RelocatedBase to offset against
1068     // Item.second is the vector of Targets to replace
1069     MadeChange = simplifyRelocatesOffABase(Item.first, Item.second);
1070   return MadeChange;
1071 }
1072
1073 /// SinkCast - Sink the specified cast instruction into its user blocks
1074 static bool SinkCast(CastInst *CI) {
1075   BasicBlock *DefBB = CI->getParent();
1076
1077   /// InsertedCasts - Only insert a cast in each block once.
1078   DenseMap<BasicBlock*, CastInst*> InsertedCasts;
1079
1080   bool MadeChange = false;
1081   for (Value::user_iterator UI = CI->user_begin(), E = CI->user_end();
1082        UI != E; ) {
1083     Use &TheUse = UI.getUse();
1084     Instruction *User = cast<Instruction>(*UI);
1085
1086     // Figure out which BB this cast is used in.  For PHI's this is the
1087     // appropriate predecessor block.
1088     BasicBlock *UserBB = User->getParent();
1089     if (PHINode *PN = dyn_cast<PHINode>(User)) {
1090       UserBB = PN->getIncomingBlock(TheUse);
1091     }
1092
1093     // Preincrement use iterator so we don't invalidate it.
1094     ++UI;
1095
1096     // The first insertion point of a block containing an EH pad is after the
1097     // pad.  If the pad is the user, we cannot sink the cast past the pad.
1098     if (User->isEHPad())
1099       continue;
1100
1101     // If the block selected to receive the cast is an EH pad that does not
1102     // allow non-PHI instructions before the terminator, we can't sink the
1103     // cast.
1104     if (UserBB->getTerminator()->isEHPad())
1105       continue;
1106
1107     // If this user is in the same block as the cast, don't change the cast.
1108     if (UserBB == DefBB) continue;
1109
1110     // If we have already inserted a cast into this block, use it.
1111     CastInst *&InsertedCast = InsertedCasts[UserBB];
1112
1113     if (!InsertedCast) {
1114       BasicBlock::iterator InsertPt = UserBB->getFirstInsertionPt();
1115       assert(InsertPt != UserBB->end());
1116       InsertedCast = CastInst::Create(CI->getOpcode(), CI->getOperand(0),
1117                                       CI->getType(), "", &*InsertPt);
1118     }
1119
1120     // Replace a use of the cast with a use of the new cast.
1121     TheUse = InsertedCast;
1122     MadeChange = true;
1123     ++NumCastUses;
1124   }
1125
1126   // If we removed all uses, nuke the cast.
1127   if (CI->use_empty()) {
1128     CI->eraseFromParent();
1129     MadeChange = true;
1130   }
1131
1132   return MadeChange;
1133 }
1134
1135 /// If the specified cast instruction is a noop copy (e.g. it's casting from
1136 /// one pointer type to another, i32->i8 on PPC), sink it into user blocks to
1137 /// reduce the number of virtual registers that must be created and coalesced.
1138 ///
1139 /// Return true if any changes are made.
1140 ///
1141 static bool OptimizeNoopCopyExpression(CastInst *CI, const TargetLowering &TLI,
1142                                        const DataLayout &DL) {
1143   // Sink only "cheap" (or nop) address-space casts.  This is a weaker condition
1144   // than sinking only nop casts, but is helpful on some platforms.
1145   if (auto *ASC = dyn_cast<AddrSpaceCastInst>(CI)) {
1146     if (!TLI.isCheapAddrSpaceCast(ASC->getSrcAddressSpace(),
1147                                   ASC->getDestAddressSpace()))
1148       return false;
1149   }
1150
1151   // If this is a noop copy,
1152   EVT SrcVT = TLI.getValueType(DL, CI->getOperand(0)->getType());
1153   EVT DstVT = TLI.getValueType(DL, CI->getType());
1154
1155   // This is an fp<->int conversion?
1156   if (SrcVT.isInteger() != DstVT.isInteger())
1157     return false;
1158
1159   // If this is an extension, it will be a zero or sign extension, which
1160   // isn't a noop.
1161   if (SrcVT.bitsLT(DstVT)) return false;
1162
1163   // If these values will be promoted, find out what they will be promoted
1164   // to.  This helps us consider truncates on PPC as noop copies when they
1165   // are.
1166   if (TLI.getTypeAction(CI->getContext(), SrcVT) ==
1167       TargetLowering::TypePromoteInteger)
1168     SrcVT = TLI.getTypeToTransformTo(CI->getContext(), SrcVT);
1169   if (TLI.getTypeAction(CI->getContext(), DstVT) ==
1170       TargetLowering::TypePromoteInteger)
1171     DstVT = TLI.getTypeToTransformTo(CI->getContext(), DstVT);
1172
1173   // If, after promotion, these are the same types, this is a noop copy.
1174   if (SrcVT != DstVT)
1175     return false;
1176
1177   return SinkCast(CI);
1178 }
1179
1180 /// Try to combine CI into a call to the llvm.uadd.with.overflow intrinsic if
1181 /// possible.
1182 ///
1183 /// Return true if any changes were made.
1184 static bool CombineUAddWithOverflow(CmpInst *CI) {
1185   Value *A, *B;
1186   Instruction *AddI;
1187   if (!match(CI,
1188              m_UAddWithOverflow(m_Value(A), m_Value(B), m_Instruction(AddI))))
1189     return false;
1190
1191   Type *Ty = AddI->getType();
1192   if (!isa<IntegerType>(Ty))
1193     return false;
1194
1195   // We don't want to move around uses of condition values this late, so we we
1196   // check if it is legal to create the call to the intrinsic in the basic
1197   // block containing the icmp:
1198
1199   if (AddI->getParent() != CI->getParent() && !AddI->hasOneUse())
1200     return false;
1201
1202 #ifndef NDEBUG
1203   // Someday m_UAddWithOverflow may get smarter, but this is a safe assumption
1204   // for now:
1205   if (AddI->hasOneUse())
1206     assert(*AddI->user_begin() == CI && "expected!");
1207 #endif
1208
1209   Module *M = CI->getModule();
1210   Value *F = Intrinsic::getDeclaration(M, Intrinsic::uadd_with_overflow, Ty);
1211
1212   auto *InsertPt = AddI->hasOneUse() ? CI : AddI;
1213
1214   auto *UAddWithOverflow =
1215       CallInst::Create(F, {A, B}, "uadd.overflow", InsertPt);
1216   auto *UAdd = ExtractValueInst::Create(UAddWithOverflow, 0, "uadd", InsertPt);
1217   auto *Overflow =
1218       ExtractValueInst::Create(UAddWithOverflow, 1, "overflow", InsertPt);
1219
1220   CI->replaceAllUsesWith(Overflow);
1221   AddI->replaceAllUsesWith(UAdd);
1222   CI->eraseFromParent();
1223   AddI->eraseFromParent();
1224   return true;
1225 }
1226
1227 /// Sink the given CmpInst into user blocks to reduce the number of virtual
1228 /// registers that must be created and coalesced. This is a clear win except on
1229 /// targets with multiple condition code registers (PowerPC), where it might
1230 /// lose; some adjustment may be wanted there.
1231 ///
1232 /// Return true if any changes are made.
1233 static bool SinkCmpExpression(CmpInst *CI, const TargetLowering *TLI) {
1234   BasicBlock *DefBB = CI->getParent();
1235
1236   // Avoid sinking soft-FP comparisons, since this can move them into a loop.
1237   if (TLI && TLI->useSoftFloat() && isa<FCmpInst>(CI))
1238     return false;
1239
1240   // Only insert a cmp in each block once.
1241   DenseMap<BasicBlock*, CmpInst*> InsertedCmps;
1242
1243   bool MadeChange = false;
1244   for (Value::user_iterator UI = CI->user_begin(), E = CI->user_end();
1245        UI != E; ) {
1246     Use &TheUse = UI.getUse();
1247     Instruction *User = cast<Instruction>(*UI);
1248
1249     // Preincrement use iterator so we don't invalidate it.
1250     ++UI;
1251
1252     // Don't bother for PHI nodes.
1253     if (isa<PHINode>(User))
1254       continue;
1255
1256     // Figure out which BB this cmp is used in.
1257     BasicBlock *UserBB = User->getParent();
1258
1259     // If this user is in the same block as the cmp, don't change the cmp.
1260     if (UserBB == DefBB) continue;
1261
1262     // If we have already inserted a cmp into this block, use it.
1263     CmpInst *&InsertedCmp = InsertedCmps[UserBB];
1264
1265     if (!InsertedCmp) {
1266       BasicBlock::iterator InsertPt = UserBB->getFirstInsertionPt();
1267       assert(InsertPt != UserBB->end());
1268       InsertedCmp =
1269           CmpInst::Create(CI->getOpcode(), CI->getPredicate(),
1270                           CI->getOperand(0), CI->getOperand(1), "", &*InsertPt);
1271       // Propagate the debug info.
1272       InsertedCmp->setDebugLoc(CI->getDebugLoc());
1273     }
1274
1275     // Replace a use of the cmp with a use of the new cmp.
1276     TheUse = InsertedCmp;
1277     MadeChange = true;
1278     ++NumCmpUses;
1279   }
1280
1281   // If we removed all uses, nuke the cmp.
1282   if (CI->use_empty()) {
1283     CI->eraseFromParent();
1284     MadeChange = true;
1285   }
1286
1287   return MadeChange;
1288 }
1289
1290 static bool OptimizeCmpExpression(CmpInst *CI, const TargetLowering *TLI) {
1291   if (SinkCmpExpression(CI, TLI))
1292     return true;
1293
1294   if (CombineUAddWithOverflow(CI))
1295     return true;
1296
1297   return false;
1298 }
1299
1300 /// Duplicate and sink the given 'and' instruction into user blocks where it is
1301 /// used in a compare to allow isel to generate better code for targets where
1302 /// this operation can be combined.
1303 ///
1304 /// Return true if any changes are made.
1305 static bool sinkAndCmp0Expression(Instruction *AndI,
1306                                   const TargetLowering &TLI,
1307                                   SetOfInstrs &InsertedInsts) {
1308   // Double-check that we're not trying to optimize an instruction that was
1309   // already optimized by some other part of this pass.
1310   assert(!InsertedInsts.count(AndI) &&
1311          "Attempting to optimize already optimized and instruction");
1312   (void) InsertedInsts;
1313
1314   // Nothing to do for single use in same basic block.
1315   if (AndI->hasOneUse() &&
1316       AndI->getParent() == cast<Instruction>(*AndI->user_begin())->getParent())
1317     return false;
1318
1319   // Try to avoid cases where sinking/duplicating is likely to increase register
1320   // pressure.
1321   if (!isa<ConstantInt>(AndI->getOperand(0)) &&
1322       !isa<ConstantInt>(AndI->getOperand(1)) &&
1323       AndI->getOperand(0)->hasOneUse() && AndI->getOperand(1)->hasOneUse())
1324     return false;
1325
1326   for (auto *U : AndI->users()) {
1327     Instruction *User = cast<Instruction>(U);
1328
1329     // Only sink for and mask feeding icmp with 0.
1330     if (!isa<ICmpInst>(User))
1331       return false;
1332
1333     auto *CmpC = dyn_cast<ConstantInt>(User->getOperand(1));
1334     if (!CmpC || !CmpC->isZero())
1335       return false;
1336   }
1337
1338   if (!TLI.isMaskAndCmp0FoldingBeneficial(*AndI))
1339     return false;
1340
1341   DEBUG(dbgs() << "found 'and' feeding only icmp 0;\n");
1342   DEBUG(AndI->getParent()->dump());
1343
1344   // Push the 'and' into the same block as the icmp 0.  There should only be
1345   // one (icmp (and, 0)) in each block, since CSE/GVN should have removed any
1346   // others, so we don't need to keep track of which BBs we insert into.
1347   for (Value::user_iterator UI = AndI->user_begin(), E = AndI->user_end();
1348        UI != E; ) {
1349     Use &TheUse = UI.getUse();
1350     Instruction *User = cast<Instruction>(*UI);
1351
1352     // Preincrement use iterator so we don't invalidate it.
1353     ++UI;
1354
1355     DEBUG(dbgs() << "sinking 'and' use: " << *User << "\n");
1356
1357     // Keep the 'and' in the same place if the use is already in the same block.
1358     Instruction *InsertPt =
1359         User->getParent() == AndI->getParent() ? AndI : User;
1360     Instruction *InsertedAnd =
1361         BinaryOperator::Create(Instruction::And, AndI->getOperand(0),
1362                                AndI->getOperand(1), "", InsertPt);
1363     // Propagate the debug info.
1364     InsertedAnd->setDebugLoc(AndI->getDebugLoc());
1365
1366     // Replace a use of the 'and' with a use of the new 'and'.
1367     TheUse = InsertedAnd;
1368     ++NumAndUses;
1369     DEBUG(User->getParent()->dump());
1370   }
1371
1372   // We removed all uses, nuke the and.
1373   AndI->eraseFromParent();
1374   return true;
1375 }
1376
1377 /// Check if the candidates could be combined with a shift instruction, which
1378 /// includes:
1379 /// 1. Truncate instruction
1380 /// 2. And instruction and the imm is a mask of the low bits:
1381 /// imm & (imm+1) == 0
1382 static bool isExtractBitsCandidateUse(Instruction *User) {
1383   if (!isa<TruncInst>(User)) {
1384     if (User->getOpcode() != Instruction::And ||
1385         !isa<ConstantInt>(User->getOperand(1)))
1386       return false;
1387
1388     const APInt &Cimm = cast<ConstantInt>(User->getOperand(1))->getValue();
1389
1390     if ((Cimm & (Cimm + 1)).getBoolValue())
1391       return false;
1392   }
1393   return true;
1394 }
1395
1396 /// Sink both shift and truncate instruction to the use of truncate's BB.
1397 static bool
1398 SinkShiftAndTruncate(BinaryOperator *ShiftI, Instruction *User, ConstantInt *CI,
1399                      DenseMap<BasicBlock *, BinaryOperator *> &InsertedShifts,
1400                      const TargetLowering &TLI, const DataLayout &DL) {
1401   BasicBlock *UserBB = User->getParent();
1402   DenseMap<BasicBlock *, CastInst *> InsertedTruncs;
1403   TruncInst *TruncI = dyn_cast<TruncInst>(User);
1404   bool MadeChange = false;
1405
1406   for (Value::user_iterator TruncUI = TruncI->user_begin(),
1407                             TruncE = TruncI->user_end();
1408        TruncUI != TruncE;) {
1409
1410     Use &TruncTheUse = TruncUI.getUse();
1411     Instruction *TruncUser = cast<Instruction>(*TruncUI);
1412     // Preincrement use iterator so we don't invalidate it.
1413
1414     ++TruncUI;
1415
1416     int ISDOpcode = TLI.InstructionOpcodeToISD(TruncUser->getOpcode());
1417     if (!ISDOpcode)
1418       continue;
1419
1420     // If the use is actually a legal node, there will not be an
1421     // implicit truncate.
1422     // FIXME: always querying the result type is just an
1423     // approximation; some nodes' legality is determined by the
1424     // operand or other means. There's no good way to find out though.
1425     if (TLI.isOperationLegalOrCustom(
1426             ISDOpcode, TLI.getValueType(DL, TruncUser->getType(), true)))
1427       continue;
1428
1429     // Don't bother for PHI nodes.
1430     if (isa<PHINode>(TruncUser))
1431       continue;
1432
1433     BasicBlock *TruncUserBB = TruncUser->getParent();
1434
1435     if (UserBB == TruncUserBB)
1436       continue;
1437
1438     BinaryOperator *&InsertedShift = InsertedShifts[TruncUserBB];
1439     CastInst *&InsertedTrunc = InsertedTruncs[TruncUserBB];
1440
1441     if (!InsertedShift && !InsertedTrunc) {
1442       BasicBlock::iterator InsertPt = TruncUserBB->getFirstInsertionPt();
1443       assert(InsertPt != TruncUserBB->end());
1444       // Sink the shift
1445       if (ShiftI->getOpcode() == Instruction::AShr)
1446         InsertedShift = BinaryOperator::CreateAShr(ShiftI->getOperand(0), CI,
1447                                                    "", &*InsertPt);
1448       else
1449         InsertedShift = BinaryOperator::CreateLShr(ShiftI->getOperand(0), CI,
1450                                                    "", &*InsertPt);
1451
1452       // Sink the trunc
1453       BasicBlock::iterator TruncInsertPt = TruncUserBB->getFirstInsertionPt();
1454       TruncInsertPt++;
1455       assert(TruncInsertPt != TruncUserBB->end());
1456
1457       InsertedTrunc = CastInst::Create(TruncI->getOpcode(), InsertedShift,
1458                                        TruncI->getType(), "", &*TruncInsertPt);
1459
1460       MadeChange = true;
1461
1462       TruncTheUse = InsertedTrunc;
1463     }
1464   }
1465   return MadeChange;
1466 }
1467
1468 /// Sink the shift *right* instruction into user blocks if the uses could
1469 /// potentially be combined with this shift instruction and generate BitExtract
1470 /// instruction. It will only be applied if the architecture supports BitExtract
1471 /// instruction. Here is an example:
1472 /// BB1:
1473 ///   %x.extract.shift = lshr i64 %arg1, 32
1474 /// BB2:
1475 ///   %x.extract.trunc = trunc i64 %x.extract.shift to i16
1476 /// ==>
1477 ///
1478 /// BB2:
1479 ///   %x.extract.shift.1 = lshr i64 %arg1, 32
1480 ///   %x.extract.trunc = trunc i64 %x.extract.shift.1 to i16
1481 ///
1482 /// CodeGen will recoginze the pattern in BB2 and generate BitExtract
1483 /// instruction.
1484 /// Return true if any changes are made.
1485 static bool OptimizeExtractBits(BinaryOperator *ShiftI, ConstantInt *CI,
1486                                 const TargetLowering &TLI,
1487                                 const DataLayout &DL) {
1488   BasicBlock *DefBB = ShiftI->getParent();
1489
1490   /// Only insert instructions in each block once.
1491   DenseMap<BasicBlock *, BinaryOperator *> InsertedShifts;
1492
1493   bool shiftIsLegal = TLI.isTypeLegal(TLI.getValueType(DL, ShiftI->getType()));
1494
1495   bool MadeChange = false;
1496   for (Value::user_iterator UI = ShiftI->user_begin(), E = ShiftI->user_end();
1497        UI != E;) {
1498     Use &TheUse = UI.getUse();
1499     Instruction *User = cast<Instruction>(*UI);
1500     // Preincrement use iterator so we don't invalidate it.
1501     ++UI;
1502
1503     // Don't bother for PHI nodes.
1504     if (isa<PHINode>(User))
1505       continue;
1506
1507     if (!isExtractBitsCandidateUse(User))
1508       continue;
1509
1510     BasicBlock *UserBB = User->getParent();
1511
1512     if (UserBB == DefBB) {
1513       // If the shift and truncate instruction are in the same BB. The use of
1514       // the truncate(TruncUse) may still introduce another truncate if not
1515       // legal. In this case, we would like to sink both shift and truncate
1516       // instruction to the BB of TruncUse.
1517       // for example:
1518       // BB1:
1519       // i64 shift.result = lshr i64 opnd, imm
1520       // trunc.result = trunc shift.result to i16
1521       //
1522       // BB2:
1523       //   ----> We will have an implicit truncate here if the architecture does
1524       //   not have i16 compare.
1525       // cmp i16 trunc.result, opnd2
1526       //
1527       if (isa<TruncInst>(User) && shiftIsLegal
1528           // If the type of the truncate is legal, no trucate will be
1529           // introduced in other basic blocks.
1530           &&
1531           (!TLI.isTypeLegal(TLI.getValueType(DL, User->getType()))))
1532         MadeChange =
1533             SinkShiftAndTruncate(ShiftI, User, CI, InsertedShifts, TLI, DL);
1534
1535       continue;
1536     }
1537     // If we have already inserted a shift into this block, use it.
1538     BinaryOperator *&InsertedShift = InsertedShifts[UserBB];
1539
1540     if (!InsertedShift) {
1541       BasicBlock::iterator InsertPt = UserBB->getFirstInsertionPt();
1542       assert(InsertPt != UserBB->end());
1543
1544       if (ShiftI->getOpcode() == Instruction::AShr)
1545         InsertedShift = BinaryOperator::CreateAShr(ShiftI->getOperand(0), CI,
1546                                                    "", &*InsertPt);
1547       else
1548         InsertedShift = BinaryOperator::CreateLShr(ShiftI->getOperand(0), CI,
1549                                                    "", &*InsertPt);
1550
1551       MadeChange = true;
1552     }
1553
1554     // Replace a use of the shift with a use of the new shift.
1555     TheUse = InsertedShift;
1556   }
1557
1558   // If we removed all uses, nuke the shift.
1559   if (ShiftI->use_empty())
1560     ShiftI->eraseFromParent();
1561
1562   return MadeChange;
1563 }
1564
1565 /// If counting leading or trailing zeros is an expensive operation and a zero
1566 /// input is defined, add a check for zero to avoid calling the intrinsic.
1567 ///
1568 /// We want to transform:
1569 ///     %z = call i64 @llvm.cttz.i64(i64 %A, i1 false)
1570 ///
1571 /// into:
1572 ///   entry:
1573 ///     %cmpz = icmp eq i64 %A, 0
1574 ///     br i1 %cmpz, label %cond.end, label %cond.false
1575 ///   cond.false:
1576 ///     %z = call i64 @llvm.cttz.i64(i64 %A, i1 true)
1577 ///     br label %cond.end
1578 ///   cond.end:
1579 ///     %ctz = phi i64 [ 64, %entry ], [ %z, %cond.false ]
1580 ///
1581 /// If the transform is performed, return true and set ModifiedDT to true.
1582 static bool despeculateCountZeros(IntrinsicInst *CountZeros,
1583                                   const TargetLowering *TLI,
1584                                   const DataLayout *DL,
1585                                   bool &ModifiedDT) {
1586   if (!TLI || !DL)
1587     return false;
1588
1589   // If a zero input is undefined, it doesn't make sense to despeculate that.
1590   if (match(CountZeros->getOperand(1), m_One()))
1591     return false;
1592
1593   // If it's cheap to speculate, there's nothing to do.
1594   auto IntrinsicID = CountZeros->getIntrinsicID();
1595   if ((IntrinsicID == Intrinsic::cttz && TLI->isCheapToSpeculateCttz()) ||
1596       (IntrinsicID == Intrinsic::ctlz && TLI->isCheapToSpeculateCtlz()))
1597     return false;
1598
1599   // Only handle legal scalar cases. Anything else requires too much work.
1600   Type *Ty = CountZeros->getType();
1601   unsigned SizeInBits = Ty->getPrimitiveSizeInBits();
1602   if (Ty->isVectorTy() || SizeInBits > DL->getLargestLegalIntTypeSizeInBits())
1603     return false;
1604
1605   // The intrinsic will be sunk behind a compare against zero and branch.
1606   BasicBlock *StartBlock = CountZeros->getParent();
1607   BasicBlock *CallBlock = StartBlock->splitBasicBlock(CountZeros, "cond.false");
1608
1609   // Create another block after the count zero intrinsic. A PHI will be added
1610   // in this block to select the result of the intrinsic or the bit-width
1611   // constant if the input to the intrinsic is zero.
1612   BasicBlock::iterator SplitPt = ++(BasicBlock::iterator(CountZeros));
1613   BasicBlock *EndBlock = CallBlock->splitBasicBlock(SplitPt, "cond.end");
1614
1615   // Set up a builder to create a compare, conditional branch, and PHI.
1616   IRBuilder<> Builder(CountZeros->getContext());
1617   Builder.SetInsertPoint(StartBlock->getTerminator());
1618   Builder.SetCurrentDebugLocation(CountZeros->getDebugLoc());
1619
1620   // Replace the unconditional branch that was created by the first split with
1621   // a compare against zero and a conditional branch.
1622   Value *Zero = Constant::getNullValue(Ty);
1623   Value *Cmp = Builder.CreateICmpEQ(CountZeros->getOperand(0), Zero, "cmpz");
1624   Builder.CreateCondBr(Cmp, EndBlock, CallBlock);
1625   StartBlock->getTerminator()->eraseFromParent();
1626
1627   // Create a PHI in the end block to select either the output of the intrinsic
1628   // or the bit width of the operand.
1629   Builder.SetInsertPoint(&EndBlock->front());
1630   PHINode *PN = Builder.CreatePHI(Ty, 2, "ctz");
1631   CountZeros->replaceAllUsesWith(PN);
1632   Value *BitWidth = Builder.getInt(APInt(SizeInBits, SizeInBits));
1633   PN->addIncoming(BitWidth, StartBlock);
1634   PN->addIncoming(CountZeros, CallBlock);
1635
1636   // We are explicitly handling the zero case, so we can set the intrinsic's
1637   // undefined zero argument to 'true'. This will also prevent reprocessing the
1638   // intrinsic; we only despeculate when a zero input is defined.
1639   CountZeros->setArgOperand(1, Builder.getTrue());
1640   ModifiedDT = true;
1641   return true;
1642 }
1643
1644 // This class provides helper functions to expand a memcmp library call into an
1645 // inline expansion.
1646 class MemCmpExpansion {
1647   struct ResultBlock {
1648     BasicBlock *BB;
1649     PHINode *PhiSrc1;
1650     PHINode *PhiSrc2;
1651     ResultBlock();
1652   };
1653
1654   CallInst *CI;
1655   ResultBlock ResBlock;
1656   unsigned MaxLoadSize;
1657   unsigned NumBlocks;
1658   unsigned NumBlocksNonOneByte;
1659   unsigned NumLoadsPerBlock;
1660   std::vector<BasicBlock *> LoadCmpBlocks;
1661   BasicBlock *EndBlock;
1662   PHINode *PhiRes;
1663   bool IsUsedForZeroCmp;
1664   const DataLayout &DL;
1665   IRBuilder<> Builder;
1666
1667   unsigned calculateNumBlocks(unsigned Size);
1668   void createLoadCmpBlocks();
1669   void createResultBlock();
1670   void setupResultBlockPHINodes();
1671   void setupEndBlockPHINodes();
1672   void emitLoadCompareBlock(unsigned Index, unsigned LoadSize,
1673                             unsigned GEPIndex);
1674   Value *getCompareLoadPairs(unsigned Index, unsigned Size,
1675                              unsigned &NumBytesProcessed);
1676   void emitLoadCompareBlockMultipleLoads(unsigned Index, unsigned Size,
1677                                          unsigned &NumBytesProcessed);
1678   void emitLoadCompareByteBlock(unsigned Index, unsigned GEPIndex);
1679   void emitMemCmpResultBlock();
1680   Value *getMemCmpExpansionZeroCase(unsigned Size);
1681   Value *getMemCmpEqZeroOneBlock(unsigned Size);
1682   Value *getMemCmpOneBlock(unsigned Size);
1683   unsigned getLoadSize(unsigned Size);
1684   unsigned getNumLoads(unsigned Size);
1685
1686 public:
1687   MemCmpExpansion(CallInst *CI, uint64_t Size, unsigned MaxLoadSize,
1688                   unsigned NumLoadsPerBlock, const DataLayout &DL);
1689   Value *getMemCmpExpansion(uint64_t Size);
1690 };
1691
1692 MemCmpExpansion::ResultBlock::ResultBlock()
1693     : BB(nullptr), PhiSrc1(nullptr), PhiSrc2(nullptr) {}
1694
1695 // Initialize the basic block structure required for expansion of memcmp call
1696 // with given maximum load size and memcmp size parameter.
1697 // This structure includes:
1698 // 1. A list of load compare blocks - LoadCmpBlocks.
1699 // 2. An EndBlock, split from original instruction point, which is the block to
1700 // return from.
1701 // 3. ResultBlock, block to branch to for early exit when a
1702 // LoadCmpBlock finds a difference.
1703 MemCmpExpansion::MemCmpExpansion(CallInst *CI, uint64_t Size,
1704                                  unsigned MaxLoadSize, unsigned LoadsPerBlock,
1705                                  const DataLayout &TheDataLayout)
1706     : CI(CI), MaxLoadSize(MaxLoadSize), NumLoadsPerBlock(LoadsPerBlock),
1707       DL(TheDataLayout), Builder(CI) {
1708
1709   // A memcmp with zero-comparison with only one block of load and compare does
1710   // not need to set up any extra blocks. This case could be handled in the DAG,
1711   // but since we have all of the machinery to flexibly expand any memcpy here,
1712   // we choose to handle this case too to avoid fragmented lowering.
1713   IsUsedForZeroCmp = isOnlyUsedInZeroEqualityComparison(CI);
1714   NumBlocks = calculateNumBlocks(Size);
1715   if ((!IsUsedForZeroCmp && NumLoadsPerBlock != 1) || NumBlocks != 1) {
1716     BasicBlock *StartBlock = CI->getParent();
1717     EndBlock = StartBlock->splitBasicBlock(CI, "endblock");
1718     setupEndBlockPHINodes();
1719     createResultBlock();
1720
1721     // If return value of memcmp is not used in a zero equality, we need to
1722     // calculate which source was larger. The calculation requires the
1723     // two loaded source values of each load compare block.
1724     // These will be saved in the phi nodes created by setupResultBlockPHINodes.
1725     if (!IsUsedForZeroCmp)
1726       setupResultBlockPHINodes();
1727
1728     // Create the number of required load compare basic blocks.
1729     createLoadCmpBlocks();
1730
1731     // Update the terminator added by splitBasicBlock to branch to the first
1732     // LoadCmpBlock.
1733     StartBlock->getTerminator()->setSuccessor(0, LoadCmpBlocks[0]);
1734   }
1735
1736   Builder.SetCurrentDebugLocation(CI->getDebugLoc());
1737 }
1738
1739 void MemCmpExpansion::createLoadCmpBlocks() {
1740   for (unsigned i = 0; i < NumBlocks; i++) {
1741     BasicBlock *BB = BasicBlock::Create(CI->getContext(), "loadbb",
1742                                         EndBlock->getParent(), EndBlock);
1743     LoadCmpBlocks.push_back(BB);
1744   }
1745 }
1746
1747 void MemCmpExpansion::createResultBlock() {
1748   ResBlock.BB = BasicBlock::Create(CI->getContext(), "res_block",
1749                                    EndBlock->getParent(), EndBlock);
1750 }
1751
1752 // This function creates the IR instructions for loading and comparing 1 byte.
1753 // It loads 1 byte from each source of the memcmp parameters with the given
1754 // GEPIndex. It then subtracts the two loaded values and adds this result to the
1755 // final phi node for selecting the memcmp result.
1756 void MemCmpExpansion::emitLoadCompareByteBlock(unsigned Index,
1757                                                unsigned GEPIndex) {
1758   Value *Source1 = CI->getArgOperand(0);
1759   Value *Source2 = CI->getArgOperand(1);
1760
1761   Builder.SetInsertPoint(LoadCmpBlocks[Index]);
1762   Type *LoadSizeType = Type::getInt8Ty(CI->getContext());
1763   // Cast source to LoadSizeType*.
1764   if (Source1->getType() != LoadSizeType)
1765     Source1 = Builder.CreateBitCast(Source1, LoadSizeType->getPointerTo());
1766   if (Source2->getType() != LoadSizeType)
1767     Source2 = Builder.CreateBitCast(Source2, LoadSizeType->getPointerTo());
1768
1769   // Get the base address using the GEPIndex.
1770   if (GEPIndex != 0) {
1771     Source1 = Builder.CreateGEP(LoadSizeType, Source1,
1772                                 ConstantInt::get(LoadSizeType, GEPIndex));
1773     Source2 = Builder.CreateGEP(LoadSizeType, Source2,
1774                                 ConstantInt::get(LoadSizeType, GEPIndex));
1775   }
1776
1777   Value *LoadSrc1 = Builder.CreateLoad(LoadSizeType, Source1);
1778   Value *LoadSrc2 = Builder.CreateLoad(LoadSizeType, Source2);
1779
1780   LoadSrc1 = Builder.CreateZExt(LoadSrc1, Type::getInt32Ty(CI->getContext()));
1781   LoadSrc2 = Builder.CreateZExt(LoadSrc2, Type::getInt32Ty(CI->getContext()));
1782   Value *Diff = Builder.CreateSub(LoadSrc1, LoadSrc2);
1783
1784   PhiRes->addIncoming(Diff, LoadCmpBlocks[Index]);
1785
1786   if (Index < (LoadCmpBlocks.size() - 1)) {
1787     // Early exit branch if difference found to EndBlock. Otherwise, continue to
1788     // next LoadCmpBlock,
1789     Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_NE, Diff,
1790                                     ConstantInt::get(Diff->getType(), 0));
1791     BranchInst *CmpBr =
1792         BranchInst::Create(EndBlock, LoadCmpBlocks[Index + 1], Cmp);
1793     Builder.Insert(CmpBr);
1794   } else {
1795     // The last block has an unconditional branch to EndBlock.
1796     BranchInst *CmpBr = BranchInst::Create(EndBlock);
1797     Builder.Insert(CmpBr);
1798   }
1799 }
1800
1801 unsigned MemCmpExpansion::getNumLoads(unsigned Size) {
1802   return (Size / MaxLoadSize) + countPopulation(Size % MaxLoadSize);
1803 }
1804
1805 unsigned MemCmpExpansion::getLoadSize(unsigned Size) {
1806   return MinAlign(PowerOf2Floor(Size), MaxLoadSize);
1807 }
1808
1809 /// Generate an equality comparison for one or more pairs of loaded values.
1810 /// This is used in the case where the memcmp() call is compared equal or not
1811 /// equal to zero.
1812 Value *MemCmpExpansion::getCompareLoadPairs(unsigned Index, unsigned Size,
1813                                             unsigned &NumBytesProcessed) {
1814   std::vector<Value *> XorList, OrList;
1815   Value *Diff;
1816
1817   unsigned RemainingBytes = Size - NumBytesProcessed;
1818   unsigned NumLoadsRemaining = getNumLoads(RemainingBytes);
1819   unsigned NumLoads = std::min(NumLoadsRemaining, NumLoadsPerBlock);
1820
1821   // For a single-block expansion, start inserting before the memcmp call.
1822   if (LoadCmpBlocks.empty())
1823     Builder.SetInsertPoint(CI);
1824   else
1825     Builder.SetInsertPoint(LoadCmpBlocks[Index]);
1826
1827   Value *Cmp = nullptr;
1828   for (unsigned i = 0; i < NumLoads; ++i) {
1829     unsigned LoadSize = getLoadSize(RemainingBytes);
1830     unsigned GEPIndex = NumBytesProcessed / LoadSize;
1831     NumBytesProcessed += LoadSize;
1832     RemainingBytes -= LoadSize;
1833
1834     Type *LoadSizeType = IntegerType::get(CI->getContext(), LoadSize * 8);
1835     Type *MaxLoadType = IntegerType::get(CI->getContext(), MaxLoadSize * 8);
1836     assert(LoadSize <= MaxLoadSize && "Unexpected load type");
1837
1838     Value *Source1 = CI->getArgOperand(0);
1839     Value *Source2 = CI->getArgOperand(1);
1840
1841     // Cast source to LoadSizeType*.
1842     if (Source1->getType() != LoadSizeType)
1843       Source1 = Builder.CreateBitCast(Source1, LoadSizeType->getPointerTo());
1844     if (Source2->getType() != LoadSizeType)
1845       Source2 = Builder.CreateBitCast(Source2, LoadSizeType->getPointerTo());
1846
1847     // Get the base address using the GEPIndex.
1848     if (GEPIndex != 0) {
1849       Source1 = Builder.CreateGEP(LoadSizeType, Source1,
1850                                   ConstantInt::get(LoadSizeType, GEPIndex));
1851       Source2 = Builder.CreateGEP(LoadSizeType, Source2,
1852                                   ConstantInt::get(LoadSizeType, GEPIndex));
1853     }
1854
1855     // Get a constant or load a value for each source address.
1856     Value *LoadSrc1 = nullptr;
1857     if (auto *Source1C = dyn_cast<Constant>(Source1))
1858       LoadSrc1 = ConstantFoldLoadFromConstPtr(Source1C, LoadSizeType, DL);
1859     if (!LoadSrc1)
1860       LoadSrc1 = Builder.CreateLoad(LoadSizeType, Source1);
1861
1862     Value *LoadSrc2 = nullptr;
1863     if (auto *Source2C = dyn_cast<Constant>(Source2))
1864       LoadSrc2 = ConstantFoldLoadFromConstPtr(Source2C, LoadSizeType, DL);
1865     if (!LoadSrc2)
1866       LoadSrc2 = Builder.CreateLoad(LoadSizeType, Source2);
1867
1868     if (NumLoads != 1) {
1869       if (LoadSizeType != MaxLoadType) {
1870         LoadSrc1 = Builder.CreateZExt(LoadSrc1, MaxLoadType);
1871         LoadSrc2 = Builder.CreateZExt(LoadSrc2, MaxLoadType);
1872       }
1873       // If we have multiple loads per block, we need to generate a composite
1874       // comparison using xor+or.
1875       Diff = Builder.CreateXor(LoadSrc1, LoadSrc2);
1876       Diff = Builder.CreateZExt(Diff, MaxLoadType);
1877       XorList.push_back(Diff);
1878     } else {
1879       // If there's only one load per block, we just compare the loaded values.
1880       Cmp = Builder.CreateICmpNE(LoadSrc1, LoadSrc2);
1881     }
1882   }
1883
1884   auto pairWiseOr = [&](std::vector<Value *> &InList) -> std::vector<Value *> {
1885     std::vector<Value *> OutList;
1886     for (unsigned i = 0; i < InList.size() - 1; i = i + 2) {
1887       Value *Or = Builder.CreateOr(InList[i], InList[i + 1]);
1888       OutList.push_back(Or);
1889     }
1890     if (InList.size() % 2 != 0)
1891       OutList.push_back(InList.back());
1892     return OutList;
1893   };
1894
1895   if (!Cmp) {
1896     // Pairwise OR the XOR results.
1897     OrList = pairWiseOr(XorList);
1898
1899     // Pairwise OR the OR results until one result left.
1900     while (OrList.size() != 1) {
1901       OrList = pairWiseOr(OrList);
1902     }
1903     Cmp = Builder.CreateICmpNE(OrList[0], ConstantInt::get(Diff->getType(), 0));
1904   }
1905
1906   return Cmp;
1907 }
1908
1909 void MemCmpExpansion::emitLoadCompareBlockMultipleLoads(
1910     unsigned Index, unsigned Size, unsigned &NumBytesProcessed) {
1911   Value *Cmp = getCompareLoadPairs(Index, Size, NumBytesProcessed);
1912
1913   BasicBlock *NextBB = (Index == (LoadCmpBlocks.size() - 1))
1914                            ? EndBlock
1915                            : LoadCmpBlocks[Index + 1];
1916   // Early exit branch if difference found to ResultBlock. Otherwise,
1917   // continue to next LoadCmpBlock or EndBlock.
1918   BranchInst *CmpBr = BranchInst::Create(ResBlock.BB, NextBB, Cmp);
1919   Builder.Insert(CmpBr);
1920
1921   // Add a phi edge for the last LoadCmpBlock to Endblock with a value of 0
1922   // since early exit to ResultBlock was not taken (no difference was found in
1923   // any of the bytes).
1924   if (Index == LoadCmpBlocks.size() - 1) {
1925     Value *Zero = ConstantInt::get(Type::getInt32Ty(CI->getContext()), 0);
1926     PhiRes->addIncoming(Zero, LoadCmpBlocks[Index]);
1927   }
1928 }
1929
1930 // This function creates the IR intructions for loading and comparing using the
1931 // given LoadSize. It loads the number of bytes specified by LoadSize from each
1932 // source of the memcmp parameters. It then does a subtract to see if there was
1933 // a difference in the loaded values. If a difference is found, it branches
1934 // with an early exit to the ResultBlock for calculating which source was
1935 // larger. Otherwise, it falls through to the either the next LoadCmpBlock or
1936 // the EndBlock if this is the last LoadCmpBlock. Loading 1 byte is handled with
1937 // a special case through emitLoadCompareByteBlock. The special handling can
1938 // simply subtract the loaded values and add it to the result phi node.
1939 void MemCmpExpansion::emitLoadCompareBlock(unsigned Index, unsigned LoadSize,
1940                                            unsigned GEPIndex) {
1941   if (LoadSize == 1) {
1942     MemCmpExpansion::emitLoadCompareByteBlock(Index, GEPIndex);
1943     return;
1944   }
1945
1946   Type *LoadSizeType = IntegerType::get(CI->getContext(), LoadSize * 8);
1947   Type *MaxLoadType = IntegerType::get(CI->getContext(), MaxLoadSize * 8);
1948   assert(LoadSize <= MaxLoadSize && "Unexpected load type");
1949
1950   Value *Source1 = CI->getArgOperand(0);
1951   Value *Source2 = CI->getArgOperand(1);
1952
1953   Builder.SetInsertPoint(LoadCmpBlocks[Index]);
1954   // Cast source to LoadSizeType*.
1955   if (Source1->getType() != LoadSizeType)
1956     Source1 = Builder.CreateBitCast(Source1, LoadSizeType->getPointerTo());
1957   if (Source2->getType() != LoadSizeType)
1958     Source2 = Builder.CreateBitCast(Source2, LoadSizeType->getPointerTo());
1959
1960   // Get the base address using the GEPIndex.
1961   if (GEPIndex != 0) {
1962     Source1 = Builder.CreateGEP(LoadSizeType, Source1,
1963                                 ConstantInt::get(LoadSizeType, GEPIndex));
1964     Source2 = Builder.CreateGEP(LoadSizeType, Source2,
1965                                 ConstantInt::get(LoadSizeType, GEPIndex));
1966   }
1967
1968   // Load LoadSizeType from the base address.
1969   Value *LoadSrc1 = Builder.CreateLoad(LoadSizeType, Source1);
1970   Value *LoadSrc2 = Builder.CreateLoad(LoadSizeType, Source2);
1971
1972   if (DL.isLittleEndian()) {
1973     Function *Bswap = Intrinsic::getDeclaration(CI->getModule(),
1974                                                 Intrinsic::bswap, LoadSizeType);
1975     LoadSrc1 = Builder.CreateCall(Bswap, LoadSrc1);
1976     LoadSrc2 = Builder.CreateCall(Bswap, LoadSrc2);
1977   }
1978
1979   if (LoadSizeType != MaxLoadType) {
1980     LoadSrc1 = Builder.CreateZExt(LoadSrc1, MaxLoadType);
1981     LoadSrc2 = Builder.CreateZExt(LoadSrc2, MaxLoadType);
1982   }
1983
1984   // Add the loaded values to the phi nodes for calculating memcmp result only
1985   // if result is not used in a zero equality.
1986   if (!IsUsedForZeroCmp) {
1987     ResBlock.PhiSrc1->addIncoming(LoadSrc1, LoadCmpBlocks[Index]);
1988     ResBlock.PhiSrc2->addIncoming(LoadSrc2, LoadCmpBlocks[Index]);
1989   }
1990
1991   Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, LoadSrc1, LoadSrc2);
1992   BasicBlock *NextBB = (Index == (LoadCmpBlocks.size() - 1))
1993                            ? EndBlock
1994                            : LoadCmpBlocks[Index + 1];
1995   // Early exit branch if difference found to ResultBlock. Otherwise, continue
1996   // to next LoadCmpBlock or EndBlock.
1997   BranchInst *CmpBr = BranchInst::Create(NextBB, ResBlock.BB, Cmp);
1998   Builder.Insert(CmpBr);
1999
2000   // Add a phi edge for the last LoadCmpBlock to Endblock with a value of 0
2001   // since early exit to ResultBlock was not taken (no difference was found in
2002   // any of the bytes).
2003   if (Index == LoadCmpBlocks.size() - 1) {
2004     Value *Zero = ConstantInt::get(Type::getInt32Ty(CI->getContext()), 0);
2005     PhiRes->addIncoming(Zero, LoadCmpBlocks[Index]);
2006   }
2007 }
2008
2009 // This function populates the ResultBlock with a sequence to calculate the
2010 // memcmp result. It compares the two loaded source values and returns -1 if
2011 // src1 < src2 and 1 if src1 > src2.
2012 void MemCmpExpansion::emitMemCmpResultBlock() {
2013   // Special case: if memcmp result is used in a zero equality, result does not
2014   // need to be calculated and can simply return 1.
2015   if (IsUsedForZeroCmp) {
2016     BasicBlock::iterator InsertPt = ResBlock.BB->getFirstInsertionPt();
2017     Builder.SetInsertPoint(ResBlock.BB, InsertPt);
2018     Value *Res = ConstantInt::get(Type::getInt32Ty(CI->getContext()), 1);
2019     PhiRes->addIncoming(Res, ResBlock.BB);
2020     BranchInst *NewBr = BranchInst::Create(EndBlock);
2021     Builder.Insert(NewBr);
2022     return;
2023   }
2024   BasicBlock::iterator InsertPt = ResBlock.BB->getFirstInsertionPt();
2025   Builder.SetInsertPoint(ResBlock.BB, InsertPt);
2026
2027   Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_ULT, ResBlock.PhiSrc1,
2028                                   ResBlock.PhiSrc2);
2029
2030   Value *Res =
2031       Builder.CreateSelect(Cmp, ConstantInt::get(Builder.getInt32Ty(), -1),
2032                            ConstantInt::get(Builder.getInt32Ty(), 1));
2033
2034   BranchInst *NewBr = BranchInst::Create(EndBlock);
2035   Builder.Insert(NewBr);
2036   PhiRes->addIncoming(Res, ResBlock.BB);
2037 }
2038
2039 unsigned MemCmpExpansion::calculateNumBlocks(unsigned Size) {
2040   unsigned NumBlocks = 0;
2041   bool HaveOneByteLoad = false;
2042   unsigned RemainingSize = Size;
2043   unsigned LoadSize = MaxLoadSize;
2044   while (RemainingSize) {
2045     if (LoadSize == 1)
2046       HaveOneByteLoad = true;
2047     NumBlocks += RemainingSize / LoadSize;
2048     RemainingSize = RemainingSize % LoadSize;
2049     LoadSize = LoadSize / 2;
2050   }
2051   NumBlocksNonOneByte = HaveOneByteLoad ? (NumBlocks - 1) : NumBlocks;
2052
2053   if (IsUsedForZeroCmp)
2054     NumBlocks = NumBlocks / NumLoadsPerBlock +
2055                 (NumBlocks % NumLoadsPerBlock != 0 ? 1 : 0);
2056
2057   return NumBlocks;
2058 }
2059
2060 void MemCmpExpansion::setupResultBlockPHINodes() {
2061   Type *MaxLoadType = IntegerType::get(CI->getContext(), MaxLoadSize * 8);
2062   Builder.SetInsertPoint(ResBlock.BB);
2063   ResBlock.PhiSrc1 =
2064       Builder.CreatePHI(MaxLoadType, NumBlocksNonOneByte, "phi.src1");
2065   ResBlock.PhiSrc2 =
2066       Builder.CreatePHI(MaxLoadType, NumBlocksNonOneByte, "phi.src2");
2067 }
2068
2069 void MemCmpExpansion::setupEndBlockPHINodes() {
2070   Builder.SetInsertPoint(&EndBlock->front());
2071   PhiRes = Builder.CreatePHI(Type::getInt32Ty(CI->getContext()), 2, "phi.res");
2072 }
2073
2074 Value *MemCmpExpansion::getMemCmpExpansionZeroCase(unsigned Size) {
2075   unsigned NumBytesProcessed = 0;
2076   // This loop populates each of the LoadCmpBlocks with the IR sequence to
2077   // handle multiple loads per block.
2078   for (unsigned i = 0; i < NumBlocks; ++i)
2079     emitLoadCompareBlockMultipleLoads(i, Size, NumBytesProcessed);
2080
2081   emitMemCmpResultBlock();
2082   return PhiRes;
2083 }
2084
2085 /// A memcmp expansion that compares equality with 0 and only has one block of
2086 /// load and compare can bypass the compare, branch, and phi IR that is required
2087 /// in the general case.
2088 Value *MemCmpExpansion::getMemCmpEqZeroOneBlock(unsigned Size) {
2089   unsigned NumBytesProcessed = 0;
2090   Value *Cmp = getCompareLoadPairs(0, Size, NumBytesProcessed);
2091   return Builder.CreateZExt(Cmp, Type::getInt32Ty(CI->getContext()));
2092 }
2093
2094 /// A memcmp expansion that only has one block of load and compare can bypass
2095 /// the compare, branch, and phi IR that is required in the general case.
2096 Value *MemCmpExpansion::getMemCmpOneBlock(unsigned Size) {
2097   assert(NumLoadsPerBlock == 1 && "Only handles one load pair per block");
2098
2099   Type *LoadSizeType = IntegerType::get(CI->getContext(), Size * 8);
2100   Value *Source1 = CI->getArgOperand(0);
2101   Value *Source2 = CI->getArgOperand(1);
2102
2103   // Cast source to LoadSizeType*.
2104   if (Source1->getType() != LoadSizeType)
2105     Source1 = Builder.CreateBitCast(Source1, LoadSizeType->getPointerTo());
2106   if (Source2->getType() != LoadSizeType)
2107     Source2 = Builder.CreateBitCast(Source2, LoadSizeType->getPointerTo());
2108
2109   // Load LoadSizeType from the base address.
2110   Value *LoadSrc1 = Builder.CreateLoad(LoadSizeType, Source1);
2111   Value *LoadSrc2 = Builder.CreateLoad(LoadSizeType, Source2);
2112
2113   if (DL.isLittleEndian() && Size != 1) {
2114     Function *Bswap = Intrinsic::getDeclaration(CI->getModule(),
2115                                                 Intrinsic::bswap, LoadSizeType);
2116     LoadSrc1 = Builder.CreateCall(Bswap, LoadSrc1);
2117     LoadSrc2 = Builder.CreateCall(Bswap, LoadSrc2);
2118   }
2119
2120   // TODO: Instead of comparing ULT, just subtract and return the difference?
2121   Value *CmpNE = Builder.CreateICmpNE(LoadSrc1, LoadSrc2);
2122   Value *CmpULT = Builder.CreateICmpULT(LoadSrc1, LoadSrc2);
2123   Type *I32 = Builder.getInt32Ty();
2124   Value *Sel1 = Builder.CreateSelect(CmpULT, ConstantInt::get(I32, -1),
2125                                              ConstantInt::get(I32, 1));
2126   return Builder.CreateSelect(CmpNE, Sel1, ConstantInt::get(I32, 0));
2127 }
2128
2129 // This function expands the memcmp call into an inline expansion and returns
2130 // the memcmp result.
2131 Value *MemCmpExpansion::getMemCmpExpansion(uint64_t Size) {
2132   if (IsUsedForZeroCmp)
2133     return NumBlocks == 1 ? getMemCmpEqZeroOneBlock(Size) :
2134                             getMemCmpExpansionZeroCase(Size);
2135
2136   // TODO: Handle more than one load pair per block in getMemCmpOneBlock().
2137   if (NumBlocks == 1 && NumLoadsPerBlock == 1)
2138     return getMemCmpOneBlock(Size);
2139
2140   // This loop calls emitLoadCompareBlock for comparing Size bytes of the two
2141   // memcmp sources. It starts with loading using the maximum load size set by
2142   // the target. It processes any remaining bytes using a load size which is the
2143   // next smallest power of 2.
2144   unsigned LoadSize = MaxLoadSize;
2145   unsigned NumBytesToBeProcessed = Size;
2146   unsigned Index = 0;
2147   while (NumBytesToBeProcessed) {
2148     // Calculate how many blocks we can create with the current load size.
2149     unsigned NumBlocks = NumBytesToBeProcessed / LoadSize;
2150     unsigned GEPIndex = (Size - NumBytesToBeProcessed) / LoadSize;
2151     NumBytesToBeProcessed = NumBytesToBeProcessed % LoadSize;
2152
2153     // For each NumBlocks, populate the instruction sequence for loading and
2154     // comparing LoadSize bytes.
2155     while (NumBlocks--) {
2156       emitLoadCompareBlock(Index, LoadSize, GEPIndex);
2157       Index++;
2158       GEPIndex++;
2159     }
2160     // Get the next LoadSize to use.
2161     LoadSize = LoadSize / 2;
2162   }
2163
2164   emitMemCmpResultBlock();
2165   return PhiRes;
2166 }
2167
2168 // This function checks to see if an expansion of memcmp can be generated.
2169 // It checks for constant compare size that is less than the max inline size.
2170 // If an expansion cannot occur, returns false to leave as a library call.
2171 // Otherwise, the library call is replaced with a new IR instruction sequence.
2172 /// We want to transform:
2173 /// %call = call signext i32 @memcmp(i8* %0, i8* %1, i64 15)
2174 /// To:
2175 /// loadbb:
2176 ///  %0 = bitcast i32* %buffer2 to i8*
2177 ///  %1 = bitcast i32* %buffer1 to i8*
2178 ///  %2 = bitcast i8* %1 to i64*
2179 ///  %3 = bitcast i8* %0 to i64*
2180 ///  %4 = load i64, i64* %2
2181 ///  %5 = load i64, i64* %3
2182 ///  %6 = call i64 @llvm.bswap.i64(i64 %4)
2183 ///  %7 = call i64 @llvm.bswap.i64(i64 %5)
2184 ///  %8 = sub i64 %6, %7
2185 ///  %9 = icmp ne i64 %8, 0
2186 ///  br i1 %9, label %res_block, label %loadbb1
2187 /// res_block:                                        ; preds = %loadbb2,
2188 /// %loadbb1, %loadbb
2189 ///  %phi.src1 = phi i64 [ %6, %loadbb ], [ %22, %loadbb1 ], [ %36, %loadbb2 ]
2190 ///  %phi.src2 = phi i64 [ %7, %loadbb ], [ %23, %loadbb1 ], [ %37, %loadbb2 ]
2191 ///  %10 = icmp ult i64 %phi.src1, %phi.src2
2192 ///  %11 = select i1 %10, i32 -1, i32 1
2193 ///  br label %endblock
2194 /// loadbb1:                                          ; preds = %loadbb
2195 ///  %12 = bitcast i32* %buffer2 to i8*
2196 ///  %13 = bitcast i32* %buffer1 to i8*
2197 ///  %14 = bitcast i8* %13 to i32*
2198 ///  %15 = bitcast i8* %12 to i32*
2199 ///  %16 = getelementptr i32, i32* %14, i32 2
2200 ///  %17 = getelementptr i32, i32* %15, i32 2
2201 ///  %18 = load i32, i32* %16
2202 ///  %19 = load i32, i32* %17
2203 ///  %20 = call i32 @llvm.bswap.i32(i32 %18)
2204 ///  %21 = call i32 @llvm.bswap.i32(i32 %19)
2205 ///  %22 = zext i32 %20 to i64
2206 ///  %23 = zext i32 %21 to i64
2207 ///  %24 = sub i64 %22, %23
2208 ///  %25 = icmp ne i64 %24, 0
2209 ///  br i1 %25, label %res_block, label %loadbb2
2210 /// loadbb2:                                          ; preds = %loadbb1
2211 ///  %26 = bitcast i32* %buffer2 to i8*
2212 ///  %27 = bitcast i32* %buffer1 to i8*
2213 ///  %28 = bitcast i8* %27 to i16*
2214 ///  %29 = bitcast i8* %26 to i16*
2215 ///  %30 = getelementptr i16, i16* %28, i16 6
2216 ///  %31 = getelementptr i16, i16* %29, i16 6
2217 ///  %32 = load i16, i16* %30
2218 ///  %33 = load i16, i16* %31
2219 ///  %34 = call i16 @llvm.bswap.i16(i16 %32)
2220 ///  %35 = call i16 @llvm.bswap.i16(i16 %33)
2221 ///  %36 = zext i16 %34 to i64
2222 ///  %37 = zext i16 %35 to i64
2223 ///  %38 = sub i64 %36, %37
2224 ///  %39 = icmp ne i64 %38, 0
2225 ///  br i1 %39, label %res_block, label %loadbb3
2226 /// loadbb3:                                          ; preds = %loadbb2
2227 ///  %40 = bitcast i32* %buffer2 to i8*
2228 ///  %41 = bitcast i32* %buffer1 to i8*
2229 ///  %42 = getelementptr i8, i8* %41, i8 14
2230 ///  %43 = getelementptr i8, i8* %40, i8 14
2231 ///  %44 = load i8, i8* %42
2232 ///  %45 = load i8, i8* %43
2233 ///  %46 = zext i8 %44 to i32
2234 ///  %47 = zext i8 %45 to i32
2235 ///  %48 = sub i32 %46, %47
2236 ///  br label %endblock
2237 /// endblock:                                         ; preds = %res_block,
2238 /// %loadbb3
2239 ///  %phi.res = phi i32 [ %48, %loadbb3 ], [ %11, %res_block ]
2240 ///  ret i32 %phi.res
2241 static bool expandMemCmp(CallInst *CI, const TargetTransformInfo *TTI,
2242                          const TargetLowering *TLI, const DataLayout *DL) {
2243   NumMemCmpCalls++;
2244
2245   // TTI call to check if target would like to expand memcmp. Also, get the
2246   // MaxLoadSize.
2247   unsigned MaxLoadSize;
2248   if (!TTI->expandMemCmp(CI, MaxLoadSize))
2249     return false;
2250
2251   // Early exit from expansion if -Oz.
2252   if (CI->getFunction()->optForMinSize())
2253     return false;
2254
2255   // Early exit from expansion if size is not a constant.
2256   ConstantInt *SizeCast = dyn_cast<ConstantInt>(CI->getArgOperand(2));
2257   if (!SizeCast) {
2258     NumMemCmpNotConstant++;
2259     return false;
2260   }
2261
2262   // Early exit from expansion if size greater than max bytes to load.
2263   uint64_t SizeVal = SizeCast->getZExtValue();
2264   unsigned NumLoads = 0;
2265   unsigned RemainingSize = SizeVal;
2266   unsigned LoadSize = MaxLoadSize;
2267   while (RemainingSize) {
2268     NumLoads += RemainingSize / LoadSize;
2269     RemainingSize = RemainingSize % LoadSize;
2270     LoadSize = LoadSize / 2;
2271   }
2272
2273   if (NumLoads > TLI->getMaxExpandSizeMemcmp(CI->getFunction()->optForSize())) {
2274     NumMemCmpGreaterThanMax++;
2275     return false;
2276   }
2277
2278   NumMemCmpInlined++;
2279
2280   // MemCmpHelper object creates and sets up basic blocks required for
2281   // expanding memcmp with size SizeVal.
2282   unsigned NumLoadsPerBlock = MemCmpNumLoadsPerBlock;
2283   MemCmpExpansion MemCmpHelper(CI, SizeVal, MaxLoadSize, NumLoadsPerBlock, *DL);
2284
2285   Value *Res = MemCmpHelper.getMemCmpExpansion(SizeVal);
2286
2287   // Replace call with result of expansion and erase call.
2288   CI->replaceAllUsesWith(Res);
2289   CI->eraseFromParent();
2290
2291   return true;
2292 }
2293
2294 bool CodeGenPrepare::optimizeCallInst(CallInst *CI, bool &ModifiedDT) {
2295   BasicBlock *BB = CI->getParent();
2296
2297   // Lower inline assembly if we can.
2298   // If we found an inline asm expession, and if the target knows how to
2299   // lower it to normal LLVM code, do so now.
2300   if (TLI && isa<InlineAsm>(CI->getCalledValue())) {
2301     if (TLI->ExpandInlineAsm(CI)) {
2302       // Avoid invalidating the iterator.
2303       CurInstIterator = BB->begin();
2304       // Avoid processing instructions out of order, which could cause
2305       // reuse before a value is defined.
2306       SunkAddrs.clear();
2307       return true;
2308     }
2309     // Sink address computing for memory operands into the block.
2310     if (optimizeInlineAsmInst(CI))
2311       return true;
2312   }
2313
2314   // Align the pointer arguments to this call if the target thinks it's a good
2315   // idea
2316   unsigned MinSize, PrefAlign;
2317   if (TLI && TLI->shouldAlignPointerArgs(CI, MinSize, PrefAlign)) {
2318     for (auto &Arg : CI->arg_operands()) {
2319       // We want to align both objects whose address is used directly and
2320       // objects whose address is used in casts and GEPs, though it only makes
2321       // sense for GEPs if the offset is a multiple of the desired alignment and
2322       // if size - offset meets the size threshold.
2323       if (!Arg->getType()->isPointerTy())
2324         continue;
2325       APInt Offset(DL->getPointerSizeInBits(
2326                        cast<PointerType>(Arg->getType())->getAddressSpace()),
2327                    0);
2328       Value *Val = Arg->stripAndAccumulateInBoundsConstantOffsets(*DL, Offset);
2329       uint64_t Offset2 = Offset.getLimitedValue();
2330       if ((Offset2 & (PrefAlign-1)) != 0)
2331         continue;
2332       AllocaInst *AI;
2333       if ((AI = dyn_cast<AllocaInst>(Val)) && AI->getAlignment() < PrefAlign &&
2334           DL->getTypeAllocSize(AI->getAllocatedType()) >= MinSize + Offset2)
2335         AI->setAlignment(PrefAlign);
2336       // Global variables can only be aligned if they are defined in this
2337       // object (i.e. they are uniquely initialized in this object), and
2338       // over-aligning global variables that have an explicit section is
2339       // forbidden.
2340       GlobalVariable *GV;
2341       if ((GV = dyn_cast<GlobalVariable>(Val)) && GV->canIncreaseAlignment() &&
2342           GV->getPointerAlignment(*DL) < PrefAlign &&
2343           DL->getTypeAllocSize(GV->getValueType()) >=
2344               MinSize + Offset2)
2345         GV->setAlignment(PrefAlign);
2346     }
2347     // If this is a memcpy (or similar) then we may be able to improve the
2348     // alignment
2349     if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(CI)) {
2350       unsigned Align = getKnownAlignment(MI->getDest(), *DL);
2351       if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(MI))
2352         Align = std::min(Align, getKnownAlignment(MTI->getSource(), *DL));
2353       if (Align > MI->getAlignment())
2354         MI->setAlignment(ConstantInt::get(MI->getAlignmentType(), Align));
2355     }
2356   }
2357
2358   // If we have a cold call site, try to sink addressing computation into the
2359   // cold block.  This interacts with our handling for loads and stores to
2360   // ensure that we can fold all uses of a potential addressing computation
2361   // into their uses.  TODO: generalize this to work over profiling data
2362   if (!OptSize && CI->hasFnAttr(Attribute::Cold))
2363     for (auto &Arg : CI->arg_operands()) {
2364       if (!Arg->getType()->isPointerTy())
2365         continue;
2366       unsigned AS = Arg->getType()->getPointerAddressSpace();
2367       return optimizeMemoryInst(CI, Arg, Arg->getType(), AS);
2368     }
2369
2370   IntrinsicInst *II = dyn_cast<IntrinsicInst>(CI);
2371   if (II) {
2372     switch (II->getIntrinsicID()) {
2373     default: break;
2374     case Intrinsic::objectsize: {
2375       // Lower all uses of llvm.objectsize.*
2376       ConstantInt *RetVal =
2377           lowerObjectSizeCall(II, *DL, TLInfo, /*MustSucceed=*/true);
2378       // Substituting this can cause recursive simplifications, which can
2379       // invalidate our iterator.  Use a WeakTrackingVH to hold onto it in case
2380       // this
2381       // happens.
2382       Value *CurValue = &*CurInstIterator;
2383       WeakTrackingVH IterHandle(CurValue);
2384
2385       replaceAndRecursivelySimplify(CI, RetVal, TLInfo, nullptr);
2386
2387       // If the iterator instruction was recursively deleted, start over at the
2388       // start of the block.
2389       if (IterHandle != CurValue) {
2390         CurInstIterator = BB->begin();
2391         SunkAddrs.clear();
2392       }
2393       return true;
2394     }
2395     case Intrinsic::aarch64_stlxr:
2396     case Intrinsic::aarch64_stxr: {
2397       ZExtInst *ExtVal = dyn_cast<ZExtInst>(CI->getArgOperand(0));
2398       if (!ExtVal || !ExtVal->hasOneUse() ||
2399           ExtVal->getParent() == CI->getParent())
2400         return false;
2401       // Sink a zext feeding stlxr/stxr before it, so it can be folded into it.
2402       ExtVal->moveBefore(CI);
2403       // Mark this instruction as "inserted by CGP", so that other
2404       // optimizations don't touch it.
2405       InsertedInsts.insert(ExtVal);
2406       return true;
2407     }
2408     case Intrinsic::invariant_group_barrier:
2409       II->replaceAllUsesWith(II->getArgOperand(0));
2410       II->eraseFromParent();
2411       return true;
2412
2413     case Intrinsic::cttz:
2414     case Intrinsic::ctlz:
2415       // If counting zeros is expensive, try to avoid it.
2416       return despeculateCountZeros(II, TLI, DL, ModifiedDT);
2417     }
2418
2419     if (TLI) {
2420       SmallVector<Value*, 2> PtrOps;
2421       Type *AccessTy;
2422       if (TLI->getAddrModeArguments(II, PtrOps, AccessTy))
2423         while (!PtrOps.empty()) {
2424           Value *PtrVal = PtrOps.pop_back_val();
2425           unsigned AS = PtrVal->getType()->getPointerAddressSpace();
2426           if (optimizeMemoryInst(II, PtrVal, AccessTy, AS))
2427             return true;
2428         }
2429     }
2430   }
2431
2432   // From here on out we're working with named functions.
2433   if (!CI->getCalledFunction()) return false;
2434
2435   // Lower all default uses of _chk calls.  This is very similar
2436   // to what InstCombineCalls does, but here we are only lowering calls
2437   // to fortified library functions (e.g. __memcpy_chk) that have the default
2438   // "don't know" as the objectsize.  Anything else should be left alone.
2439   FortifiedLibCallSimplifier Simplifier(TLInfo, true);
2440   if (Value *V = Simplifier.optimizeCall(CI)) {
2441     CI->replaceAllUsesWith(V);
2442     CI->eraseFromParent();
2443     return true;
2444   }
2445
2446   LibFunc Func;
2447   if (TLInfo->getLibFunc(ImmutableCallSite(CI), Func) &&
2448       Func == LibFunc_memcmp && expandMemCmp(CI, TTI, TLI, DL)) {
2449     ModifiedDT = true;
2450     return true;
2451   }
2452   return false;
2453 }
2454
2455 /// Look for opportunities to duplicate return instructions to the predecessor
2456 /// to enable tail call optimizations. The case it is currently looking for is:
2457 /// @code
2458 /// bb0:
2459 ///   %tmp0 = tail call i32 @f0()
2460 ///   br label %return
2461 /// bb1:
2462 ///   %tmp1 = tail call i32 @f1()
2463 ///   br label %return
2464 /// bb2:
2465 ///   %tmp2 = tail call i32 @f2()
2466 ///   br label %return
2467 /// return:
2468 ///   %retval = phi i32 [ %tmp0, %bb0 ], [ %tmp1, %bb1 ], [ %tmp2, %bb2 ]
2469 ///   ret i32 %retval
2470 /// @endcode
2471 ///
2472 /// =>
2473 ///
2474 /// @code
2475 /// bb0:
2476 ///   %tmp0 = tail call i32 @f0()
2477 ///   ret i32 %tmp0
2478 /// bb1:
2479 ///   %tmp1 = tail call i32 @f1()
2480 ///   ret i32 %tmp1
2481 /// bb2:
2482 ///   %tmp2 = tail call i32 @f2()
2483 ///   ret i32 %tmp2
2484 /// @endcode
2485 bool CodeGenPrepare::dupRetToEnableTailCallOpts(BasicBlock *BB) {
2486   if (!TLI)
2487     return false;
2488
2489   ReturnInst *RetI = dyn_cast<ReturnInst>(BB->getTerminator());
2490   if (!RetI)
2491     return false;
2492
2493   PHINode *PN = nullptr;
2494   BitCastInst *BCI = nullptr;
2495   Value *V = RetI->getReturnValue();
2496   if (V) {
2497     BCI = dyn_cast<BitCastInst>(V);
2498     if (BCI)
2499       V = BCI->getOperand(0);
2500
2501     PN = dyn_cast<PHINode>(V);
2502     if (!PN)
2503       return false;
2504   }
2505
2506   if (PN && PN->getParent() != BB)
2507     return false;
2508
2509   // Make sure there are no instructions between the PHI and return, or that the
2510   // return is the first instruction in the block.
2511   if (PN) {
2512     BasicBlock::iterator BI = BB->begin();
2513     do { ++BI; } while (isa<DbgInfoIntrinsic>(BI));
2514     if (&*BI == BCI)
2515       // Also skip over the bitcast.
2516       ++BI;
2517     if (&*BI != RetI)
2518       return false;
2519   } else {
2520     BasicBlock::iterator BI = BB->begin();
2521     while (isa<DbgInfoIntrinsic>(BI)) ++BI;
2522     if (&*BI != RetI)
2523       return false;
2524   }
2525
2526   /// Only dup the ReturnInst if the CallInst is likely to be emitted as a tail
2527   /// call.
2528   const Function *F = BB->getParent();
2529   SmallVector<CallInst*, 4> TailCalls;
2530   if (PN) {
2531     for (unsigned I = 0, E = PN->getNumIncomingValues(); I != E; ++I) {
2532       CallInst *CI = dyn_cast<CallInst>(PN->getIncomingValue(I));
2533       // Make sure the phi value is indeed produced by the tail call.
2534       if (CI && CI->hasOneUse() && CI->getParent() == PN->getIncomingBlock(I) &&
2535           TLI->mayBeEmittedAsTailCall(CI) &&
2536           attributesPermitTailCall(F, CI, RetI, *TLI))
2537         TailCalls.push_back(CI);
2538     }
2539   } else {
2540     SmallPtrSet<BasicBlock*, 4> VisitedBBs;
2541     for (pred_iterator PI = pred_begin(BB), PE = pred_end(BB); PI != PE; ++PI) {
2542       if (!VisitedBBs.insert(*PI).second)
2543         continue;
2544
2545       BasicBlock::InstListType &InstList = (*PI)->getInstList();
2546       BasicBlock::InstListType::reverse_iterator RI = InstList.rbegin();
2547       BasicBlock::InstListType::reverse_iterator RE = InstList.rend();
2548       do { ++RI; } while (RI != RE && isa<DbgInfoIntrinsic>(&*RI));
2549       if (RI == RE)
2550         continue;
2551
2552       CallInst *CI = dyn_cast<CallInst>(&*RI);
2553       if (CI && CI->use_empty() && TLI->mayBeEmittedAsTailCall(CI) &&
2554           attributesPermitTailCall(F, CI, RetI, *TLI))
2555         TailCalls.push_back(CI);
2556     }
2557   }
2558
2559   bool Changed = false;
2560   for (unsigned i = 0, e = TailCalls.size(); i != e; ++i) {
2561     CallInst *CI = TailCalls[i];
2562     CallSite CS(CI);
2563
2564     // Conservatively require the attributes of the call to match those of the
2565     // return. Ignore noalias because it doesn't affect the call sequence.
2566     AttributeList CalleeAttrs = CS.getAttributes();
2567     if (AttrBuilder(CalleeAttrs, AttributeList::ReturnIndex)
2568             .removeAttribute(Attribute::NoAlias) !=
2569         AttrBuilder(CalleeAttrs, AttributeList::ReturnIndex)
2570             .removeAttribute(Attribute::NoAlias))
2571       continue;
2572
2573     // Make sure the call instruction is followed by an unconditional branch to
2574     // the return block.
2575     BasicBlock *CallBB = CI->getParent();
2576     BranchInst *BI = dyn_cast<BranchInst>(CallBB->getTerminator());
2577     if (!BI || !BI->isUnconditional() || BI->getSuccessor(0) != BB)
2578       continue;
2579
2580     // Duplicate the return into CallBB.
2581     (void)FoldReturnIntoUncondBranch(RetI, BB, CallBB);
2582     ModifiedDT = Changed = true;
2583     ++NumRetsDup;
2584   }
2585
2586   // If we eliminated all predecessors of the block, delete the block now.
2587   if (Changed && !BB->hasAddressTaken() && pred_begin(BB) == pred_end(BB))
2588     BB->eraseFromParent();
2589
2590   return Changed;
2591 }
2592
2593 //===----------------------------------------------------------------------===//
2594 // Memory Optimization
2595 //===----------------------------------------------------------------------===//
2596
2597 namespace {
2598
2599 /// This is an extended version of TargetLowering::AddrMode
2600 /// which holds actual Value*'s for register values.
2601 struct ExtAddrMode : public TargetLowering::AddrMode {
2602   Value *BaseReg;
2603   Value *ScaledReg;
2604   ExtAddrMode() : BaseReg(nullptr), ScaledReg(nullptr) {}
2605   void print(raw_ostream &OS) const;
2606   void dump() const;
2607
2608   bool operator==(const ExtAddrMode& O) const {
2609     return (BaseReg == O.BaseReg) && (ScaledReg == O.ScaledReg) &&
2610            (BaseGV == O.BaseGV) && (BaseOffs == O.BaseOffs) &&
2611            (HasBaseReg == O.HasBaseReg) && (Scale == O.Scale);
2612   }
2613 };
2614
2615 #ifndef NDEBUG
2616 static inline raw_ostream &operator<<(raw_ostream &OS, const ExtAddrMode &AM) {
2617   AM.print(OS);
2618   return OS;
2619 }
2620 #endif
2621
2622 void ExtAddrMode::print(raw_ostream &OS) const {
2623   bool NeedPlus = false;
2624   OS << "[";
2625   if (BaseGV) {
2626     OS << (NeedPlus ? " + " : "")
2627        << "GV:";
2628     BaseGV->printAsOperand(OS, /*PrintType=*/false);
2629     NeedPlus = true;
2630   }
2631
2632   if (BaseOffs) {
2633     OS << (NeedPlus ? " + " : "")
2634        << BaseOffs;
2635     NeedPlus = true;
2636   }
2637
2638   if (BaseReg) {
2639     OS << (NeedPlus ? " + " : "")
2640        << "Base:";
2641     BaseReg->printAsOperand(OS, /*PrintType=*/false);
2642     NeedPlus = true;
2643   }
2644   if (Scale) {
2645     OS << (NeedPlus ? " + " : "")
2646        << Scale << "*";
2647     ScaledReg->printAsOperand(OS, /*PrintType=*/false);
2648   }
2649
2650   OS << ']';
2651 }
2652
2653 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2654 LLVM_DUMP_METHOD void ExtAddrMode::dump() const {
2655   print(dbgs());
2656   dbgs() << '\n';
2657 }
2658 #endif
2659
2660 /// \brief This class provides transaction based operation on the IR.
2661 /// Every change made through this class is recorded in the internal state and
2662 /// can be undone (rollback) until commit is called.
2663 class TypePromotionTransaction {
2664
2665   /// \brief This represents the common interface of the individual transaction.
2666   /// Each class implements the logic for doing one specific modification on
2667   /// the IR via the TypePromotionTransaction.
2668   class TypePromotionAction {
2669   protected:
2670     /// The Instruction modified.
2671     Instruction *Inst;
2672
2673   public:
2674     /// \brief Constructor of the action.
2675     /// The constructor performs the related action on the IR.
2676     TypePromotionAction(Instruction *Inst) : Inst(Inst) {}
2677
2678     virtual ~TypePromotionAction() {}
2679
2680     /// \brief Undo the modification done by this action.
2681     /// When this method is called, the IR must be in the same state as it was
2682     /// before this action was applied.
2683     /// \pre Undoing the action works if and only if the IR is in the exact same
2684     /// state as it was directly after this action was applied.
2685     virtual void undo() = 0;
2686
2687     /// \brief Advocate every change made by this action.
2688     /// When the results on the IR of the action are to be kept, it is important
2689     /// to call this function, otherwise hidden information may be kept forever.
2690     virtual void commit() {
2691       // Nothing to be done, this action is not doing anything.
2692     }
2693   };
2694
2695   /// \brief Utility to remember the position of an instruction.
2696   class InsertionHandler {
2697     /// Position of an instruction.
2698     /// Either an instruction:
2699     /// - Is the first in a basic block: BB is used.
2700     /// - Has a previous instructon: PrevInst is used.
2701     union {
2702       Instruction *PrevInst;
2703       BasicBlock *BB;
2704     } Point;
2705     /// Remember whether or not the instruction had a previous instruction.
2706     bool HasPrevInstruction;
2707
2708   public:
2709     /// \brief Record the position of \p Inst.
2710     InsertionHandler(Instruction *Inst) {
2711       BasicBlock::iterator It = Inst->getIterator();
2712       HasPrevInstruction = (It != (Inst->getParent()->begin()));
2713       if (HasPrevInstruction)
2714         Point.PrevInst = &*--It;
2715       else
2716         Point.BB = Inst->getParent();
2717     }
2718
2719     /// \brief Insert \p Inst at the recorded position.
2720     void insert(Instruction *Inst) {
2721       if (HasPrevInstruction) {
2722         if (Inst->getParent())
2723           Inst->removeFromParent();
2724         Inst->insertAfter(Point.PrevInst);
2725       } else {
2726         Instruction *Position = &*Point.BB->getFirstInsertionPt();
2727         if (Inst->getParent())
2728           Inst->moveBefore(Position);
2729         else
2730           Inst->insertBefore(Position);
2731       }
2732     }
2733   };
2734
2735   /// \brief Move an instruction before another.
2736   class InstructionMoveBefore : public TypePromotionAction {
2737     /// Original position of the instruction.
2738     InsertionHandler Position;
2739
2740   public:
2741     /// \brief Move \p Inst before \p Before.
2742     InstructionMoveBefore(Instruction *Inst, Instruction *Before)
2743         : TypePromotionAction(Inst), Position(Inst) {
2744       DEBUG(dbgs() << "Do: move: " << *Inst << "\nbefore: " << *Before << "\n");
2745       Inst->moveBefore(Before);
2746     }
2747
2748     /// \brief Move the instruction back to its original position.
2749     void undo() override {
2750       DEBUG(dbgs() << "Undo: moveBefore: " << *Inst << "\n");
2751       Position.insert(Inst);
2752     }
2753   };
2754
2755   /// \brief Set the operand of an instruction with a new value.
2756   class OperandSetter : public TypePromotionAction {
2757     /// Original operand of the instruction.
2758     Value *Origin;
2759     /// Index of the modified instruction.
2760     unsigned Idx;
2761
2762   public:
2763     /// \brief Set \p Idx operand of \p Inst with \p NewVal.
2764     OperandSetter(Instruction *Inst, unsigned Idx, Value *NewVal)
2765         : TypePromotionAction(Inst), Idx(Idx) {
2766       DEBUG(dbgs() << "Do: setOperand: " << Idx << "\n"
2767                    << "for:" << *Inst << "\n"
2768                    << "with:" << *NewVal << "\n");
2769       Origin = Inst->getOperand(Idx);
2770       Inst->setOperand(Idx, NewVal);
2771     }
2772
2773     /// \brief Restore the original value of the instruction.
2774     void undo() override {
2775       DEBUG(dbgs() << "Undo: setOperand:" << Idx << "\n"
2776                    << "for: " << *Inst << "\n"
2777                    << "with: " << *Origin << "\n");
2778       Inst->setOperand(Idx, Origin);
2779     }
2780   };
2781
2782   /// \brief Hide the operands of an instruction.
2783   /// Do as if this instruction was not using any of its operands.
2784   class OperandsHider : public TypePromotionAction {
2785     /// The list of original operands.
2786     SmallVector<Value *, 4> OriginalValues;
2787
2788   public:
2789     /// \brief Remove \p Inst from the uses of the operands of \p Inst.
2790     OperandsHider(Instruction *Inst) : TypePromotionAction(Inst) {
2791       DEBUG(dbgs() << "Do: OperandsHider: " << *Inst << "\n");
2792       unsigned NumOpnds = Inst->getNumOperands();
2793       OriginalValues.reserve(NumOpnds);
2794       for (unsigned It = 0; It < NumOpnds; ++It) {
2795         // Save the current operand.
2796         Value *Val = Inst->getOperand(It);
2797         OriginalValues.push_back(Val);
2798         // Set a dummy one.
2799         // We could use OperandSetter here, but that would imply an overhead
2800         // that we are not willing to pay.
2801         Inst->setOperand(It, UndefValue::get(Val->getType()));
2802       }
2803     }
2804
2805     /// \brief Restore the original list of uses.
2806     void undo() override {
2807       DEBUG(dbgs() << "Undo: OperandsHider: " << *Inst << "\n");
2808       for (unsigned It = 0, EndIt = OriginalValues.size(); It != EndIt; ++It)
2809         Inst->setOperand(It, OriginalValues[It]);
2810     }
2811   };
2812
2813   /// \brief Build a truncate instruction.
2814   class TruncBuilder : public TypePromotionAction {
2815     Value *Val;
2816   public:
2817     /// \brief Build a truncate instruction of \p Opnd producing a \p Ty
2818     /// result.
2819     /// trunc Opnd to Ty.
2820     TruncBuilder(Instruction *Opnd, Type *Ty) : TypePromotionAction(Opnd) {
2821       IRBuilder<> Builder(Opnd);
2822       Val = Builder.CreateTrunc(Opnd, Ty, "promoted");
2823       DEBUG(dbgs() << "Do: TruncBuilder: " << *Val << "\n");
2824     }
2825
2826     /// \brief Get the built value.
2827     Value *getBuiltValue() { return Val; }
2828
2829     /// \brief Remove the built instruction.
2830     void undo() override {
2831       DEBUG(dbgs() << "Undo: TruncBuilder: " << *Val << "\n");
2832       if (Instruction *IVal = dyn_cast<Instruction>(Val))
2833         IVal->eraseFromParent();
2834     }
2835   };
2836
2837   /// \brief Build a sign extension instruction.
2838   class SExtBuilder : public TypePromotionAction {
2839     Value *Val;
2840   public:
2841     /// \brief Build a sign extension instruction of \p Opnd producing a \p Ty
2842     /// result.
2843     /// sext Opnd to Ty.
2844     SExtBuilder(Instruction *InsertPt, Value *Opnd, Type *Ty)
2845         : TypePromotionAction(InsertPt) {
2846       IRBuilder<> Builder(InsertPt);
2847       Val = Builder.CreateSExt(Opnd, Ty, "promoted");
2848       DEBUG(dbgs() << "Do: SExtBuilder: " << *Val << "\n");
2849     }
2850
2851     /// \brief Get the built value.
2852     Value *getBuiltValue() { return Val; }
2853
2854     /// \brief Remove the built instruction.
2855     void undo() override {
2856       DEBUG(dbgs() << "Undo: SExtBuilder: " << *Val << "\n");
2857       if (Instruction *IVal = dyn_cast<Instruction>(Val))
2858         IVal->eraseFromParent();
2859     }
2860   };
2861
2862   /// \brief Build a zero extension instruction.
2863   class ZExtBuilder : public TypePromotionAction {
2864     Value *Val;
2865   public:
2866     /// \brief Build a zero extension instruction of \p Opnd producing a \p Ty
2867     /// result.
2868     /// zext Opnd to Ty.
2869     ZExtBuilder(Instruction *InsertPt, Value *Opnd, Type *Ty)
2870         : TypePromotionAction(InsertPt) {
2871       IRBuilder<> Builder(InsertPt);
2872       Val = Builder.CreateZExt(Opnd, Ty, "promoted");
2873       DEBUG(dbgs() << "Do: ZExtBuilder: " << *Val << "\n");
2874     }
2875
2876     /// \brief Get the built value.
2877     Value *getBuiltValue() { return Val; }
2878
2879     /// \brief Remove the built instruction.
2880     void undo() override {
2881       DEBUG(dbgs() << "Undo: ZExtBuilder: " << *Val << "\n");
2882       if (Instruction *IVal = dyn_cast<Instruction>(Val))
2883         IVal->eraseFromParent();
2884     }
2885   };
2886
2887   /// \brief Mutate an instruction to another type.
2888   class TypeMutator : public TypePromotionAction {
2889     /// Record the original type.
2890     Type *OrigTy;
2891
2892   public:
2893     /// \brief Mutate the type of \p Inst into \p NewTy.
2894     TypeMutator(Instruction *Inst, Type *NewTy)
2895         : TypePromotionAction(Inst), OrigTy(Inst->getType()) {
2896       DEBUG(dbgs() << "Do: MutateType: " << *Inst << " with " << *NewTy
2897                    << "\n");
2898       Inst->mutateType(NewTy);
2899     }
2900
2901     /// \brief Mutate the instruction back to its original type.
2902     void undo() override {
2903       DEBUG(dbgs() << "Undo: MutateType: " << *Inst << " with " << *OrigTy
2904                    << "\n");
2905       Inst->mutateType(OrigTy);
2906     }
2907   };
2908
2909   /// \brief Replace the uses of an instruction by another instruction.
2910   class UsesReplacer : public TypePromotionAction {
2911     /// Helper structure to keep track of the replaced uses.
2912     struct InstructionAndIdx {
2913       /// The instruction using the instruction.
2914       Instruction *Inst;
2915       /// The index where this instruction is used for Inst.
2916       unsigned Idx;
2917       InstructionAndIdx(Instruction *Inst, unsigned Idx)
2918           : Inst(Inst), Idx(Idx) {}
2919     };
2920
2921     /// Keep track of the original uses (pair Instruction, Index).
2922     SmallVector<InstructionAndIdx, 4> OriginalUses;
2923     typedef SmallVectorImpl<InstructionAndIdx>::iterator use_iterator;
2924
2925   public:
2926     /// \brief Replace all the use of \p Inst by \p New.
2927     UsesReplacer(Instruction *Inst, Value *New) : TypePromotionAction(Inst) {
2928       DEBUG(dbgs() << "Do: UsersReplacer: " << *Inst << " with " << *New
2929                    << "\n");
2930       // Record the original uses.
2931       for (Use &U : Inst->uses()) {
2932         Instruction *UserI = cast<Instruction>(U.getUser());
2933         OriginalUses.push_back(InstructionAndIdx(UserI, U.getOperandNo()));
2934       }
2935       // Now, we can replace the uses.
2936       Inst->replaceAllUsesWith(New);
2937     }
2938
2939     /// \brief Reassign the original uses of Inst to Inst.
2940     void undo() override {
2941       DEBUG(dbgs() << "Undo: UsersReplacer: " << *Inst << "\n");
2942       for (use_iterator UseIt = OriginalUses.begin(),
2943                         EndIt = OriginalUses.end();
2944            UseIt != EndIt; ++UseIt) {
2945         UseIt->Inst->setOperand(UseIt->Idx, Inst);
2946       }
2947     }
2948   };
2949
2950   /// \brief Remove an instruction from the IR.
2951   class InstructionRemover : public TypePromotionAction {
2952     /// Original position of the instruction.
2953     InsertionHandler Inserter;
2954     /// Helper structure to hide all the link to the instruction. In other
2955     /// words, this helps to do as if the instruction was removed.
2956     OperandsHider Hider;
2957     /// Keep track of the uses replaced, if any.
2958     UsesReplacer *Replacer;
2959     /// Keep track of instructions removed.
2960     SetOfInstrs &RemovedInsts;
2961
2962   public:
2963     /// \brief Remove all reference of \p Inst and optinally replace all its
2964     /// uses with New.
2965     /// \p RemovedInsts Keep track of the instructions removed by this Action.
2966     /// \pre If !Inst->use_empty(), then New != nullptr
2967     InstructionRemover(Instruction *Inst, SetOfInstrs &RemovedInsts,
2968                        Value *New = nullptr)
2969         : TypePromotionAction(Inst), Inserter(Inst), Hider(Inst),
2970           Replacer(nullptr), RemovedInsts(RemovedInsts) {
2971       if (New)
2972         Replacer = new UsesReplacer(Inst, New);
2973       DEBUG(dbgs() << "Do: InstructionRemover: " << *Inst << "\n");
2974       RemovedInsts.insert(Inst);
2975       /// The instructions removed here will be freed after completing
2976       /// optimizeBlock() for all blocks as we need to keep track of the
2977       /// removed instructions during promotion.
2978       Inst->removeFromParent();
2979     }
2980
2981     ~InstructionRemover() override { delete Replacer; }
2982
2983     /// \brief Resurrect the instruction and reassign it to the proper uses if
2984     /// new value was provided when build this action.
2985     void undo() override {
2986       DEBUG(dbgs() << "Undo: InstructionRemover: " << *Inst << "\n");
2987       Inserter.insert(Inst);
2988       if (Replacer)
2989         Replacer->undo();
2990       Hider.undo();
2991       RemovedInsts.erase(Inst);
2992     }
2993   };
2994
2995 public:
2996   /// Restoration point.
2997   /// The restoration point is a pointer to an action instead of an iterator
2998   /// because the iterator may be invalidated but not the pointer.
2999   typedef const TypePromotionAction *ConstRestorationPt;
3000
3001   TypePromotionTransaction(SetOfInstrs &RemovedInsts)
3002       : RemovedInsts(RemovedInsts) {}
3003
3004   /// Advocate every changes made in that transaction.
3005   void commit();
3006   /// Undo all the changes made after the given point.
3007   void rollback(ConstRestorationPt Point);
3008   /// Get the current restoration point.
3009   ConstRestorationPt getRestorationPoint() const;
3010
3011   /// \name API for IR modification with state keeping to support rollback.
3012   /// @{
3013   /// Same as Instruction::setOperand.
3014   void setOperand(Instruction *Inst, unsigned Idx, Value *NewVal);
3015   /// Same as Instruction::eraseFromParent.
3016   void eraseInstruction(Instruction *Inst, Value *NewVal = nullptr);
3017   /// Same as Value::replaceAllUsesWith.
3018   void replaceAllUsesWith(Instruction *Inst, Value *New);
3019   /// Same as Value::mutateType.
3020   void mutateType(Instruction *Inst, Type *NewTy);
3021   /// Same as IRBuilder::createTrunc.
3022   Value *createTrunc(Instruction *Opnd, Type *Ty);
3023   /// Same as IRBuilder::createSExt.
3024   Value *createSExt(Instruction *Inst, Value *Opnd, Type *Ty);
3025   /// Same as IRBuilder::createZExt.
3026   Value *createZExt(Instruction *Inst, Value *Opnd, Type *Ty);
3027   /// Same as Instruction::moveBefore.
3028   void moveBefore(Instruction *Inst, Instruction *Before);
3029   /// @}
3030
3031 private:
3032   /// The ordered list of actions made so far.
3033   SmallVector<std::unique_ptr<TypePromotionAction>, 16> Actions;
3034   typedef SmallVectorImpl<std::unique_ptr<TypePromotionAction>>::iterator CommitPt;
3035   SetOfInstrs &RemovedInsts;
3036 };
3037
3038 void TypePromotionTransaction::setOperand(Instruction *Inst, unsigned Idx,
3039                                           Value *NewVal) {
3040   Actions.push_back(
3041       make_unique<TypePromotionTransaction::OperandSetter>(Inst, Idx, NewVal));
3042 }
3043
3044 void TypePromotionTransaction::eraseInstruction(Instruction *Inst,
3045                                                 Value *NewVal) {
3046   Actions.push_back(
3047       make_unique<TypePromotionTransaction::InstructionRemover>(Inst,
3048                                                          RemovedInsts, NewVal));
3049 }
3050
3051 void TypePromotionTransaction::replaceAllUsesWith(Instruction *Inst,
3052                                                   Value *New) {
3053   Actions.push_back(make_unique<TypePromotionTransaction::UsesReplacer>(Inst, New));
3054 }
3055
3056 void TypePromotionTransaction::mutateType(Instruction *Inst, Type *NewTy) {
3057   Actions.push_back(make_unique<TypePromotionTransaction::TypeMutator>(Inst, NewTy));
3058 }
3059
3060 Value *TypePromotionTransaction::createTrunc(Instruction *Opnd,
3061                                              Type *Ty) {
3062   std::unique_ptr<TruncBuilder> Ptr(new TruncBuilder(Opnd, Ty));
3063   Value *Val = Ptr->getBuiltValue();
3064   Actions.push_back(std::move(Ptr));
3065   return Val;
3066 }
3067
3068 Value *TypePromotionTransaction::createSExt(Instruction *Inst,
3069                                             Value *Opnd, Type *Ty) {
3070   std::unique_ptr<SExtBuilder> Ptr(new SExtBuilder(Inst, Opnd, Ty));
3071   Value *Val = Ptr->getBuiltValue();
3072   Actions.push_back(std::move(Ptr));
3073   return Val;
3074 }
3075
3076 Value *TypePromotionTransaction::createZExt(Instruction *Inst,
3077                                             Value *Opnd, Type *Ty) {
3078   std::unique_ptr<ZExtBuilder> Ptr(new ZExtBuilder(Inst, Opnd, Ty));
3079   Value *Val = Ptr->getBuiltValue();
3080   Actions.push_back(std::move(Ptr));
3081   return Val;
3082 }
3083
3084 void TypePromotionTransaction::moveBefore(Instruction *Inst,
3085                                           Instruction *Before) {
3086   Actions.push_back(
3087       make_unique<TypePromotionTransaction::InstructionMoveBefore>(Inst, Before));
3088 }
3089
3090 TypePromotionTransaction::ConstRestorationPt
3091 TypePromotionTransaction::getRestorationPoint() const {
3092   return !Actions.empty() ? Actions.back().get() : nullptr;
3093 }
3094
3095 void TypePromotionTransaction::commit() {
3096   for (CommitPt It = Actions.begin(), EndIt = Actions.end(); It != EndIt;
3097        ++It)
3098     (*It)->commit();
3099   Actions.clear();
3100 }
3101
3102 void TypePromotionTransaction::rollback(
3103     TypePromotionTransaction::ConstRestorationPt Point) {
3104   while (!Actions.empty() && Point != Actions.back().get()) {
3105     std::unique_ptr<TypePromotionAction> Curr = Actions.pop_back_val();
3106     Curr->undo();
3107   }
3108 }
3109
3110 /// \brief A helper class for matching addressing modes.
3111 ///
3112 /// This encapsulates the logic for matching the target-legal addressing modes.
3113 class AddressingModeMatcher {
3114   SmallVectorImpl<Instruction*> &AddrModeInsts;
3115   const TargetLowering &TLI;
3116   const TargetRegisterInfo &TRI;
3117   const DataLayout &DL;
3118
3119   /// AccessTy/MemoryInst - This is the type for the access (e.g. double) and
3120   /// the memory instruction that we're computing this address for.
3121   Type *AccessTy;
3122   unsigned AddrSpace;
3123   Instruction *MemoryInst;
3124
3125   /// This is the addressing mode that we're building up. This is
3126   /// part of the return value of this addressing mode matching stuff.
3127   ExtAddrMode &AddrMode;
3128
3129   /// The instructions inserted by other CodeGenPrepare optimizations.
3130   const SetOfInstrs &InsertedInsts;
3131   /// A map from the instructions to their type before promotion.
3132   InstrToOrigTy &PromotedInsts;
3133   /// The ongoing transaction where every action should be registered.
3134   TypePromotionTransaction &TPT;
3135
3136   /// This is set to true when we should not do profitability checks.
3137   /// When true, IsProfitableToFoldIntoAddressingMode always returns true.
3138   bool IgnoreProfitability;
3139
3140   AddressingModeMatcher(SmallVectorImpl<Instruction *> &AMI,
3141                         const TargetLowering &TLI,
3142                         const TargetRegisterInfo &TRI,
3143                         Type *AT, unsigned AS,
3144                         Instruction *MI, ExtAddrMode &AM,
3145                         const SetOfInstrs &InsertedInsts,
3146                         InstrToOrigTy &PromotedInsts,
3147                         TypePromotionTransaction &TPT)
3148       : AddrModeInsts(AMI), TLI(TLI), TRI(TRI),
3149         DL(MI->getModule()->getDataLayout()), AccessTy(AT), AddrSpace(AS),
3150         MemoryInst(MI), AddrMode(AM), InsertedInsts(InsertedInsts),
3151         PromotedInsts(PromotedInsts), TPT(TPT) {
3152     IgnoreProfitability = false;
3153   }
3154 public:
3155
3156   /// Find the maximal addressing mode that a load/store of V can fold,
3157   /// give an access type of AccessTy.  This returns a list of involved
3158   /// instructions in AddrModeInsts.
3159   /// \p InsertedInsts The instructions inserted by other CodeGenPrepare
3160   /// optimizations.
3161   /// \p PromotedInsts maps the instructions to their type before promotion.
3162   /// \p The ongoing transaction where every action should be registered.
3163   static ExtAddrMode Match(Value *V, Type *AccessTy, unsigned AS,
3164                            Instruction *MemoryInst,
3165                            SmallVectorImpl<Instruction*> &AddrModeInsts,
3166                            const TargetLowering &TLI,
3167                            const TargetRegisterInfo &TRI,
3168                            const SetOfInstrs &InsertedInsts,
3169                            InstrToOrigTy &PromotedInsts,
3170                            TypePromotionTransaction &TPT) {
3171     ExtAddrMode Result;
3172
3173     bool Success = AddressingModeMatcher(AddrModeInsts, TLI, TRI,
3174                                          AccessTy, AS,
3175                                          MemoryInst, Result, InsertedInsts,
3176                                          PromotedInsts, TPT).matchAddr(V, 0);
3177     (void)Success; assert(Success && "Couldn't select *anything*?");
3178     return Result;
3179   }
3180 private:
3181   bool matchScaledValue(Value *ScaleReg, int64_t Scale, unsigned Depth);
3182   bool matchAddr(Value *V, unsigned Depth);
3183   bool matchOperationAddr(User *Operation, unsigned Opcode, unsigned Depth,
3184                           bool *MovedAway = nullptr);
3185   bool isProfitableToFoldIntoAddressingMode(Instruction *I,
3186                                             ExtAddrMode &AMBefore,
3187                                             ExtAddrMode &AMAfter);
3188   bool valueAlreadyLiveAtInst(Value *Val, Value *KnownLive1, Value *KnownLive2);
3189   bool isPromotionProfitable(unsigned NewCost, unsigned OldCost,
3190                              Value *PromotedOperand) const;
3191 };
3192
3193 /// Try adding ScaleReg*Scale to the current addressing mode.
3194 /// Return true and update AddrMode if this addr mode is legal for the target,
3195 /// false if not.
3196 bool AddressingModeMatcher::matchScaledValue(Value *ScaleReg, int64_t Scale,
3197                                              unsigned Depth) {
3198   // If Scale is 1, then this is the same as adding ScaleReg to the addressing
3199   // mode.  Just process that directly.
3200   if (Scale == 1)
3201     return matchAddr(ScaleReg, Depth);
3202
3203   // If the scale is 0, it takes nothing to add this.
3204   if (Scale == 0)
3205     return true;
3206
3207   // If we already have a scale of this value, we can add to it, otherwise, we
3208   // need an available scale field.
3209   if (AddrMode.Scale != 0 && AddrMode.ScaledReg != ScaleReg)
3210     return false;
3211
3212   ExtAddrMode TestAddrMode = AddrMode;
3213
3214   // Add scale to turn X*4+X*3 -> X*7.  This could also do things like
3215   // [A+B + A*7] -> [B+A*8].
3216   TestAddrMode.Scale += Scale;
3217   TestAddrMode.ScaledReg = ScaleReg;
3218
3219   // If the new address isn't legal, bail out.
3220   if (!TLI.isLegalAddressingMode(DL, TestAddrMode, AccessTy, AddrSpace))
3221     return false;
3222
3223   // It was legal, so commit it.
3224   AddrMode = TestAddrMode;
3225
3226   // Okay, we decided that we can add ScaleReg+Scale to AddrMode.  Check now
3227   // to see if ScaleReg is actually X+C.  If so, we can turn this into adding
3228   // X*Scale + C*Scale to addr mode.
3229   ConstantInt *CI = nullptr; Value *AddLHS = nullptr;
3230   if (isa<Instruction>(ScaleReg) &&  // not a constant expr.
3231       match(ScaleReg, m_Add(m_Value(AddLHS), m_ConstantInt(CI)))) {
3232     TestAddrMode.ScaledReg = AddLHS;
3233     TestAddrMode.BaseOffs += CI->getSExtValue()*TestAddrMode.Scale;
3234
3235     // If this addressing mode is legal, commit it and remember that we folded
3236     // this instruction.
3237     if (TLI.isLegalAddressingMode(DL, TestAddrMode, AccessTy, AddrSpace)) {
3238       AddrModeInsts.push_back(cast<Instruction>(ScaleReg));
3239       AddrMode = TestAddrMode;
3240       return true;
3241     }
3242   }
3243
3244   // Otherwise, not (x+c)*scale, just return what we have.
3245   return true;
3246 }
3247
3248 /// This is a little filter, which returns true if an addressing computation
3249 /// involving I might be folded into a load/store accessing it.
3250 /// This doesn't need to be perfect, but needs to accept at least
3251 /// the set of instructions that MatchOperationAddr can.
3252 static bool MightBeFoldableInst(Instruction *I) {
3253   switch (I->getOpcode()) {
3254   case Instruction::BitCast:
3255   case Instruction::AddrSpaceCast:
3256     // Don't touch identity bitcasts.
3257     if (I->getType() == I->getOperand(0)->getType())
3258       return false;
3259     return I->getType()->isPointerTy() || I->getType()->isIntegerTy();
3260   case Instruction::PtrToInt:
3261     // PtrToInt is always a noop, as we know that the int type is pointer sized.
3262     return true;
3263   case Instruction::IntToPtr:
3264     // We know the input is intptr_t, so this is foldable.
3265     return true;
3266   case Instruction::Add:
3267     return true;
3268   case Instruction::Mul:
3269   case Instruction::Shl:
3270     // Can only handle X*C and X << C.
3271     return isa<ConstantInt>(I->getOperand(1));
3272   case Instruction::GetElementPtr:
3273     return true;
3274   default:
3275     return false;
3276   }
3277 }
3278
3279 /// \brief Check whether or not \p Val is a legal instruction for \p TLI.
3280 /// \note \p Val is assumed to be the product of some type promotion.
3281 /// Therefore if \p Val has an undefined state in \p TLI, this is assumed
3282 /// to be legal, as the non-promoted value would have had the same state.
3283 static bool isPromotedInstructionLegal(const TargetLowering &TLI,
3284                                        const DataLayout &DL, Value *Val) {
3285   Instruction *PromotedInst = dyn_cast<Instruction>(Val);
3286   if (!PromotedInst)
3287     return false;
3288   int ISDOpcode = TLI.InstructionOpcodeToISD(PromotedInst->getOpcode());
3289   // If the ISDOpcode is undefined, it was undefined before the promotion.
3290   if (!ISDOpcode)
3291     return true;
3292   // Otherwise, check if the promoted instruction is legal or not.
3293   return TLI.isOperationLegalOrCustom(
3294       ISDOpcode, TLI.getValueType(DL, PromotedInst->getType()));
3295 }
3296
3297 /// \brief Hepler class to perform type promotion.
3298 class TypePromotionHelper {
3299   /// \brief Utility function to check whether or not a sign or zero extension
3300   /// of \p Inst with \p ConsideredExtType can be moved through \p Inst by
3301   /// either using the operands of \p Inst or promoting \p Inst.
3302   /// The type of the extension is defined by \p IsSExt.
3303   /// In other words, check if:
3304   /// ext (Ty Inst opnd1 opnd2 ... opndN) to ConsideredExtType.
3305   /// #1 Promotion applies:
3306   /// ConsideredExtType Inst (ext opnd1 to ConsideredExtType, ...).
3307   /// #2 Operand reuses:
3308   /// ext opnd1 to ConsideredExtType.
3309   /// \p PromotedInsts maps the instructions to their type before promotion.
3310   static bool canGetThrough(const Instruction *Inst, Type *ConsideredExtType,
3311                             const InstrToOrigTy &PromotedInsts, bool IsSExt);
3312
3313   /// \brief Utility function to determine if \p OpIdx should be promoted when
3314   /// promoting \p Inst.
3315   static bool shouldExtOperand(const Instruction *Inst, int OpIdx) {
3316     return !(isa<SelectInst>(Inst) && OpIdx == 0);
3317   }
3318
3319   /// \brief Utility function to promote the operand of \p Ext when this
3320   /// operand is a promotable trunc or sext or zext.
3321   /// \p PromotedInsts maps the instructions to their type before promotion.
3322   /// \p CreatedInstsCost[out] contains the cost of all instructions
3323   /// created to promote the operand of Ext.
3324   /// Newly added extensions are inserted in \p Exts.
3325   /// Newly added truncates are inserted in \p Truncs.
3326   /// Should never be called directly.
3327   /// \return The promoted value which is used instead of Ext.
3328   static Value *promoteOperandForTruncAndAnyExt(
3329       Instruction *Ext, TypePromotionTransaction &TPT,
3330       InstrToOrigTy &PromotedInsts, unsigned &CreatedInstsCost,
3331       SmallVectorImpl<Instruction *> *Exts,
3332       SmallVectorImpl<Instruction *> *Truncs, const TargetLowering &TLI);
3333
3334   /// \brief Utility function to promote the operand of \p Ext when this
3335   /// operand is promotable and is not a supported trunc or sext.
3336   /// \p PromotedInsts maps the instructions to their type before promotion.
3337   /// \p CreatedInstsCost[out] contains the cost of all the instructions
3338   /// created to promote the operand of Ext.
3339   /// Newly added extensions are inserted in \p Exts.
3340   /// Newly added truncates are inserted in \p Truncs.
3341   /// Should never be called directly.
3342   /// \return The promoted value which is used instead of Ext.
3343   static Value *promoteOperandForOther(Instruction *Ext,
3344                                        TypePromotionTransaction &TPT,
3345                                        InstrToOrigTy &PromotedInsts,
3346                                        unsigned &CreatedInstsCost,
3347                                        SmallVectorImpl<Instruction *> *Exts,
3348                                        SmallVectorImpl<Instruction *> *Truncs,
3349                                        const TargetLowering &TLI, bool IsSExt);
3350
3351   /// \see promoteOperandForOther.
3352   static Value *signExtendOperandForOther(
3353       Instruction *Ext, TypePromotionTransaction &TPT,
3354       InstrToOrigTy &PromotedInsts, unsigned &CreatedInstsCost,
3355       SmallVectorImpl<Instruction *> *Exts,
3356       SmallVectorImpl<Instruction *> *Truncs, const TargetLowering &TLI) {
3357     return promoteOperandForOther(Ext, TPT, PromotedInsts, CreatedInstsCost,
3358                                   Exts, Truncs, TLI, true);
3359   }
3360
3361   /// \see promoteOperandForOther.
3362   static Value *zeroExtendOperandForOther(
3363       Instruction *Ext, TypePromotionTransaction &TPT,
3364       InstrToOrigTy &PromotedInsts, unsigned &CreatedInstsCost,
3365       SmallVectorImpl<Instruction *> *Exts,
3366       SmallVectorImpl<Instruction *> *Truncs, const TargetLowering &TLI) {
3367     return promoteOperandForOther(Ext, TPT, PromotedInsts, CreatedInstsCost,
3368                                   Exts, Truncs, TLI, false);
3369   }
3370
3371 public:
3372   /// Type for the utility function that promotes the operand of Ext.
3373   typedef Value *(*Action)(Instruction *Ext, TypePromotionTransaction &TPT,
3374                            InstrToOrigTy &PromotedInsts,
3375                            unsigned &CreatedInstsCost,
3376                            SmallVectorImpl<Instruction *> *Exts,
3377                            SmallVectorImpl<Instruction *> *Truncs,
3378                            const TargetLowering &TLI);
3379   /// \brief Given a sign/zero extend instruction \p Ext, return the approriate
3380   /// action to promote the operand of \p Ext instead of using Ext.
3381   /// \return NULL if no promotable action is possible with the current
3382   /// sign extension.
3383   /// \p InsertedInsts keeps track of all the instructions inserted by the
3384   /// other CodeGenPrepare optimizations. This information is important
3385   /// because we do not want to promote these instructions as CodeGenPrepare
3386   /// will reinsert them later. Thus creating an infinite loop: create/remove.
3387   /// \p PromotedInsts maps the instructions to their type before promotion.
3388   static Action getAction(Instruction *Ext, const SetOfInstrs &InsertedInsts,
3389                           const TargetLowering &TLI,
3390                           const InstrToOrigTy &PromotedInsts);
3391 };
3392
3393 bool TypePromotionHelper::canGetThrough(const Instruction *Inst,
3394                                         Type *ConsideredExtType,
3395                                         const InstrToOrigTy &PromotedInsts,
3396                                         bool IsSExt) {
3397   // The promotion helper does not know how to deal with vector types yet.
3398   // To be able to fix that, we would need to fix the places where we
3399   // statically extend, e.g., constants and such.
3400   if (Inst->getType()->isVectorTy())
3401     return false;
3402
3403   // We can always get through zext.
3404   if (isa<ZExtInst>(Inst))
3405     return true;
3406
3407   // sext(sext) is ok too.
3408   if (IsSExt && isa<SExtInst>(Inst))
3409     return true;
3410
3411   // We can get through binary operator, if it is legal. In other words, the
3412   // binary operator must have a nuw or nsw flag.
3413   const BinaryOperator *BinOp = dyn_cast<BinaryOperator>(Inst);
3414   if (BinOp && isa<OverflowingBinaryOperator>(BinOp) &&
3415       ((!IsSExt && BinOp->hasNoUnsignedWrap()) ||
3416        (IsSExt && BinOp->hasNoSignedWrap())))
3417     return true;
3418
3419   // Check if we can do the following simplification.
3420   // ext(trunc(opnd)) --> ext(opnd)
3421   if (!isa<TruncInst>(Inst))
3422     return false;
3423
3424   Value *OpndVal = Inst->getOperand(0);
3425   // Check if we can use this operand in the extension.
3426   // If the type is larger than the result type of the extension, we cannot.
3427   if (!OpndVal->getType()->isIntegerTy() ||
3428       OpndVal->getType()->getIntegerBitWidth() >
3429           ConsideredExtType->getIntegerBitWidth())
3430     return false;
3431
3432   // If the operand of the truncate is not an instruction, we will not have
3433   // any information on the dropped bits.
3434   // (Actually we could for constant but it is not worth the extra logic).
3435   Instruction *Opnd = dyn_cast<Instruction>(OpndVal);
3436   if (!Opnd)
3437     return false;
3438
3439   // Check if the source of the type is narrow enough.
3440   // I.e., check that trunc just drops extended bits of the same kind of
3441   // the extension.
3442   // #1 get the type of the operand and check the kind of the extended bits.
3443   const Type *OpndType;
3444   InstrToOrigTy::const_iterator It = PromotedInsts.find(Opnd);
3445   if (It != PromotedInsts.end() && It->second.getInt() == IsSExt)
3446     OpndType = It->second.getPointer();
3447   else if ((IsSExt && isa<SExtInst>(Opnd)) || (!IsSExt && isa<ZExtInst>(Opnd)))
3448     OpndType = Opnd->getOperand(0)->getType();
3449   else
3450     return false;
3451
3452   // #2 check that the truncate just drops extended bits.
3453   return Inst->getType()->getIntegerBitWidth() >=
3454          OpndType->getIntegerBitWidth();
3455 }
3456
3457 TypePromotionHelper::Action TypePromotionHelper::getAction(
3458     Instruction *Ext, const SetOfInstrs &InsertedInsts,
3459     const TargetLowering &TLI, const InstrToOrigTy &PromotedInsts) {
3460   assert((isa<SExtInst>(Ext) || isa<ZExtInst>(Ext)) &&
3461          "Unexpected instruction type");
3462   Instruction *ExtOpnd = dyn_cast<Instruction>(Ext->getOperand(0));
3463   Type *ExtTy = Ext->getType();
3464   bool IsSExt = isa<SExtInst>(Ext);
3465   // If the operand of the extension is not an instruction, we cannot
3466   // get through.
3467   // If it, check we can get through.
3468   if (!ExtOpnd || !canGetThrough(ExtOpnd, ExtTy, PromotedInsts, IsSExt))
3469     return nullptr;
3470
3471   // Do not promote if the operand has been added by codegenprepare.
3472   // Otherwise, it means we are undoing an optimization that is likely to be
3473   // redone, thus causing potential infinite loop.
3474   if (isa<TruncInst>(ExtOpnd) && InsertedInsts.count(ExtOpnd))
3475     return nullptr;
3476
3477   // SExt or Trunc instructions.
3478   // Return the related handler.
3479   if (isa<SExtInst>(ExtOpnd) || isa<TruncInst>(ExtOpnd) ||
3480       isa<ZExtInst>(ExtOpnd))
3481     return promoteOperandForTruncAndAnyExt;
3482
3483   // Regular instruction.
3484   // Abort early if we will have to insert non-free instructions.
3485   if (!ExtOpnd->hasOneUse() && !TLI.isTruncateFree(ExtTy, ExtOpnd->getType()))
3486     return nullptr;
3487   return IsSExt ? signExtendOperandForOther : zeroExtendOperandForOther;
3488 }
3489
3490 Value *TypePromotionHelper::promoteOperandForTruncAndAnyExt(
3491     llvm::Instruction *SExt, TypePromotionTransaction &TPT,
3492     InstrToOrigTy &PromotedInsts, unsigned &CreatedInstsCost,
3493     SmallVectorImpl<Instruction *> *Exts,
3494     SmallVectorImpl<Instruction *> *Truncs, const TargetLowering &TLI) {
3495   // By construction, the operand of SExt is an instruction. Otherwise we cannot
3496   // get through it and this method should not be called.
3497   Instruction *SExtOpnd = cast<Instruction>(SExt->getOperand(0));
3498   Value *ExtVal = SExt;
3499   bool HasMergedNonFreeExt = false;
3500   if (isa<ZExtInst>(SExtOpnd)) {
3501     // Replace s|zext(zext(opnd))
3502     // => zext(opnd).
3503     HasMergedNonFreeExt = !TLI.isExtFree(SExtOpnd);
3504     Value *ZExt =
3505         TPT.createZExt(SExt, SExtOpnd->getOperand(0), SExt->getType());
3506     TPT.replaceAllUsesWith(SExt, ZExt);
3507     TPT.eraseInstruction(SExt);
3508     ExtVal = ZExt;
3509   } else {
3510     // Replace z|sext(trunc(opnd)) or sext(sext(opnd))
3511     // => z|sext(opnd).
3512     TPT.setOperand(SExt, 0, SExtOpnd->getOperand(0));
3513   }
3514   CreatedInstsCost = 0;
3515
3516   // Remove dead code.
3517   if (SExtOpnd->use_empty())
3518     TPT.eraseInstruction(SExtOpnd);
3519
3520   // Check if the extension is still needed.
3521   Instruction *ExtInst = dyn_cast<Instruction>(ExtVal);
3522   if (!ExtInst || ExtInst->getType() != ExtInst->getOperand(0)->getType()) {
3523     if (ExtInst) {
3524       if (Exts)
3525         Exts->push_back(ExtInst);
3526       CreatedInstsCost = !TLI.isExtFree(ExtInst) && !HasMergedNonFreeExt;
3527     }
3528     return ExtVal;
3529   }
3530
3531   // At this point we have: ext ty opnd to ty.
3532   // Reassign the uses of ExtInst to the opnd and remove ExtInst.
3533   Value *NextVal = ExtInst->getOperand(0);
3534   TPT.eraseInstruction(ExtInst, NextVal);
3535   return NextVal;
3536 }
3537
3538 Value *TypePromotionHelper::promoteOperandForOther(
3539     Instruction *Ext, TypePromotionTransaction &TPT,
3540     InstrToOrigTy &PromotedInsts, unsigned &CreatedInstsCost,
3541     SmallVectorImpl<Instruction *> *Exts,
3542     SmallVectorImpl<Instruction *> *Truncs, const TargetLowering &TLI,
3543     bool IsSExt) {
3544   // By construction, the operand of Ext is an instruction. Otherwise we cannot
3545   // get through it and this method should not be called.
3546   Instruction *ExtOpnd = cast<Instruction>(Ext->getOperand(0));
3547   CreatedInstsCost = 0;
3548   if (!ExtOpnd->hasOneUse()) {
3549     // ExtOpnd will be promoted.
3550     // All its uses, but Ext, will need to use a truncated value of the
3551     // promoted version.
3552     // Create the truncate now.
3553     Value *Trunc = TPT.createTrunc(Ext, ExtOpnd->getType());
3554     if (Instruction *ITrunc = dyn_cast<Instruction>(Trunc)) {
3555       ITrunc->removeFromParent();
3556       // Insert it just after the definition.
3557       ITrunc->insertAfter(ExtOpnd);
3558       if (Truncs)
3559         Truncs->push_back(ITrunc);
3560     }
3561
3562     TPT.replaceAllUsesWith(ExtOpnd, Trunc);
3563     // Restore the operand of Ext (which has been replaced by the previous call
3564     // to replaceAllUsesWith) to avoid creating a cycle trunc <-> sext.
3565     TPT.setOperand(Ext, 0, ExtOpnd);
3566   }
3567
3568   // Get through the Instruction:
3569   // 1. Update its type.
3570   // 2. Replace the uses of Ext by Inst.
3571   // 3. Extend each operand that needs to be extended.
3572
3573   // Remember the original type of the instruction before promotion.
3574   // This is useful to know that the high bits are sign extended bits.
3575   PromotedInsts.insert(std::pair<Instruction *, TypeIsSExt>(
3576       ExtOpnd, TypeIsSExt(ExtOpnd->getType(), IsSExt)));
3577   // Step #1.
3578   TPT.mutateType(ExtOpnd, Ext->getType());
3579   // Step #2.
3580   TPT.replaceAllUsesWith(Ext, ExtOpnd);
3581   // Step #3.
3582   Instruction *ExtForOpnd = Ext;
3583
3584   DEBUG(dbgs() << "Propagate Ext to operands\n");
3585   for (int OpIdx = 0, EndOpIdx = ExtOpnd->getNumOperands(); OpIdx != EndOpIdx;
3586        ++OpIdx) {
3587     DEBUG(dbgs() << "Operand:\n" << *(ExtOpnd->getOperand(OpIdx)) << '\n');
3588     if (ExtOpnd->getOperand(OpIdx)->getType() == Ext->getType() ||
3589         !shouldExtOperand(ExtOpnd, OpIdx)) {
3590       DEBUG(dbgs() << "No need to propagate\n");
3591       continue;
3592     }
3593     // Check if we can statically extend the operand.
3594     Value *Opnd = ExtOpnd->getOperand(OpIdx);
3595     if (const ConstantInt *Cst = dyn_cast<ConstantInt>(Opnd)) {
3596       DEBUG(dbgs() << "Statically extend\n");
3597       unsigned BitWidth = Ext->getType()->getIntegerBitWidth();
3598       APInt CstVal = IsSExt ? Cst->getValue().sext(BitWidth)
3599                             : Cst->getValue().zext(BitWidth);
3600       TPT.setOperand(ExtOpnd, OpIdx, ConstantInt::get(Ext->getType(), CstVal));
3601       continue;
3602     }
3603     // UndefValue are typed, so we have to statically sign extend them.
3604     if (isa<UndefValue>(Opnd)) {
3605       DEBUG(dbgs() << "Statically extend\n");
3606       TPT.setOperand(ExtOpnd, OpIdx, UndefValue::get(Ext->getType()));
3607       continue;
3608     }
3609
3610     // Otherwise we have to explicity sign extend the operand.
3611     // Check if Ext was reused to extend an operand.
3612     if (!ExtForOpnd) {
3613       // If yes, create a new one.
3614       DEBUG(dbgs() << "More operands to ext\n");
3615       Value *ValForExtOpnd = IsSExt ? TPT.createSExt(Ext, Opnd, Ext->getType())
3616         : TPT.createZExt(Ext, Opnd, Ext->getType());
3617       if (!isa<Instruction>(ValForExtOpnd)) {
3618         TPT.setOperand(ExtOpnd, OpIdx, ValForExtOpnd);
3619         continue;
3620       }
3621       ExtForOpnd = cast<Instruction>(ValForExtOpnd);
3622     }
3623     if (Exts)
3624       Exts->push_back(ExtForOpnd);
3625     TPT.setOperand(ExtForOpnd, 0, Opnd);
3626
3627     // Move the sign extension before the insertion point.
3628     TPT.moveBefore(ExtForOpnd, ExtOpnd);
3629     TPT.setOperand(ExtOpnd, OpIdx, ExtForOpnd);
3630     CreatedInstsCost += !TLI.isExtFree(ExtForOpnd);
3631     // If more sext are required, new instructions will have to be created.
3632     ExtForOpnd = nullptr;
3633   }
3634   if (ExtForOpnd == Ext) {
3635     DEBUG(dbgs() << "Extension is useless now\n");
3636     TPT.eraseInstruction(Ext);
3637   }
3638   return ExtOpnd;
3639 }
3640
3641 /// Check whether or not promoting an instruction to a wider type is profitable.
3642 /// \p NewCost gives the cost of extension instructions created by the
3643 /// promotion.
3644 /// \p OldCost gives the cost of extension instructions before the promotion
3645 /// plus the number of instructions that have been
3646 /// matched in the addressing mode the promotion.
3647 /// \p PromotedOperand is the value that has been promoted.
3648 /// \return True if the promotion is profitable, false otherwise.
3649 bool AddressingModeMatcher::isPromotionProfitable(
3650     unsigned NewCost, unsigned OldCost, Value *PromotedOperand) const {
3651   DEBUG(dbgs() << "OldCost: " << OldCost << "\tNewCost: " << NewCost << '\n');
3652   // The cost of the new extensions is greater than the cost of the
3653   // old extension plus what we folded.
3654   // This is not profitable.
3655   if (NewCost > OldCost)
3656     return false;
3657   if (NewCost < OldCost)
3658     return true;
3659   // The promotion is neutral but it may help folding the sign extension in
3660   // loads for instance.
3661   // Check that we did not create an illegal instruction.
3662   return isPromotedInstructionLegal(TLI, DL, PromotedOperand);
3663 }
3664
3665 /// Given an instruction or constant expr, see if we can fold the operation
3666 /// into the addressing mode. If so, update the addressing mode and return
3667 /// true, otherwise return false without modifying AddrMode.
3668 /// If \p MovedAway is not NULL, it contains the information of whether or
3669 /// not AddrInst has to be folded into the addressing mode on success.
3670 /// If \p MovedAway == true, \p AddrInst will not be part of the addressing
3671 /// because it has been moved away.
3672 /// Thus AddrInst must not be added in the matched instructions.
3673 /// This state can happen when AddrInst is a sext, since it may be moved away.
3674 /// Therefore, AddrInst may not be valid when MovedAway is true and it must
3675 /// not be referenced anymore.
3676 bool AddressingModeMatcher::matchOperationAddr(User *AddrInst, unsigned Opcode,
3677                                                unsigned Depth,
3678                                                bool *MovedAway) {
3679   // Avoid exponential behavior on extremely deep expression trees.
3680   if (Depth >= 5) return false;
3681
3682   // By default, all matched instructions stay in place.
3683   if (MovedAway)
3684     *MovedAway = false;
3685
3686   switch (Opcode) {
3687   case Instruction::PtrToInt:
3688     // PtrToInt is always a noop, as we know that the int type is pointer sized.
3689     return matchAddr(AddrInst->getOperand(0), Depth);
3690   case Instruction::IntToPtr: {
3691     auto AS = AddrInst->getType()->getPointerAddressSpace();
3692     auto PtrTy = MVT::getIntegerVT(DL.getPointerSizeInBits(AS));
3693     // This inttoptr is a no-op if the integer type is pointer sized.
3694     if (TLI.getValueType(DL, AddrInst->getOperand(0)->getType()) == PtrTy)
3695       return matchAddr(AddrInst->getOperand(0), Depth);
3696     return false;
3697   }
3698   case Instruction::BitCast:
3699     // BitCast is always a noop, and we can handle it as long as it is
3700     // int->int or pointer->pointer (we don't want int<->fp or something).
3701     if ((AddrInst->getOperand(0)->getType()->isPointerTy() ||
3702          AddrInst->getOperand(0)->getType()->isIntegerTy()) &&
3703         // Don't touch identity bitcasts.  These were probably put here by LSR,
3704         // and we don't want to mess around with them.  Assume it knows what it
3705         // is doing.
3706         AddrInst->getOperand(0)->getType() != AddrInst->getType())
3707       return matchAddr(AddrInst->getOperand(0), Depth);
3708     return false;
3709   case Instruction::AddrSpaceCast: {
3710     unsigned SrcAS
3711       = AddrInst->getOperand(0)->getType()->getPointerAddressSpace();
3712     unsigned DestAS = AddrInst->getType()->getPointerAddressSpace();
3713     if (TLI.isNoopAddrSpaceCast(SrcAS, DestAS))
3714       return matchAddr(AddrInst->getOperand(0), Depth);
3715     return false;
3716   }
3717   case Instruction::Add: {
3718     // Check to see if we can merge in the RHS then the LHS.  If so, we win.
3719     ExtAddrMode BackupAddrMode = AddrMode;
3720     unsigned OldSize = AddrModeInsts.size();
3721     // Start a transaction at this point.
3722     // The LHS may match but not the RHS.
3723     // Therefore, we need a higher level restoration point to undo partially
3724     // matched operation.
3725     TypePromotionTransaction::ConstRestorationPt LastKnownGood =
3726         TPT.getRestorationPoint();
3727
3728     if (matchAddr(AddrInst->getOperand(1), Depth+1) &&
3729         matchAddr(AddrInst->getOperand(0), Depth+1))
3730       return true;
3731
3732     // Restore the old addr mode info.
3733     AddrMode = BackupAddrMode;
3734     AddrModeInsts.resize(OldSize);
3735     TPT.rollback(LastKnownGood);
3736
3737     // Otherwise this was over-aggressive.  Try merging in the LHS then the RHS.
3738     if (matchAddr(AddrInst->getOperand(0), Depth+1) &&
3739         matchAddr(AddrInst->getOperand(1), Depth+1))
3740       return true;
3741
3742     // Otherwise we definitely can't merge the ADD in.
3743     AddrMode = BackupAddrMode;
3744     AddrModeInsts.resize(OldSize);
3745     TPT.rollback(LastKnownGood);
3746     break;
3747   }
3748   //case Instruction::Or:
3749   // TODO: We can handle "Or Val, Imm" iff this OR is equivalent to an ADD.
3750   //break;
3751   case Instruction::Mul:
3752   case Instruction::Shl: {
3753     // Can only handle X*C and X << C.
3754     ConstantInt *RHS = dyn_cast<ConstantInt>(AddrInst->getOperand(1));
3755     if (!RHS)
3756       return false;
3757     int64_t Scale = RHS->getSExtValue();
3758     if (Opcode == Instruction::Shl)
3759       Scale = 1LL << Scale;
3760
3761     return matchScaledValue(AddrInst->getOperand(0), Scale, Depth);
3762   }
3763   case Instruction::GetElementPtr: {
3764     // Scan the GEP.  We check it if it contains constant offsets and at most
3765     // one variable offset.
3766     int VariableOperand = -1;
3767     unsigned VariableScale = 0;
3768
3769     int64_t ConstantOffset = 0;
3770     gep_type_iterator GTI = gep_type_begin(AddrInst);
3771     for (unsigned i = 1, e = AddrInst->getNumOperands(); i != e; ++i, ++GTI) {
3772       if (StructType *STy = GTI.getStructTypeOrNull()) {
3773         const StructLayout *SL = DL.getStructLayout(STy);
3774         unsigned Idx =
3775           cast<ConstantInt>(AddrInst->getOperand(i))->getZExtValue();
3776         ConstantOffset += SL->getElementOffset(Idx);
3777       } else {
3778         uint64_t TypeSize = DL.getTypeAllocSize(GTI.getIndexedType());
3779         if (ConstantInt *CI = dyn_cast<ConstantInt>(AddrInst->getOperand(i))) {
3780           ConstantOffset += CI->getSExtValue()*TypeSize;
3781         } else if (TypeSize) {  // Scales of zero don't do anything.
3782           // We only allow one variable index at the moment.
3783           if (VariableOperand != -1)
3784             return false;
3785
3786           // Remember the variable index.
3787           VariableOperand = i;
3788           VariableScale = TypeSize;
3789         }
3790       }
3791     }
3792
3793     // A common case is for the GEP to only do a constant offset.  In this case,
3794     // just add it to the disp field and check validity.
3795     if (VariableOperand == -1) {
3796       AddrMode.BaseOffs += ConstantOffset;
3797       if (ConstantOffset == 0 ||
3798           TLI.isLegalAddressingMode(DL, AddrMode, AccessTy, AddrSpace)) {
3799         // Check to see if we can fold the base pointer in too.
3800         if (matchAddr(AddrInst->getOperand(0), Depth+1))
3801           return true;
3802       }
3803       AddrMode.BaseOffs -= ConstantOffset;
3804       return false;
3805     }
3806
3807     // Save the valid addressing mode in case we can't match.
3808     ExtAddrMode BackupAddrMode = AddrMode;
3809     unsigned OldSize = AddrModeInsts.size();
3810
3811     // See if the scale and offset amount is valid for this target.
3812     AddrMode.BaseOffs += ConstantOffset;
3813
3814     // Match the base operand of the GEP.
3815     if (!matchAddr(AddrInst->getOperand(0), Depth+1)) {
3816       // If it couldn't be matched, just stuff the value in a register.
3817       if (AddrMode.HasBaseReg) {
3818         AddrMode = BackupAddrMode;
3819         AddrModeInsts.resize(OldSize);
3820         return false;
3821       }
3822       AddrMode.HasBaseReg = true;
3823       AddrMode.BaseReg = AddrInst->getOperand(0);
3824     }
3825
3826     // Match the remaining variable portion of the GEP.
3827     if (!matchScaledValue(AddrInst->getOperand(VariableOperand), VariableScale,
3828                           Depth)) {
3829       // If it couldn't be matched, try stuffing the base into a register
3830       // instead of matching it, and retrying the match of the scale.
3831       AddrMode = BackupAddrMode;
3832       AddrModeInsts.resize(OldSize);
3833       if (AddrMode.HasBaseReg)
3834         return false;
3835       AddrMode.HasBaseReg = true;
3836       AddrMode.BaseReg = AddrInst->getOperand(0);
3837       AddrMode.BaseOffs += ConstantOffset;
3838       if (!matchScaledValue(AddrInst->getOperand(VariableOperand),
3839                             VariableScale, Depth)) {
3840         // If even that didn't work, bail.
3841         AddrMode = BackupAddrMode;
3842         AddrModeInsts.resize(OldSize);
3843         return false;
3844       }
3845     }
3846
3847     return true;
3848   }
3849   case Instruction::SExt:
3850   case Instruction::ZExt: {
3851     Instruction *Ext = dyn_cast<Instruction>(AddrInst);
3852     if (!Ext)
3853       return false;
3854
3855     // Try to move this ext out of the way of the addressing mode.
3856     // Ask for a method for doing so.
3857     TypePromotionHelper::Action TPH =
3858         TypePromotionHelper::getAction(Ext, InsertedInsts, TLI, PromotedInsts);
3859     if (!TPH)
3860       return false;
3861
3862     TypePromotionTransaction::ConstRestorationPt LastKnownGood =
3863         TPT.getRestorationPoint();
3864     unsigned CreatedInstsCost = 0;
3865     unsigned ExtCost = !TLI.isExtFree(Ext);
3866     Value *PromotedOperand =
3867         TPH(Ext, TPT, PromotedInsts, CreatedInstsCost, nullptr, nullptr, TLI);
3868     // SExt has been moved away.
3869     // Thus either it will be rematched later in the recursive calls or it is
3870     // gone. Anyway, we must not fold it into the addressing mode at this point.
3871     // E.g.,
3872     // op = add opnd, 1
3873     // idx = ext op
3874     // addr = gep base, idx
3875     // is now:
3876     // promotedOpnd = ext opnd            <- no match here
3877     // op = promoted_add promotedOpnd, 1  <- match (later in recursive calls)
3878     // addr = gep base, op                <- match
3879     if (MovedAway)
3880       *MovedAway = true;
3881
3882     assert(PromotedOperand &&
3883            "TypePromotionHelper should have filtered out those cases");
3884
3885     ExtAddrMode BackupAddrMode = AddrMode;
3886     unsigned OldSize = AddrModeInsts.size();
3887
3888     if (!matchAddr(PromotedOperand, Depth) ||
3889         // The total of the new cost is equal to the cost of the created
3890         // instructions.
3891         // The total of the old cost is equal to the cost of the extension plus
3892         // what we have saved in the addressing mode.
3893         !isPromotionProfitable(CreatedInstsCost,
3894                                ExtCost + (AddrModeInsts.size() - OldSize),
3895                                PromotedOperand)) {
3896       AddrMode = BackupAddrMode;
3897       AddrModeInsts.resize(OldSize);
3898       DEBUG(dbgs() << "Sign extension does not pay off: rollback\n");
3899       TPT.rollback(LastKnownGood);
3900       return false;
3901     }
3902     return true;
3903   }
3904   }
3905   return false;
3906 }
3907
3908 /// If we can, try to add the value of 'Addr' into the current addressing mode.
3909 /// If Addr can't be added to AddrMode this returns false and leaves AddrMode
3910 /// unmodified. This assumes that Addr is either a pointer type or intptr_t
3911 /// for the target.
3912 ///
3913 bool AddressingModeMatcher::matchAddr(Value *Addr, unsigned Depth) {
3914   // Start a transaction at this point that we will rollback if the matching
3915   // fails.
3916   TypePromotionTransaction::ConstRestorationPt LastKnownGood =
3917       TPT.getRestorationPoint();
3918   if (ConstantInt *CI = dyn_cast<ConstantInt>(Addr)) {
3919     // Fold in immediates if legal for the target.
3920     AddrMode.BaseOffs += CI->getSExtValue();
3921     if (TLI.isLegalAddressingMode(DL, AddrMode, AccessTy, AddrSpace))
3922       return true;
3923     AddrMode.BaseOffs -= CI->getSExtValue();
3924   } else if (GlobalValue *GV = dyn_cast<GlobalValue>(Addr)) {
3925     // If this is a global variable, try to fold it into the addressing mode.
3926     if (!AddrMode.BaseGV) {
3927       AddrMode.BaseGV = GV;
3928       if (TLI.isLegalAddressingMode(DL, AddrMode, AccessTy, AddrSpace))
3929         return true;
3930       AddrMode.BaseGV = nullptr;
3931     }
3932   } else if (Instruction *I = dyn_cast<Instruction>(Addr)) {
3933     ExtAddrMode BackupAddrMode = AddrMode;
3934     unsigned OldSize = AddrModeInsts.size();
3935
3936     // Check to see if it is possible to fold this operation.
3937     bool MovedAway = false;
3938     if (matchOperationAddr(I, I->getOpcode(), Depth, &MovedAway)) {
3939       // This instruction may have been moved away. If so, there is nothing
3940       // to check here.
3941       if (MovedAway)
3942         return true;
3943       // Okay, it's possible to fold this.  Check to see if it is actually
3944       // *profitable* to do so.  We use a simple cost model to avoid increasing
3945       // register pressure too much.
3946       if (I->hasOneUse() ||
3947           isProfitableToFoldIntoAddressingMode(I, BackupAddrMode, AddrMode)) {
3948         AddrModeInsts.push_back(I);
3949         return true;
3950       }
3951
3952       // It isn't profitable to do this, roll back.
3953       //cerr << "NOT FOLDING: " << *I;
3954       AddrMode = BackupAddrMode;
3955       AddrModeInsts.resize(OldSize);
3956       TPT.rollback(LastKnownGood);
3957     }
3958   } else if (ConstantExpr *CE = dyn_cast<ConstantExpr>(Addr)) {
3959     if (matchOperationAddr(CE, CE->getOpcode(), Depth))
3960       return true;
3961     TPT.rollback(LastKnownGood);
3962   } else if (isa<ConstantPointerNull>(Addr)) {
3963     // Null pointer gets folded without affecting the addressing mode.
3964     return true;
3965   }
3966
3967   // Worse case, the target should support [reg] addressing modes. :)
3968   if (!AddrMode.HasBaseReg) {
3969     AddrMode.HasBaseReg = true;
3970     AddrMode.BaseReg = Addr;
3971     // Still check for legality in case the target supports [imm] but not [i+r].
3972     if (TLI.isLegalAddressingMode(DL, AddrMode, AccessTy, AddrSpace))
3973       return true;
3974     AddrMode.HasBaseReg = false;
3975     AddrMode.BaseReg = nullptr;
3976   }
3977
3978   // If the base register is already taken, see if we can do [r+r].
3979   if (AddrMode.Scale == 0) {
3980     AddrMode.Scale = 1;
3981     AddrMode.ScaledReg = Addr;
3982     if (TLI.isLegalAddressingMode(DL, AddrMode, AccessTy, AddrSpace))
3983       return true;
3984     AddrMode.Scale = 0;
3985     AddrMode.ScaledReg = nullptr;
3986   }
3987   // Couldn't match.
3988   TPT.rollback(LastKnownGood);
3989   return false;
3990 }
3991
3992 /// Check to see if all uses of OpVal by the specified inline asm call are due
3993 /// to memory operands. If so, return true, otherwise return false.
3994 static bool IsOperandAMemoryOperand(CallInst *CI, InlineAsm *IA, Value *OpVal,
3995                                     const TargetLowering &TLI,
3996                                     const TargetRegisterInfo &TRI) {
3997   const Function *F = CI->getFunction();
3998   TargetLowering::AsmOperandInfoVector TargetConstraints =
3999       TLI.ParseConstraints(F->getParent()->getDataLayout(), &TRI,
4000                             ImmutableCallSite(CI));
4001
4002   for (unsigned i = 0, e = TargetConstraints.size(); i != e; ++i) {
4003     TargetLowering::AsmOperandInfo &OpInfo = TargetConstraints[i];
4004
4005     // Compute the constraint code and ConstraintType to use.
4006     TLI.ComputeConstraintToUse(OpInfo, SDValue());
4007
4008     // If this asm operand is our Value*, and if it isn't an indirect memory
4009     // operand, we can't fold it!
4010     if (OpInfo.CallOperandVal == OpVal &&
4011         (OpInfo.ConstraintType != TargetLowering::C_Memory ||
4012          !OpInfo.isIndirect))
4013       return false;
4014   }
4015
4016   return true;
4017 }
4018
4019 /// Recursively walk all the uses of I until we find a memory use.
4020 /// If we find an obviously non-foldable instruction, return true.
4021 /// Add the ultimately found memory instructions to MemoryUses.
4022 static bool FindAllMemoryUses(
4023     Instruction *I,
4024     SmallVectorImpl<std::pair<Instruction *, unsigned>> &MemoryUses,
4025     SmallPtrSetImpl<Instruction *> &ConsideredInsts,
4026     const TargetLowering &TLI, const TargetRegisterInfo &TRI) {
4027   // If we already considered this instruction, we're done.
4028   if (!ConsideredInsts.insert(I).second)
4029     return false;
4030
4031   // If this is an obviously unfoldable instruction, bail out.
4032   if (!MightBeFoldableInst(I))
4033     return true;
4034
4035   const bool OptSize = I->getFunction()->optForSize();
4036
4037   // Loop over all the uses, recursively processing them.
4038   for (Use &U : I->uses()) {
4039     Instruction *UserI = cast<Instruction>(U.getUser());
4040
4041     if (LoadInst *LI = dyn_cast<LoadInst>(UserI)) {
4042       MemoryUses.push_back(std::make_pair(LI, U.getOperandNo()));
4043       continue;
4044     }
4045
4046     if (StoreInst *SI = dyn_cast<StoreInst>(UserI)) {
4047       unsigned opNo = U.getOperandNo();
4048       if (opNo != StoreInst::getPointerOperandIndex())
4049         return true; // Storing addr, not into addr.
4050       MemoryUses.push_back(std::make_pair(SI, opNo));
4051       continue;
4052     }
4053
4054     if (AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(UserI)) {
4055       unsigned opNo = U.getOperandNo();
4056       if (opNo != AtomicRMWInst::getPointerOperandIndex())
4057         return true; // Storing addr, not into addr.
4058       MemoryUses.push_back(std::make_pair(RMW, opNo));
4059       continue;
4060     }
4061
4062     if (AtomicCmpXchgInst *CmpX = dyn_cast<AtomicCmpXchgInst>(UserI)) {
4063       unsigned opNo = U.getOperandNo();
4064       if (opNo != AtomicCmpXchgInst::getPointerOperandIndex())
4065         return true; // Storing addr, not into addr.
4066       MemoryUses.push_back(std::make_pair(CmpX, opNo));
4067       continue;
4068     }
4069
4070     if (CallInst *CI = dyn_cast<CallInst>(UserI)) {
4071       // If this is a cold call, we can sink the addressing calculation into
4072       // the cold path.  See optimizeCallInst
4073       if (!OptSize && CI->hasFnAttr(Attribute::Cold))
4074         continue;
4075
4076       InlineAsm *IA = dyn_cast<InlineAsm>(CI->getCalledValue());
4077       if (!IA) return true;
4078
4079       // If this is a memory operand, we're cool, otherwise bail out.
4080       if (!IsOperandAMemoryOperand(CI, IA, I, TLI, TRI))
4081         return true;
4082       continue;
4083     }
4084
4085     if (FindAllMemoryUses(UserI, MemoryUses, ConsideredInsts, TLI, TRI))
4086       return true;
4087   }
4088
4089   return false;
4090 }
4091
4092 /// Return true if Val is already known to be live at the use site that we're
4093 /// folding it into. If so, there is no cost to include it in the addressing
4094 /// mode. KnownLive1 and KnownLive2 are two values that we know are live at the
4095 /// instruction already.
4096 bool AddressingModeMatcher::valueAlreadyLiveAtInst(Value *Val,Value *KnownLive1,
4097                                                    Value *KnownLive2) {
4098   // If Val is either of the known-live values, we know it is live!
4099   if (Val == nullptr || Val == KnownLive1 || Val == KnownLive2)
4100     return true;
4101
4102   // All values other than instructions and arguments (e.g. constants) are live.
4103   if (!isa<Instruction>(Val) && !isa<Argument>(Val)) return true;
4104
4105   // If Val is a constant sized alloca in the entry block, it is live, this is
4106   // true because it is just a reference to the stack/frame pointer, which is
4107   // live for the whole function.
4108   if (AllocaInst *AI = dyn_cast<AllocaInst>(Val))
4109     if (AI->isStaticAlloca())
4110       return true;
4111
4112   // Check to see if this value is already used in the memory instruction's
4113   // block.  If so, it's already live into the block at the very least, so we
4114   // can reasonably fold it.
4115   return Val->isUsedInBasicBlock(MemoryInst->getParent());
4116 }
4117
4118 /// It is possible for the addressing mode of the machine to fold the specified
4119 /// instruction into a load or store that ultimately uses it.
4120 /// However, the specified instruction has multiple uses.
4121 /// Given this, it may actually increase register pressure to fold it
4122 /// into the load. For example, consider this code:
4123 ///
4124 ///     X = ...
4125 ///     Y = X+1
4126 ///     use(Y)   -> nonload/store
4127 ///     Z = Y+1
4128 ///     load Z
4129 ///
4130 /// In this case, Y has multiple uses, and can be folded into the load of Z
4131 /// (yielding load [X+2]).  However, doing this will cause both "X" and "X+1" to
4132 /// be live at the use(Y) line.  If we don't fold Y into load Z, we use one
4133 /// fewer register.  Since Y can't be folded into "use(Y)" we don't increase the
4134 /// number of computations either.
4135 ///
4136 /// Note that this (like most of CodeGenPrepare) is just a rough heuristic.  If
4137 /// X was live across 'load Z' for other reasons, we actually *would* want to
4138 /// fold the addressing mode in the Z case.  This would make Y die earlier.
4139 bool AddressingModeMatcher::
4140 isProfitableToFoldIntoAddressingMode(Instruction *I, ExtAddrMode &AMBefore,
4141                                      ExtAddrMode &AMAfter) {
4142   if (IgnoreProfitability) return true;
4143
4144   // AMBefore is the addressing mode before this instruction was folded into it,
4145   // and AMAfter is the addressing mode after the instruction was folded.  Get
4146   // the set of registers referenced by AMAfter and subtract out those
4147   // referenced by AMBefore: this is the set of values which folding in this
4148   // address extends the lifetime of.
4149   //
4150   // Note that there are only two potential values being referenced here,
4151   // BaseReg and ScaleReg (global addresses are always available, as are any
4152   // folded immediates).
4153   Value *BaseReg = AMAfter.BaseReg, *ScaledReg = AMAfter.ScaledReg;
4154
4155   // If the BaseReg or ScaledReg was referenced by the previous addrmode, their
4156   // lifetime wasn't extended by adding this instruction.
4157   if (valueAlreadyLiveAtInst(BaseReg, AMBefore.BaseReg, AMBefore.ScaledReg))
4158     BaseReg = nullptr;
4159   if (valueAlreadyLiveAtInst(ScaledReg, AMBefore.BaseReg, AMBefore.ScaledReg))
4160     ScaledReg = nullptr;
4161
4162   // If folding this instruction (and it's subexprs) didn't extend any live
4163   // ranges, we're ok with it.
4164   if (!BaseReg && !ScaledReg)
4165     return true;
4166
4167   // If all uses of this instruction can have the address mode sunk into them,
4168   // we can remove the addressing mode and effectively trade one live register
4169   // for another (at worst.)  In this context, folding an addressing mode into
4170   // the use is just a particularly nice way of sinking it.
4171   SmallVector<std::pair<Instruction*,unsigned>, 16> MemoryUses;
4172   SmallPtrSet<Instruction*, 16> ConsideredInsts;
4173   if (FindAllMemoryUses(I, MemoryUses, ConsideredInsts, TLI, TRI))
4174     return false;  // Has a non-memory, non-foldable use!
4175
4176   // Now that we know that all uses of this instruction are part of a chain of
4177   // computation involving only operations that could theoretically be folded
4178   // into a memory use, loop over each of these memory operation uses and see
4179   // if they could  *actually* fold the instruction.  The assumption is that
4180   // addressing modes are cheap and that duplicating the computation involved
4181   // many times is worthwhile, even on a fastpath. For sinking candidates
4182   // (i.e. cold call sites), this serves as a way to prevent excessive code
4183   // growth since most architectures have some reasonable small and fast way to
4184   // compute an effective address.  (i.e LEA on x86)
4185   SmallVector<Instruction*, 32> MatchedAddrModeInsts;
4186   for (unsigned i = 0, e = MemoryUses.size(); i != e; ++i) {
4187     Instruction *User = MemoryUses[i].first;
4188     unsigned OpNo = MemoryUses[i].second;
4189
4190     // Get the access type of this use.  If the use isn't a pointer, we don't
4191     // know what it accesses.
4192     Value *Address = User->getOperand(OpNo);
4193     PointerType *AddrTy = dyn_cast<PointerType>(Address->getType());
4194     if (!AddrTy)
4195       return false;
4196     Type *AddressAccessTy = AddrTy->getElementType();
4197     unsigned AS = AddrTy->getAddressSpace();
4198
4199     // Do a match against the root of this address, ignoring profitability. This
4200     // will tell us if the addressing mode for the memory operation will
4201     // *actually* cover the shared instruction.
4202     ExtAddrMode Result;
4203     TypePromotionTransaction::ConstRestorationPt LastKnownGood =
4204         TPT.getRestorationPoint();
4205     AddressingModeMatcher Matcher(MatchedAddrModeInsts, TLI, TRI,
4206                                   AddressAccessTy, AS,
4207                                   MemoryInst, Result, InsertedInsts,
4208                                   PromotedInsts, TPT);
4209     Matcher.IgnoreProfitability = true;
4210     bool Success = Matcher.matchAddr(Address, 0);
4211     (void)Success; assert(Success && "Couldn't select *anything*?");
4212
4213     // The match was to check the profitability, the changes made are not
4214     // part of the original matcher. Therefore, they should be dropped
4215     // otherwise the original matcher will not present the right state.
4216     TPT.rollback(LastKnownGood);
4217
4218     // If the match didn't cover I, then it won't be shared by it.
4219     if (!is_contained(MatchedAddrModeInsts, I))
4220       return false;
4221
4222     MatchedAddrModeInsts.clear();
4223   }
4224
4225   return true;
4226 }
4227
4228 } // end anonymous namespace
4229
4230 /// Return true if the specified values are defined in a
4231 /// different basic block than BB.
4232 static bool IsNonLocalValue(Value *V, BasicBlock *BB) {
4233   if (Instruction *I = dyn_cast<Instruction>(V))
4234     return I->getParent() != BB;
4235   return false;
4236 }
4237
4238 /// Sink addressing mode computation immediate before MemoryInst if doing so
4239 /// can be done without increasing register pressure.  The need for the
4240 /// register pressure constraint means this can end up being an all or nothing
4241 /// decision for all uses of the same addressing computation.
4242 ///
4243 /// Load and Store Instructions often have addressing modes that can do
4244 /// significant amounts of computation. As such, instruction selection will try
4245 /// to get the load or store to do as much computation as possible for the
4246 /// program. The problem is that isel can only see within a single block. As
4247 /// such, we sink as much legal addressing mode work into the block as possible.
4248 ///
4249 /// This method is used to optimize both load/store and inline asms with memory
4250 /// operands.  It's also used to sink addressing computations feeding into cold
4251 /// call sites into their (cold) basic block.
4252 ///
4253 /// The motivation for handling sinking into cold blocks is that doing so can
4254 /// both enable other address mode sinking (by satisfying the register pressure
4255 /// constraint above), and reduce register pressure globally (by removing the
4256 /// addressing mode computation from the fast path entirely.).
4257 bool CodeGenPrepare::optimizeMemoryInst(Instruction *MemoryInst, Value *Addr,
4258                                         Type *AccessTy, unsigned AddrSpace) {
4259   Value *Repl = Addr;
4260
4261   // Try to collapse single-value PHI nodes.  This is necessary to undo
4262   // unprofitable PRE transformations.
4263   SmallVector<Value*, 8> worklist;
4264   SmallPtrSet<Value*, 16> Visited;
4265   worklist.push_back(Addr);
4266
4267   // Use a worklist to iteratively look through PHI nodes, and ensure that
4268   // the addressing mode obtained from the non-PHI roots of the graph
4269   // are equivalent.
4270   bool AddrModeFound = false;
4271   bool PhiSeen = false;
4272   SmallVector<Instruction*, 16> AddrModeInsts;
4273   ExtAddrMode AddrMode;
4274   TypePromotionTransaction TPT(RemovedInsts);
4275   TypePromotionTransaction::ConstRestorationPt LastKnownGood =
4276       TPT.getRestorationPoint();
4277   while (!worklist.empty()) {
4278     Value *V = worklist.back();
4279     worklist.pop_back();
4280
4281     // We allow traversing cyclic Phi nodes.
4282     // In case of success after this loop we ensure that traversing through
4283     // Phi nodes ends up with all cases to compute address of the form
4284     //    BaseGV + Base + Scale * Index + Offset
4285     // where Scale and Offset are constans and BaseGV, Base and Index
4286     // are exactly the same Values in all cases.
4287     // It means that BaseGV, Scale and Offset dominate our memory instruction
4288     // and have the same value as they had in address computation represented
4289     // as Phi. So we can safely sink address computation to memory instruction.
4290     if (!Visited.insert(V).second)
4291       continue;
4292
4293     // For a PHI node, push all of its incoming values.
4294     if (PHINode *P = dyn_cast<PHINode>(V)) {
4295       for (Value *IncValue : P->incoming_values())
4296         worklist.push_back(IncValue);
4297       PhiSeen = true;
4298       continue;
4299     }
4300
4301     // For non-PHIs, determine the addressing mode being computed.  Note that
4302     // the result may differ depending on what other uses our candidate
4303     // addressing instructions might have.
4304     AddrModeInsts.clear();
4305     ExtAddrMode NewAddrMode = AddressingModeMatcher::Match(
4306         V, AccessTy, AddrSpace, MemoryInst, AddrModeInsts, *TLI, *TRI,
4307         InsertedInsts, PromotedInsts, TPT);
4308
4309     if (!AddrModeFound) {
4310       AddrModeFound = true;
4311       AddrMode = NewAddrMode;
4312       continue;
4313     }
4314     if (NewAddrMode == AddrMode)
4315       continue;
4316
4317     AddrModeFound = false;
4318     break;
4319   }
4320
4321   // If the addressing mode couldn't be determined, or if multiple different
4322   // ones were determined, bail out now.
4323   if (!AddrModeFound) {
4324     TPT.rollback(LastKnownGood);
4325     return false;
4326   }
4327   TPT.commit();
4328
4329   // If all the instructions matched are already in this BB, don't do anything.
4330   // If we saw Phi node then it is not local definitely.
4331   if (!PhiSeen && none_of(AddrModeInsts, [&](Value *V) {
4332         return IsNonLocalValue(V, MemoryInst->getParent());
4333                   })) {
4334     DEBUG(dbgs() << "CGP: Found      local addrmode: " << AddrMode << "\n");
4335     return false;
4336   }
4337
4338   // Insert this computation right after this user.  Since our caller is
4339   // scanning from the top of the BB to the bottom, reuse of the expr are
4340   // guaranteed to happen later.
4341   IRBuilder<> Builder(MemoryInst);
4342
4343   // Now that we determined the addressing expression we want to use and know
4344   // that we have to sink it into this block.  Check to see if we have already
4345   // done this for some other load/store instr in this block.  If so, reuse the
4346   // computation.
4347   Value *&SunkAddr = SunkAddrs[Addr];
4348   if (SunkAddr) {
4349     DEBUG(dbgs() << "CGP: Reusing nonlocal addrmode: " << AddrMode << " for "
4350                  << *MemoryInst << "\n");
4351     if (SunkAddr->getType() != Addr->getType())
4352       SunkAddr = Builder.CreatePointerCast(SunkAddr, Addr->getType());
4353   } else if (AddrSinkUsingGEPs ||
4354              (!AddrSinkUsingGEPs.getNumOccurrences() && TM &&
4355               SubtargetInfo->useAA())) {
4356     // By default, we use the GEP-based method when AA is used later. This
4357     // prevents new inttoptr/ptrtoint pairs from degrading AA capabilities.
4358     DEBUG(dbgs() << "CGP: SINKING nonlocal addrmode: " << AddrMode << " for "
4359                  << *MemoryInst << "\n");
4360     Type *IntPtrTy = DL->getIntPtrType(Addr->getType());
4361     Value *ResultPtr = nullptr, *ResultIndex = nullptr;
4362
4363     // First, find the pointer.
4364     if (AddrMode.BaseReg && AddrMode.BaseReg->getType()->isPointerTy()) {
4365       ResultPtr = AddrMode.BaseReg;
4366       AddrMode.BaseReg = nullptr;
4367     }
4368
4369     if (AddrMode.Scale && AddrMode.ScaledReg->getType()->isPointerTy()) {
4370       // We can't add more than one pointer together, nor can we scale a
4371       // pointer (both of which seem meaningless).
4372       if (ResultPtr || AddrMode.Scale != 1)
4373         return false;
4374
4375       ResultPtr = AddrMode.ScaledReg;
4376       AddrMode.Scale = 0;
4377     }
4378
4379     // It is only safe to sign extend the BaseReg if we know that the math
4380     // required to create it did not overflow before we extend it. Since
4381     // the original IR value was tossed in favor of a constant back when
4382     // the AddrMode was created we need to bail out gracefully if widths
4383     // do not match instead of extending it.
4384     //
4385     // (See below for code to add the scale.)
4386     if (AddrMode.Scale) {
4387       Type *ScaledRegTy = AddrMode.ScaledReg->getType();
4388       if (cast<IntegerType>(IntPtrTy)->getBitWidth() >
4389           cast<IntegerType>(ScaledRegTy)->getBitWidth())
4390         return false;
4391     }
4392
4393     if (AddrMode.BaseGV) {
4394       if (ResultPtr)
4395         return false;
4396
4397       ResultPtr = AddrMode.BaseGV;
4398     }
4399
4400     // If the real base value actually came from an inttoptr, then the matcher
4401     // will look through it and provide only the integer value. In that case,
4402     // use it here.
4403     if (!DL->isNonIntegralPointerType(Addr->getType())) {
4404       if (!ResultPtr && AddrMode.BaseReg) {
4405         ResultPtr = Builder.CreateIntToPtr(AddrMode.BaseReg, Addr->getType(),
4406                                            "sunkaddr");
4407         AddrMode.BaseReg = nullptr;
4408       } else if (!ResultPtr && AddrMode.Scale == 1) {
4409         ResultPtr = Builder.CreateIntToPtr(AddrMode.ScaledReg, Addr->getType(),
4410                                            "sunkaddr");
4411         AddrMode.Scale = 0;
4412       }
4413     }
4414
4415     if (!ResultPtr &&
4416         !AddrMode.BaseReg && !AddrMode.Scale && !AddrMode.BaseOffs) {
4417       SunkAddr = Constant::getNullValue(Addr->getType());
4418     } else if (!ResultPtr) {
4419       return false;
4420     } else {
4421       Type *I8PtrTy =
4422           Builder.getInt8PtrTy(Addr->getType()->getPointerAddressSpace());
4423       Type *I8Ty = Builder.getInt8Ty();
4424
4425       // Start with the base register. Do this first so that subsequent address
4426       // matching finds it last, which will prevent it from trying to match it
4427       // as the scaled value in case it happens to be a mul. That would be
4428       // problematic if we've sunk a different mul for the scale, because then
4429       // we'd end up sinking both muls.
4430       if (AddrMode.BaseReg) {
4431         Value *V = AddrMode.BaseReg;
4432         if (V->getType() != IntPtrTy)
4433           V = Builder.CreateIntCast(V, IntPtrTy, /*isSigned=*/true, "sunkaddr");
4434
4435         ResultIndex = V;
4436       }
4437
4438       // Add the scale value.
4439       if (AddrMode.Scale) {
4440         Value *V = AddrMode.ScaledReg;
4441         if (V->getType() == IntPtrTy) {
4442           // done.
4443         } else {
4444           assert(cast<IntegerType>(IntPtrTy)->getBitWidth() <
4445                  cast<IntegerType>(V->getType())->getBitWidth() &&
4446                  "We can't transform if ScaledReg is too narrow");
4447           V = Builder.CreateTrunc(V, IntPtrTy, "sunkaddr");
4448         }
4449
4450         if (AddrMode.Scale != 1)
4451           V = Builder.CreateMul(V, ConstantInt::get(IntPtrTy, AddrMode.Scale),
4452                                 "sunkaddr");
4453         if (ResultIndex)
4454           ResultIndex = Builder.CreateAdd(ResultIndex, V, "sunkaddr");
4455         else
4456           ResultIndex = V;
4457       }
4458
4459       // Add in the Base Offset if present.
4460       if (AddrMode.BaseOffs) {
4461         Value *V = ConstantInt::get(IntPtrTy, AddrMode.BaseOffs);
4462         if (ResultIndex) {
4463           // We need to add this separately from the scale above to help with
4464           // SDAG consecutive load/store merging.
4465           if (ResultPtr->getType() != I8PtrTy)
4466             ResultPtr = Builder.CreatePointerCast(ResultPtr, I8PtrTy);
4467           ResultPtr = Builder.CreateGEP(I8Ty, ResultPtr, ResultIndex, "sunkaddr");
4468         }
4469
4470         ResultIndex = V;
4471       }
4472
4473       if (!ResultIndex) {
4474         SunkAddr = ResultPtr;
4475       } else {
4476         if (ResultPtr->getType() != I8PtrTy)
4477           ResultPtr = Builder.CreatePointerCast(ResultPtr, I8PtrTy);
4478         SunkAddr = Builder.CreateGEP(I8Ty, ResultPtr, ResultIndex, "sunkaddr");
4479       }
4480
4481       if (SunkAddr->getType() != Addr->getType())
4482         SunkAddr = Builder.CreatePointerCast(SunkAddr, Addr->getType());
4483     }
4484   } else {
4485     // We'd require a ptrtoint/inttoptr down the line, which we can't do for
4486     // non-integral pointers, so in that case bail out now.
4487     Type *BaseTy = AddrMode.BaseReg ? AddrMode.BaseReg->getType() : nullptr;
4488     Type *ScaleTy = AddrMode.Scale ? AddrMode.ScaledReg->getType() : nullptr;
4489     PointerType *BasePtrTy = dyn_cast_or_null<PointerType>(BaseTy);
4490     PointerType *ScalePtrTy = dyn_cast_or_null<PointerType>(ScaleTy);
4491     if (DL->isNonIntegralPointerType(Addr->getType()) ||
4492         (BasePtrTy && DL->isNonIntegralPointerType(BasePtrTy)) ||
4493         (ScalePtrTy && DL->isNonIntegralPointerType(ScalePtrTy)) ||
4494         (AddrMode.BaseGV &&
4495          DL->isNonIntegralPointerType(AddrMode.BaseGV->getType())))
4496       return false;
4497
4498     DEBUG(dbgs() << "CGP: SINKING nonlocal addrmode: " << AddrMode << " for "
4499                  << *MemoryInst << "\n");
4500     Type *IntPtrTy = DL->getIntPtrType(Addr->getType());
4501     Value *Result = nullptr;
4502
4503     // Start with the base register. Do this first so that subsequent address
4504     // matching finds it last, which will prevent it from trying to match it
4505     // as the scaled value in case it happens to be a mul. That would be
4506     // problematic if we've sunk a different mul for the scale, because then
4507     // we'd end up sinking both muls.
4508     if (AddrMode.BaseReg) {
4509       Value *V = AddrMode.BaseReg;
4510       if (V->getType()->isPointerTy())
4511         V = Builder.CreatePtrToInt(V, IntPtrTy, "sunkaddr");
4512       if (V->getType() != IntPtrTy)
4513         V = Builder.CreateIntCast(V, IntPtrTy, /*isSigned=*/true, "sunkaddr");
4514       Result = V;
4515     }
4516
4517     // Add the scale value.
4518     if (AddrMode.Scale) {
4519       Value *V = AddrMode.ScaledReg;
4520       if (V->getType() == IntPtrTy) {
4521         // done.
4522       } else if (V->getType()->isPointerTy()) {
4523         V = Builder.CreatePtrToInt(V, IntPtrTy, "sunkaddr");
4524       } else if (cast<IntegerType>(IntPtrTy)->getBitWidth() <
4525                  cast<IntegerType>(V->getType())->getBitWidth()) {
4526         V = Builder.CreateTrunc(V, IntPtrTy, "sunkaddr");
4527       } else {
4528         // It is only safe to sign extend the BaseReg if we know that the math
4529         // required to create it did not overflow before we extend it. Since
4530         // the original IR value was tossed in favor of a constant back when
4531         // the AddrMode was created we need to bail out gracefully if widths
4532         // do not match instead of extending it.
4533         Instruction *I = dyn_cast_or_null<Instruction>(Result);
4534         if (I && (Result != AddrMode.BaseReg))
4535           I->eraseFromParent();
4536         return false;
4537       }
4538       if (AddrMode.Scale != 1)
4539         V = Builder.CreateMul(V, ConstantInt::get(IntPtrTy, AddrMode.Scale),
4540                               "sunkaddr");
4541       if (Result)
4542         Result = Builder.CreateAdd(Result, V, "sunkaddr");
4543       else
4544         Result = V;
4545     }
4546
4547     // Add in the BaseGV if present.
4548     if (AddrMode.BaseGV) {
4549       Value *V = Builder.CreatePtrToInt(AddrMode.BaseGV, IntPtrTy, "sunkaddr");
4550       if (Result)
4551         Result = Builder.CreateAdd(Result, V, "sunkaddr");
4552       else
4553         Result = V;
4554     }
4555
4556     // Add in the Base Offset if present.
4557     if (AddrMode.BaseOffs) {
4558       Value *V = ConstantInt::get(IntPtrTy, AddrMode.BaseOffs);
4559       if (Result)
4560         Result = Builder.CreateAdd(Result, V, "sunkaddr");
4561       else
4562         Result = V;
4563     }
4564
4565     if (!Result)
4566       SunkAddr = Constant::getNullValue(Addr->getType());
4567     else
4568       SunkAddr = Builder.CreateIntToPtr(Result, Addr->getType(), "sunkaddr");
4569   }
4570
4571   MemoryInst->replaceUsesOfWith(Repl, SunkAddr);
4572
4573   // If we have no uses, recursively delete the value and all dead instructions
4574   // using it.
4575   if (Repl->use_empty()) {
4576     // This can cause recursive deletion, which can invalidate our iterator.
4577     // Use a WeakTrackingVH to hold onto it in case this happens.
4578     Value *CurValue = &*CurInstIterator;
4579     WeakTrackingVH IterHandle(CurValue);
4580     BasicBlock *BB = CurInstIterator->getParent();
4581
4582     RecursivelyDeleteTriviallyDeadInstructions(Repl, TLInfo);
4583
4584     if (IterHandle != CurValue) {
4585       // If the iterator instruction was recursively deleted, start over at the
4586       // start of the block.
4587       CurInstIterator = BB->begin();
4588       SunkAddrs.clear();
4589     }
4590   }
4591   ++NumMemoryInsts;
4592   return true;
4593 }
4594
4595 /// If there are any memory operands, use OptimizeMemoryInst to sink their
4596 /// address computing into the block when possible / profitable.
4597 bool CodeGenPrepare::optimizeInlineAsmInst(CallInst *CS) {
4598   bool MadeChange = false;
4599
4600   const TargetRegisterInfo *TRI =
4601       TM->getSubtargetImpl(*CS->getFunction())->getRegisterInfo();
4602   TargetLowering::AsmOperandInfoVector TargetConstraints =
4603       TLI->ParseConstraints(*DL, TRI, CS);
4604   unsigned ArgNo = 0;
4605   for (unsigned i = 0, e = TargetConstraints.size(); i != e; ++i) {
4606     TargetLowering::AsmOperandInfo &OpInfo = TargetConstraints[i];
4607
4608     // Compute the constraint code and ConstraintType to use.
4609     TLI->ComputeConstraintToUse(OpInfo, SDValue());
4610
4611     if (OpInfo.ConstraintType == TargetLowering::C_Memory &&
4612         OpInfo.isIndirect) {
4613       Value *OpVal = CS->getArgOperand(ArgNo++);
4614       MadeChange |= optimizeMemoryInst(CS, OpVal, OpVal->getType(), ~0u);
4615     } else if (OpInfo.Type == InlineAsm::isInput)
4616       ArgNo++;
4617   }
4618
4619   return MadeChange;
4620 }
4621
4622 /// \brief Check if all the uses of \p Val are equivalent (or free) zero or
4623 /// sign extensions.
4624 static bool hasSameExtUse(Value *Val, const TargetLowering &TLI) {
4625   assert(!Val->use_empty() && "Input must have at least one use");
4626   const Instruction *FirstUser = cast<Instruction>(*Val->user_begin());
4627   bool IsSExt = isa<SExtInst>(FirstUser);
4628   Type *ExtTy = FirstUser->getType();
4629   for (const User *U : Val->users()) {
4630     const Instruction *UI = cast<Instruction>(U);
4631     if ((IsSExt && !isa<SExtInst>(UI)) || (!IsSExt && !isa<ZExtInst>(UI)))
4632       return false;
4633     Type *CurTy = UI->getType();
4634     // Same input and output types: Same instruction after CSE.
4635     if (CurTy == ExtTy)
4636       continue;
4637
4638     // If IsSExt is true, we are in this situation:
4639     // a = Val
4640     // b = sext ty1 a to ty2
4641     // c = sext ty1 a to ty3
4642     // Assuming ty2 is shorter than ty3, this could be turned into:
4643     // a = Val
4644     // b = sext ty1 a to ty2
4645     // c = sext ty2 b to ty3
4646     // However, the last sext is not free.
4647     if (IsSExt)
4648       return false;
4649
4650     // This is a ZExt, maybe this is free to extend from one type to another.
4651     // In that case, we would not account for a different use.
4652     Type *NarrowTy;
4653     Type *LargeTy;
4654     if (ExtTy->getScalarType()->getIntegerBitWidth() >
4655         CurTy->getScalarType()->getIntegerBitWidth()) {
4656       NarrowTy = CurTy;
4657       LargeTy = ExtTy;
4658     } else {
4659       NarrowTy = ExtTy;
4660       LargeTy = CurTy;
4661     }
4662
4663     if (!TLI.isZExtFree(NarrowTy, LargeTy))
4664       return false;
4665   }
4666   // All uses are the same or can be derived from one another for free.
4667   return true;
4668 }
4669
4670 /// \brief Try to speculatively promote extensions in \p Exts and continue
4671 /// promoting through newly promoted operands recursively as far as doing so is
4672 /// profitable. Save extensions profitably moved up, in \p ProfitablyMovedExts.
4673 /// When some promotion happened, \p TPT contains the proper state to revert
4674 /// them.
4675 ///
4676 /// \return true if some promotion happened, false otherwise.
4677 bool CodeGenPrepare::tryToPromoteExts(
4678     TypePromotionTransaction &TPT, const SmallVectorImpl<Instruction *> &Exts,
4679     SmallVectorImpl<Instruction *> &ProfitablyMovedExts,
4680     unsigned CreatedInstsCost) {
4681   bool Promoted = false;
4682
4683   // Iterate over all the extensions to try to promote them.
4684   for (auto I : Exts) {
4685     // Early check if we directly have ext(load).
4686     if (isa<LoadInst>(I->getOperand(0))) {
4687       ProfitablyMovedExts.push_back(I);
4688       continue;
4689     }
4690
4691     // Check whether or not we want to do any promotion.  The reason we have
4692     // this check inside the for loop is to catch the case where an extension
4693     // is directly fed by a load because in such case the extension can be moved
4694     // up without any promotion on its operands.
4695     if (!TLI || !TLI->enableExtLdPromotion() || DisableExtLdPromotion)
4696       return false;
4697
4698     // Get the action to perform the promotion.
4699     TypePromotionHelper::Action TPH =
4700         TypePromotionHelper::getAction(I, InsertedInsts, *TLI, PromotedInsts);
4701     // Check if we can promote.
4702     if (!TPH) {
4703       // Save the current extension as we cannot move up through its operand.
4704       ProfitablyMovedExts.push_back(I);
4705       continue;
4706     }
4707
4708     // Save the current state.
4709     TypePromotionTransaction::ConstRestorationPt LastKnownGood =
4710         TPT.getRestorationPoint();
4711     SmallVector<Instruction *, 4> NewExts;
4712     unsigned NewCreatedInstsCost = 0;
4713     unsigned ExtCost = !TLI->isExtFree(I);
4714     // Promote.
4715     Value *PromotedVal = TPH(I, TPT, PromotedInsts, NewCreatedInstsCost,
4716                              &NewExts, nullptr, *TLI);
4717     assert(PromotedVal &&
4718            "TypePromotionHelper should have filtered out those cases");
4719
4720     // We would be able to merge only one extension in a load.
4721     // Therefore, if we have more than 1 new extension we heuristically
4722     // cut this search path, because it means we degrade the code quality.
4723     // With exactly 2, the transformation is neutral, because we will merge
4724     // one extension but leave one. However, we optimistically keep going,
4725     // because the new extension may be removed too.
4726     long long TotalCreatedInstsCost = CreatedInstsCost + NewCreatedInstsCost;
4727     // FIXME: It would be possible to propagate a negative value instead of
4728     // conservatively ceiling it to 0.
4729     TotalCreatedInstsCost =
4730         std::max((long long)0, (TotalCreatedInstsCost - ExtCost));
4731     if (!StressExtLdPromotion &&
4732         (TotalCreatedInstsCost > 1 ||
4733          !isPromotedInstructionLegal(*TLI, *DL, PromotedVal))) {
4734       // This promotion is not profitable, rollback to the previous state, and
4735       // save the current extension in ProfitablyMovedExts as the latest
4736       // speculative promotion turned out to be unprofitable.
4737       TPT.rollback(LastKnownGood);
4738       ProfitablyMovedExts.push_back(I);
4739       continue;
4740     }
4741     // Continue promoting NewExts as far as doing so is profitable.
4742     SmallVector<Instruction *, 2> NewlyMovedExts;
4743     (void)tryToPromoteExts(TPT, NewExts, NewlyMovedExts, TotalCreatedInstsCost);
4744     bool NewPromoted = false;
4745     for (auto ExtInst : NewlyMovedExts) {
4746       Instruction *MovedExt = cast<Instruction>(ExtInst);
4747       Value *ExtOperand = MovedExt->getOperand(0);
4748       // If we have reached to a load, we need this extra profitability check
4749       // as it could potentially be merged into an ext(load).
4750       if (isa<LoadInst>(ExtOperand) &&
4751           !(StressExtLdPromotion || NewCreatedInstsCost <= ExtCost ||
4752             (ExtOperand->hasOneUse() || hasSameExtUse(ExtOperand, *TLI))))
4753         continue;
4754
4755       ProfitablyMovedExts.push_back(MovedExt);
4756       NewPromoted = true;
4757     }
4758
4759     // If none of speculative promotions for NewExts is profitable, rollback
4760     // and save the current extension (I) as the last profitable extension.
4761     if (!NewPromoted) {
4762       TPT.rollback(LastKnownGood);
4763       ProfitablyMovedExts.push_back(I);
4764       continue;
4765     }
4766     // The promotion is profitable.
4767     Promoted = true;
4768   }
4769   return Promoted;
4770 }
4771
4772 /// Merging redundant sexts when one is dominating the other.
4773 bool CodeGenPrepare::mergeSExts(Function &F) {
4774   DominatorTree DT(F);
4775   bool Changed = false;
4776   for (auto &Entry : ValToSExtendedUses) {
4777     SExts &Insts = Entry.second;
4778     SExts CurPts;
4779     for (Instruction *Inst : Insts) {
4780       if (RemovedInsts.count(Inst) || !isa<SExtInst>(Inst) ||
4781           Inst->getOperand(0) != Entry.first)
4782         continue;
4783       bool inserted = false;
4784       for (auto &Pt : CurPts) {
4785         if (DT.dominates(Inst, Pt)) {
4786           Pt->replaceAllUsesWith(Inst);
4787           RemovedInsts.insert(Pt);
4788           Pt->removeFromParent();
4789           Pt = Inst;
4790           inserted = true;
4791           Changed = true;
4792           break;
4793         }
4794         if (!DT.dominates(Pt, Inst))
4795           // Give up if we need to merge in a common dominator as the
4796           // expermients show it is not profitable.
4797           continue;
4798         Inst->replaceAllUsesWith(Pt);
4799         RemovedInsts.insert(Inst);
4800         Inst->removeFromParent();
4801         inserted = true;
4802         Changed = true;
4803         break;
4804       }
4805       if (!inserted)
4806         CurPts.push_back(Inst);
4807     }
4808   }
4809   return Changed;
4810 }
4811
4812 /// Return true, if an ext(load) can be formed from an extension in
4813 /// \p MovedExts.
4814 bool CodeGenPrepare::canFormExtLd(
4815     const SmallVectorImpl<Instruction *> &MovedExts, LoadInst *&LI,
4816     Instruction *&Inst, bool HasPromoted) {
4817   for (auto *MovedExtInst : MovedExts) {
4818     if (isa<LoadInst>(MovedExtInst->getOperand(0))) {
4819       LI = cast<LoadInst>(MovedExtInst->getOperand(0));
4820       Inst = MovedExtInst;
4821       break;
4822     }
4823   }
4824   if (!LI)
4825     return false;
4826
4827   // If they're already in the same block, there's nothing to do.
4828   // Make the cheap checks first if we did not promote.
4829   // If we promoted, we need to check if it is indeed profitable.
4830   if (!HasPromoted && LI->getParent() == Inst->getParent())
4831     return false;
4832
4833   return TLI->isExtLoad(LI, Inst, *DL);
4834 }
4835
4836 /// Move a zext or sext fed by a load into the same basic block as the load,
4837 /// unless conditions are unfavorable. This allows SelectionDAG to fold the
4838 /// extend into the load.
4839 ///
4840 /// E.g.,
4841 /// \code
4842 /// %ld = load i32* %addr
4843 /// %add = add nuw i32 %ld, 4
4844 /// %zext = zext i32 %add to i64
4845 // \endcode
4846 /// =>
4847 /// \code
4848 /// %ld = load i32* %addr
4849 /// %zext = zext i32 %ld to i64
4850 /// %add = add nuw i64 %zext, 4
4851 /// \encode
4852 /// Note that the promotion in %add to i64 is done in tryToPromoteExts(), which
4853 /// allow us to match zext(load i32*) to i64.
4854 ///
4855 /// Also, try to promote the computations used to obtain a sign extended
4856 /// value used into memory accesses.
4857 /// E.g.,
4858 /// \code
4859 /// a = add nsw i32 b, 3
4860 /// d = sext i32 a to i64
4861 /// e = getelementptr ..., i64 d
4862 /// \endcode
4863 /// =>
4864 /// \code
4865 /// f = sext i32 b to i64
4866 /// a = add nsw i64 f, 3
4867 /// e = getelementptr ..., i64 a
4868 /// \endcode
4869 ///
4870 /// \p Inst[in/out] the extension may be modified during the process if some
4871 /// promotions apply.
4872 bool CodeGenPrepare::optimizeExt(Instruction *&Inst) {
4873   // ExtLoad formation and address type promotion infrastructure requires TLI to
4874   // be effective.
4875   if (!TLI)
4876     return false;
4877
4878   bool AllowPromotionWithoutCommonHeader = false;
4879   /// See if it is an interesting sext operations for the address type
4880   /// promotion before trying to promote it, e.g., the ones with the right
4881   /// type and used in memory accesses.
4882   bool ATPConsiderable = TTI->shouldConsiderAddressTypePromotion(
4883       *Inst, AllowPromotionWithoutCommonHeader);
4884   TypePromotionTransaction TPT(RemovedInsts);
4885   TypePromotionTransaction::ConstRestorationPt LastKnownGood =
4886       TPT.getRestorationPoint();
4887   SmallVector<Instruction *, 1> Exts;
4888   SmallVector<Instruction *, 2> SpeculativelyMovedExts;
4889   Exts.push_back(Inst);
4890
4891   bool HasPromoted = tryToPromoteExts(TPT, Exts, SpeculativelyMovedExts);
4892
4893   // Look for a load being extended.
4894   LoadInst *LI = nullptr;
4895   Instruction *ExtFedByLoad;
4896
4897   // Try to promote a chain of computation if it allows to form an extended
4898   // load.
4899   if (canFormExtLd(SpeculativelyMovedExts, LI, ExtFedByLoad, HasPromoted)) {
4900     assert(LI && ExtFedByLoad && "Expect a valid load and extension");
4901     TPT.commit();
4902     // Move the extend into the same block as the load
4903     ExtFedByLoad->removeFromParent();
4904     ExtFedByLoad->insertAfter(LI);
4905     // CGP does not check if the zext would be speculatively executed when moved
4906     // to the same basic block as the load. Preserving its original location
4907     // would pessimize the debugging experience, as well as negatively impact
4908     // the quality of sample pgo. We don't want to use "line 0" as that has a
4909     // size cost in the line-table section and logically the zext can be seen as
4910     // part of the load. Therefore we conservatively reuse the same debug
4911     // location for the load and the zext.
4912     ExtFedByLoad->setDebugLoc(LI->getDebugLoc());
4913     ++NumExtsMoved;
4914     Inst = ExtFedByLoad;
4915     return true;
4916   }
4917
4918   // Continue promoting SExts if known as considerable depending on targets.
4919   if (ATPConsiderable &&
4920       performAddressTypePromotion(Inst, AllowPromotionWithoutCommonHeader,
4921                                   HasPromoted, TPT, SpeculativelyMovedExts))
4922     return true;
4923
4924   TPT.rollback(LastKnownGood);
4925   return false;
4926 }
4927
4928 // Perform address type promotion if doing so is profitable.
4929 // If AllowPromotionWithoutCommonHeader == false, we should find other sext
4930 // instructions that sign extended the same initial value. However, if
4931 // AllowPromotionWithoutCommonHeader == true, we expect promoting the
4932 // extension is just profitable.
4933 bool CodeGenPrepare::performAddressTypePromotion(
4934     Instruction *&Inst, bool AllowPromotionWithoutCommonHeader,
4935     bool HasPromoted, TypePromotionTransaction &TPT,
4936     SmallVectorImpl<Instruction *> &SpeculativelyMovedExts) {
4937   bool Promoted = false;
4938   SmallPtrSet<Instruction *, 1> UnhandledExts;
4939   bool AllSeenFirst = true;
4940   for (auto I : SpeculativelyMovedExts) {
4941     Value *HeadOfChain = I->getOperand(0);
4942     DenseMap<Value *, Instruction *>::iterator AlreadySeen =
4943         SeenChainsForSExt.find(HeadOfChain);
4944     // If there is an unhandled SExt which has the same header, try to promote
4945     // it as well.
4946     if (AlreadySeen != SeenChainsForSExt.end()) {
4947       if (AlreadySeen->second != nullptr)
4948         UnhandledExts.insert(AlreadySeen->second);
4949       AllSeenFirst = false;
4950     }
4951   }
4952
4953   if (!AllSeenFirst || (AllowPromotionWithoutCommonHeader &&
4954                         SpeculativelyMovedExts.size() == 1)) {
4955     TPT.commit();
4956     if (HasPromoted)
4957       Promoted = true;
4958     for (auto I : SpeculativelyMovedExts) {
4959       Value *HeadOfChain = I->getOperand(0);
4960       SeenChainsForSExt[HeadOfChain] = nullptr;
4961       ValToSExtendedUses[HeadOfChain].push_back(I);
4962     }
4963     // Update Inst as promotion happen.
4964     Inst = SpeculativelyMovedExts.pop_back_val();
4965   } else {
4966     // This is the first chain visited from the header, keep the current chain
4967     // as unhandled. Defer to promote this until we encounter another SExt
4968     // chain derived from the same header.
4969     for (auto I : SpeculativelyMovedExts) {
4970       Value *HeadOfChain = I->getOperand(0);
4971       SeenChainsForSExt[HeadOfChain] = Inst;
4972     }
4973     return false;
4974   }
4975
4976   if (!AllSeenFirst && !UnhandledExts.empty())
4977     for (auto VisitedSExt : UnhandledExts) {
4978       if (RemovedInsts.count(VisitedSExt))
4979         continue;
4980       TypePromotionTransaction TPT(RemovedInsts);
4981       SmallVector<Instruction *, 1> Exts;
4982       SmallVector<Instruction *, 2> Chains;
4983       Exts.push_back(VisitedSExt);
4984       bool HasPromoted = tryToPromoteExts(TPT, Exts, Chains);
4985       TPT.commit();
4986       if (HasPromoted)
4987         Promoted = true;
4988       for (auto I : Chains) {
4989         Value *HeadOfChain = I->getOperand(0);
4990         // Mark this as handled.
4991         SeenChainsForSExt[HeadOfChain] = nullptr;
4992         ValToSExtendedUses[HeadOfChain].push_back(I);
4993       }
4994     }
4995   return Promoted;
4996 }
4997
4998 bool CodeGenPrepare::optimizeExtUses(Instruction *I) {
4999   BasicBlock *DefBB = I->getParent();
5000
5001   // If the result of a {s|z}ext and its source are both live out, rewrite all
5002   // other uses of the source with result of extension.
5003   Value *Src = I->getOperand(0);
5004   if (Src->hasOneUse())
5005     return false;
5006
5007   // Only do this xform if truncating is free.
5008   if (TLI && !TLI->isTruncateFree(I->getType(), Src->getType()))
5009     return false;
5010
5011   // Only safe to perform the optimization if the source is also defined in
5012   // this block.
5013   if (!isa<Instruction>(Src) || DefBB != cast<Instruction>(Src)->getParent())
5014     return false;
5015
5016   bool DefIsLiveOut = false;
5017   for (User *U : I->users()) {
5018     Instruction *UI = cast<Instruction>(U);
5019
5020     // Figure out which BB this ext is used in.
5021     BasicBlock *UserBB = UI->getParent();
5022     if (UserBB == DefBB) continue;
5023     DefIsLiveOut = true;
5024     break;
5025   }
5026   if (!DefIsLiveOut)
5027     return false;
5028
5029   // Make sure none of the uses are PHI nodes.
5030   for (User *U : Src->users()) {
5031     Instruction *UI = cast<Instruction>(U);
5032     BasicBlock *UserBB = UI->getParent();
5033     if (UserBB == DefBB) continue;
5034     // Be conservative. We don't want this xform to end up introducing
5035     // reloads just before load / store instructions.
5036     if (isa<PHINode>(UI) || isa<LoadInst>(UI) || isa<StoreInst>(UI))
5037       return false;
5038   }
5039
5040   // InsertedTruncs - Only insert one trunc in each block once.
5041   DenseMap<BasicBlock*, Instruction*> InsertedTruncs;
5042
5043   bool MadeChange = false;
5044   for (Use &U : Src->uses()) {
5045     Instruction *User = cast<Instruction>(U.getUser());
5046
5047     // Figure out which BB this ext is used in.
5048     BasicBlock *UserBB = User->getParent();
5049     if (UserBB == DefBB) continue;
5050
5051     // Both src and def are live in this block. Rewrite the use.
5052     Instruction *&InsertedTrunc = InsertedTruncs[UserBB];
5053
5054     if (!InsertedTrunc) {
5055       BasicBlock::iterator InsertPt = UserBB->getFirstInsertionPt();
5056       assert(InsertPt != UserBB->end());
5057       InsertedTrunc = new TruncInst(I, Src->getType(), "", &*InsertPt);
5058       InsertedInsts.insert(InsertedTrunc);
5059     }
5060
5061     // Replace a use of the {s|z}ext source with a use of the result.
5062     U = InsertedTrunc;
5063     ++NumExtUses;
5064     MadeChange = true;
5065   }
5066
5067   return MadeChange;
5068 }
5069
5070 // Find loads whose uses only use some of the loaded value's bits.  Add an "and"
5071 // just after the load if the target can fold this into one extload instruction,
5072 // with the hope of eliminating some of the other later "and" instructions using
5073 // the loaded value.  "and"s that are made trivially redundant by the insertion
5074 // of the new "and" are removed by this function, while others (e.g. those whose
5075 // path from the load goes through a phi) are left for isel to potentially
5076 // remove.
5077 //
5078 // For example:
5079 //
5080 // b0:
5081 //   x = load i32
5082 //   ...
5083 // b1:
5084 //   y = and x, 0xff
5085 //   z = use y
5086 //
5087 // becomes:
5088 //
5089 // b0:
5090 //   x = load i32
5091 //   x' = and x, 0xff
5092 //   ...
5093 // b1:
5094 //   z = use x'
5095 //
5096 // whereas:
5097 //
5098 // b0:
5099 //   x1 = load i32
5100 //   ...
5101 // b1:
5102 //   x2 = load i32
5103 //   ...
5104 // b2:
5105 //   x = phi x1, x2
5106 //   y = and x, 0xff
5107 //
5108 // becomes (after a call to optimizeLoadExt for each load):
5109 //
5110 // b0:
5111 //   x1 = load i32
5112 //   x1' = and x1, 0xff
5113 //   ...
5114 // b1:
5115 //   x2 = load i32
5116 //   x2' = and x2, 0xff
5117 //   ...
5118 // b2:
5119 //   x = phi x1', x2'
5120 //   y = and x, 0xff
5121 //
5122
5123 bool CodeGenPrepare::optimizeLoadExt(LoadInst *Load) {
5124
5125   if (!Load->isSimple() ||
5126       !(Load->getType()->isIntegerTy() || Load->getType()->isPointerTy()))
5127     return false;
5128
5129   // Skip loads we've already transformed.
5130   if (Load->hasOneUse() &&
5131       InsertedInsts.count(cast<Instruction>(*Load->user_begin())))
5132     return false;
5133
5134   // Look at all uses of Load, looking through phis, to determine how many bits
5135   // of the loaded value are needed.
5136   SmallVector<Instruction *, 8> WorkList;
5137   SmallPtrSet<Instruction *, 16> Visited;
5138   SmallVector<Instruction *, 8> AndsToMaybeRemove;
5139   for (auto *U : Load->users())
5140     WorkList.push_back(cast<Instruction>(U));
5141
5142   EVT LoadResultVT = TLI->getValueType(*DL, Load->getType());
5143   unsigned BitWidth = LoadResultVT.getSizeInBits();
5144   APInt DemandBits(BitWidth, 0);
5145   APInt WidestAndBits(BitWidth, 0);
5146
5147   while (!WorkList.empty()) {
5148     Instruction *I = WorkList.back();
5149     WorkList.pop_back();
5150
5151     // Break use-def graph loops.
5152     if (!Visited.insert(I).second)
5153       continue;
5154
5155     // For a PHI node, push all of its users.
5156     if (auto *Phi = dyn_cast<PHINode>(I)) {
5157       for (auto *U : Phi->users())
5158         WorkList.push_back(cast<Instruction>(U));
5159       continue;
5160     }
5161
5162     switch (I->getOpcode()) {
5163     case llvm::Instruction::And: {
5164       auto *AndC = dyn_cast<ConstantInt>(I->getOperand(1));
5165       if (!AndC)
5166         return false;
5167       APInt AndBits = AndC->getValue();
5168       DemandBits |= AndBits;
5169       // Keep track of the widest and mask we see.
5170       if (AndBits.ugt(WidestAndBits))
5171         WidestAndBits = AndBits;
5172       if (AndBits == WidestAndBits && I->getOperand(0) == Load)
5173         AndsToMaybeRemove.push_back(I);
5174       break;
5175     }
5176
5177     case llvm::Instruction::Shl: {
5178       auto *ShlC = dyn_cast<ConstantInt>(I->getOperand(1));
5179       if (!ShlC)
5180         return false;
5181       uint64_t ShiftAmt = ShlC->getLimitedValue(BitWidth - 1);
5182       DemandBits.setLowBits(BitWidth - ShiftAmt);
5183       break;
5184     }
5185
5186     case llvm::Instruction::Trunc: {
5187       EVT TruncVT = TLI->getValueType(*DL, I->getType());
5188       unsigned TruncBitWidth = TruncVT.getSizeInBits();
5189       DemandBits.setLowBits(TruncBitWidth);
5190       break;
5191     }
5192
5193     default:
5194       return false;
5195     }
5196   }
5197
5198   uint32_t ActiveBits = DemandBits.getActiveBits();
5199   // Avoid hoisting (and (load x) 1) since it is unlikely to be folded by the
5200   // target even if isLoadExtLegal says an i1 EXTLOAD is valid.  For example,
5201   // for the AArch64 target isLoadExtLegal(ZEXTLOAD, i32, i1) returns true, but
5202   // (and (load x) 1) is not matched as a single instruction, rather as a LDR
5203   // followed by an AND.
5204   // TODO: Look into removing this restriction by fixing backends to either
5205   // return false for isLoadExtLegal for i1 or have them select this pattern to
5206   // a single instruction.
5207   //
5208   // Also avoid hoisting if we didn't see any ands with the exact DemandBits
5209   // mask, since these are the only ands that will be removed by isel.
5210   if (ActiveBits <= 1 || !DemandBits.isMask(ActiveBits) ||
5211       WidestAndBits != DemandBits)
5212     return false;
5213
5214   LLVMContext &Ctx = Load->getType()->getContext();
5215   Type *TruncTy = Type::getIntNTy(Ctx, ActiveBits);
5216   EVT TruncVT = TLI->getValueType(*DL, TruncTy);
5217
5218   // Reject cases that won't be matched as extloads.
5219   if (!LoadResultVT.bitsGT(TruncVT) || !TruncVT.isRound() ||
5220       !TLI->isLoadExtLegal(ISD::ZEXTLOAD, LoadResultVT, TruncVT))
5221     return false;
5222
5223   IRBuilder<> Builder(Load->getNextNode());
5224   auto *NewAnd = dyn_cast<Instruction>(
5225       Builder.CreateAnd(Load, ConstantInt::get(Ctx, DemandBits)));
5226   // Mark this instruction as "inserted by CGP", so that other
5227   // optimizations don't touch it.
5228   InsertedInsts.insert(NewAnd);
5229
5230   // Replace all uses of load with new and (except for the use of load in the
5231   // new and itself).
5232   Load->replaceAllUsesWith(NewAnd);
5233   NewAnd->setOperand(0, Load);
5234
5235   // Remove any and instructions that are now redundant.
5236   for (auto *And : AndsToMaybeRemove)
5237     // Check that the and mask is the same as the one we decided to put on the
5238     // new and.
5239     if (cast<ConstantInt>(And->getOperand(1))->getValue() == DemandBits) {
5240       And->replaceAllUsesWith(NewAnd);
5241       if (&*CurInstIterator == And)
5242         CurInstIterator = std::next(And->getIterator());
5243       And->eraseFromParent();
5244       ++NumAndUses;
5245     }
5246
5247   ++NumAndsAdded;
5248   return true;
5249 }
5250
5251 /// Check if V (an operand of a select instruction) is an expensive instruction
5252 /// that is only used once.
5253 static bool sinkSelectOperand(const TargetTransformInfo *TTI, Value *V) {
5254   auto *I = dyn_cast<Instruction>(V);
5255   // If it's safe to speculatively execute, then it should not have side
5256   // effects; therefore, it's safe to sink and possibly *not* execute.
5257   return I && I->hasOneUse() && isSafeToSpeculativelyExecute(I) &&
5258          TTI->getUserCost(I) >= TargetTransformInfo::TCC_Expensive;
5259 }
5260
5261 /// Returns true if a SelectInst should be turned into an explicit branch.
5262 static bool isFormingBranchFromSelectProfitable(const TargetTransformInfo *TTI,
5263                                                 const TargetLowering *TLI,
5264                                                 SelectInst *SI) {
5265   // If even a predictable select is cheap, then a branch can't be cheaper.
5266   if (!TLI->isPredictableSelectExpensive())
5267     return false;
5268
5269   // FIXME: This should use the same heuristics as IfConversion to determine
5270   // whether a select is better represented as a branch.
5271
5272   // If metadata tells us that the select condition is obviously predictable,
5273   // then we want to replace the select with a branch.
5274   uint64_t TrueWeight, FalseWeight;
5275   if (SI->extractProfMetadata(TrueWeight, FalseWeight)) {
5276     uint64_t Max = std::max(TrueWeight, FalseWeight);
5277     uint64_t Sum = TrueWeight + FalseWeight;
5278     if (Sum != 0) {
5279       auto Probability = BranchProbability::getBranchProbability(Max, Sum);
5280       if (Probability > TLI->getPredictableBranchThreshold())
5281         return true;
5282     }
5283   }
5284
5285   CmpInst *Cmp = dyn_cast<CmpInst>(SI->getCondition());
5286
5287   // If a branch is predictable, an out-of-order CPU can avoid blocking on its
5288   // comparison condition. If the compare has more than one use, there's
5289   // probably another cmov or setcc around, so it's not worth emitting a branch.
5290   if (!Cmp || !Cmp->hasOneUse())
5291     return false;
5292
5293   // If either operand of the select is expensive and only needed on one side
5294   // of the select, we should form a branch.
5295   if (sinkSelectOperand(TTI, SI->getTrueValue()) ||
5296       sinkSelectOperand(TTI, SI->getFalseValue()))
5297     return true;
5298
5299   return false;
5300 }
5301
5302 /// If \p isTrue is true, return the true value of \p SI, otherwise return
5303 /// false value of \p SI. If the true/false value of \p SI is defined by any
5304 /// select instructions in \p Selects, look through the defining select
5305 /// instruction until the true/false value is not defined in \p Selects.
5306 static Value *getTrueOrFalseValue(
5307     SelectInst *SI, bool isTrue,
5308     const SmallPtrSet<const Instruction *, 2> &Selects) {
5309   Value *V;
5310
5311   for (SelectInst *DefSI = SI; DefSI != nullptr && Selects.count(DefSI);
5312        DefSI = dyn_cast<SelectInst>(V)) {
5313     assert(DefSI->getCondition() == SI->getCondition() &&
5314            "The condition of DefSI does not match with SI");
5315     V = (isTrue ? DefSI->getTrueValue() : DefSI->getFalseValue());
5316   }
5317   return V;
5318 }
5319
5320 /// If we have a SelectInst that will likely profit from branch prediction,
5321 /// turn it into a branch.
5322 bool CodeGenPrepare::optimizeSelectInst(SelectInst *SI) {
5323   // Find all consecutive select instructions that share the same condition.
5324   SmallVector<SelectInst *, 2> ASI;
5325   ASI.push_back(SI);
5326   for (BasicBlock::iterator It = ++BasicBlock::iterator(SI);
5327        It != SI->getParent()->end(); ++It) {
5328     SelectInst *I = dyn_cast<SelectInst>(&*It);
5329     if (I && SI->getCondition() == I->getCondition()) {
5330       ASI.push_back(I);
5331     } else {
5332       break;
5333     }
5334   }
5335
5336   SelectInst *LastSI = ASI.back();
5337   // Increment the current iterator to skip all the rest of select instructions
5338   // because they will be either "not lowered" or "all lowered" to branch.
5339   CurInstIterator = std::next(LastSI->getIterator());
5340
5341   bool VectorCond = !SI->getCondition()->getType()->isIntegerTy(1);
5342
5343   // Can we convert the 'select' to CF ?
5344   if (DisableSelectToBranch || OptSize || !TLI || VectorCond ||
5345       SI->getMetadata(LLVMContext::MD_unpredictable))
5346     return false;
5347
5348   TargetLowering::SelectSupportKind SelectKind;
5349   if (VectorCond)
5350     SelectKind = TargetLowering::VectorMaskSelect;
5351   else if (SI->getType()->isVectorTy())
5352     SelectKind = TargetLowering::ScalarCondVectorVal;
5353   else
5354     SelectKind = TargetLowering::ScalarValSelect;
5355
5356   if (TLI->isSelectSupported(SelectKind) &&
5357       !isFormingBranchFromSelectProfitable(TTI, TLI, SI))
5358     return false;
5359
5360   ModifiedDT = true;
5361
5362   // Transform a sequence like this:
5363   //    start:
5364   //       %cmp = cmp uge i32 %a, %b
5365   //       %sel = select i1 %cmp, i32 %c, i32 %d
5366   //
5367   // Into:
5368   //    start:
5369   //       %cmp = cmp uge i32 %a, %b
5370   //       br i1 %cmp, label %select.true, label %select.false
5371   //    select.true:
5372   //       br label %select.end
5373   //    select.false:
5374   //       br label %select.end
5375   //    select.end:
5376   //       %sel = phi i32 [ %c, %select.true ], [ %d, %select.false ]
5377   //
5378   // In addition, we may sink instructions that produce %c or %d from
5379   // the entry block into the destination(s) of the new branch.
5380   // If the true or false blocks do not contain a sunken instruction, that
5381   // block and its branch may be optimized away. In that case, one side of the
5382   // first branch will point directly to select.end, and the corresponding PHI
5383   // predecessor block will be the start block.
5384
5385   // First, we split the block containing the select into 2 blocks.
5386   BasicBlock *StartBlock = SI->getParent();
5387   BasicBlock::iterator SplitPt = ++(BasicBlock::iterator(LastSI));
5388   BasicBlock *EndBlock = StartBlock->splitBasicBlock(SplitPt, "select.end");
5389
5390   // Delete the unconditional branch that was just created by the split.
5391   StartBlock->getTerminator()->eraseFromParent();
5392
5393   // These are the new basic blocks for the conditional branch.
5394   // At least one will become an actual new basic block.
5395   BasicBlock *TrueBlock = nullptr;
5396   BasicBlock *FalseBlock = nullptr;
5397   BranchInst *TrueBranch = nullptr;
5398   BranchInst *FalseBranch = nullptr;
5399
5400   // Sink expensive instructions into the conditional blocks to avoid executing
5401   // them speculatively.
5402   for (SelectInst *SI : ASI) {
5403     if (sinkSelectOperand(TTI, SI->getTrueValue())) {
5404       if (TrueBlock == nullptr) {
5405         TrueBlock = BasicBlock::Create(SI->getContext(), "select.true.sink",
5406                                        EndBlock->getParent(), EndBlock);
5407         TrueBranch = BranchInst::Create(EndBlock, TrueBlock);
5408       }
5409       auto *TrueInst = cast<Instruction>(SI->getTrueValue());
5410       TrueInst->moveBefore(TrueBranch);
5411     }
5412     if (sinkSelectOperand(TTI, SI->getFalseValue())) {
5413       if (FalseBlock == nullptr) {
5414         FalseBlock = BasicBlock::Create(SI->getContext(), "select.false.sink",
5415                                         EndBlock->getParent(), EndBlock);
5416         FalseBranch = BranchInst::Create(EndBlock, FalseBlock);
5417       }
5418       auto *FalseInst = cast<Instruction>(SI->getFalseValue());
5419       FalseInst->moveBefore(FalseBranch);
5420     }
5421   }
5422
5423   // If there was nothing to sink, then arbitrarily choose the 'false' side
5424   // for a new input value to the PHI.
5425   if (TrueBlock == FalseBlock) {
5426     assert(TrueBlock == nullptr &&
5427            "Unexpected basic block transform while optimizing select");
5428
5429     FalseBlock = BasicBlock::Create(SI->getContext(), "select.false",
5430                                     EndBlock->getParent(), EndBlock);
5431     BranchInst::Create(EndBlock, FalseBlock);
5432   }
5433
5434   // Insert the real conditional branch based on the original condition.
5435   // If we did not create a new block for one of the 'true' or 'false' paths
5436   // of the condition, it means that side of the branch goes to the end block
5437   // directly and the path originates from the start block from the point of
5438   // view of the new PHI.
5439   BasicBlock *TT, *FT;
5440   if (TrueBlock == nullptr) {
5441     TT = EndBlock;
5442     FT = FalseBlock;
5443     TrueBlock = StartBlock;
5444   } else if (FalseBlock == nullptr) {
5445     TT = TrueBlock;
5446     FT = EndBlock;
5447     FalseBlock = StartBlock;
5448   } else {
5449     TT = TrueBlock;
5450     FT = FalseBlock;
5451   }
5452   IRBuilder<>(SI).CreateCondBr(SI->getCondition(), TT, FT, SI);
5453
5454   SmallPtrSet<const Instruction *, 2> INS;
5455   INS.insert(ASI.begin(), ASI.end());
5456   // Use reverse iterator because later select may use the value of the
5457   // earlier select, and we need to propagate value through earlier select
5458   // to get the PHI operand.
5459   for (auto It = ASI.rbegin(); It != ASI.rend(); ++It) {
5460     SelectInst *SI = *It;
5461     // The select itself is replaced with a PHI Node.
5462     PHINode *PN = PHINode::Create(SI->getType(), 2, "", &EndBlock->front());
5463     PN->takeName(SI);
5464     PN->addIncoming(getTrueOrFalseValue(SI, true, INS), TrueBlock);
5465     PN->addIncoming(getTrueOrFalseValue(SI, false, INS), FalseBlock);
5466
5467     SI->replaceAllUsesWith(PN);
5468     SI->eraseFromParent();
5469     INS.erase(SI);
5470     ++NumSelectsExpanded;
5471   }
5472
5473   // Instruct OptimizeBlock to skip to the next block.
5474   CurInstIterator = StartBlock->end();
5475   return true;
5476 }
5477
5478 static bool isBroadcastShuffle(ShuffleVectorInst *SVI) {
5479   SmallVector<int, 16> Mask(SVI->getShuffleMask());
5480   int SplatElem = -1;
5481   for (unsigned i = 0; i < Mask.size(); ++i) {
5482     if (SplatElem != -1 && Mask[i] != -1 && Mask[i] != SplatElem)
5483       return false;
5484     SplatElem = Mask[i];
5485   }
5486
5487   return true;
5488 }
5489
5490 /// Some targets have expensive vector shifts if the lanes aren't all the same
5491 /// (e.g. x86 only introduced "vpsllvd" and friends with AVX2). In these cases
5492 /// it's often worth sinking a shufflevector splat down to its use so that
5493 /// codegen can spot all lanes are identical.
5494 bool CodeGenPrepare::optimizeShuffleVectorInst(ShuffleVectorInst *SVI) {
5495   BasicBlock *DefBB = SVI->getParent();
5496
5497   // Only do this xform if variable vector shifts are particularly expensive.
5498   if (!TLI || !TLI->isVectorShiftByScalarCheap(SVI->getType()))
5499     return false;
5500
5501   // We only expect better codegen by sinking a shuffle if we can recognise a
5502   // constant splat.
5503   if (!isBroadcastShuffle(SVI))
5504     return false;
5505
5506   // InsertedShuffles - Only insert a shuffle in each block once.
5507   DenseMap<BasicBlock*, Instruction*> InsertedShuffles;
5508
5509   bool MadeChange = false;
5510   for (User *U : SVI->users()) {
5511     Instruction *UI = cast<Instruction>(U);
5512
5513     // Figure out which BB this ext is used in.
5514     BasicBlock *UserBB = UI->getParent();
5515     if (UserBB == DefBB) continue;
5516
5517     // For now only apply this when the splat is used by a shift instruction.
5518     if (!UI->isShift()) continue;
5519
5520     // Everything checks out, sink the shuffle if the user's block doesn't
5521     // already have a copy.
5522     Instruction *&InsertedShuffle = InsertedShuffles[UserBB];
5523
5524     if (!InsertedShuffle) {
5525       BasicBlock::iterator InsertPt = UserBB->getFirstInsertionPt();
5526       assert(InsertPt != UserBB->end());
5527       InsertedShuffle =
5528           new ShuffleVectorInst(SVI->getOperand(0), SVI->getOperand(1),
5529                                 SVI->getOperand(2), "", &*InsertPt);
5530     }
5531
5532     UI->replaceUsesOfWith(SVI, InsertedShuffle);
5533     MadeChange = true;
5534   }
5535
5536   // If we removed all uses, nuke the shuffle.
5537   if (SVI->use_empty()) {
5538     SVI->eraseFromParent();
5539     MadeChange = true;
5540   }
5541
5542   return MadeChange;
5543 }
5544
5545 bool CodeGenPrepare::optimizeSwitchInst(SwitchInst *SI) {
5546   if (!TLI || !DL)
5547     return false;
5548
5549   Value *Cond = SI->getCondition();
5550   Type *OldType = Cond->getType();
5551   LLVMContext &Context = Cond->getContext();
5552   MVT RegType = TLI->getRegisterType(Context, TLI->getValueType(*DL, OldType));
5553   unsigned RegWidth = RegType.getSizeInBits();
5554
5555   if (RegWidth <= cast<IntegerType>(OldType)->getBitWidth())
5556     return false;
5557
5558   // If the register width is greater than the type width, expand the condition
5559   // of the switch instruction and each case constant to the width of the
5560   // register. By widening the type of the switch condition, subsequent
5561   // comparisons (for case comparisons) will not need to be extended to the
5562   // preferred register width, so we will potentially eliminate N-1 extends,
5563   // where N is the number of cases in the switch.
5564   auto *NewType = Type::getIntNTy(Context, RegWidth);
5565
5566   // Zero-extend the switch condition and case constants unless the switch
5567   // condition is a function argument that is already being sign-extended.
5568   // In that case, we can avoid an unnecessary mask/extension by sign-extending
5569   // everything instead.
5570   Instruction::CastOps ExtType = Instruction::ZExt;
5571   if (auto *Arg = dyn_cast<Argument>(Cond))
5572     if (Arg->hasSExtAttr())
5573       ExtType = Instruction::SExt;
5574
5575   auto *ExtInst = CastInst::Create(ExtType, Cond, NewType);
5576   ExtInst->insertBefore(SI);
5577   SI->setCondition(ExtInst);
5578   for (auto Case : SI->cases()) {
5579     APInt NarrowConst = Case.getCaseValue()->getValue();
5580     APInt WideConst = (ExtType == Instruction::ZExt) ?
5581                       NarrowConst.zext(RegWidth) : NarrowConst.sext(RegWidth);
5582     Case.setValue(ConstantInt::get(Context, WideConst));
5583   }
5584
5585   return true;
5586 }
5587
5588
5589 namespace {
5590 /// \brief Helper class to promote a scalar operation to a vector one.
5591 /// This class is used to move downward extractelement transition.
5592 /// E.g.,
5593 /// a = vector_op <2 x i32>
5594 /// b = extractelement <2 x i32> a, i32 0
5595 /// c = scalar_op b
5596 /// store c
5597 ///
5598 /// =>
5599 /// a = vector_op <2 x i32>
5600 /// c = vector_op a (equivalent to scalar_op on the related lane)
5601 /// * d = extractelement <2 x i32> c, i32 0
5602 /// * store d
5603 /// Assuming both extractelement and store can be combine, we get rid of the
5604 /// transition.
5605 class VectorPromoteHelper {
5606   /// DataLayout associated with the current module.
5607   const DataLayout &DL;
5608
5609   /// Used to perform some checks on the legality of vector operations.
5610   const TargetLowering &TLI;
5611
5612   /// Used to estimated the cost of the promoted chain.
5613   const TargetTransformInfo &TTI;
5614
5615   /// The transition being moved downwards.
5616   Instruction *Transition;
5617   /// The sequence of instructions to be promoted.
5618   SmallVector<Instruction *, 4> InstsToBePromoted;
5619   /// Cost of combining a store and an extract.
5620   unsigned StoreExtractCombineCost;
5621   /// Instruction that will be combined with the transition.
5622   Instruction *CombineInst;
5623
5624   /// \brief The instruction that represents the current end of the transition.
5625   /// Since we are faking the promotion until we reach the end of the chain
5626   /// of computation, we need a way to get the current end of the transition.
5627   Instruction *getEndOfTransition() const {
5628     if (InstsToBePromoted.empty())
5629       return Transition;
5630     return InstsToBePromoted.back();
5631   }
5632
5633   /// \brief Return the index of the original value in the transition.
5634   /// E.g., for "extractelement <2 x i32> c, i32 1" the original value,
5635   /// c, is at index 0.
5636   unsigned getTransitionOriginalValueIdx() const {
5637     assert(isa<ExtractElementInst>(Transition) &&
5638            "Other kind of transitions are not supported yet");
5639     return 0;
5640   }
5641
5642   /// \brief Return the index of the index in the transition.
5643   /// E.g., for "extractelement <2 x i32> c, i32 0" the index
5644   /// is at index 1.
5645   unsigned getTransitionIdx() const {
5646     assert(isa<ExtractElementInst>(Transition) &&
5647            "Other kind of transitions are not supported yet");
5648     return 1;
5649   }
5650
5651   /// \brief Get the type of the transition.
5652   /// This is the type of the original value.
5653   /// E.g., for "extractelement <2 x i32> c, i32 1" the type of the
5654   /// transition is <2 x i32>.
5655   Type *getTransitionType() const {
5656     return Transition->getOperand(getTransitionOriginalValueIdx())->getType();
5657   }
5658
5659   /// \brief Promote \p ToBePromoted by moving \p Def downward through.
5660   /// I.e., we have the following sequence:
5661   /// Def = Transition <ty1> a to <ty2>
5662   /// b = ToBePromoted <ty2> Def, ...
5663   /// =>
5664   /// b = ToBePromoted <ty1> a, ...
5665   /// Def = Transition <ty1> ToBePromoted to <ty2>
5666   void promoteImpl(Instruction *ToBePromoted);
5667
5668   /// \brief Check whether or not it is profitable to promote all the
5669   /// instructions enqueued to be promoted.
5670   bool isProfitableToPromote() {
5671     Value *ValIdx = Transition->getOperand(getTransitionOriginalValueIdx());
5672     unsigned Index = isa<ConstantInt>(ValIdx)
5673                          ? cast<ConstantInt>(ValIdx)->getZExtValue()
5674                          : -1;
5675     Type *PromotedType = getTransitionType();
5676
5677     StoreInst *ST = cast<StoreInst>(CombineInst);
5678     unsigned AS = ST->getPointerAddressSpace();
5679     unsigned Align = ST->getAlignment();
5680     // Check if this store is supported.
5681     if (!TLI.allowsMisalignedMemoryAccesses(
5682             TLI.getValueType(DL, ST->getValueOperand()->getType()), AS,
5683             Align)) {
5684       // If this is not supported, there is no way we can combine
5685       // the extract with the store.
5686       return false;
5687     }
5688
5689     // The scalar chain of computation has to pay for the transition
5690     // scalar to vector.
5691     // The vector chain has to account for the combining cost.
5692     uint64_t ScalarCost =
5693         TTI.getVectorInstrCost(Transition->getOpcode(), PromotedType, Index);
5694     uint64_t VectorCost = StoreExtractCombineCost;
5695     for (const auto &Inst : InstsToBePromoted) {
5696       // Compute the cost.
5697       // By construction, all instructions being promoted are arithmetic ones.
5698       // Moreover, one argument is a constant that can be viewed as a splat
5699       // constant.
5700       Value *Arg0 = Inst->getOperand(0);
5701       bool IsArg0Constant = isa<UndefValue>(Arg0) || isa<ConstantInt>(Arg0) ||
5702                             isa<ConstantFP>(Arg0);
5703       TargetTransformInfo::OperandValueKind Arg0OVK =
5704           IsArg0Constant ? TargetTransformInfo::OK_UniformConstantValue
5705                          : TargetTransformInfo::OK_AnyValue;
5706       TargetTransformInfo::OperandValueKind Arg1OVK =
5707           !IsArg0Constant ? TargetTransformInfo::OK_UniformConstantValue
5708                           : TargetTransformInfo::OK_AnyValue;
5709       ScalarCost += TTI.getArithmeticInstrCost(
5710           Inst->getOpcode(), Inst->getType(), Arg0OVK, Arg1OVK);
5711       VectorCost += TTI.getArithmeticInstrCost(Inst->getOpcode(), PromotedType,
5712                                                Arg0OVK, Arg1OVK);
5713     }
5714     DEBUG(dbgs() << "Estimated cost of computation to be promoted:\nScalar: "
5715                  << ScalarCost << "\nVector: " << VectorCost << '\n');
5716     return ScalarCost > VectorCost;
5717   }
5718
5719   /// \brief Generate a constant vector with \p Val with the same
5720   /// number of elements as the transition.
5721   /// \p UseSplat defines whether or not \p Val should be replicated
5722   /// across the whole vector.
5723   /// In other words, if UseSplat == true, we generate <Val, Val, ..., Val>,
5724   /// otherwise we generate a vector with as many undef as possible:
5725   /// <undef, ..., undef, Val, undef, ..., undef> where \p Val is only
5726   /// used at the index of the extract.
5727   Value *getConstantVector(Constant *Val, bool UseSplat) const {
5728     unsigned ExtractIdx = UINT_MAX;
5729     if (!UseSplat) {
5730       // If we cannot determine where the constant must be, we have to
5731       // use a splat constant.
5732       Value *ValExtractIdx = Transition->getOperand(getTransitionIdx());
5733       if (ConstantInt *CstVal = dyn_cast<ConstantInt>(ValExtractIdx))
5734         ExtractIdx = CstVal->getSExtValue();
5735       else
5736         UseSplat = true;
5737     }
5738
5739     unsigned End = getTransitionType()->getVectorNumElements();
5740     if (UseSplat)
5741       return ConstantVector::getSplat(End, Val);
5742
5743     SmallVector<Constant *, 4> ConstVec;
5744     UndefValue *UndefVal = UndefValue::get(Val->getType());
5745     for (unsigned Idx = 0; Idx != End; ++Idx) {
5746       if (Idx == ExtractIdx)
5747         ConstVec.push_back(Val);
5748       else
5749         ConstVec.push_back(UndefVal);
5750     }
5751     return ConstantVector::get(ConstVec);
5752   }
5753
5754   /// \brief Check if promoting to a vector type an operand at \p OperandIdx
5755   /// in \p Use can trigger undefined behavior.
5756   static bool canCauseUndefinedBehavior(const Instruction *Use,
5757                                         unsigned OperandIdx) {
5758     // This is not safe to introduce undef when the operand is on
5759     // the right hand side of a division-like instruction.
5760     if (OperandIdx != 1)
5761       return false;
5762     switch (Use->getOpcode()) {
5763     default:
5764       return false;
5765     case Instruction::SDiv:
5766     case Instruction::UDiv:
5767     case Instruction::SRem:
5768     case Instruction::URem:
5769       return true;
5770     case Instruction::FDiv:
5771     case Instruction::FRem:
5772       return !Use->hasNoNaNs();
5773     }
5774     llvm_unreachable(nullptr);
5775   }
5776
5777 public:
5778   VectorPromoteHelper(const DataLayout &DL, const TargetLowering &TLI,
5779                       const TargetTransformInfo &TTI, Instruction *Transition,
5780                       unsigned CombineCost)
5781       : DL(DL), TLI(TLI), TTI(TTI), Transition(Transition),
5782         StoreExtractCombineCost(CombineCost), CombineInst(nullptr) {
5783     assert(Transition && "Do not know how to promote null");
5784   }
5785
5786   /// \brief Check if we can promote \p ToBePromoted to \p Type.
5787   bool canPromote(const Instruction *ToBePromoted) const {
5788     // We could support CastInst too.
5789     return isa<BinaryOperator>(ToBePromoted);
5790   }
5791
5792   /// \brief Check if it is profitable to promote \p ToBePromoted
5793   /// by moving downward the transition through.
5794   bool shouldPromote(const Instruction *ToBePromoted) const {
5795     // Promote only if all the operands can be statically expanded.
5796     // Indeed, we do not want to introduce any new kind of transitions.
5797     for (const Use &U : ToBePromoted->operands()) {
5798       const Value *Val = U.get();
5799       if (Val == getEndOfTransition()) {
5800         // If the use is a division and the transition is on the rhs,
5801         // we cannot promote the operation, otherwise we may create a
5802         // division by zero.
5803         if (canCauseUndefinedBehavior(ToBePromoted, U.getOperandNo()))
5804           return false;
5805         continue;
5806       }
5807       if (!isa<ConstantInt>(Val) && !isa<UndefValue>(Val) &&
5808           !isa<ConstantFP>(Val))
5809         return false;
5810     }
5811     // Check that the resulting operation is legal.
5812     int ISDOpcode = TLI.InstructionOpcodeToISD(ToBePromoted->getOpcode());
5813     if (!ISDOpcode)
5814       return false;
5815     return StressStoreExtract ||
5816            TLI.isOperationLegalOrCustom(
5817                ISDOpcode, TLI.getValueType(DL, getTransitionType(), true));
5818   }
5819
5820   /// \brief Check whether or not \p Use can be combined
5821   /// with the transition.
5822   /// I.e., is it possible to do Use(Transition) => AnotherUse?
5823   bool canCombine(const Instruction *Use) { return isa<StoreInst>(Use); }
5824
5825   /// \brief Record \p ToBePromoted as part of the chain to be promoted.
5826   void enqueueForPromotion(Instruction *ToBePromoted) {
5827     InstsToBePromoted.push_back(ToBePromoted);
5828   }
5829
5830   /// \brief Set the instruction that will be combined with the transition.
5831   void recordCombineInstruction(Instruction *ToBeCombined) {
5832     assert(canCombine(ToBeCombined) && "Unsupported instruction to combine");
5833     CombineInst = ToBeCombined;
5834   }
5835
5836   /// \brief Promote all the instructions enqueued for promotion if it is
5837   /// is profitable.
5838   /// \return True if the promotion happened, false otherwise.
5839   bool promote() {
5840     // Check if there is something to promote.
5841     // Right now, if we do not have anything to combine with,
5842     // we assume the promotion is not profitable.
5843     if (InstsToBePromoted.empty() || !CombineInst)
5844       return false;
5845
5846     // Check cost.
5847     if (!StressStoreExtract && !isProfitableToPromote())
5848       return false;
5849
5850     // Promote.
5851     for (auto &ToBePromoted : InstsToBePromoted)
5852       promoteImpl(ToBePromoted);
5853     InstsToBePromoted.clear();
5854     return true;
5855   }
5856 };
5857 } // End of anonymous namespace.
5858
5859 void VectorPromoteHelper::promoteImpl(Instruction *ToBePromoted) {
5860   // At this point, we know that all the operands of ToBePromoted but Def
5861   // can be statically promoted.
5862   // For Def, we need to use its parameter in ToBePromoted:
5863   // b = ToBePromoted ty1 a
5864   // Def = Transition ty1 b to ty2
5865   // Move the transition down.
5866   // 1. Replace all uses of the promoted operation by the transition.
5867   // = ... b => = ... Def.
5868   assert(ToBePromoted->getType() == Transition->getType() &&
5869          "The type of the result of the transition does not match "
5870          "the final type");
5871   ToBePromoted->replaceAllUsesWith(Transition);
5872   // 2. Update the type of the uses.
5873   // b = ToBePromoted ty2 Def => b = ToBePromoted ty1 Def.
5874   Type *TransitionTy = getTransitionType();
5875   ToBePromoted->mutateType(TransitionTy);
5876   // 3. Update all the operands of the promoted operation with promoted
5877   // operands.
5878   // b = ToBePromoted ty1 Def => b = ToBePromoted ty1 a.
5879   for (Use &U : ToBePromoted->operands()) {
5880     Value *Val = U.get();
5881     Value *NewVal = nullptr;
5882     if (Val == Transition)
5883       NewVal = Transition->getOperand(getTransitionOriginalValueIdx());
5884     else if (isa<UndefValue>(Val) || isa<ConstantInt>(Val) ||
5885              isa<ConstantFP>(Val)) {
5886       // Use a splat constant if it is not safe to use undef.
5887       NewVal = getConstantVector(
5888           cast<Constant>(Val),
5889           isa<UndefValue>(Val) ||
5890               canCauseUndefinedBehavior(ToBePromoted, U.getOperandNo()));
5891     } else
5892       llvm_unreachable("Did you modified shouldPromote and forgot to update "
5893                        "this?");
5894     ToBePromoted->setOperand(U.getOperandNo(), NewVal);
5895   }
5896   Transition->removeFromParent();
5897   Transition->insertAfter(ToBePromoted);
5898   Transition->setOperand(getTransitionOriginalValueIdx(), ToBePromoted);
5899 }
5900
5901 /// Some targets can do store(extractelement) with one instruction.
5902 /// Try to push the extractelement towards the stores when the target
5903 /// has this feature and this is profitable.
5904 bool CodeGenPrepare::optimizeExtractElementInst(Instruction *Inst) {
5905   unsigned CombineCost = UINT_MAX;
5906   if (DisableStoreExtract || !TLI ||
5907       (!StressStoreExtract &&
5908        !TLI->canCombineStoreAndExtract(Inst->getOperand(0)->getType(),
5909                                        Inst->getOperand(1), CombineCost)))
5910     return false;
5911
5912   // At this point we know that Inst is a vector to scalar transition.
5913   // Try to move it down the def-use chain, until:
5914   // - We can combine the transition with its single use
5915   //   => we got rid of the transition.
5916   // - We escape the current basic block
5917   //   => we would need to check that we are moving it at a cheaper place and
5918   //      we do not do that for now.
5919   BasicBlock *Parent = Inst->getParent();
5920   DEBUG(dbgs() << "Found an interesting transition: " << *Inst << '\n');
5921   VectorPromoteHelper VPH(*DL, *TLI, *TTI, Inst, CombineCost);
5922   // If the transition has more than one use, assume this is not going to be
5923   // beneficial.
5924   while (Inst->hasOneUse()) {
5925     Instruction *ToBePromoted = cast<Instruction>(*Inst->user_begin());
5926     DEBUG(dbgs() << "Use: " << *ToBePromoted << '\n');
5927
5928     if (ToBePromoted->getParent() != Parent) {
5929       DEBUG(dbgs() << "Instruction to promote is in a different block ("
5930                    << ToBePromoted->getParent()->getName()
5931                    << ") than the transition (" << Parent->getName() << ").\n");
5932       return false;
5933     }
5934
5935     if (VPH.canCombine(ToBePromoted)) {
5936       DEBUG(dbgs() << "Assume " << *Inst << '\n'
5937                    << "will be combined with: " << *ToBePromoted << '\n');
5938       VPH.recordCombineInstruction(ToBePromoted);
5939       bool Changed = VPH.promote();
5940       NumStoreExtractExposed += Changed;
5941       return Changed;
5942     }
5943
5944     DEBUG(dbgs() << "Try promoting.\n");
5945     if (!VPH.canPromote(ToBePromoted) || !VPH.shouldPromote(ToBePromoted))
5946       return false;
5947
5948     DEBUG(dbgs() << "Promoting is possible... Enqueue for promotion!\n");
5949
5950     VPH.enqueueForPromotion(ToBePromoted);
5951     Inst = ToBePromoted;
5952   }
5953   return false;
5954 }
5955
5956 /// For the instruction sequence of store below, F and I values
5957 /// are bundled together as an i64 value before being stored into memory.
5958 /// Sometimes it is more efficent to generate separate stores for F and I,
5959 /// which can remove the bitwise instructions or sink them to colder places.
5960 ///
5961 ///   (store (or (zext (bitcast F to i32) to i64),
5962 ///              (shl (zext I to i64), 32)), addr)  -->
5963 ///   (store F, addr) and (store I, addr+4)
5964 ///
5965 /// Similarly, splitting for other merged store can also be beneficial, like:
5966 /// For pair of {i32, i32}, i64 store --> two i32 stores.
5967 /// For pair of {i32, i16}, i64 store --> two i32 stores.
5968 /// For pair of {i16, i16}, i32 store --> two i16 stores.
5969 /// For pair of {i16, i8},  i32 store --> two i16 stores.
5970 /// For pair of {i8, i8},   i16 store --> two i8 stores.
5971 ///
5972 /// We allow each target to determine specifically which kind of splitting is
5973 /// supported.
5974 ///
5975 /// The store patterns are commonly seen from the simple code snippet below
5976 /// if only std::make_pair(...) is sroa transformed before inlined into hoo.
5977 ///   void goo(const std::pair<int, float> &);
5978 ///   hoo() {
5979 ///     ...
5980 ///     goo(std::make_pair(tmp, ftmp));
5981 ///     ...
5982 ///   }
5983 ///
5984 /// Although we already have similar splitting in DAG Combine, we duplicate
5985 /// it in CodeGenPrepare to catch the case in which pattern is across
5986 /// multiple BBs. The logic in DAG Combine is kept to catch case generated
5987 /// during code expansion.
5988 static bool splitMergedValStore(StoreInst &SI, const DataLayout &DL,
5989                                 const TargetLowering &TLI) {
5990   // Handle simple but common cases only.
5991   Type *StoreType = SI.getValueOperand()->getType();
5992   if (DL.getTypeStoreSizeInBits(StoreType) != DL.getTypeSizeInBits(StoreType) ||
5993       DL.getTypeSizeInBits(StoreType) == 0)
5994     return false;
5995
5996   unsigned HalfValBitSize = DL.getTypeSizeInBits(StoreType) / 2;
5997   Type *SplitStoreType = Type::getIntNTy(SI.getContext(), HalfValBitSize);
5998   if (DL.getTypeStoreSizeInBits(SplitStoreType) !=
5999       DL.getTypeSizeInBits(SplitStoreType))
6000     return false;
6001
6002   // Match the following patterns:
6003   // (store (or (zext LValue to i64),
6004   //            (shl (zext HValue to i64), 32)), HalfValBitSize)
6005   //  or
6006   // (store (or (shl (zext HValue to i64), 32)), HalfValBitSize)
6007   //            (zext LValue to i64),
6008   // Expect both operands of OR and the first operand of SHL have only
6009   // one use.
6010   Value *LValue, *HValue;
6011   if (!match(SI.getValueOperand(),
6012              m_c_Or(m_OneUse(m_ZExt(m_Value(LValue))),
6013                     m_OneUse(m_Shl(m_OneUse(m_ZExt(m_Value(HValue))),
6014                                    m_SpecificInt(HalfValBitSize))))))
6015     return false;
6016
6017   // Check LValue and HValue are int with size less or equal than 32.
6018   if (!LValue->getType()->isIntegerTy() ||
6019       DL.getTypeSizeInBits(LValue->getType()) > HalfValBitSize ||
6020       !HValue->getType()->isIntegerTy() ||
6021       DL.getTypeSizeInBits(HValue->getType()) > HalfValBitSize)
6022     return false;
6023
6024   // If LValue/HValue is a bitcast instruction, use the EVT before bitcast
6025   // as the input of target query.
6026   auto *LBC = dyn_cast<BitCastInst>(LValue);
6027   auto *HBC = dyn_cast<BitCastInst>(HValue);
6028   EVT LowTy = LBC ? EVT::getEVT(LBC->getOperand(0)->getType())
6029                   : EVT::getEVT(LValue->getType());
6030   EVT HighTy = HBC ? EVT::getEVT(HBC->getOperand(0)->getType())
6031                    : EVT::getEVT(HValue->getType());
6032   if (!ForceSplitStore && !TLI.isMultiStoresCheaperThanBitsMerge(LowTy, HighTy))
6033     return false;
6034
6035   // Start to split store.
6036   IRBuilder<> Builder(SI.getContext());
6037   Builder.SetInsertPoint(&SI);
6038
6039   // If LValue/HValue is a bitcast in another BB, create a new one in current
6040   // BB so it may be merged with the splitted stores by dag combiner.
6041   if (LBC && LBC->getParent() != SI.getParent())
6042     LValue = Builder.CreateBitCast(LBC->getOperand(0), LBC->getType());
6043   if (HBC && HBC->getParent() != SI.getParent())
6044     HValue = Builder.CreateBitCast(HBC->getOperand(0), HBC->getType());
6045
6046   auto CreateSplitStore = [&](Value *V, bool Upper) {
6047     V = Builder.CreateZExtOrBitCast(V, SplitStoreType);
6048     Value *Addr = Builder.CreateBitCast(
6049         SI.getOperand(1),
6050         SplitStoreType->getPointerTo(SI.getPointerAddressSpace()));
6051     if (Upper)
6052       Addr = Builder.CreateGEP(
6053           SplitStoreType, Addr,
6054           ConstantInt::get(Type::getInt32Ty(SI.getContext()), 1));
6055     Builder.CreateAlignedStore(
6056         V, Addr, Upper ? SI.getAlignment() / 2 : SI.getAlignment());
6057   };
6058
6059   CreateSplitStore(LValue, false);
6060   CreateSplitStore(HValue, true);
6061
6062   // Delete the old store.
6063   SI.eraseFromParent();
6064   return true;
6065 }
6066
6067 bool CodeGenPrepare::optimizeInst(Instruction *I, bool &ModifiedDT) {
6068   // Bail out if we inserted the instruction to prevent optimizations from
6069   // stepping on each other's toes.
6070   if (InsertedInsts.count(I))
6071     return false;
6072
6073   if (PHINode *P = dyn_cast<PHINode>(I)) {
6074     // It is possible for very late stage optimizations (such as SimplifyCFG)
6075     // to introduce PHI nodes too late to be cleaned up.  If we detect such a
6076     // trivial PHI, go ahead and zap it here.
6077     if (Value *V = SimplifyInstruction(P, {*DL, TLInfo})) {
6078       P->replaceAllUsesWith(V);
6079       P->eraseFromParent();
6080       ++NumPHIsElim;
6081       return true;
6082     }
6083     return false;
6084   }
6085
6086   if (CastInst *CI = dyn_cast<CastInst>(I)) {
6087     // If the source of the cast is a constant, then this should have
6088     // already been constant folded.  The only reason NOT to constant fold
6089     // it is if something (e.g. LSR) was careful to place the constant
6090     // evaluation in a block other than then one that uses it (e.g. to hoist
6091     // the address of globals out of a loop).  If this is the case, we don't
6092     // want to forward-subst the cast.
6093     if (isa<Constant>(CI->getOperand(0)))
6094       return false;
6095
6096     if (TLI && OptimizeNoopCopyExpression(CI, *TLI, *DL))
6097       return true;
6098
6099     if (isa<ZExtInst>(I) || isa<SExtInst>(I)) {
6100       /// Sink a zext or sext into its user blocks if the target type doesn't
6101       /// fit in one register
6102       if (TLI &&
6103           TLI->getTypeAction(CI->getContext(),
6104                              TLI->getValueType(*DL, CI->getType())) ==
6105               TargetLowering::TypeExpandInteger) {
6106         return SinkCast(CI);
6107       } else {
6108         bool MadeChange = optimizeExt(I);
6109         return MadeChange | optimizeExtUses(I);
6110       }
6111     }
6112     return false;
6113   }
6114
6115   if (CmpInst *CI = dyn_cast<CmpInst>(I))
6116     if (!TLI || !TLI->hasMultipleConditionRegisters())
6117       return OptimizeCmpExpression(CI, TLI);
6118
6119   if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
6120     LI->setMetadata(LLVMContext::MD_invariant_group, nullptr);
6121     if (TLI) {
6122       bool Modified = optimizeLoadExt(LI);
6123       unsigned AS = LI->getPointerAddressSpace();
6124       Modified |= optimizeMemoryInst(I, I->getOperand(0), LI->getType(), AS);
6125       return Modified;
6126     }
6127     return false;
6128   }
6129
6130   if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
6131     if (TLI && splitMergedValStore(*SI, *DL, *TLI))
6132       return true;
6133     SI->setMetadata(LLVMContext::MD_invariant_group, nullptr);
6134     if (TLI) {
6135       unsigned AS = SI->getPointerAddressSpace();
6136       return optimizeMemoryInst(I, SI->getOperand(1),
6137                                 SI->getOperand(0)->getType(), AS);
6138     }
6139     return false;
6140   }
6141
6142   if (AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(I)) {
6143       unsigned AS = RMW->getPointerAddressSpace();
6144       return optimizeMemoryInst(I, RMW->getPointerOperand(),
6145                                 RMW->getType(), AS);
6146   }
6147
6148   if (AtomicCmpXchgInst *CmpX = dyn_cast<AtomicCmpXchgInst>(I)) {
6149       unsigned AS = CmpX->getPointerAddressSpace();
6150       return optimizeMemoryInst(I, CmpX->getPointerOperand(),
6151                                 CmpX->getCompareOperand()->getType(), AS);
6152   }
6153
6154   BinaryOperator *BinOp = dyn_cast<BinaryOperator>(I);
6155
6156   if (BinOp && (BinOp->getOpcode() == Instruction::And) &&
6157       EnableAndCmpSinking && TLI)
6158     return sinkAndCmp0Expression(BinOp, *TLI, InsertedInsts);
6159
6160   if (BinOp && (BinOp->getOpcode() == Instruction::AShr ||
6161                 BinOp->getOpcode() == Instruction::LShr)) {
6162     ConstantInt *CI = dyn_cast<ConstantInt>(BinOp->getOperand(1));
6163     if (TLI && CI && TLI->hasExtractBitsInsn())
6164       return OptimizeExtractBits(BinOp, CI, *TLI, *DL);
6165
6166     return false;
6167   }
6168
6169   if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(I)) {
6170     if (GEPI->hasAllZeroIndices()) {
6171       /// The GEP operand must be a pointer, so must its result -> BitCast
6172       Instruction *NC = new BitCastInst(GEPI->getOperand(0), GEPI->getType(),
6173                                         GEPI->getName(), GEPI);
6174       GEPI->replaceAllUsesWith(NC);
6175       GEPI->eraseFromParent();
6176       ++NumGEPsElim;
6177       optimizeInst(NC, ModifiedDT);
6178       return true;
6179     }
6180     return false;
6181   }
6182
6183   if (CallInst *CI = dyn_cast<CallInst>(I))
6184     return optimizeCallInst(CI, ModifiedDT);
6185
6186   if (SelectInst *SI = dyn_cast<SelectInst>(I))
6187     return optimizeSelectInst(SI);
6188
6189   if (ShuffleVectorInst *SVI = dyn_cast<ShuffleVectorInst>(I))
6190     return optimizeShuffleVectorInst(SVI);
6191
6192   if (auto *Switch = dyn_cast<SwitchInst>(I))
6193     return optimizeSwitchInst(Switch);
6194
6195   if (isa<ExtractElementInst>(I))
6196     return optimizeExtractElementInst(I);
6197
6198   return false;
6199 }
6200
6201 /// Given an OR instruction, check to see if this is a bitreverse
6202 /// idiom. If so, insert the new intrinsic and return true.
6203 static bool makeBitReverse(Instruction &I, const DataLayout &DL,
6204                            const TargetLowering &TLI) {
6205   if (!I.getType()->isIntegerTy() ||
6206       !TLI.isOperationLegalOrCustom(ISD::BITREVERSE,
6207                                     TLI.getValueType(DL, I.getType(), true)))
6208     return false;
6209
6210   SmallVector<Instruction*, 4> Insts;
6211   if (!recognizeBSwapOrBitReverseIdiom(&I, false, true, Insts))
6212     return false;
6213   Instruction *LastInst = Insts.back();
6214   I.replaceAllUsesWith(LastInst);
6215   RecursivelyDeleteTriviallyDeadInstructions(&I);
6216   return true;
6217 }
6218
6219 // In this pass we look for GEP and cast instructions that are used
6220 // across basic blocks and rewrite them to improve basic-block-at-a-time
6221 // selection.
6222 bool CodeGenPrepare::optimizeBlock(BasicBlock &BB, bool &ModifiedDT) {
6223   SunkAddrs.clear();
6224   bool MadeChange = false;
6225
6226   CurInstIterator = BB.begin();
6227   while (CurInstIterator != BB.end()) {
6228     MadeChange |= optimizeInst(&*CurInstIterator++, ModifiedDT);
6229     if (ModifiedDT)
6230       return true;
6231   }
6232
6233   bool MadeBitReverse = true;
6234   while (TLI && MadeBitReverse) {
6235     MadeBitReverse = false;
6236     for (auto &I : reverse(BB)) {
6237       if (makeBitReverse(I, *DL, *TLI)) {
6238         MadeBitReverse = MadeChange = true;
6239         ModifiedDT = true;
6240         break;
6241       }
6242     }
6243   }
6244   MadeChange |= dupRetToEnableTailCallOpts(&BB);
6245
6246   return MadeChange;
6247 }
6248
6249 // llvm.dbg.value is far away from the value then iSel may not be able
6250 // handle it properly. iSel will drop llvm.dbg.value if it can not
6251 // find a node corresponding to the value.
6252 bool CodeGenPrepare::placeDbgValues(Function &F) {
6253   bool MadeChange = false;
6254   for (BasicBlock &BB : F) {
6255     Instruction *PrevNonDbgInst = nullptr;
6256     for (BasicBlock::iterator BI = BB.begin(), BE = BB.end(); BI != BE;) {
6257       Instruction *Insn = &*BI++;
6258       DbgValueInst *DVI = dyn_cast<DbgValueInst>(Insn);
6259       // Leave dbg.values that refer to an alloca alone. These
6260       // instrinsics describe the address of a variable (= the alloca)
6261       // being taken.  They should not be moved next to the alloca
6262       // (and to the beginning of the scope), but rather stay close to
6263       // where said address is used.
6264       if (!DVI || (DVI->getValue() && isa<AllocaInst>(DVI->getValue()))) {
6265         PrevNonDbgInst = Insn;
6266         continue;
6267       }
6268
6269       Instruction *VI = dyn_cast_or_null<Instruction>(DVI->getValue());
6270       if (VI && VI != PrevNonDbgInst && !VI->isTerminator()) {
6271         // If VI is a phi in a block with an EHPad terminator, we can't insert
6272         // after it.
6273         if (isa<PHINode>(VI) && VI->getParent()->getTerminator()->isEHPad())
6274           continue;
6275         DEBUG(dbgs() << "Moving Debug Value before :\n" << *DVI << ' ' << *VI);
6276         DVI->removeFromParent();
6277         if (isa<PHINode>(VI))
6278           DVI->insertBefore(&*VI->getParent()->getFirstInsertionPt());
6279         else
6280           DVI->insertAfter(VI);
6281         MadeChange = true;
6282         ++NumDbgValueMoved;
6283       }
6284     }
6285   }
6286   return MadeChange;
6287 }
6288
6289 /// \brief Scale down both weights to fit into uint32_t.
6290 static void scaleWeights(uint64_t &NewTrue, uint64_t &NewFalse) {
6291   uint64_t NewMax = (NewTrue > NewFalse) ? NewTrue : NewFalse;
6292   uint32_t Scale = (NewMax / UINT32_MAX) + 1;
6293   NewTrue = NewTrue / Scale;
6294   NewFalse = NewFalse / Scale;
6295 }
6296
6297 /// \brief Some targets prefer to split a conditional branch like:
6298 /// \code
6299 ///   %0 = icmp ne i32 %a, 0
6300 ///   %1 = icmp ne i32 %b, 0
6301 ///   %or.cond = or i1 %0, %1
6302 ///   br i1 %or.cond, label %TrueBB, label %FalseBB
6303 /// \endcode
6304 /// into multiple branch instructions like:
6305 /// \code
6306 ///   bb1:
6307 ///     %0 = icmp ne i32 %a, 0
6308 ///     br i1 %0, label %TrueBB, label %bb2
6309 ///   bb2:
6310 ///     %1 = icmp ne i32 %b, 0
6311 ///     br i1 %1, label %TrueBB, label %FalseBB
6312 /// \endcode
6313 /// This usually allows instruction selection to do even further optimizations
6314 /// and combine the compare with the branch instruction. Currently this is
6315 /// applied for targets which have "cheap" jump instructions.
6316 ///
6317 /// FIXME: Remove the (equivalent?) implementation in SelectionDAG.
6318 ///
6319 bool CodeGenPrepare::splitBranchCondition(Function &F) {
6320   if (!TM || !TM->Options.EnableFastISel || !TLI || TLI->isJumpExpensive())
6321     return false;
6322
6323   bool MadeChange = false;
6324   for (auto &BB : F) {
6325     // Does this BB end with the following?
6326     //   %cond1 = icmp|fcmp|binary instruction ...
6327     //   %cond2 = icmp|fcmp|binary instruction ...
6328     //   %cond.or = or|and i1 %cond1, cond2
6329     //   br i1 %cond.or label %dest1, label %dest2"
6330     BinaryOperator *LogicOp;
6331     BasicBlock *TBB, *FBB;
6332     if (!match(BB.getTerminator(), m_Br(m_OneUse(m_BinOp(LogicOp)), TBB, FBB)))
6333       continue;
6334
6335     auto *Br1 = cast<BranchInst>(BB.getTerminator());
6336     if (Br1->getMetadata(LLVMContext::MD_unpredictable))
6337       continue;
6338
6339     unsigned Opc;
6340     Value *Cond1, *Cond2;
6341     if (match(LogicOp, m_And(m_OneUse(m_Value(Cond1)),
6342                              m_OneUse(m_Value(Cond2)))))
6343       Opc = Instruction::And;
6344     else if (match(LogicOp, m_Or(m_OneUse(m_Value(Cond1)),
6345                                  m_OneUse(m_Value(Cond2)))))
6346       Opc = Instruction::Or;
6347     else
6348       continue;
6349
6350     if (!match(Cond1, m_CombineOr(m_Cmp(), m_BinOp())) ||
6351         !match(Cond2, m_CombineOr(m_Cmp(), m_BinOp()))   )
6352       continue;
6353
6354     DEBUG(dbgs() << "Before branch condition splitting\n"; BB.dump());
6355
6356     // Create a new BB.
6357     auto TmpBB =
6358         BasicBlock::Create(BB.getContext(), BB.getName() + ".cond.split",
6359                            BB.getParent(), BB.getNextNode());
6360
6361     // Update original basic block by using the first condition directly by the
6362     // branch instruction and removing the no longer needed and/or instruction.
6363     Br1->setCondition(Cond1);
6364     LogicOp->eraseFromParent();
6365
6366     // Depending on the conditon we have to either replace the true or the false
6367     // successor of the original branch instruction.
6368     if (Opc == Instruction::And)
6369       Br1->setSuccessor(0, TmpBB);
6370     else
6371       Br1->setSuccessor(1, TmpBB);
6372
6373     // Fill in the new basic block.
6374     auto *Br2 = IRBuilder<>(TmpBB).CreateCondBr(Cond2, TBB, FBB);
6375     if (auto *I = dyn_cast<Instruction>(Cond2)) {
6376       I->removeFromParent();
6377       I->insertBefore(Br2);
6378     }
6379
6380     // Update PHI nodes in both successors. The original BB needs to be
6381     // replaced in one successor's PHI nodes, because the branch comes now from
6382     // the newly generated BB (NewBB). In the other successor we need to add one
6383     // incoming edge to the PHI nodes, because both branch instructions target
6384     // now the same successor. Depending on the original branch condition
6385     // (and/or) we have to swap the successors (TrueDest, FalseDest), so that
6386     // we perform the correct update for the PHI nodes.
6387     // This doesn't change the successor order of the just created branch
6388     // instruction (or any other instruction).
6389     if (Opc == Instruction::Or)
6390       std::swap(TBB, FBB);
6391
6392     // Replace the old BB with the new BB.
6393     for (auto &I : *TBB) {
6394       PHINode *PN = dyn_cast<PHINode>(&I);
6395       if (!PN)
6396         break;
6397       int i;
6398       while ((i = PN->getBasicBlockIndex(&BB)) >= 0)
6399         PN->setIncomingBlock(i, TmpBB);
6400     }
6401
6402     // Add another incoming edge form the new BB.
6403     for (auto &I : *FBB) {
6404       PHINode *PN = dyn_cast<PHINode>(&I);
6405       if (!PN)
6406         break;
6407       auto *Val = PN->getIncomingValueForBlock(&BB);
6408       PN->addIncoming(Val, TmpBB);
6409     }
6410
6411     // Update the branch weights (from SelectionDAGBuilder::
6412     // FindMergedConditions).
6413     if (Opc == Instruction::Or) {
6414       // Codegen X | Y as:
6415       // BB1:
6416       //   jmp_if_X TBB
6417       //   jmp TmpBB
6418       // TmpBB:
6419       //   jmp_if_Y TBB
6420       //   jmp FBB
6421       //
6422
6423       // We have flexibility in setting Prob for BB1 and Prob for NewBB.
6424       // The requirement is that
6425       //   TrueProb for BB1 + (FalseProb for BB1 * TrueProb for TmpBB)
6426       //     = TrueProb for orignal BB.
6427       // Assuming the orignal weights are A and B, one choice is to set BB1's
6428       // weights to A and A+2B, and set TmpBB's weights to A and 2B. This choice
6429       // assumes that
6430       //   TrueProb for BB1 == FalseProb for BB1 * TrueProb for TmpBB.
6431       // Another choice is to assume TrueProb for BB1 equals to TrueProb for
6432       // TmpBB, but the math is more complicated.
6433       uint64_t TrueWeight, FalseWeight;
6434       if (Br1->extractProfMetadata(TrueWeight, FalseWeight)) {
6435         uint64_t NewTrueWeight = TrueWeight;
6436         uint64_t NewFalseWeight = TrueWeight + 2 * FalseWeight;
6437         scaleWeights(NewTrueWeight, NewFalseWeight);
6438         Br1->setMetadata(LLVMContext::MD_prof, MDBuilder(Br1->getContext())
6439                          .createBranchWeights(TrueWeight, FalseWeight));
6440
6441         NewTrueWeight = TrueWeight;
6442         NewFalseWeight = 2 * FalseWeight;
6443         scaleWeights(NewTrueWeight, NewFalseWeight);
6444         Br2->setMetadata(LLVMContext::MD_prof, MDBuilder(Br2->getContext())
6445                          .createBranchWeights(TrueWeight, FalseWeight));
6446       }
6447     } else {
6448       // Codegen X & Y as:
6449       // BB1:
6450       //   jmp_if_X TmpBB
6451       //   jmp FBB
6452       // TmpBB:
6453       //   jmp_if_Y TBB
6454       //   jmp FBB
6455       //
6456       //  This requires creation of TmpBB after CurBB.
6457
6458       // We have flexibility in setting Prob for BB1 and Prob for TmpBB.
6459       // The requirement is that
6460       //   FalseProb for BB1 + (TrueProb for BB1 * FalseProb for TmpBB)
6461       //     = FalseProb for orignal BB.
6462       // Assuming the orignal weights are A and B, one choice is to set BB1's
6463       // weights to 2A+B and B, and set TmpBB's weights to 2A and B. This choice
6464       // assumes that
6465       //   FalseProb for BB1 == TrueProb for BB1 * FalseProb for TmpBB.
6466       uint64_t TrueWeight, FalseWeight;
6467       if (Br1->extractProfMetadata(TrueWeight, FalseWeight)) {
6468         uint64_t NewTrueWeight = 2 * TrueWeight + FalseWeight;
6469         uint64_t NewFalseWeight = FalseWeight;
6470         scaleWeights(NewTrueWeight, NewFalseWeight);
6471         Br1->setMetadata(LLVMContext::MD_prof, MDBuilder(Br1->getContext())
6472                          .createBranchWeights(TrueWeight, FalseWeight));
6473
6474         NewTrueWeight = 2 * TrueWeight;
6475         NewFalseWeight = FalseWeight;
6476         scaleWeights(NewTrueWeight, NewFalseWeight);
6477         Br2->setMetadata(LLVMContext::MD_prof, MDBuilder(Br2->getContext())
6478                          .createBranchWeights(TrueWeight, FalseWeight));
6479       }
6480     }
6481
6482     // Note: No point in getting fancy here, since the DT info is never
6483     // available to CodeGenPrepare.
6484     ModifiedDT = true;
6485
6486     MadeChange = true;
6487
6488     DEBUG(dbgs() << "After branch condition splitting\n"; BB.dump();
6489           TmpBB->dump());
6490   }
6491   return MadeChange;
6492 }