contrib/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp

   1 //===- LoopStrengthReduce.cpp - Strength Reduce IVs in Loops --------------===//
   2 //
   3 //                     The LLVM Compiler Infrastructure
   4 //
   5 // This file is distributed under the University of Illinois Open Source
   6 // License. See LICENSE.TXT for details.
   7 //
   8 //===----------------------------------------------------------------------===//
   9 //
  10 // This transformation analyzes and transforms the induction variables (and
  11 // computations derived from them) into forms suitable for efficient execution
  12 // on the target.
  13 //
  14 // This pass performs a strength reduction on array references inside loops that
  15 // have as one or more of their components the loop induction variable, it
  16 // rewrites expressions to take advantage of scaled-index addressing modes
  17 // available on the target, and it performs a variety of other optimizations
  18 // related to loop induction variables.
  19 //
  20 // Terminology note: this code has a lot of handling for "post-increment" or
  21 // "post-inc" users. This is not talking about post-increment addressing modes;
  22 // it is instead talking about code like this:
  23 //
  24 //   %i = phi [ 0, %entry ], [ %i.next, %latch ]
  25 //   ...
  26 //   %i.next = add %i, 1
  27 //   %c = icmp eq %i.next, %n
  28 //
  29 // The SCEV for %i is {0,+,1}<%L>. The SCEV for %i.next is {1,+,1}<%L>, however
  30 // it's useful to think about these as the same register, with some uses using
  31 // the value of the register before the add and some using // it after. In this
  32 // example, the icmp is a post-increment user, since it uses %i.next, which is
  33 // the value of the induction variable after the increment. The other common
  34 // case of post-increment users is users outside the loop.
  35 //
  36 // TODO: More sophistication in the way Formulae are generated and filtered.
  37 //
  38 // TODO: Handle multiple loops at a time.
  39 //
  40 // TODO: Should TargetLowering::AddrMode::BaseGV be changed to a ConstantExpr
  41 //       instead of a GlobalValue?
  42 //
  43 // TODO: When truncation is free, truncate ICmp users' operands to make it a
  44 //       smaller encoding (on x86 at least).
  45 //
  46 // TODO: When a negated register is used by an add (such as in a list of
  47 //       multiple base registers, or as the increment expression in an addrec),
  48 //       we may not actually need both reg and (-1 * reg) in registers; the
  49 //       negation can be implemented by using a sub instead of an add. The
  50 //       lack of support for taking this into consideration when making
  51 //       register pressure decisions is partly worked around by the "Special"
  52 //       use kind.
  53 //
  54 //===----------------------------------------------------------------------===//
  55
  56 #define DEBUG_TYPE "loop-reduce"
  57 #include "llvm/Transforms/Scalar.h"
  58 #include "llvm/Constants.h"
  59 #include "llvm/Instructions.h"
  60 #include "llvm/IntrinsicInst.h"
  61 #include "llvm/DerivedTypes.h"
  62 #include "llvm/Analysis/IVUsers.h"
  63 #include "llvm/Analysis/Dominators.h"
  64 #include "llvm/Analysis/LoopPass.h"
  65 #include "llvm/Analysis/ScalarEvolutionExpander.h"
  66 #include "llvm/Assembly/Writer.h"
  67 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
  68 #include "llvm/Transforms/Utils/Local.h"
  69 #include "llvm/ADT/SmallBitVector.h"
  70 #include "llvm/ADT/SetVector.h"
  71 #include "llvm/ADT/DenseSet.h"
  72 #include "llvm/Support/Debug.h"
  73 #include "llvm/Support/CommandLine.h"
  74 #include "llvm/Support/ValueHandle.h"
  75 #include "llvm/Support/raw_ostream.h"
  76 #include "llvm/Target/TargetLowering.h"
  77 #include <algorithm>
  78 using namespace llvm;
  79
  80 namespace llvm {
  81 cl::opt<bool> EnableNested(
  82   "enable-lsr-nested", cl::Hidden, cl::desc("Enable LSR on nested loops"));
  83
  84 cl::opt<bool> EnableRetry(
  85     "enable-lsr-retry", cl::Hidden, cl::desc("Enable LSR retry"));
  86
  87 // Temporary flag to cleanup congruent phis after LSR phi expansion.
  88 // It's currently disabled until we can determine whether it's truly useful or
  89 // not. The flag should be removed after the v3.0 release.
  90 cl::opt<bool> EnablePhiElim(
  91     "enable-lsr-phielim", cl::Hidden, cl::desc("Enable LSR phi elimination"));
  92 }
  93
  94 namespace {
  95
  96 /// RegSortData - This class holds data which is used to order reuse candidates.
  97 class RegSortData {
  98 public:
  99   /// UsedByIndices - This represents the set of LSRUse indices which reference
 100   /// a particular register.
 101   SmallBitVector UsedByIndices;
 102
 103   RegSortData() {}
 104
 105   void print(raw_ostream &OS) const;
 106   void dump() const;
 107 };
 108
 109 }
 110
 111 void RegSortData::print(raw_ostream &OS) const {
 112   OS << "[NumUses=" << UsedByIndices.count() << ']';
 113 }
 114
 115 void RegSortData::dump() const {
 116   print(errs()); errs() << '\n';
 117 }
 118
 119 namespace {
 120
 121 /// RegUseTracker - Map register candidates to information about how they are
 122 /// used.
 123 class RegUseTracker {
 124   typedef DenseMap<const SCEV *, RegSortData> RegUsesTy;
 125
 126   RegUsesTy RegUsesMap;
 127   SmallVector<const SCEV *, 16> RegSequence;
 128
 129 public:
 130   void CountRegister(const SCEV *Reg, size_t LUIdx);
 131   void DropRegister(const SCEV *Reg, size_t LUIdx);
 132   void SwapAndDropUse(size_t LUIdx, size_t LastLUIdx);
 133
 134   bool isRegUsedByUsesOtherThan(const SCEV *Reg, size_t LUIdx) const;
 135
 136   const SmallBitVector &getUsedByIndices(const SCEV *Reg) const;
 137
 138   void clear();
 139
 140   typedef SmallVectorImpl<const SCEV *>::iterator iterator;
 141   typedef SmallVectorImpl<const SCEV *>::const_iterator const_iterator;
 142   iterator begin() { return RegSequence.begin(); }
 143   iterator end()   { return RegSequence.end(); }
 144   const_iterator begin() const { return RegSequence.begin(); }
 145   const_iterator end() const   { return RegSequence.end(); }
 146 };
 147
 148 }
 149
 150 void
 151 RegUseTracker::CountRegister(const SCEV *Reg, size_t LUIdx) {
 152   std::pair<RegUsesTy::iterator, bool> Pair =
 153     RegUsesMap.insert(std::make_pair(Reg, RegSortData()));
 154   RegSortData &RSD = Pair.first->second;
 155   if (Pair.second)
 156     RegSequence.push_back(Reg);
 157   RSD.UsedByIndices.resize(std::max(RSD.UsedByIndices.size(), LUIdx + 1));
 158   RSD.UsedByIndices.set(LUIdx);
 159 }
 160
 161 void
 162 RegUseTracker::DropRegister(const SCEV *Reg, size_t LUIdx) {
 163   RegUsesTy::iterator It = RegUsesMap.find(Reg);
 164   assert(It != RegUsesMap.end());
 165   RegSortData &RSD = It->second;
 166   assert(RSD.UsedByIndices.size() > LUIdx);
 167   RSD.UsedByIndices.reset(LUIdx);
 168 }
 169
 170 void
 171 RegUseTracker::SwapAndDropUse(size_t LUIdx, size_t LastLUIdx) {
 172   assert(LUIdx <= LastLUIdx);
 173
 174   // Update RegUses. The data structure is not optimized for this purpose;
 175   // we must iterate through it and update each of the bit vectors.
 176   for (RegUsesTy::iterator I = RegUsesMap.begin(), E = RegUsesMap.end();
 177        I != E; ++I) {
 178     SmallBitVector &UsedByIndices = I->second.UsedByIndices;
 179     if (LUIdx < UsedByIndices.size())
 180       UsedByIndices[LUIdx] =
 181         LastLUIdx < UsedByIndices.size() ? UsedByIndices[LastLUIdx] : 0;
 182     UsedByIndices.resize(std::min(UsedByIndices.size(), LastLUIdx));
 183   }
 184 }
 185
 186 bool
 187 RegUseTracker::isRegUsedByUsesOtherThan(const SCEV *Reg, size_t LUIdx) const {
 188   RegUsesTy::const_iterator I = RegUsesMap.find(Reg);
 189   if (I == RegUsesMap.end())
 190     return false;
 191   const SmallBitVector &UsedByIndices = I->second.UsedByIndices;
 192   int i = UsedByIndices.find_first();
 193   if (i == -1) return false;
 194   if ((size_t)i != LUIdx) return true;
 195   return UsedByIndices.find_next(i) != -1;
 196 }
 197
 198 const SmallBitVector &RegUseTracker::getUsedByIndices(const SCEV *Reg) const {
 199   RegUsesTy::const_iterator I = RegUsesMap.find(Reg);
 200   assert(I != RegUsesMap.end() && "Unknown register!");
 201   return I->second.UsedByIndices;
 202 }
 203
 204 void RegUseTracker::clear() {
 205   RegUsesMap.clear();
 206   RegSequence.clear();
 207 }
 208
 209 namespace {
 210
 211 /// Formula - This class holds information that describes a formula for
 212 /// computing satisfying a use. It may include broken-out immediates and scaled
 213 /// registers.
 214 struct Formula {
 215   /// AM - This is used to represent complex addressing, as well as other kinds
 216   /// of interesting uses.
 217   TargetLowering::AddrMode AM;
 218
 219   /// BaseRegs - The list of "base" registers for this use. When this is
 220   /// non-empty, AM.HasBaseReg should be set to true.
 221   SmallVector<const SCEV *, 2> BaseRegs;
 222
 223   /// ScaledReg - The 'scaled' register for this use. This should be non-null
 224   /// when AM.Scale is not zero.
 225   const SCEV *ScaledReg;
 226
 227   /// UnfoldedOffset - An additional constant offset which added near the
 228   /// use. This requires a temporary register, but the offset itself can
 229   /// live in an add immediate field rather than a register.
 230   int64_t UnfoldedOffset;
 231
 232   Formula() : ScaledReg(0), UnfoldedOffset(0) {}
 233
 234   void InitialMatch(const SCEV *S, Loop *L, ScalarEvolution &SE);
 235
 236   unsigned getNumRegs() const;
 237   Type *getType() const;
 238
 239   void DeleteBaseReg(const SCEV *&S);
 240
 241   bool referencesReg(const SCEV *S) const;
 242   bool hasRegsUsedByUsesOtherThan(size_t LUIdx,
 243                                   const RegUseTracker &RegUses) const;
 244
 245   void print(raw_ostream &OS) const;
 246   void dump() const;
 247 };
 248
 249 }
 250
 251 /// DoInitialMatch - Recursion helper for InitialMatch.
 252 static void DoInitialMatch(const SCEV *S, Loop *L,
 253                            SmallVectorImpl<const SCEV *> &Good,
 254                            SmallVectorImpl<const SCEV *> &Bad,
 255                            ScalarEvolution &SE) {
 256   // Collect expressions which properly dominate the loop header.
 257   if (SE.properlyDominates(S, L->getHeader())) {
 258     Good.push_back(S);
 259     return;
 260   }
 261
 262   // Look at add operands.
 263   if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
 264     for (SCEVAddExpr::op_iterator I = Add->op_begin(), E = Add->op_end();
 265          I != E; ++I)
 266       DoInitialMatch(*I, L, Good, Bad, SE);
 267     return;
 268   }
 269
 270   // Look at addrec operands.
 271   if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S))
 272     if (!AR->getStart()->isZero()) {
 273       DoInitialMatch(AR->getStart(), L, Good, Bad, SE);
 274       DoInitialMatch(SE.getAddRecExpr(SE.getConstant(AR->getType(), 0),
 275                                       AR->getStepRecurrence(SE),
 276                                       // FIXME: AR->getNoWrapFlags()
 277                                       AR->getLoop(), SCEV::FlagAnyWrap),
 278                      L, Good, Bad, SE);
 279       return;
 280     }
 281
 282   // Handle a multiplication by -1 (negation) if it didn't fold.
 283   if (const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(S))
 284     if (Mul->getOperand(0)->isAllOnesValue()) {
 285       SmallVector<const SCEV *, 4> Ops(Mul->op_begin()+1, Mul->op_end());
 286       const SCEV *NewMul = SE.getMulExpr(Ops);
 287
 288       SmallVector<const SCEV *, 4> MyGood;
 289       SmallVector<const SCEV *, 4> MyBad;
 290       DoInitialMatch(NewMul, L, MyGood, MyBad, SE);
 291       const SCEV *NegOne = SE.getSCEV(ConstantInt::getAllOnesValue(
 292         SE.getEffectiveSCEVType(NewMul->getType())));
 293       for (SmallVectorImpl<const SCEV *>::const_iterator I = MyGood.begin(),
 294            E = MyGood.end(); I != E; ++I)
 295         Good.push_back(SE.getMulExpr(NegOne, *I));
 296       for (SmallVectorImpl<const SCEV *>::const_iterator I = MyBad.begin(),
 297            E = MyBad.end(); I != E; ++I)
 298         Bad.push_back(SE.getMulExpr(NegOne, *I));
 299       return;
 300     }
 301
 302   // Ok, we can't do anything interesting. Just stuff the whole thing into a
 303   // register and hope for the best.
 304   Bad.push_back(S);
 305 }
 306
 307 /// InitialMatch - Incorporate loop-variant parts of S into this Formula,
 308 /// attempting to keep all loop-invariant and loop-computable values in a
 309 /// single base register.
 310 void Formula::InitialMatch(const SCEV *S, Loop *L, ScalarEvolution &SE) {
 311   SmallVector<const SCEV *, 4> Good;
 312   SmallVector<const SCEV *, 4> Bad;
 313   DoInitialMatch(S, L, Good, Bad, SE);
 314   if (!Good.empty()) {
 315     const SCEV *Sum = SE.getAddExpr(Good);
 316     if (!Sum->isZero())
 317       BaseRegs.push_back(Sum);
 318     AM.HasBaseReg = true;
 319   }
 320   if (!Bad.empty()) {
 321     const SCEV *Sum = SE.getAddExpr(Bad);
 322     if (!Sum->isZero())
 323       BaseRegs.push_back(Sum);
 324     AM.HasBaseReg = true;
 325   }
 326 }
 327
 328 /// getNumRegs - Return the total number of register operands used by this
 329 /// formula. This does not include register uses implied by non-constant
 330 /// addrec strides.
 331 unsigned Formula::getNumRegs() const {
 332   return !!ScaledReg + BaseRegs.size();
 333 }
 334
 335 /// getType - Return the type of this formula, if it has one, or null
 336 /// otherwise. This type is meaningless except for the bit size.
 337 Type *Formula::getType() const {
 338   return !BaseRegs.empty() ? BaseRegs.front()->getType() :
 339          ScaledReg ? ScaledReg->getType() :
 340          AM.BaseGV ? AM.BaseGV->getType() :
 341          0;
 342 }
 343
 344 /// DeleteBaseReg - Delete the given base reg from the BaseRegs list.
 345 void Formula::DeleteBaseReg(const SCEV *&S) {
 346   if (&S != &BaseRegs.back())
 347     std::swap(S, BaseRegs.back());
 348   BaseRegs.pop_back();
 349 }
 350
 351 /// referencesReg - Test if this formula references the given register.
 352 bool Formula::referencesReg(const SCEV *S) const {
 353   return S == ScaledReg ||
 354          std::find(BaseRegs.begin(), BaseRegs.end(), S) != BaseRegs.end();
 355 }
 356
 357 /// hasRegsUsedByUsesOtherThan - Test whether this formula uses registers
 358 /// which are used by uses other than the use with the given index.
 359 bool Formula::hasRegsUsedByUsesOtherThan(size_t LUIdx,
 360                                          const RegUseTracker &RegUses) const {
 361   if (ScaledReg)
 362     if (RegUses.isRegUsedByUsesOtherThan(ScaledReg, LUIdx))
 363       return true;
 364   for (SmallVectorImpl<const SCEV *>::const_iterator I = BaseRegs.begin(),
 365        E = BaseRegs.end(); I != E; ++I)
 366     if (RegUses.isRegUsedByUsesOtherThan(*I, LUIdx))
 367       return true;
 368   return false;
 369 }
 370
 371 void Formula::print(raw_ostream &OS) const {
 372   bool First = true;
 373   if (AM.BaseGV) {
 374     if (!First) OS << " + "; else First = false;
 375     WriteAsOperand(OS, AM.BaseGV, /*PrintType=*/false);
 376   }
 377   if (AM.BaseOffs != 0) {
 378     if (!First) OS << " + "; else First = false;
 379     OS << AM.BaseOffs;
 380   }
 381   for (SmallVectorImpl<const SCEV *>::const_iterator I = BaseRegs.begin(),
 382        E = BaseRegs.end(); I != E; ++I) {
 383     if (!First) OS << " + "; else First = false;
 384     OS << "reg(" << **I << ')';
 385   }
 386   if (AM.HasBaseReg && BaseRegs.empty()) {
 387     if (!First) OS << " + "; else First = false;
 388     OS << "**error: HasBaseReg**";
 389   } else if (!AM.HasBaseReg && !BaseRegs.empty()) {
 390     if (!First) OS << " + "; else First = false;
 391     OS << "**error: !HasBaseReg**";
 392   }
 393   if (AM.Scale != 0) {
 394     if (!First) OS << " + "; else First = false;
 395     OS << AM.Scale << "*reg(";
 396     if (ScaledReg)
 397       OS << *ScaledReg;
 398     else
 399       OS << "<unknown>";
 400     OS << ')';
 401   }
 402   if (UnfoldedOffset != 0) {
 403     if (!First) OS << " + "; else First = false;
 404     OS << "imm(" << UnfoldedOffset << ')';
 405   }
 406 }
 407
 408 void Formula::dump() const {
 409   print(errs()); errs() << '\n';
 410 }
 411
 412 /// isAddRecSExtable - Return true if the given addrec can be sign-extended
 413 /// without changing its value.
 414 static bool isAddRecSExtable(const SCEVAddRecExpr *AR, ScalarEvolution &SE) {
 415   Type *WideTy =
 416     IntegerType::get(SE.getContext(), SE.getTypeSizeInBits(AR->getType()) + 1);
 417   return isa<SCEVAddRecExpr>(SE.getSignExtendExpr(AR, WideTy));
 418 }
 419
 420 /// isAddSExtable - Return true if the given add can be sign-extended
 421 /// without changing its value.
 422 static bool isAddSExtable(const SCEVAddExpr *A, ScalarEvolution &SE) {
 423   Type *WideTy =
 424     IntegerType::get(SE.getContext(), SE.getTypeSizeInBits(A->getType()) + 1);
 425   return isa<SCEVAddExpr>(SE.getSignExtendExpr(A, WideTy));
 426 }
 427
 428 /// isMulSExtable - Return true if the given mul can be sign-extended
 429 /// without changing its value.
 430 static bool isMulSExtable(const SCEVMulExpr *M, ScalarEvolution &SE) {
 431   Type *WideTy =
 432     IntegerType::get(SE.getContext(),
 433                      SE.getTypeSizeInBits(M->getType()) * M->getNumOperands());
 434   return isa<SCEVMulExpr>(SE.getSignExtendExpr(M, WideTy));
 435 }
 436
 437 /// getExactSDiv - Return an expression for LHS /s RHS, if it can be determined
 438 /// and if the remainder is known to be zero,  or null otherwise. If
 439 /// IgnoreSignificantBits is true, expressions like (X * Y) /s Y are simplified
 440 /// to Y, ignoring that the multiplication may overflow, which is useful when
 441 /// the result will be used in a context where the most significant bits are
 442 /// ignored.
 443 static const SCEV *getExactSDiv(const SCEV *LHS, const SCEV *RHS,
 444                                 ScalarEvolution &SE,
 445                                 bool IgnoreSignificantBits = false) {
 446   // Handle the trivial case, which works for any SCEV type.
 447   if (LHS == RHS)
 448     return SE.getConstant(LHS->getType(), 1);
 449
 450   // Handle a few RHS special cases.
 451   const SCEVConstant *RC = dyn_cast<SCEVConstant>(RHS);
 452   if (RC) {
 453     const APInt &RA = RC->getValue()->getValue();
 454     // Handle x /s -1 as x * -1, to give ScalarEvolution a chance to do
 455     // some folding.
 456     if (RA.isAllOnesValue())
 457       return SE.getMulExpr(LHS, RC);
 458     // Handle x /s 1 as x.
 459     if (RA == 1)
 460       return LHS;
 461   }
 462
 463   // Check for a division of a constant by a constant.
 464   if (const SCEVConstant *C = dyn_cast<SCEVConstant>(LHS)) {
 465     if (!RC)
 466       return 0;
 467     const APInt &LA = C->getValue()->getValue();
 468     const APInt &RA = RC->getValue()->getValue();
 469     if (LA.srem(RA) != 0)
 470       return 0;
 471     return SE.getConstant(LA.sdiv(RA));
 472   }
 473
 474   // Distribute the sdiv over addrec operands, if the addrec doesn't overflow.
 475   if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(LHS)) {
 476     if (IgnoreSignificantBits || isAddRecSExtable(AR, SE)) {
 477       const SCEV *Step = getExactSDiv(AR->getStepRecurrence(SE), RHS, SE,
 478                                       IgnoreSignificantBits);
 479       if (!Step) return 0;
 480       const SCEV *Start = getExactSDiv(AR->getStart(), RHS, SE,
 481                                        IgnoreSignificantBits);
 482       if (!Start) return 0;
 483       // FlagNW is independent of the start value, step direction, and is
 484       // preserved with smaller magnitude steps.
 485       // FIXME: AR->getNoWrapFlags(SCEV::FlagNW)
 486       return SE.getAddRecExpr(Start, Step, AR->getLoop(), SCEV::FlagAnyWrap);
 487     }
 488     return 0;
 489   }
 490
 491   // Distribute the sdiv over add operands, if the add doesn't overflow.
 492   if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(LHS)) {
 493     if (IgnoreSignificantBits || isAddSExtable(Add, SE)) {
 494       SmallVector<const SCEV *, 8> Ops;
 495       for (SCEVAddExpr::op_iterator I = Add->op_begin(), E = Add->op_end();
 496            I != E; ++I) {
 497         const SCEV *Op = getExactSDiv(*I, RHS, SE,
 498                                       IgnoreSignificantBits);
 499         if (!Op) return 0;
 500         Ops.push_back(Op);
 501       }
 502       return SE.getAddExpr(Ops);
 503     }
 504     return 0;
 505   }
 506
 507   // Check for a multiply operand that we can pull RHS out of.
 508   if (const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(LHS)) {
 509     if (IgnoreSignificantBits || isMulSExtable(Mul, SE)) {
 510       SmallVector<const SCEV *, 4> Ops;
 511       bool Found = false;
 512       for (SCEVMulExpr::op_iterator I = Mul->op_begin(), E = Mul->op_end();
 513            I != E; ++I) {
 514         const SCEV *S = *I;
 515         if (!Found)
 516           if (const SCEV *Q = getExactSDiv(S, RHS, SE,
 517                                            IgnoreSignificantBits)) {
 518             S = Q;
 519             Found = true;
 520           }
 521         Ops.push_back(S);
 522       }
 523       return Found ? SE.getMulExpr(Ops) : 0;
 524     }
 525     return 0;
 526   }
 527
 528   // Otherwise we don't know.
 529   return 0;
 530 }
 531
 532 /// ExtractImmediate - If S involves the addition of a constant integer value,
 533 /// return that integer value, and mutate S to point to a new SCEV with that
 534 /// value excluded.
 535 static int64_t ExtractImmediate(const SCEV *&S, ScalarEvolution &SE) {
 536   if (const SCEVConstant *C = dyn_cast<SCEVConstant>(S)) {
 537     if (C->getValue()->getValue().getMinSignedBits() <= 64) {
 538       S = SE.getConstant(C->getType(), 0);
 539       return C->getValue()->getSExtValue();
 540     }
 541   } else if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
 542     SmallVector<const SCEV *, 8> NewOps(Add->op_begin(), Add->op_end());
 543     int64_t Result = ExtractImmediate(NewOps.front(), SE);
 544     if (Result != 0)
 545       S = SE.getAddExpr(NewOps);
 546     return Result;
 547   } else if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
 548     SmallVector<const SCEV *, 8> NewOps(AR->op_begin(), AR->op_end());
 549     int64_t Result = ExtractImmediate(NewOps.front(), SE);
 550     if (Result != 0)
 551       S = SE.getAddRecExpr(NewOps, AR->getLoop(),
 552                            // FIXME: AR->getNoWrapFlags(SCEV::FlagNW)
 553                            SCEV::FlagAnyWrap);
 554     return Result;
 555   }
 556   return 0;
 557 }
 558
 559 /// ExtractSymbol - If S involves the addition of a GlobalValue address,
 560 /// return that symbol, and mutate S to point to a new SCEV with that
 561 /// value excluded.
 562 static GlobalValue *ExtractSymbol(const SCEV *&S, ScalarEvolution &SE) {
 563   if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(S)) {
 564     if (GlobalValue *GV = dyn_cast<GlobalValue>(U->getValue())) {
 565       S = SE.getConstant(GV->getType(), 0);
 566       return GV;
 567     }
 568   } else if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
 569     SmallVector<const SCEV *, 8> NewOps(Add->op_begin(), Add->op_end());
 570     GlobalValue *Result = ExtractSymbol(NewOps.back(), SE);
 571     if (Result)
 572       S = SE.getAddExpr(NewOps);
 573     return Result;
 574   } else if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
 575     SmallVector<const SCEV *, 8> NewOps(AR->op_begin(), AR->op_end());
 576     GlobalValue *Result = ExtractSymbol(NewOps.front(), SE);
 577     if (Result)
 578       S = SE.getAddRecExpr(NewOps, AR->getLoop(),
 579                            // FIXME: AR->getNoWrapFlags(SCEV::FlagNW)
 580                            SCEV::FlagAnyWrap);
 581     return Result;
 582   }
 583   return 0;
 584 }
 585
 586 /// isAddressUse - Returns true if the specified instruction is using the
 587 /// specified value as an address.
 588 static bool isAddressUse(Instruction *Inst, Value *OperandVal) {
 589   bool isAddress = isa<LoadInst>(Inst);
 590   if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
 591     if (SI->getOperand(1) == OperandVal)
 592       isAddress = true;
 593   } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) {
 594     // Addressing modes can also be folded into prefetches and a variety
 595     // of intrinsics.
 596     switch (II->getIntrinsicID()) {
 597       default: break;
 598       case Intrinsic::prefetch:
 599       case Intrinsic::x86_sse_storeu_ps:
 600       case Intrinsic::x86_sse2_storeu_pd:
 601       case Intrinsic::x86_sse2_storeu_dq:
 602       case Intrinsic::x86_sse2_storel_dq:
 603         if (II->getArgOperand(0) == OperandVal)
 604           isAddress = true;
 605         break;
 606     }
 607   }
 608   return isAddress;
 609 }
 610
 611 /// getAccessType - Return the type of the memory being accessed.
 612 static Type *getAccessType(const Instruction *Inst) {
 613   Type *AccessTy = Inst->getType();
 614   if (const StoreInst *SI = dyn_cast<StoreInst>(Inst))
 615     AccessTy = SI->getOperand(0)->getType();
 616   else if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) {
 617     // Addressing modes can also be folded into prefetches and a variety
 618     // of intrinsics.
 619     switch (II->getIntrinsicID()) {
 620     default: break;
 621     case Intrinsic::x86_sse_storeu_ps:
 622     case Intrinsic::x86_sse2_storeu_pd:
 623     case Intrinsic::x86_sse2_storeu_dq:
 624     case Intrinsic::x86_sse2_storel_dq:
 625       AccessTy = II->getArgOperand(0)->getType();
 626       break;
 627     }
 628   }
 629
 630   // All pointers have the same requirements, so canonicalize them to an
 631   // arbitrary pointer type to minimize variation.
 632   if (PointerType *PTy = dyn_cast<PointerType>(AccessTy))
 633     AccessTy = PointerType::get(IntegerType::get(PTy->getContext(), 1),
 634                                 PTy->getAddressSpace());
 635
 636   return AccessTy;
 637 }
 638
 639 /// DeleteTriviallyDeadInstructions - If any of the instructions is the
 640 /// specified set are trivially dead, delete them and see if this makes any of
 641 /// their operands subsequently dead.
 642 static bool
 643 DeleteTriviallyDeadInstructions(SmallVectorImpl<WeakVH> &DeadInsts) {
 644   bool Changed = false;
 645
 646   while (!DeadInsts.empty()) {
 647     Instruction *I = dyn_cast_or_null<Instruction>(&*DeadInsts.pop_back_val());
 648
 649     if (I == 0 || !isInstructionTriviallyDead(I))
 650       continue;
 651
 652     for (User::op_iterator OI = I->op_begin(), E = I->op_end(); OI != E; ++OI)
 653       if (Instruction *U = dyn_cast<Instruction>(*OI)) {
 654         *OI = 0;
 655         if (U->use_empty())
 656           DeadInsts.push_back(U);
 657       }
 658
 659     I->eraseFromParent();
 660     Changed = true;
 661   }
 662
 663   return Changed;
 664 }
 665
 666 namespace {
 667
 668 /// Cost - This class is used to measure and compare candidate formulae.
 669 class Cost {
 670   /// TODO: Some of these could be merged. Also, a lexical ordering
 671   /// isn't always optimal.
 672   unsigned NumRegs;
 673   unsigned AddRecCost;
 674   unsigned NumIVMuls;
 675   unsigned NumBaseAdds;
 676   unsigned ImmCost;
 677   unsigned SetupCost;
 678
 679 public:
 680   Cost()
 681     : NumRegs(0), AddRecCost(0), NumIVMuls(0), NumBaseAdds(0), ImmCost(0),
 682       SetupCost(0) {}
 683
 684   bool operator<(const Cost &Other) const;
 685
 686   void Loose();
 687
 688 #ifndef NDEBUG
 689   // Once any of the metrics loses, they must all remain losers.
 690   bool isValid() {
 691     return ((NumRegs | AddRecCost | NumIVMuls | NumBaseAdds
 692              | ImmCost | SetupCost) != ~0u)
 693       || ((NumRegs & AddRecCost & NumIVMuls & NumBaseAdds
 694            & ImmCost & SetupCost) == ~0u);
 695   }
 696 #endif
 697
 698   bool isLoser() {
 699     assert(isValid() && "invalid cost");
 700     return NumRegs == ~0u;
 701   }
 702
 703   void RateFormula(const Formula &F,
 704                    SmallPtrSet<const SCEV *, 16> &Regs,
 705                    const DenseSet<const SCEV *> &VisitedRegs,
 706                    const Loop *L,
 707                    const SmallVectorImpl<int64_t> &Offsets,
 708                    ScalarEvolution &SE, DominatorTree &DT);
 709
 710   void print(raw_ostream &OS) const;
 711   void dump() const;
 712
 713 private:
 714   void RateRegister(const SCEV *Reg,
 715                     SmallPtrSet<const SCEV *, 16> &Regs,
 716                     const Loop *L,
 717                     ScalarEvolution &SE, DominatorTree &DT);
 718   void RatePrimaryRegister(const SCEV *Reg,
 719                            SmallPtrSet<const SCEV *, 16> &Regs,
 720                            const Loop *L,
 721                            ScalarEvolution &SE, DominatorTree &DT);
 722 };
 723
 724 }
 725
 726 /// RateRegister - Tally up interesting quantities from the given register.
 727 void Cost::RateRegister(const SCEV *Reg,
 728                         SmallPtrSet<const SCEV *, 16> &Regs,
 729                         const Loop *L,
 730                         ScalarEvolution &SE, DominatorTree &DT) {
 731   if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Reg)) {
 732     if (AR->getLoop() == L)
 733       AddRecCost += 1; /// TODO: This should be a function of the stride.
 734
 735     // If this is an addrec for another loop, don't second-guess its addrec phi
 736     // nodes. LSR isn't currently smart enough to reason about more than one
 737     // loop at a time. LSR has either already run on inner loops, will not run
 738     // on other loops, and cannot be expected to change sibling loops. If the
 739     // AddRec exists, consider it's register free and leave it alone. Otherwise,
 740     // do not consider this formula at all.
 741     // FIXME: why do we need to generate such fomulae?
 742     else if (!EnableNested || L->contains(AR->getLoop()) ||
 743              (!AR->getLoop()->contains(L) &&
 744               DT.dominates(L->getHeader(), AR->getLoop()->getHeader()))) {
 745       for (BasicBlock::iterator I = AR->getLoop()->getHeader()->begin();
 746            PHINode *PN = dyn_cast<PHINode>(I); ++I) {
 747         if (SE.isSCEVable(PN->getType()) &&
 748             (SE.getEffectiveSCEVType(PN->getType()) ==
 749              SE.getEffectiveSCEVType(AR->getType())) &&
 750             SE.getSCEV(PN) == AR)
 751           return;
 752       }
 753       if (!EnableNested) {
 754         Loose();
 755         return;
 756       }
 757       // If this isn't one of the addrecs that the loop already has, it
 758       // would require a costly new phi and add. TODO: This isn't
 759       // precisely modeled right now.
 760       ++NumBaseAdds;
 761       if (!Regs.count(AR->getStart())) {
 762         RateRegister(AR->getStart(), Regs, L, SE, DT);
 763         if (isLoser())
 764           return;
 765       }
 766     }
 767
 768     // Add the step value register, if it needs one.
 769     // TODO: The non-affine case isn't precisely modeled here.
 770     if (!AR->isAffine() || !isa<SCEVConstant>(AR->getOperand(1))) {
 771       if (!Regs.count(AR->getOperand(1))) {
 772         RateRegister(AR->getOperand(1), Regs, L, SE, DT);
 773         if (isLoser())
 774           return;
 775       }
 776     }
 777   }
 778   ++NumRegs;
 779
 780   // Rough heuristic; favor registers which don't require extra setup
 781   // instructions in the preheader.
 782   if (!isa<SCEVUnknown>(Reg) &&
 783       !isa<SCEVConstant>(Reg) &&
 784       !(isa<SCEVAddRecExpr>(Reg) &&
 785         (isa<SCEVUnknown>(cast<SCEVAddRecExpr>(Reg)->getStart()) ||
 786          isa<SCEVConstant>(cast<SCEVAddRecExpr>(Reg)->getStart()))))
 787     ++SetupCost;
 788
 789     NumIVMuls += isa<SCEVMulExpr>(Reg) &&
 790                  SE.hasComputableLoopEvolution(Reg, L);
 791 }
 792
 793 /// RatePrimaryRegister - Record this register in the set. If we haven't seen it
 794 /// before, rate it.
 795 void Cost::RatePrimaryRegister(const SCEV *Reg,
 796                                SmallPtrSet<const SCEV *, 16> &Regs,
 797                                const Loop *L,
 798                                ScalarEvolution &SE, DominatorTree &DT) {
 799   if (Regs.insert(Reg))
 800     RateRegister(Reg, Regs, L, SE, DT);
 801 }
 802
 803 void Cost::RateFormula(const Formula &F,
 804                        SmallPtrSet<const SCEV *, 16> &Regs,
 805                        const DenseSet<const SCEV *> &VisitedRegs,
 806                        const Loop *L,
 807                        const SmallVectorImpl<int64_t> &Offsets,
 808                        ScalarEvolution &SE, DominatorTree &DT) {
 809   // Tally up the registers.
 810   if (const SCEV *ScaledReg = F.ScaledReg) {
 811     if (VisitedRegs.count(ScaledReg)) {
 812       Loose();
 813       return;
 814     }
 815     RatePrimaryRegister(ScaledReg, Regs, L, SE, DT);
 816     if (isLoser())
 817       return;
 818   }
 819   for (SmallVectorImpl<const SCEV *>::const_iterator I = F.BaseRegs.begin(),
 820        E = F.BaseRegs.end(); I != E; ++I) {
 821     const SCEV *BaseReg = *I;
 822     if (VisitedRegs.count(BaseReg)) {
 823       Loose();
 824       return;
 825     }
 826     RatePrimaryRegister(BaseReg, Regs, L, SE, DT);
 827     if (isLoser())
 828       return;
 829   }
 830
 831   // Determine how many (unfolded) adds we'll need inside the loop.
 832   size_t NumBaseParts = F.BaseRegs.size() + (F.UnfoldedOffset != 0);
 833   if (NumBaseParts > 1)
 834     NumBaseAdds += NumBaseParts - 1;
 835
 836   // Tally up the non-zero immediates.
 837   for (SmallVectorImpl<int64_t>::const_iterator I = Offsets.begin(),
 838        E = Offsets.end(); I != E; ++I) {
 839     int64_t Offset = (uint64_t)*I + F.AM.BaseOffs;
 840     if (F.AM.BaseGV)
 841       ImmCost += 64; // Handle symbolic values conservatively.
 842                      // TODO: This should probably be the pointer size.
 843     else if (Offset != 0)
 844       ImmCost += APInt(64, Offset, true).getMinSignedBits();
 845   }
 846   assert(isValid() && "invalid cost");
 847 }
 848
 849 /// Loose - Set this cost to a losing value.
 850 void Cost::Loose() {
 851   NumRegs = ~0u;
 852   AddRecCost = ~0u;
 853   NumIVMuls = ~0u;
 854   NumBaseAdds = ~0u;
 855   ImmCost = ~0u;
 856   SetupCost = ~0u;
 857 }
 858
 859 /// operator< - Choose the lower cost.
 860 bool Cost::operator<(const Cost &Other) const {
 861   if (NumRegs != Other.NumRegs)
 862     return NumRegs < Other.NumRegs;
 863   if (AddRecCost != Other.AddRecCost)
 864     return AddRecCost < Other.AddRecCost;
 865   if (NumIVMuls != Other.NumIVMuls)
 866     return NumIVMuls < Other.NumIVMuls;
 867   if (NumBaseAdds != Other.NumBaseAdds)
 868     return NumBaseAdds < Other.NumBaseAdds;
 869   if (ImmCost != Other.ImmCost)
 870     return ImmCost < Other.ImmCost;
 871   if (SetupCost != Other.SetupCost)
 872     return SetupCost < Other.SetupCost;
 873   return false;
 874 }
 875
 876 void Cost::print(raw_ostream &OS) const {
 877   OS << NumRegs << " reg" << (NumRegs == 1 ? "" : "s");
 878   if (AddRecCost != 0)
 879     OS << ", with addrec cost " << AddRecCost;
 880   if (NumIVMuls != 0)
 881     OS << ", plus " << NumIVMuls << " IV mul" << (NumIVMuls == 1 ? "" : "s");
 882   if (NumBaseAdds != 0)
 883     OS << ", plus " << NumBaseAdds << " base add"
 884        << (NumBaseAdds == 1 ? "" : "s");
 885   if (ImmCost != 0)
 886     OS << ", plus " << ImmCost << " imm cost";
 887   if (SetupCost != 0)
 888     OS << ", plus " << SetupCost << " setup cost";
 889 }
 890
 891 void Cost::dump() const {
 892   print(errs()); errs() << '\n';
 893 }
 894
 895 namespace {
 896
 897 /// LSRFixup - An operand value in an instruction which is to be replaced
 898 /// with some equivalent, possibly strength-reduced, replacement.
 899 struct LSRFixup {
 900   /// UserInst - The instruction which will be updated.
 901   Instruction *UserInst;
 902
 903   /// OperandValToReplace - The operand of the instruction which will
 904   /// be replaced. The operand may be used more than once; every instance
 905   /// will be replaced.
 906   Value *OperandValToReplace;
 907
 908   /// PostIncLoops - If this user is to use the post-incremented value of an
 909   /// induction variable, this variable is non-null and holds the loop
 910   /// associated with the induction variable.
 911   PostIncLoopSet PostIncLoops;
 912
 913   /// LUIdx - The index of the LSRUse describing the expression which
 914   /// this fixup needs, minus an offset (below).
 915   size_t LUIdx;
 916
 917   /// Offset - A constant offset to be added to the LSRUse expression.
 918   /// This allows multiple fixups to share the same LSRUse with different
 919   /// offsets, for example in an unrolled loop.
 920   int64_t Offset;
 921
 922   bool isUseFullyOutsideLoop(const Loop *L) const;
 923
 924   LSRFixup();
 925
 926   void print(raw_ostream &OS) const;
 927   void dump() const;
 928 };
 929
 930 }
 931
 932 LSRFixup::LSRFixup()
 933   : UserInst(0), OperandValToReplace(0), LUIdx(~size_t(0)), Offset(0) {}
 934
 935 /// isUseFullyOutsideLoop - Test whether this fixup always uses its
 936 /// value outside of the given loop.
 937 bool LSRFixup::isUseFullyOutsideLoop(const Loop *L) const {
 938   // PHI nodes use their value in their incoming blocks.
 939   if (const PHINode *PN = dyn_cast<PHINode>(UserInst)) {
 940     for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
 941       if (PN->getIncomingValue(i) == OperandValToReplace &&
 942           L->contains(PN->getIncomingBlock(i)))
 943         return false;
 944     return true;
 945   }
 946
 947   return !L->contains(UserInst);
 948 }
 949
 950 void LSRFixup::print(raw_ostream &OS) const {
 951   OS << "UserInst=";
 952   // Store is common and interesting enough to be worth special-casing.
 953   if (StoreInst *Store = dyn_cast<StoreInst>(UserInst)) {
 954     OS << "store ";
 955     WriteAsOperand(OS, Store->getOperand(0), /*PrintType=*/false);
 956   } else if (UserInst->getType()->isVoidTy())
 957     OS << UserInst->getOpcodeName();
 958   else
 959     WriteAsOperand(OS, UserInst, /*PrintType=*/false);
 960
 961   OS << ", OperandValToReplace=";
 962   WriteAsOperand(OS, OperandValToReplace, /*PrintType=*/false);
 963
 964   for (PostIncLoopSet::const_iterator I = PostIncLoops.begin(),
 965        E = PostIncLoops.end(); I != E; ++I) {
 966     OS << ", PostIncLoop=";
 967     WriteAsOperand(OS, (*I)->getHeader(), /*PrintType=*/false);
 968   }
 969
 970   if (LUIdx != ~size_t(0))
 971     OS << ", LUIdx=" << LUIdx;
 972
 973   if (Offset != 0)
 974     OS << ", Offset=" << Offset;
 975 }
 976
 977 void LSRFixup::dump() const {
 978   print(errs()); errs() << '\n';
 979 }
 980
 981 namespace {
 982
 983 /// UniquifierDenseMapInfo - A DenseMapInfo implementation for holding
 984 /// DenseMaps and DenseSets of sorted SmallVectors of const SCEV*.
 985 struct UniquifierDenseMapInfo {
 986   static SmallVector<const SCEV *, 2> getEmptyKey() {
 987     SmallVector<const SCEV *, 2> V;
 988     V.push_back(reinterpret_cast<const SCEV *>(-1));
 989     return V;
 990   }
 991
 992   static SmallVector<const SCEV *, 2> getTombstoneKey() {
 993     SmallVector<const SCEV *, 2> V;
 994     V.push_back(reinterpret_cast<const SCEV *>(-2));
 995     return V;
 996   }
 997
 998   static unsigned getHashValue(const SmallVector<const SCEV *, 2> &V) {
 999     unsigned Result = 0;
1000     for (SmallVectorImpl<const SCEV *>::const_iterator I = V.begin(),
1001          E = V.end(); I != E; ++I)
1002       Result ^= DenseMapInfo<const SCEV *>::getHashValue(*I);
1003     return Result;
1004   }
1005
1006   static bool isEqual(const SmallVector<const SCEV *, 2> &LHS,
1007                       const SmallVector<const SCEV *, 2> &RHS) {
1008     return LHS == RHS;
1009   }
1010 };
1011
1012 /// LSRUse - This class holds the state that LSR keeps for each use in
1013 /// IVUsers, as well as uses invented by LSR itself. It includes information
1014 /// about what kinds of things can be folded into the user, information about
1015 /// the user itself, and information about how the use may be satisfied.
1016 /// TODO: Represent multiple users of the same expression in common?
1017 class LSRUse {
1018   DenseSet<SmallVector<const SCEV *, 2>, UniquifierDenseMapInfo> Uniquifier;
1019
1020 public:
1021   /// KindType - An enum for a kind of use, indicating what types of
1022   /// scaled and immediate operands it might support.
1023   enum KindType {
1024     Basic,   ///< A normal use, with no folding.
1025     Special, ///< A special case of basic, allowing -1 scales.
1026     Address, ///< An address use; folding according to TargetLowering
1027     ICmpZero ///< An equality icmp with both operands folded into one.
1028     // TODO: Add a generic icmp too?
1029   };
1030
1031   KindType Kind;
1032   Type *AccessTy;
1033
1034   SmallVector<int64_t, 8> Offsets;
1035   int64_t MinOffset;
1036   int64_t MaxOffset;
1037
1038   /// AllFixupsOutsideLoop - This records whether all of the fixups using this
1039   /// LSRUse are outside of the loop, in which case some special-case heuristics
1040   /// may be used.
1041   bool AllFixupsOutsideLoop;
1042
1043   /// WidestFixupType - This records the widest use type for any fixup using
1044   /// this LSRUse. FindUseWithSimilarFormula can't consider uses with different
1045   /// max fixup widths to be equivalent, because the narrower one may be relying
1046   /// on the implicit truncation to truncate away bogus bits.
1047   Type *WidestFixupType;
1048
1049   /// Formulae - A list of ways to build a value that can satisfy this user.
1050   /// After the list is populated, one of these is selected heuristically and
1051   /// used to formulate a replacement for OperandValToReplace in UserInst.
1052   SmallVector<Formula, 12> Formulae;
1053
1054   /// Regs - The set of register candidates used by all formulae in this LSRUse.
1055   SmallPtrSet<const SCEV *, 4> Regs;
1056
1057   LSRUse(KindType K, Type *T) : Kind(K), AccessTy(T),
1058                                       MinOffset(INT64_MAX),
1059                                       MaxOffset(INT64_MIN),
1060                                       AllFixupsOutsideLoop(true),
1061                                       WidestFixupType(0) {}
1062
1063   bool HasFormulaWithSameRegs(const Formula &F) const;
1064   bool InsertFormula(const Formula &F);
1065   void DeleteFormula(Formula &F);
1066   void RecomputeRegs(size_t LUIdx, RegUseTracker &Reguses);
1067
1068   void print(raw_ostream &OS) const;
1069   void dump() const;
1070 };
1071
1072 }
1073
1074 /// HasFormula - Test whether this use as a formula which has the same
1075 /// registers as the given formula.
1076 bool LSRUse::HasFormulaWithSameRegs(const Formula &F) const {
1077   SmallVector<const SCEV *, 2> Key = F.BaseRegs;
1078   if (F.ScaledReg) Key.push_back(F.ScaledReg);
1079   // Unstable sort by host order ok, because this is only used for uniquifying.
1080   std::sort(Key.begin(), Key.end());
1081   return Uniquifier.count(Key);
1082 }
1083
1084 /// InsertFormula - If the given formula has not yet been inserted, add it to
1085 /// the list, and return true. Return false otherwise.
1086 bool LSRUse::InsertFormula(const Formula &F) {
1087   SmallVector<const SCEV *, 2> Key = F.BaseRegs;
1088   if (F.ScaledReg) Key.push_back(F.ScaledReg);
1089   // Unstable sort by host order ok, because this is only used for uniquifying.
1090   std::sort(Key.begin(), Key.end());
1091
1092   if (!Uniquifier.insert(Key).second)
1093     return false;
1094
1095   // Using a register to hold the value of 0 is not profitable.
1096   assert((!F.ScaledReg || !F.ScaledReg->isZero()) &&
1097          "Zero allocated in a scaled register!");
1098 #ifndef NDEBUG
1099   for (SmallVectorImpl<const SCEV *>::const_iterator I =
1100        F.BaseRegs.begin(), E = F.BaseRegs.end(); I != E; ++I)
1101     assert(!(*I)->isZero() && "Zero allocated in a base register!");
1102 #endif
1103
1104   // Add the formula to the list.
1105   Formulae.push_back(F);
1106
1107   // Record registers now being used by this use.
1108   if (F.ScaledReg) Regs.insert(F.ScaledReg);
1109   Regs.insert(F.BaseRegs.begin(), F.BaseRegs.end());
1110
1111   return true;
1112 }
1113
1114 /// DeleteFormula - Remove the given formula from this use's list.
1115 void LSRUse::DeleteFormula(Formula &F) {
1116   if (&F != &Formulae.back())
1117     std::swap(F, Formulae.back());
1118   Formulae.pop_back();
1119   assert(!Formulae.empty() && "LSRUse has no formulae left!");
1120 }
1121
1122 /// RecomputeRegs - Recompute the Regs field, and update RegUses.
1123 void LSRUse::RecomputeRegs(size_t LUIdx, RegUseTracker &RegUses) {
1124   // Now that we've filtered out some formulae, recompute the Regs set.
1125   SmallPtrSet<const SCEV *, 4> OldRegs = Regs;
1126   Regs.clear();
1127   for (SmallVectorImpl<Formula>::const_iterator I = Formulae.begin(),
1128        E = Formulae.end(); I != E; ++I) {
1129     const Formula &F = *I;
1130     if (F.ScaledReg) Regs.insert(F.ScaledReg);
1131     Regs.insert(F.BaseRegs.begin(), F.BaseRegs.end());
1132   }
1133
1134   // Update the RegTracker.
1135   for (SmallPtrSet<const SCEV *, 4>::iterator I = OldRegs.begin(),
1136        E = OldRegs.end(); I != E; ++I)
1137     if (!Regs.count(*I))
1138       RegUses.DropRegister(*I, LUIdx);
1139 }
1140
1141 void LSRUse::print(raw_ostream &OS) const {
1142   OS << "LSR Use: Kind=";
1143   switch (Kind) {
1144   case Basic:    OS << "Basic"; break;
1145   case Special:  OS << "Special"; break;
1146   case ICmpZero: OS << "ICmpZero"; break;
1147   case Address:
1148     OS << "Address of ";
1149     if (AccessTy->isPointerTy())
1150       OS << "pointer"; // the full pointer type could be really verbose
1151     else
1152       OS << *AccessTy;
1153   }
1154
1155   OS << ", Offsets={";
1156   for (SmallVectorImpl<int64_t>::const_iterator I = Offsets.begin(),
1157        E = Offsets.end(); I != E; ++I) {
1158     OS << *I;
1159     if (llvm::next(I) != E)
1160       OS << ',';
1161   }
1162   OS << '}';
1163
1164   if (AllFixupsOutsideLoop)
1165     OS << ", all-fixups-outside-loop";
1166
1167   if (WidestFixupType)
1168     OS << ", widest fixup type: " << *WidestFixupType;
1169 }
1170
1171 void LSRUse::dump() const {
1172   print(errs()); errs() << '\n';
1173 }
1174
1175 /// isLegalUse - Test whether the use described by AM is "legal", meaning it can
1176 /// be completely folded into the user instruction at isel time. This includes
1177 /// address-mode folding and special icmp tricks.
1178 static bool isLegalUse(const TargetLowering::AddrMode &AM,
1179                        LSRUse::KindType Kind, Type *AccessTy,
1180                        const TargetLowering *TLI) {
1181   switch (Kind) {
1182   case LSRUse::Address:
1183     // If we have low-level target information, ask the target if it can
1184     // completely fold this address.
1185     if (TLI) return TLI->isLegalAddressingMode(AM, AccessTy);
1186
1187     // Otherwise, just guess that reg+reg addressing is legal.
1188     return !AM.BaseGV && AM.BaseOffs == 0 && AM.Scale <= 1;
1189
1190   case LSRUse::ICmpZero:
1191     // There's not even a target hook for querying whether it would be legal to
1192     // fold a GV into an ICmp.
1193     if (AM.BaseGV)
1194       return false;
1195
1196     // ICmp only has two operands; don't allow more than two non-trivial parts.
1197     if (AM.Scale != 0 && AM.HasBaseReg && AM.BaseOffs != 0)
1198       return false;
1199
1200     // ICmp only supports no scale or a -1 scale, as we can "fold" a -1 scale by
1201     // putting the scaled register in the other operand of the icmp.
1202     if (AM.Scale != 0 && AM.Scale != -1)
1203       return false;
1204
1205     // If we have low-level target information, ask the target if it can fold an
1206     // integer immediate on an icmp.
1207     if (AM.BaseOffs != 0) {
1208       if (TLI) return TLI->isLegalICmpImmediate(-(uint64_t)AM.BaseOffs);
1209       return false;
1210     }
1211
1212     return true;
1213
1214   case LSRUse::Basic:
1215     // Only handle single-register values.
1216     return !AM.BaseGV && AM.Scale == 0 && AM.BaseOffs == 0;
1217
1218   case LSRUse::Special:
1219     // Only handle -1 scales, or no scale.
1220     return AM.Scale == 0 || AM.Scale == -1;
1221   }
1222
1223   return false;
1224 }
1225
1226 static bool isLegalUse(TargetLowering::AddrMode AM,
1227                        int64_t MinOffset, int64_t MaxOffset,
1228                        LSRUse::KindType Kind, Type *AccessTy,
1229                        const TargetLowering *TLI) {
1230   // Check for overflow.
1231   if (((int64_t)((uint64_t)AM.BaseOffs + MinOffset) > AM.BaseOffs) !=
1232       (MinOffset > 0))
1233     return false;
1234   AM.BaseOffs = (uint64_t)AM.BaseOffs + MinOffset;
1235   if (isLegalUse(AM, Kind, AccessTy, TLI)) {
1236     AM.BaseOffs = (uint64_t)AM.BaseOffs - MinOffset;
1237     // Check for overflow.
1238     if (((int64_t)((uint64_t)AM.BaseOffs + MaxOffset) > AM.BaseOffs) !=
1239         (MaxOffset > 0))
1240       return false;
1241     AM.BaseOffs = (uint64_t)AM.BaseOffs + MaxOffset;
1242     return isLegalUse(AM, Kind, AccessTy, TLI);
1243   }
1244   return false;
1245 }
1246
1247 static bool isAlwaysFoldable(int64_t BaseOffs,
1248                              GlobalValue *BaseGV,
1249                              bool HasBaseReg,
1250                              LSRUse::KindType Kind, Type *AccessTy,
1251                              const TargetLowering *TLI) {
1252   // Fast-path: zero is always foldable.
1253   if (BaseOffs == 0 && !BaseGV) return true;
1254
1255   // Conservatively, create an address with an immediate and a
1256   // base and a scale.
1257   TargetLowering::AddrMode AM;
1258   AM.BaseOffs = BaseOffs;
1259   AM.BaseGV = BaseGV;
1260   AM.HasBaseReg = HasBaseReg;
1261   AM.Scale = Kind == LSRUse::ICmpZero ? -1 : 1;
1262
1263   // Canonicalize a scale of 1 to a base register if the formula doesn't
1264   // already have a base register.
1265   if (!AM.HasBaseReg && AM.Scale == 1) {
1266     AM.Scale = 0;
1267     AM.HasBaseReg = true;
1268   }
1269
1270   return isLegalUse(AM, Kind, AccessTy, TLI);
1271 }
1272
1273 static bool isAlwaysFoldable(const SCEV *S,
1274                              int64_t MinOffset, int64_t MaxOffset,
1275                              bool HasBaseReg,
1276                              LSRUse::KindType Kind, Type *AccessTy,
1277                              const TargetLowering *TLI,
1278                              ScalarEvolution &SE) {
1279   // Fast-path: zero is always foldable.
1280   if (S->isZero()) return true;
1281
1282   // Conservatively, create an address with an immediate and a
1283   // base and a scale.
1284   int64_t BaseOffs = ExtractImmediate(S, SE);
1285   GlobalValue *BaseGV = ExtractSymbol(S, SE);
1286
1287   // If there's anything else involved, it's not foldable.
1288   if (!S->isZero()) return false;
1289
1290   // Fast-path: zero is always foldable.
1291   if (BaseOffs == 0 && !BaseGV) return true;
1292
1293   // Conservatively, create an address with an immediate and a
1294   // base and a scale.
1295   TargetLowering::AddrMode AM;
1296   AM.BaseOffs = BaseOffs;
1297   AM.BaseGV = BaseGV;
1298   AM.HasBaseReg = HasBaseReg;
1299   AM.Scale = Kind == LSRUse::ICmpZero ? -1 : 1;
1300
1301   return isLegalUse(AM, MinOffset, MaxOffset, Kind, AccessTy, TLI);
1302 }
1303
1304 namespace {
1305
1306 /// UseMapDenseMapInfo - A DenseMapInfo implementation for holding
1307 /// DenseMaps and DenseSets of pairs of const SCEV* and LSRUse::Kind.
1308 struct UseMapDenseMapInfo {
1309   static std::pair<const SCEV *, LSRUse::KindType> getEmptyKey() {
1310     return std::make_pair(reinterpret_cast<const SCEV *>(-1), LSRUse::Basic);
1311   }
1312
1313   static std::pair<const SCEV *, LSRUse::KindType> getTombstoneKey() {
1314     return std::make_pair(reinterpret_cast<const SCEV *>(-2), LSRUse::Basic);
1315   }
1316
1317   static unsigned
1318   getHashValue(const std::pair<const SCEV *, LSRUse::KindType> &V) {
1319     unsigned Result = DenseMapInfo<const SCEV *>::getHashValue(V.first);
1320     Result ^= DenseMapInfo<unsigned>::getHashValue(unsigned(V.second));
1321     return Result;
1322   }
1323
1324   static bool isEqual(const std::pair<const SCEV *, LSRUse::KindType> &LHS,
1325                       const std::pair<const SCEV *, LSRUse::KindType> &RHS) {
1326     return LHS == RHS;
1327   }
1328 };
1329
1330 /// LSRInstance - This class holds state for the main loop strength reduction
1331 /// logic.
1332 class LSRInstance {
1333   IVUsers &IU;
1334   ScalarEvolution &SE;
1335   DominatorTree &DT;
1336   LoopInfo &LI;
1337   const TargetLowering *const TLI;
1338   Loop *const L;
1339   bool Changed;
1340
1341   /// IVIncInsertPos - This is the insert position that the current loop's
1342   /// induction variable increment should be placed. In simple loops, this is
1343   /// the latch block's terminator. But in more complicated cases, this is a
1344   /// position which will dominate all the in-loop post-increment users.
1345   Instruction *IVIncInsertPos;
1346
1347   /// Factors - Interesting factors between use strides.
1348   SmallSetVector<int64_t, 8> Factors;
1349
1350   /// Types - Interesting use types, to facilitate truncation reuse.
1351   SmallSetVector<Type *, 4> Types;
1352
1353   /// Fixups - The list of operands which are to be replaced.
1354   SmallVector<LSRFixup, 16> Fixups;
1355
1356   /// Uses - The list of interesting uses.
1357   SmallVector<LSRUse, 16> Uses;
1358
1359   /// RegUses - Track which uses use which register candidates.
1360   RegUseTracker RegUses;
1361
1362   void OptimizeShadowIV();
1363   bool FindIVUserForCond(ICmpInst *Cond, IVStrideUse *&CondUse);
1364   ICmpInst *OptimizeMax(ICmpInst *Cond, IVStrideUse* &CondUse);
1365   void OptimizeLoopTermCond();
1366
1367   void CollectInterestingTypesAndFactors();
1368   void CollectFixupsAndInitialFormulae();
1369
1370   LSRFixup &getNewFixup() {
1371     Fixups.push_back(LSRFixup());
1372     return Fixups.back();
1373   }
1374
1375   // Support for sharing of LSRUses between LSRFixups.
1376   typedef DenseMap<std::pair<const SCEV *, LSRUse::KindType>,
1377                    size_t,
1378                    UseMapDenseMapInfo> UseMapTy;
1379   UseMapTy UseMap;
1380
1381   bool reconcileNewOffset(LSRUse &LU, int64_t NewOffset, bool HasBaseReg,
1382                           LSRUse::KindType Kind, Type *AccessTy);
1383
1384   std::pair<size_t, int64_t> getUse(const SCEV *&Expr,
1385                                     LSRUse::KindType Kind,
1386                                     Type *AccessTy);
1387
1388   void DeleteUse(LSRUse &LU, size_t LUIdx);
1389
1390   LSRUse *FindUseWithSimilarFormula(const Formula &F, const LSRUse &OrigLU);
1391
1392 public:
1393   void InsertInitialFormula(const SCEV *S, LSRUse &LU, size_t LUIdx);
1394   void InsertSupplementalFormula(const SCEV *S, LSRUse &LU, size_t LUIdx);
1395   void CountRegisters(const Formula &F, size_t LUIdx);
1396   bool InsertFormula(LSRUse &LU, unsigned LUIdx, const Formula &F);
1397
1398   void CollectLoopInvariantFixupsAndFormulae();
1399
1400   void GenerateReassociations(LSRUse &LU, unsigned LUIdx, Formula Base,
1401                               unsigned Depth = 0);
1402   void GenerateCombinations(LSRUse &LU, unsigned LUIdx, Formula Base);
1403   void GenerateSymbolicOffsets(LSRUse &LU, unsigned LUIdx, Formula Base);
1404   void GenerateConstantOffsets(LSRUse &LU, unsigned LUIdx, Formula Base);
1405   void GenerateICmpZeroScales(LSRUse &LU, unsigned LUIdx, Formula Base);
1406   void GenerateScales(LSRUse &LU, unsigned LUIdx, Formula Base);
1407   void GenerateTruncates(LSRUse &LU, unsigned LUIdx, Formula Base);
1408   void GenerateCrossUseConstantOffsets();
1409   void GenerateAllReuseFormulae();
1410
1411   void FilterOutUndesirableDedicatedRegisters();
1412
1413   size_t EstimateSearchSpaceComplexity() const;
1414   void NarrowSearchSpaceByDetectingSupersets();
1415   void NarrowSearchSpaceByCollapsingUnrolledCode();
1416   void NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters();
1417   void NarrowSearchSpaceByPickingWinnerRegs();
1418   void NarrowSearchSpaceUsingHeuristics();
1419
1420   void SolveRecurse(SmallVectorImpl<const Formula *> &Solution,
1421                     Cost &SolutionCost,
1422                     SmallVectorImpl<const Formula *> &Workspace,
1423                     const Cost &CurCost,
1424                     const SmallPtrSet<const SCEV *, 16> &CurRegs,
1425                     DenseSet<const SCEV *> &VisitedRegs) const;
1426   void Solve(SmallVectorImpl<const Formula *> &Solution) const;
1427
1428   BasicBlock::iterator
1429     HoistInsertPosition(BasicBlock::iterator IP,
1430                         const SmallVectorImpl<Instruction *> &Inputs) const;
1431   BasicBlock::iterator AdjustInsertPositionForExpand(BasicBlock::iterator IP,
1432                                                      const LSRFixup &LF,
1433                                                      const LSRUse &LU) const;
1434
1435   Value *Expand(const LSRFixup &LF,
1436                 const Formula &F,
1437                 BasicBlock::iterator IP,
1438                 SCEVExpander &Rewriter,
1439                 SmallVectorImpl<WeakVH> &DeadInsts) const;
1440   void RewriteForPHI(PHINode *PN, const LSRFixup &LF,
1441                      const Formula &F,
1442                      SCEVExpander &Rewriter,
1443                      SmallVectorImpl<WeakVH> &DeadInsts,
1444                      Pass *P) const;
1445   void Rewrite(const LSRFixup &LF,
1446                const Formula &F,
1447                SCEVExpander &Rewriter,
1448                SmallVectorImpl<WeakVH> &DeadInsts,
1449                Pass *P) const;
1450   void ImplementSolution(const SmallVectorImpl<const Formula *> &Solution,
1451                          Pass *P);
1452
1453   LSRInstance(const TargetLowering *tli, Loop *l, Pass *P);
1454
1455   bool getChanged() const { return Changed; }
1456
1457   void print_factors_and_types(raw_ostream &OS) const;
1458   void print_fixups(raw_ostream &OS) const;
1459   void print_uses(raw_ostream &OS) const;
1460   void print(raw_ostream &OS) const;
1461   void dump() const;
1462 };
1463
1464 }
1465
1466 /// OptimizeShadowIV - If IV is used in a int-to-float cast
1467 /// inside the loop then try to eliminate the cast operation.
1468 void LSRInstance::OptimizeShadowIV() {
1469   const SCEV *BackedgeTakenCount = SE.getBackedgeTakenCount(L);
1470   if (isa<SCEVCouldNotCompute>(BackedgeTakenCount))
1471     return;
1472
1473   for (IVUsers::const_iterator UI = IU.begin(), E = IU.end();
1474        UI != E; /* empty */) {
1475     IVUsers::const_iterator CandidateUI = UI;
1476     ++UI;
1477     Instruction *ShadowUse = CandidateUI->getUser();
1478     Type *DestTy = NULL;
1479     bool IsSigned = false;
1480
1481     /* If shadow use is a int->float cast then insert a second IV
1482        to eliminate this cast.
1483
1484          for (unsigned i = 0; i < n; ++i)
1485            foo((double)i);
1486
1487        is transformed into
1488
1489          double d = 0.0;
1490          for (unsigned i = 0; i < n; ++i, ++d)
1491            foo(d);
1492     */
1493     if (UIToFPInst *UCast = dyn_cast<UIToFPInst>(CandidateUI->getUser())) {
1494       IsSigned = false;
1495       DestTy = UCast->getDestTy();
1496     }
1497     else if (SIToFPInst *SCast = dyn_cast<SIToFPInst>(CandidateUI->getUser())) {
1498       IsSigned = true;
1499       DestTy = SCast->getDestTy();
1500     }
1501     if (!DestTy) continue;
1502
1503     if (TLI) {
1504       // If target does not support DestTy natively then do not apply
1505       // this transformation.
1506       EVT DVT = TLI->getValueType(DestTy);
1507       if (!TLI->isTypeLegal(DVT)) continue;
1508     }
1509
1510     PHINode *PH = dyn_cast<PHINode>(ShadowUse->getOperand(0));
1511     if (!PH) continue;
1512     if (PH->getNumIncomingValues() != 2) continue;
1513
1514     Type *SrcTy = PH->getType();
1515     int Mantissa = DestTy->getFPMantissaWidth();
1516     if (Mantissa == -1) continue;
1517     if ((int)SE.getTypeSizeInBits(SrcTy) > Mantissa)
1518       continue;
1519
1520     unsigned Entry, Latch;
1521     if (PH->getIncomingBlock(0) == L->getLoopPreheader()) {
1522       Entry = 0;
1523       Latch = 1;
1524     } else {
1525       Entry = 1;
1526       Latch = 0;
1527     }
1528
1529     ConstantInt *Init = dyn_cast<ConstantInt>(PH->getIncomingValue(Entry));
1530     if (!Init) continue;
1531     Constant *NewInit = ConstantFP::get(DestTy, IsSigned ?
1532                                         (double)Init->getSExtValue() :
1533                                         (double)Init->getZExtValue());
1534
1535     BinaryOperator *Incr =
1536       dyn_cast<BinaryOperator>(PH->getIncomingValue(Latch));
1537     if (!Incr) continue;
1538     if (Incr->getOpcode() != Instruction::Add
1539         && Incr->getOpcode() != Instruction::Sub)
1540       continue;
1541
1542     /* Initialize new IV, double d = 0.0 in above example. */
1543     ConstantInt *C = NULL;
1544     if (Incr->getOperand(0) == PH)
1545       C = dyn_cast<ConstantInt>(Incr->getOperand(1));
1546     else if (Incr->getOperand(1) == PH)
1547       C = dyn_cast<ConstantInt>(Incr->getOperand(0));
1548     else
1549       continue;
1550
1551     if (!C) continue;
1552
1553     // Ignore negative constants, as the code below doesn't handle them
1554     // correctly. TODO: Remove this restriction.
1555     if (!C->getValue().isStrictlyPositive()) continue;
1556
1557     /* Add new PHINode. */
1558     PHINode *NewPH = PHINode::Create(DestTy, 2, "IV.S.", PH);
1559
1560     /* create new increment. '++d' in above example. */
1561     Constant *CFP = ConstantFP::get(DestTy, C->getZExtValue());
1562     BinaryOperator *NewIncr =
1563       BinaryOperator::Create(Incr->getOpcode() == Instruction::Add ?
1564                                Instruction::FAdd : Instruction::FSub,
1565                              NewPH, CFP, "IV.S.next.", Incr);
1566
1567     NewPH->addIncoming(NewInit, PH->getIncomingBlock(Entry));
1568     NewPH->addIncoming(NewIncr, PH->getIncomingBlock(Latch));
1569
1570     /* Remove cast operation */
1571     ShadowUse->replaceAllUsesWith(NewPH);
1572     ShadowUse->eraseFromParent();
1573     Changed = true;
1574     break;
1575   }
1576 }
1577
1578 /// FindIVUserForCond - If Cond has an operand that is an expression of an IV,
1579 /// set the IV user and stride information and return true, otherwise return
1580 /// false.
1581 bool LSRInstance::FindIVUserForCond(ICmpInst *Cond, IVStrideUse *&CondUse) {
1582   for (IVUsers::iterator UI = IU.begin(), E = IU.end(); UI != E; ++UI)
1583     if (UI->getUser() == Cond) {
1584       // NOTE: we could handle setcc instructions with multiple uses here, but
1585       // InstCombine does it as well for simple uses, it's not clear that it
1586       // occurs enough in real life to handle.
1587       CondUse = UI;
1588       return true;
1589     }
1590   return false;
1591 }
1592
1593 /// OptimizeMax - Rewrite the loop's terminating condition if it uses
1594 /// a max computation.
1595 ///
1596 /// This is a narrow solution to a specific, but acute, problem. For loops
1597 /// like this:
1598 ///
1599 ///   i = 0;
1600 ///   do {
1601 ///     p[i] = 0.0;
1602 ///   } while (++i < n);
1603 ///
1604 /// the trip count isn't just 'n', because 'n' might not be positive. And
1605 /// unfortunately this can come up even for loops where the user didn't use
1606 /// a C do-while loop. For example, seemingly well-behaved top-test loops
1607 /// will commonly be lowered like this:
1608 //
1609 ///   if (n > 0) {
1610 ///     i = 0;
1611 ///     do {
1612 ///       p[i] = 0.0;
1613 ///     } while (++i < n);
1614 ///   }
1615 ///
1616 /// and then it's possible for subsequent optimization to obscure the if
1617 /// test in such a way that indvars can't find it.
1618 ///
1619 /// When indvars can't find the if test in loops like this, it creates a
1620 /// max expression, which allows it to give the loop a canonical
1621 /// induction variable:
1622 ///
1623 ///   i = 0;
1624 ///   max = n < 1 ? 1 : n;
1625 ///   do {
1626 ///     p[i] = 0.0;
1627 ///   } while (++i != max);
1628 ///
1629 /// Canonical induction variables are necessary because the loop passes
1630 /// are designed around them. The most obvious example of this is the
1631 /// LoopInfo analysis, which doesn't remember trip count values. It
1632 /// expects to be able to rediscover the trip count each time it is
1633 /// needed, and it does this using a simple analysis that only succeeds if
1634 /// the loop has a canonical induction variable.
1635 ///
1636 /// However, when it comes time to generate code, the maximum operation
1637 /// can be quite costly, especially if it's inside of an outer loop.
1638 ///
1639 /// This function solves this problem by detecting this type of loop and
1640 /// rewriting their conditions from ICMP_NE back to ICMP_SLT, and deleting
1641 /// the instructions for the maximum computation.
1642 ///
1643 ICmpInst *LSRInstance::OptimizeMax(ICmpInst *Cond, IVStrideUse* &CondUse) {
1644   // Check that the loop matches the pattern we're looking for.
1645   if (Cond->getPredicate() != CmpInst::ICMP_EQ &&
1646       Cond->getPredicate() != CmpInst::ICMP_NE)
1647     return Cond;
1648
1649   SelectInst *Sel = dyn_cast<SelectInst>(Cond->getOperand(1));
1650   if (!Sel || !Sel->hasOneUse()) return Cond;
1651
1652   const SCEV *BackedgeTakenCount = SE.getBackedgeTakenCount(L);
1653   if (isa<SCEVCouldNotCompute>(BackedgeTakenCount))
1654     return Cond;
1655   const SCEV *One = SE.getConstant(BackedgeTakenCount->getType(), 1);
1656
1657   // Add one to the backedge-taken count to get the trip count.
1658   const SCEV *IterationCount = SE.getAddExpr(One, BackedgeTakenCount);
1659   if (IterationCount != SE.getSCEV(Sel)) return Cond;
1660
1661   // Check for a max calculation that matches the pattern. There's no check
1662   // for ICMP_ULE here because the comparison would be with zero, which
1663   // isn't interesting.
1664   CmpInst::Predicate Pred = ICmpInst::BAD_ICMP_PREDICATE;
1665   const SCEVNAryExpr *Max = 0;
1666   if (const SCEVSMaxExpr *S = dyn_cast<SCEVSMaxExpr>(BackedgeTakenCount)) {
1667     Pred = ICmpInst::ICMP_SLE;
1668     Max = S;
1669   } else if (const SCEVSMaxExpr *S = dyn_cast<SCEVSMaxExpr>(IterationCount)) {
1670     Pred = ICmpInst::ICMP_SLT;
1671     Max = S;
1672   } else if (const SCEVUMaxExpr *U = dyn_cast<SCEVUMaxExpr>(IterationCount)) {
1673     Pred = ICmpInst::ICMP_ULT;
1674     Max = U;
1675   } else {
1676     // No match; bail.
1677     return Cond;
1678   }
1679
1680   // To handle a max with more than two operands, this optimization would
1681   // require additional checking and setup.
1682   if (Max->getNumOperands() != 2)
1683     return Cond;
1684
1685   const SCEV *MaxLHS = Max->getOperand(0);
1686   const SCEV *MaxRHS = Max->getOperand(1);
1687
1688   // ScalarEvolution canonicalizes constants to the left. For < and >, look
1689   // for a comparison with 1. For <= and >=, a comparison with zero.
1690   if (!MaxLHS ||
1691       (ICmpInst::isTrueWhenEqual(Pred) ? !MaxLHS->isZero() : (MaxLHS != One)))
1692     return Cond;
1693
1694   // Check the relevant induction variable for conformance to
1695   // the pattern.
1696   const SCEV *IV = SE.getSCEV(Cond->getOperand(0));
1697   const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(IV);
1698   if (!AR || !AR->isAffine() ||
1699       AR->getStart() != One ||
1700       AR->getStepRecurrence(SE) != One)
1701     return Cond;
1702
1703   assert(AR->getLoop() == L &&
1704          "Loop condition operand is an addrec in a different loop!");
1705
1706   // Check the right operand of the select, and remember it, as it will
1707   // be used in the new comparison instruction.
1708   Value *NewRHS = 0;
1709   if (ICmpInst::isTrueWhenEqual(Pred)) {
1710     // Look for n+1, and grab n.
1711     if (AddOperator *BO = dyn_cast<AddOperator>(Sel->getOperand(1)))
1712       if (isa<ConstantInt>(BO->getOperand(1)) &&
1713           cast<ConstantInt>(BO->getOperand(1))->isOne() &&
1714           SE.getSCEV(BO->getOperand(0)) == MaxRHS)
1715         NewRHS = BO->getOperand(0);
1716     if (AddOperator *BO = dyn_cast<AddOperator>(Sel->getOperand(2)))
1717       if (isa<ConstantInt>(BO->getOperand(1)) &&
1718           cast<ConstantInt>(BO->getOperand(1))->isOne() &&
1719           SE.getSCEV(BO->getOperand(0)) == MaxRHS)
1720         NewRHS = BO->getOperand(0);
1721     if (!NewRHS)
1722       return Cond;
1723   } else if (SE.getSCEV(Sel->getOperand(1)) == MaxRHS)
1724     NewRHS = Sel->getOperand(1);
1725   else if (SE.getSCEV(Sel->getOperand(2)) == MaxRHS)
1726     NewRHS = Sel->getOperand(2);
1727   else if (const SCEVUnknown *SU = dyn_cast<SCEVUnknown>(MaxRHS))
1728     NewRHS = SU->getValue();
1729   else
1730     // Max doesn't match expected pattern.
1731     return Cond;
1732
1733   // Determine the new comparison opcode. It may be signed or unsigned,
1734   // and the original comparison may be either equality or inequality.
1735   if (Cond->getPredicate() == CmpInst::ICMP_EQ)
1736     Pred = CmpInst::getInversePredicate(Pred);
1737
1738   // Ok, everything looks ok to change the condition into an SLT or SGE and
1739   // delete the max calculation.
1740   ICmpInst *NewCond =
1741     new ICmpInst(Cond, Pred, Cond->getOperand(0), NewRHS, "scmp");
1742
1743   // Delete the max calculation instructions.
1744   Cond->replaceAllUsesWith(NewCond);
1745   CondUse->setUser(NewCond);
1746   Instruction *Cmp = cast<Instruction>(Sel->getOperand(0));
1747   Cond->eraseFromParent();
1748   Sel->eraseFromParent();
1749   if (Cmp->use_empty())
1750     Cmp->eraseFromParent();
1751   return NewCond;
1752 }
1753
1754 /// OptimizeLoopTermCond - Change loop terminating condition to use the
1755 /// postinc iv when possible.
1756 void
1757 LSRInstance::OptimizeLoopTermCond() {
1758   SmallPtrSet<Instruction *, 4> PostIncs;
1759
1760   BasicBlock *LatchBlock = L->getLoopLatch();
1761   SmallVector<BasicBlock*, 8> ExitingBlocks;
1762   L->getExitingBlocks(ExitingBlocks);
1763
1764   for (unsigned i = 0, e = ExitingBlocks.size(); i != e; ++i) {
1765     BasicBlock *ExitingBlock = ExitingBlocks[i];
1766
1767     // Get the terminating condition for the loop if possible.  If we
1768     // can, we want to change it to use a post-incremented version of its
1769     // induction variable, to allow coalescing the live ranges for the IV into
1770     // one register value.
1771
1772     BranchInst *TermBr = dyn_cast<BranchInst>(ExitingBlock->getTerminator());
1773     if (!TermBr)
1774       continue;
1775     // FIXME: Overly conservative, termination condition could be an 'or' etc..
1776     if (TermBr->isUnconditional() || !isa<ICmpInst>(TermBr->getCondition()))
1777       continue;
1778
1779     // Search IVUsesByStride to find Cond's IVUse if there is one.
1780     IVStrideUse *CondUse = 0;
1781     ICmpInst *Cond = cast<ICmpInst>(TermBr->getCondition());
1782     if (!FindIVUserForCond(Cond, CondUse))
1783       continue;
1784
1785     // If the trip count is computed in terms of a max (due to ScalarEvolution
1786     // being unable to find a sufficient guard, for example), change the loop
1787     // comparison to use SLT or ULT instead of NE.
1788     // One consequence of doing this now is that it disrupts the count-down
1789     // optimization. That's not always a bad thing though, because in such
1790     // cases it may still be worthwhile to avoid a max.
1791     Cond = OptimizeMax(Cond, CondUse);
1792
1793     // If this exiting block dominates the latch block, it may also use
1794     // the post-inc value if it won't be shared with other uses.
1795     // Check for dominance.
1796     if (!DT.dominates(ExitingBlock, LatchBlock))
1797       continue;
1798
1799     // Conservatively avoid trying to use the post-inc value in non-latch
1800     // exits if there may be pre-inc users in intervening blocks.
1801     if (LatchBlock != ExitingBlock)
1802       for (IVUsers::const_iterator UI = IU.begin(), E = IU.end(); UI != E; ++UI)
1803         // Test if the use is reachable from the exiting block. This dominator
1804         // query is a conservative approximation of reachability.
1805         if (&*UI != CondUse &&
1806             !DT.properlyDominates(UI->getUser()->getParent(), ExitingBlock)) {
1807           // Conservatively assume there may be reuse if the quotient of their
1808           // strides could be a legal scale.
1809           const SCEV *A = IU.getStride(*CondUse, L);
1810           const SCEV *B = IU.getStride(*UI, L);
1811           if (!A || !B) continue;
1812           if (SE.getTypeSizeInBits(A->getType()) !=
1813               SE.getTypeSizeInBits(B->getType())) {
1814             if (SE.getTypeSizeInBits(A->getType()) >
1815                 SE.getTypeSizeInBits(B->getType()))
1816               B = SE.getSignExtendExpr(B, A->getType());
1817             else
1818               A = SE.getSignExtendExpr(A, B->getType());
1819           }
1820           if (const SCEVConstant *D =
1821                 dyn_cast_or_null<SCEVConstant>(getExactSDiv(B, A, SE))) {
1822             const ConstantInt *C = D->getValue();
1823             // Stride of one or negative one can have reuse with non-addresses.
1824             if (C->isOne() || C->isAllOnesValue())
1825               goto decline_post_inc;
1826             // Avoid weird situations.
1827             if (C->getValue().getMinSignedBits() >= 64 ||
1828                 C->getValue().isMinSignedValue())
1829               goto decline_post_inc;
1830             // Without TLI, assume that any stride might be valid, and so any
1831             // use might be shared.
1832             if (!TLI)
1833               goto decline_post_inc;
1834             // Check for possible scaled-address reuse.
1835             Type *AccessTy = getAccessType(UI->getUser());
1836             TargetLowering::AddrMode AM;
1837             AM.Scale = C->getSExtValue();
1838             if (TLI->isLegalAddressingMode(AM, AccessTy))
1839               goto decline_post_inc;
1840             AM.Scale = -AM.Scale;
1841             if (TLI->isLegalAddressingMode(AM, AccessTy))
1842               goto decline_post_inc;
1843           }
1844         }
1845
1846     DEBUG(dbgs() << "  Change loop exiting icmp to use postinc iv: "
1847                  << *Cond << '\n');
1848
1849     // It's possible for the setcc instruction to be anywhere in the loop, and
1850     // possible for it to have multiple users.  If it is not immediately before
1851     // the exiting block branch, move it.
1852     if (&*++BasicBlock::iterator(Cond) != TermBr) {
1853       if (Cond->hasOneUse()) {
1854         Cond->moveBefore(TermBr);
1855       } else {
1856         // Clone the terminating condition and insert into the loopend.
1857         ICmpInst *OldCond = Cond;
1858         Cond = cast<ICmpInst>(Cond->clone());
1859         Cond->setName(L->getHeader()->getName() + ".termcond");
1860         ExitingBlock->getInstList().insert(TermBr, Cond);
1861
1862         // Clone the IVUse, as the old use still exists!
1863         CondUse = &IU.AddUser(Cond, CondUse->getOperandValToReplace());
1864         TermBr->replaceUsesOfWith(OldCond, Cond);
1865       }
1866     }
1867
1868     // If we get to here, we know that we can transform the setcc instruction to
1869     // use the post-incremented version of the IV, allowing us to coalesce the
1870     // live ranges for the IV correctly.
1871     CondUse->transformToPostInc(L);
1872     Changed = true;
1873
1874     PostIncs.insert(Cond);
1875   decline_post_inc:;
1876   }
1877
1878   // Determine an insertion point for the loop induction variable increment. It
1879   // must dominate all the post-inc comparisons we just set up, and it must
1880   // dominate the loop latch edge.
1881   IVIncInsertPos = L->getLoopLatch()->getTerminator();
1882   for (SmallPtrSet<Instruction *, 4>::const_iterator I = PostIncs.begin(),
1883        E = PostIncs.end(); I != E; ++I) {
1884     BasicBlock *BB =
1885       DT.findNearestCommonDominator(IVIncInsertPos->getParent(),
1886                                     (*I)->getParent());
1887     if (BB == (*I)->getParent())
1888       IVIncInsertPos = *I;
1889     else if (BB != IVIncInsertPos->getParent())
1890       IVIncInsertPos = BB->getTerminator();
1891   }
1892 }
1893
1894 /// reconcileNewOffset - Determine if the given use can accommodate a fixup
1895 /// at the given offset and other details. If so, update the use and
1896 /// return true.
1897 bool
1898 LSRInstance::reconcileNewOffset(LSRUse &LU, int64_t NewOffset, bool HasBaseReg,
1899                                 LSRUse::KindType Kind, Type *AccessTy) {
1900   int64_t NewMinOffset = LU.MinOffset;
1901   int64_t NewMaxOffset = LU.MaxOffset;
1902   Type *NewAccessTy = AccessTy;
1903
1904   // Check for a mismatched kind. It's tempting to collapse mismatched kinds to
1905   // something conservative, however this can pessimize in the case that one of
1906   // the uses will have all its uses outside the loop, for example.
1907   if (LU.Kind != Kind)
1908     return false;
1909   // Conservatively assume HasBaseReg is true for now.
1910   if (NewOffset < LU.MinOffset) {
1911     if (!isAlwaysFoldable(LU.MaxOffset - NewOffset, 0, HasBaseReg,
1912                           Kind, AccessTy, TLI))
1913       return false;
1914     NewMinOffset = NewOffset;
1915   } else if (NewOffset > LU.MaxOffset) {
1916     if (!isAlwaysFoldable(NewOffset - LU.MinOffset, 0, HasBaseReg,
1917                           Kind, AccessTy, TLI))
1918       return false;
1919     NewMaxOffset = NewOffset;
1920   }
1921   // Check for a mismatched access type, and fall back conservatively as needed.
1922   // TODO: Be less conservative when the type is similar and can use the same
1923   // addressing modes.
1924   if (Kind == LSRUse::Address && AccessTy != LU.AccessTy)
1925     NewAccessTy = Type::getVoidTy(AccessTy->getContext());
1926
1927   // Update the use.
1928   LU.MinOffset = NewMinOffset;
1929   LU.MaxOffset = NewMaxOffset;
1930   LU.AccessTy = NewAccessTy;
1931   if (NewOffset != LU.Offsets.back())
1932     LU.Offsets.push_back(NewOffset);
1933   return true;
1934 }
1935
1936 /// getUse - Return an LSRUse index and an offset value for a fixup which
1937 /// needs the given expression, with the given kind and optional access type.
1938 /// Either reuse an existing use or create a new one, as needed.
1939 std::pair<size_t, int64_t>
1940 LSRInstance::getUse(const SCEV *&Expr,
1941                     LSRUse::KindType Kind, Type *AccessTy) {
1942   const SCEV *Copy = Expr;
1943   int64_t Offset = ExtractImmediate(Expr, SE);
1944
1945   // Basic uses can't accept any offset, for example.
1946   if (!isAlwaysFoldable(Offset, 0, /*HasBaseReg=*/true, Kind, AccessTy, TLI)) {
1947     Expr = Copy;
1948     Offset = 0;
1949   }
1950
1951   std::pair<UseMapTy::iterator, bool> P =
1952     UseMap.insert(std::make_pair(std::make_pair(Expr, Kind), 0));
1953   if (!P.second) {
1954     // A use already existed with this base.
1955     size_t LUIdx = P.first->second;
1956     LSRUse &LU = Uses[LUIdx];
1957     if (reconcileNewOffset(LU, Offset, /*HasBaseReg=*/true, Kind, AccessTy))
1958       // Reuse this use.
1959       return std::make_pair(LUIdx, Offset);
1960   }
1961
1962   // Create a new use.
1963   size_t LUIdx = Uses.size();
1964   P.first->second = LUIdx;
1965   Uses.push_back(LSRUse(Kind, AccessTy));
1966   LSRUse &LU = Uses[LUIdx];
1967
1968   // We don't need to track redundant offsets, but we don't need to go out
1969   // of our way here to avoid them.
1970   if (LU.Offsets.empty() || Offset != LU.Offsets.back())
1971     LU.Offsets.push_back(Offset);
1972
1973   LU.MinOffset = Offset;
1974   LU.MaxOffset = Offset;
1975   return std::make_pair(LUIdx, Offset);
1976 }
1977
1978 /// DeleteUse - Delete the given use from the Uses list.
1979 void LSRInstance::DeleteUse(LSRUse &LU, size_t LUIdx) {
1980   if (&LU != &Uses.back())
1981     std::swap(LU, Uses.back());
1982   Uses.pop_back();
1983
1984   // Update RegUses.
1985   RegUses.SwapAndDropUse(LUIdx, Uses.size());
1986 }
1987
1988 /// FindUseWithFormula - Look for a use distinct from OrigLU which is has
1989 /// a formula that has the same registers as the given formula.
1990 LSRUse *
1991 LSRInstance::FindUseWithSimilarFormula(const Formula &OrigF,
1992                                        const LSRUse &OrigLU) {
1993   // Search all uses for the formula. This could be more clever.
1994   for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
1995     LSRUse &LU = Uses[LUIdx];
1996     // Check whether this use is close enough to OrigLU, to see whether it's
1997     // worthwhile looking through its formulae.
1998     // Ignore ICmpZero uses because they may contain formulae generated by
1999     // GenerateICmpZeroScales, in which case adding fixup offsets may
2000     // be invalid.
2001     if (&LU != &OrigLU &&
2002         LU.Kind != LSRUse::ICmpZero &&
2003         LU.Kind == OrigLU.Kind && OrigLU.AccessTy == LU.AccessTy &&
2004         LU.WidestFixupType == OrigLU.WidestFixupType &&
2005         LU.HasFormulaWithSameRegs(OrigF)) {
2006       // Scan through this use's formulae.
2007       for (SmallVectorImpl<Formula>::const_iterator I = LU.Formulae.begin(),
2008            E = LU.Formulae.end(); I != E; ++I) {
2009         const Formula &F = *I;
2010         // Check to see if this formula has the same registers and symbols
2011         // as OrigF.
2012         if (F.BaseRegs == OrigF.BaseRegs &&
2013             F.ScaledReg == OrigF.ScaledReg &&
2014             F.AM.BaseGV == OrigF.AM.BaseGV &&
2015             F.AM.Scale == OrigF.AM.Scale &&
2016             F.UnfoldedOffset == OrigF.UnfoldedOffset) {
2017           if (F.AM.BaseOffs == 0)
2018             return &LU;
2019           // This is the formula where all the registers and symbols matched;
2020           // there aren't going to be any others. Since we declined it, we
2021           // can skip the rest of the formulae and procede to the next LSRUse.
2022           break;
2023         }
2024       }
2025     }
2026   }
2027
2028   // Nothing looked good.
2029   return 0;
2030 }
2031
2032 void LSRInstance::CollectInterestingTypesAndFactors() {
2033   SmallSetVector<const SCEV *, 4> Strides;
2034
2035   // Collect interesting types and strides.
2036   SmallVector<const SCEV *, 4> Worklist;
2037   for (IVUsers::const_iterator UI = IU.begin(), E = IU.end(); UI != E; ++UI) {
2038     const SCEV *Expr = IU.getExpr(*UI);
2039
2040     // Collect interesting types.
2041     Types.insert(SE.getEffectiveSCEVType(Expr->getType()));
2042
2043     // Add strides for mentioned loops.
2044     Worklist.push_back(Expr);
2045     do {
2046       const SCEV *S = Worklist.pop_back_val();
2047       if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
2048         Strides.insert(AR->getStepRecurrence(SE));
2049         Worklist.push_back(AR->getStart());
2050       } else if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
2051         Worklist.append(Add->op_begin(), Add->op_end());
2052       }
2053     } while (!Worklist.empty());
2054   }
2055
2056   // Compute interesting factors from the set of interesting strides.
2057   for (SmallSetVector<const SCEV *, 4>::const_iterator
2058        I = Strides.begin(), E = Strides.end(); I != E; ++I)
2059     for (SmallSetVector<const SCEV *, 4>::const_iterator NewStrideIter =
2060          llvm::next(I); NewStrideIter != E; ++NewStrideIter) {
2061       const SCEV *OldStride = *I;
2062       const SCEV *NewStride = *NewStrideIter;
2063
2064       if (SE.getTypeSizeInBits(OldStride->getType()) !=
2065           SE.getTypeSizeInBits(NewStride->getType())) {
2066         if (SE.getTypeSizeInBits(OldStride->getType()) >
2067             SE.getTypeSizeInBits(NewStride->getType()))
2068           NewStride = SE.getSignExtendExpr(NewStride, OldStride->getType());
2069         else
2070           OldStride = SE.getSignExtendExpr(OldStride, NewStride->getType());
2071       }
2072       if (const SCEVConstant *Factor =
2073             dyn_cast_or_null<SCEVConstant>(getExactSDiv(NewStride, OldStride,
2074                                                         SE, true))) {
2075         if (Factor->getValue()->getValue().getMinSignedBits() <= 64)
2076           Factors.insert(Factor->getValue()->getValue().getSExtValue());
2077       } else if (const SCEVConstant *Factor =
2078                    dyn_cast_or_null<SCEVConstant>(getExactSDiv(OldStride,
2079                                                                NewStride,
2080                                                                SE, true))) {
2081         if (Factor->getValue()->getValue().getMinSignedBits() <= 64)
2082           Factors.insert(Factor->getValue()->getValue().getSExtValue());
2083       }
2084     }
2085
2086   // If all uses use the same type, don't bother looking for truncation-based
2087   // reuse.
2088   if (Types.size() == 1)
2089     Types.clear();
2090
2091   DEBUG(print_factors_and_types(dbgs()));
2092 }
2093
2094 void LSRInstance::CollectFixupsAndInitialFormulae() {
2095   for (IVUsers::const_iterator UI = IU.begin(), E = IU.end(); UI != E; ++UI) {
2096     // Record the uses.
2097     LSRFixup &LF = getNewFixup();
2098     LF.UserInst = UI->getUser();
2099     LF.OperandValToReplace = UI->getOperandValToReplace();
2100     LF.PostIncLoops = UI->getPostIncLoops();
2101
2102     LSRUse::KindType Kind = LSRUse::Basic;
2103     Type *AccessTy = 0;
2104     if (isAddressUse(LF.UserInst, LF.OperandValToReplace)) {
2105       Kind = LSRUse::Address;
2106       AccessTy = getAccessType(LF.UserInst);
2107     }
2108
2109     const SCEV *S = IU.getExpr(*UI);
2110
2111     // Equality (== and !=) ICmps are special. We can rewrite (i == N) as
2112     // (N - i == 0), and this allows (N - i) to be the expression that we work
2113     // with rather than just N or i, so we can consider the register
2114     // requirements for both N and i at the same time. Limiting this code to
2115     // equality icmps is not a problem because all interesting loops use
2116     // equality icmps, thanks to IndVarSimplify.
2117     if (ICmpInst *CI = dyn_cast<ICmpInst>(LF.UserInst))
2118       if (CI->isEquality()) {
2119         // Swap the operands if needed to put the OperandValToReplace on the
2120         // left, for consistency.
2121         Value *NV = CI->getOperand(1);
2122         if (NV == LF.OperandValToReplace) {
2123           CI->setOperand(1, CI->getOperand(0));
2124           CI->setOperand(0, NV);
2125           NV = CI->getOperand(1);
2126           Changed = true;
2127         }
2128
2129         // x == y  -->  x - y == 0
2130         const SCEV *N = SE.getSCEV(NV);
2131         if (SE.isLoopInvariant(N, L)) {
2132           // S is normalized, so normalize N before folding it into S
2133           // to keep the result normalized.
2134           N = TransformForPostIncUse(Normalize, N, CI, 0,
2135                                      LF.PostIncLoops, SE, DT);
2136           Kind = LSRUse::ICmpZero;
2137           S = SE.getMinusSCEV(N, S);
2138         }
2139
2140         // -1 and the negations of all interesting strides (except the negation
2141         // of -1) are now also interesting.
2142         for (size_t i = 0, e = Factors.size(); i != e; ++i)
2143           if (Factors[i] != -1)
2144             Factors.insert(-(uint64_t)Factors[i]);
2145         Factors.insert(-1);
2146       }
2147
2148     // Set up the initial formula for this use.
2149     std::pair<size_t, int64_t> P = getUse(S, Kind, AccessTy);
2150     LF.LUIdx = P.first;
2151     LF.Offset = P.second;
2152     LSRUse &LU = Uses[LF.LUIdx];
2153     LU.AllFixupsOutsideLoop &= LF.isUseFullyOutsideLoop(L);
2154     if (!LU.WidestFixupType ||
2155         SE.getTypeSizeInBits(LU.WidestFixupType) <
2156         SE.getTypeSizeInBits(LF.OperandValToReplace->getType()))
2157       LU.WidestFixupType = LF.OperandValToReplace->getType();
2158
2159     // If this is the first use of this LSRUse, give it a formula.
2160     if (LU.Formulae.empty()) {
2161       InsertInitialFormula(S, LU, LF.LUIdx);
2162       CountRegisters(LU.Formulae.back(), LF.LUIdx);
2163     }
2164   }
2165
2166   DEBUG(print_fixups(dbgs()));
2167 }
2168
2169 /// InsertInitialFormula - Insert a formula for the given expression into
2170 /// the given use, separating out loop-variant portions from loop-invariant
2171 /// and loop-computable portions.
2172 void
2173 LSRInstance::InsertInitialFormula(const SCEV *S, LSRUse &LU, size_t LUIdx) {
2174   Formula F;
2175   F.InitialMatch(S, L, SE);
2176   bool Inserted = InsertFormula(LU, LUIdx, F);
2177   assert(Inserted && "Initial formula already exists!"); (void)Inserted;
2178 }
2179
2180 /// InsertSupplementalFormula - Insert a simple single-register formula for
2181 /// the given expression into the given use.
2182 void
2183 LSRInstance::InsertSupplementalFormula(const SCEV *S,
2184                                        LSRUse &LU, size_t LUIdx) {
2185   Formula F;
2186   F.BaseRegs.push_back(S);
2187   F.AM.HasBaseReg = true;
2188   bool Inserted = InsertFormula(LU, LUIdx, F);
2189   assert(Inserted && "Supplemental formula already exists!"); (void)Inserted;
2190 }
2191
2192 /// CountRegisters - Note which registers are used by the given formula,
2193 /// updating RegUses.
2194 void LSRInstance::CountRegisters(const Formula &F, size_t LUIdx) {
2195   if (F.ScaledReg)
2196     RegUses.CountRegister(F.ScaledReg, LUIdx);
2197   for (SmallVectorImpl<const SCEV *>::const_iterator I = F.BaseRegs.begin(),
2198        E = F.BaseRegs.end(); I != E; ++I)
2199     RegUses.CountRegister(*I, LUIdx);
2200 }
2201
2202 /// InsertFormula - If the given formula has not yet been inserted, add it to
2203 /// the list, and return true. Return false otherwise.
2204 bool LSRInstance::InsertFormula(LSRUse &LU, unsigned LUIdx, const Formula &F) {
2205   if (!LU.InsertFormula(F))
2206     return false;
2207
2208   CountRegisters(F, LUIdx);
2209   return true;
2210 }
2211
2212 /// CollectLoopInvariantFixupsAndFormulae - Check for other uses of
2213 /// loop-invariant values which we're tracking. These other uses will pin these
2214 /// values in registers, making them less profitable for elimination.
2215 /// TODO: This currently misses non-constant addrec step registers.
2216 /// TODO: Should this give more weight to users inside the loop?
2217 void
2218 LSRInstance::CollectLoopInvariantFixupsAndFormulae() {
2219   SmallVector<const SCEV *, 8> Worklist(RegUses.begin(), RegUses.end());
2220   SmallPtrSet<const SCEV *, 8> Inserted;
2221
2222   while (!Worklist.empty()) {
2223     const SCEV *S = Worklist.pop_back_val();
2224
2225     if (const SCEVNAryExpr *N = dyn_cast<SCEVNAryExpr>(S))
2226       Worklist.append(N->op_begin(), N->op_end());
2227     else if (const SCEVCastExpr *C = dyn_cast<SCEVCastExpr>(S))
2228       Worklist.push_back(C->getOperand());
2229     else if (const SCEVUDivExpr *D = dyn_cast<SCEVUDivExpr>(S)) {
2230       Worklist.push_back(D->getLHS());
2231       Worklist.push_back(D->getRHS());
2232     } else if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(S)) {
2233       if (!Inserted.insert(U)) continue;
2234       const Value *V = U->getValue();
2235       if (const Instruction *Inst = dyn_cast<Instruction>(V)) {
2236         // Look for instructions defined outside the loop.
2237         if (L->contains(Inst)) continue;
2238       } else if (isa<UndefValue>(V))
2239         // Undef doesn't have a live range, so it doesn't matter.
2240         continue;
2241       for (Value::const_use_iterator UI = V->use_begin(), UE = V->use_end();
2242            UI != UE; ++UI) {
2243         const Instruction *UserInst = dyn_cast<Instruction>(*UI);
2244         // Ignore non-instructions.
2245         if (!UserInst)
2246           continue;
2247         // Ignore instructions in other functions (as can happen with
2248         // Constants).
2249         if (UserInst->getParent()->getParent() != L->getHeader()->getParent())
2250           continue;
2251         // Ignore instructions not dominated by the loop.
2252         const BasicBlock *UseBB = !isa<PHINode>(UserInst) ?
2253           UserInst->getParent() :
2254           cast<PHINode>(UserInst)->getIncomingBlock(
2255             PHINode::getIncomingValueNumForOperand(UI.getOperandNo()));
2256         if (!DT.dominates(L->getHeader(), UseBB))
2257           continue;
2258         // Ignore uses which are part of other SCEV expressions, to avoid
2259         // analyzing them multiple times.
2260         if (SE.isSCEVable(UserInst->getType())) {
2261           const SCEV *UserS = SE.getSCEV(const_cast<Instruction *>(UserInst));
2262           // If the user is a no-op, look through to its uses.
2263           if (!isa<SCEVUnknown>(UserS))
2264             continue;
2265           if (UserS == U) {
2266             Worklist.push_back(
2267               SE.getUnknown(const_cast<Instruction *>(UserInst)));
2268             continue;
2269           }
2270         }
2271         // Ignore icmp instructions which are already being analyzed.
2272         if (const ICmpInst *ICI = dyn_cast<ICmpInst>(UserInst)) {
2273           unsigned OtherIdx = !UI.getOperandNo();
2274           Value *OtherOp = const_cast<Value *>(ICI->getOperand(OtherIdx));
2275           if (SE.hasComputableLoopEvolution(SE.getSCEV(OtherOp), L))
2276             continue;
2277         }
2278
2279         LSRFixup &LF = getNewFixup();
2280         LF.UserInst = const_cast<Instruction *>(UserInst);
2281         LF.OperandValToReplace = UI.getUse();
2282         std::pair<size_t, int64_t> P = getUse(S, LSRUse::Basic, 0);
2283         LF.LUIdx = P.first;
2284         LF.Offset = P.second;
2285         LSRUse &LU = Uses[LF.LUIdx];
2286         LU.AllFixupsOutsideLoop &= LF.isUseFullyOutsideLoop(L);
2287         if (!LU.WidestFixupType ||
2288             SE.getTypeSizeInBits(LU.WidestFixupType) <
2289             SE.getTypeSizeInBits(LF.OperandValToReplace->getType()))
2290           LU.WidestFixupType = LF.OperandValToReplace->getType();
2291         InsertSupplementalFormula(U, LU, LF.LUIdx);
2292         CountRegisters(LU.Formulae.back(), Uses.size() - 1);
2293         break;
2294       }
2295     }
2296   }
2297 }
2298
2299 /// CollectSubexprs - Split S into subexpressions which can be pulled out into
2300 /// separate registers. If C is non-null, multiply each subexpression by C.
2301 static void CollectSubexprs(const SCEV *S, const SCEVConstant *C,
2302                             SmallVectorImpl<const SCEV *> &Ops,
2303                             const Loop *L,
2304                             ScalarEvolution &SE) {
2305   if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
2306     // Break out add operands.
2307     for (SCEVAddExpr::op_iterator I = Add->op_begin(), E = Add->op_end();
2308          I != E; ++I)
2309       CollectSubexprs(*I, C, Ops, L, SE);
2310     return;
2311   } else if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
2312     // Split a non-zero base out of an addrec.
2313     if (!AR->getStart()->isZero()) {
2314       CollectSubexprs(SE.getAddRecExpr(SE.getConstant(AR->getType(), 0),
2315                                        AR->getStepRecurrence(SE),
2316                                        AR->getLoop(),
2317                                        //FIXME: AR->getNoWrapFlags(SCEV::FlagNW)
2318                                        SCEV::FlagAnyWrap),
2319                       C, Ops, L, SE);
2320       CollectSubexprs(AR->getStart(), C, Ops, L, SE);
2321       return;
2322     }
2323   } else if (const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(S)) {
2324     // Break (C * (a + b + c)) into C*a + C*b + C*c.
2325     if (Mul->getNumOperands() == 2)
2326       if (const SCEVConstant *Op0 =
2327             dyn_cast<SCEVConstant>(Mul->getOperand(0))) {
2328         CollectSubexprs(Mul->getOperand(1),
2329                         C ? cast<SCEVConstant>(SE.getMulExpr(C, Op0)) : Op0,
2330                         Ops, L, SE);
2331         return;
2332       }
2333   }
2334
2335   // Otherwise use the value itself, optionally with a scale applied.
2336   Ops.push_back(C ? SE.getMulExpr(C, S) : S);
2337 }
2338
2339 /// GenerateReassociations - Split out subexpressions from adds and the bases of
2340 /// addrecs.
2341 void LSRInstance::GenerateReassociations(LSRUse &LU, unsigned LUIdx,
2342                                          Formula Base,
2343                                          unsigned Depth) {
2344   // Arbitrarily cap recursion to protect compile time.
2345   if (Depth >= 3) return;
2346
2347   for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i) {
2348     const SCEV *BaseReg = Base.BaseRegs[i];
2349
2350     SmallVector<const SCEV *, 8> AddOps;
2351     CollectSubexprs(BaseReg, 0, AddOps, L, SE);
2352
2353     if (AddOps.size() == 1) continue;
2354
2355     for (SmallVectorImpl<const SCEV *>::const_iterator J = AddOps.begin(),
2356          JE = AddOps.end(); J != JE; ++J) {
2357
2358       // Loop-variant "unknown" values are uninteresting; we won't be able to
2359       // do anything meaningful with them.
2360       if (isa<SCEVUnknown>(*J) && !SE.isLoopInvariant(*J, L))
2361         continue;
2362
2363       // Don't pull a constant into a register if the constant could be folded
2364       // into an immediate field.
2365       if (isAlwaysFoldable(*J, LU.MinOffset, LU.MaxOffset,
2366                            Base.getNumRegs() > 1,
2367                            LU.Kind, LU.AccessTy, TLI, SE))
2368         continue;
2369
2370       // Collect all operands except *J.
2371       SmallVector<const SCEV *, 8> InnerAddOps
2372         (((const SmallVector<const SCEV *, 8> &)AddOps).begin(), J);
2373       InnerAddOps.append
2374         (llvm::next(J), ((const SmallVector<const SCEV *, 8> &)AddOps).end());
2375
2376       // Don't leave just a constant behind in a register if the constant could
2377       // be folded into an immediate field.
2378       if (InnerAddOps.size() == 1 &&
2379           isAlwaysFoldable(InnerAddOps[0], LU.MinOffset, LU.MaxOffset,
2380                            Base.getNumRegs() > 1,
2381                            LU.Kind, LU.AccessTy, TLI, SE))
2382         continue;
2383
2384       const SCEV *InnerSum = SE.getAddExpr(InnerAddOps);
2385       if (InnerSum->isZero())
2386         continue;
2387       Formula F = Base;
2388
2389       // Add the remaining pieces of the add back into the new formula.
2390       const SCEVConstant *InnerSumSC = dyn_cast<SCEVConstant>(InnerSum);
2391       if (TLI && InnerSumSC &&
2392           SE.getTypeSizeInBits(InnerSumSC->getType()) <= 64 &&
2393           TLI->isLegalAddImmediate((uint64_t)F.UnfoldedOffset +
2394                                    InnerSumSC->getValue()->getZExtValue())) {
2395         F.UnfoldedOffset = (uint64_t)F.UnfoldedOffset +
2396                            InnerSumSC->getValue()->getZExtValue();
2397         F.BaseRegs.erase(F.BaseRegs.begin() + i);
2398       } else
2399         F.BaseRegs[i] = InnerSum;
2400
2401       // Add J as its own register, or an unfolded immediate.
2402       const SCEVConstant *SC = dyn_cast<SCEVConstant>(*J);
2403       if (TLI && SC && SE.getTypeSizeInBits(SC->getType()) <= 64 &&
2404           TLI->isLegalAddImmediate((uint64_t)F.UnfoldedOffset +
2405                                    SC->getValue()->getZExtValue()))
2406         F.UnfoldedOffset = (uint64_t)F.UnfoldedOffset +
2407                            SC->getValue()->getZExtValue();
2408       else
2409         F.BaseRegs.push_back(*J);
2410
2411       if (InsertFormula(LU, LUIdx, F))
2412         // If that formula hadn't been seen before, recurse to find more like
2413         // it.
2414         GenerateReassociations(LU, LUIdx, LU.Formulae.back(), Depth+1);
2415     }
2416   }
2417 }
2418
2419 /// GenerateCombinations - Generate a formula consisting of all of the
2420 /// loop-dominating registers added into a single register.
2421 void LSRInstance::GenerateCombinations(LSRUse &LU, unsigned LUIdx,
2422                                        Formula Base) {
2423   // This method is only interesting on a plurality of registers.
2424   if (Base.BaseRegs.size() <= 1) return;
2425
2426   Formula F = Base;
2427   F.BaseRegs.clear();
2428   SmallVector<const SCEV *, 4> Ops;
2429   for (SmallVectorImpl<const SCEV *>::const_iterator
2430        I = Base.BaseRegs.begin(), E = Base.BaseRegs.end(); I != E; ++I) {
2431     const SCEV *BaseReg = *I;
2432     if (SE.properlyDominates(BaseReg, L->getHeader()) &&
2433         !SE.hasComputableLoopEvolution(BaseReg, L))
2434       Ops.push_back(BaseReg);
2435     else
2436       F.BaseRegs.push_back(BaseReg);
2437   }
2438   if (Ops.size() > 1) {
2439     const SCEV *Sum = SE.getAddExpr(Ops);
2440     // TODO: If Sum is zero, it probably means ScalarEvolution missed an
2441     // opportunity to fold something. For now, just ignore such cases
2442     // rather than proceed with zero in a register.
2443     if (!Sum->isZero()) {
2444       F.BaseRegs.push_back(Sum);
2445       (void)InsertFormula(LU, LUIdx, F);
2446     }
2447   }
2448 }
2449
2450 /// GenerateSymbolicOffsets - Generate reuse formulae using symbolic offsets.
2451 void LSRInstance::GenerateSymbolicOffsets(LSRUse &LU, unsigned LUIdx,
2452                                           Formula Base) {
2453   // We can't add a symbolic offset if the address already contains one.
2454   if (Base.AM.BaseGV) return;
2455
2456   for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i) {
2457     const SCEV *G = Base.BaseRegs[i];
2458     GlobalValue *GV = ExtractSymbol(G, SE);
2459     if (G->isZero() || !GV)
2460       continue;
2461     Formula F = Base;
2462     F.AM.BaseGV = GV;
2463     if (!isLegalUse(F.AM, LU.MinOffset, LU.MaxOffset,
2464                     LU.Kind, LU.AccessTy, TLI))
2465       continue;
2466     F.BaseRegs[i] = G;
2467     (void)InsertFormula(LU, LUIdx, F);
2468   }
2469 }
2470
2471 /// GenerateConstantOffsets - Generate reuse formulae using symbolic offsets.
2472 void LSRInstance::GenerateConstantOffsets(LSRUse &LU, unsigned LUIdx,
2473                                           Formula Base) {
2474   // TODO: For now, just add the min and max offset, because it usually isn't
2475   // worthwhile looking at everything inbetween.
2476   SmallVector<int64_t, 2> Worklist;
2477   Worklist.push_back(LU.MinOffset);
2478   if (LU.MaxOffset != LU.MinOffset)
2479     Worklist.push_back(LU.MaxOffset);
2480
2481   for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i) {
2482     const SCEV *G = Base.BaseRegs[i];
2483
2484     for (SmallVectorImpl<int64_t>::const_iterator I = Worklist.begin(),
2485          E = Worklist.end(); I != E; ++I) {
2486       Formula F = Base;
2487       F.AM.BaseOffs = (uint64_t)Base.AM.BaseOffs - *I;
2488       if (isLegalUse(F.AM, LU.MinOffset - *I, LU.MaxOffset - *I,
2489                      LU.Kind, LU.AccessTy, TLI)) {
2490         // Add the offset to the base register.
2491         const SCEV *NewG = SE.getAddExpr(SE.getConstant(G->getType(), *I), G);
2492         // If it cancelled out, drop the base register, otherwise update it.
2493         if (NewG->isZero()) {
2494           std::swap(F.BaseRegs[i], F.BaseRegs.back());
2495           F.BaseRegs.pop_back();
2496         } else
2497           F.BaseRegs[i] = NewG;
2498
2499         (void)InsertFormula(LU, LUIdx, F);
2500       }
2501     }
2502
2503     int64_t Imm = ExtractImmediate(G, SE);
2504     if (G->isZero() || Imm == 0)
2505       continue;
2506     Formula F = Base;
2507     F.AM.BaseOffs = (uint64_t)F.AM.BaseOffs + Imm;
2508     if (!isLegalUse(F.AM, LU.MinOffset, LU.MaxOffset,
2509                     LU.Kind, LU.AccessTy, TLI))
2510       continue;
2511     F.BaseRegs[i] = G;
2512     (void)InsertFormula(LU, LUIdx, F);
2513   }
2514 }
2515
2516 /// GenerateICmpZeroScales - For ICmpZero, check to see if we can scale up
2517 /// the comparison. For example, x == y -> x*c == y*c.
2518 void LSRInstance::GenerateICmpZeroScales(LSRUse &LU, unsigned LUIdx,
2519                                          Formula Base) {
2520   if (LU.Kind != LSRUse::ICmpZero) return;
2521
2522   // Determine the integer type for the base formula.
2523   Type *IntTy = Base.getType();
2524   if (!IntTy) return;
2525   if (SE.getTypeSizeInBits(IntTy) > 64) return;
2526
2527   // Don't do this if there is more than one offset.
2528   if (LU.MinOffset != LU.MaxOffset) return;
2529
2530   assert(!Base.AM.BaseGV && "ICmpZero use is not legal!");
2531
2532   // Check each interesting stride.
2533   for (SmallSetVector<int64_t, 8>::const_iterator
2534        I = Factors.begin(), E = Factors.end(); I != E; ++I) {
2535     int64_t Factor = *I;
2536
2537     // Check that the multiplication doesn't overflow.
2538     if (Base.AM.BaseOffs == INT64_MIN && Factor == -1)
2539       continue;
2540     int64_t NewBaseOffs = (uint64_t)Base.AM.BaseOffs * Factor;
2541     if (NewBaseOffs / Factor != Base.AM.BaseOffs)
2542       continue;
2543
2544     // Check that multiplying with the use offset doesn't overflow.
2545     int64_t Offset = LU.MinOffset;
2546     if (Offset == INT64_MIN && Factor == -1)
2547       continue;
2548     Offset = (uint64_t)Offset * Factor;
2549     if (Offset / Factor != LU.MinOffset)
2550       continue;
2551
2552     Formula F = Base;
2553     F.AM.BaseOffs = NewBaseOffs;
2554
2555     // Check that this scale is legal.
2556     if (!isLegalUse(F.AM, Offset, Offset, LU.Kind, LU.AccessTy, TLI))
2557       continue;
2558
2559     // Compensate for the use having MinOffset built into it.
2560     F.AM.BaseOffs = (uint64_t)F.AM.BaseOffs + Offset - LU.MinOffset;
2561
2562     const SCEV *FactorS = SE.getConstant(IntTy, Factor);
2563
2564     // Check that multiplying with each base register doesn't overflow.
2565     for (size_t i = 0, e = F.BaseRegs.size(); i != e; ++i) {
2566       F.BaseRegs[i] = SE.getMulExpr(F.BaseRegs[i], FactorS);
2567       if (getExactSDiv(F.BaseRegs[i], FactorS, SE) != Base.BaseRegs[i])
2568         goto next;
2569     }
2570
2571     // Check that multiplying with the scaled register doesn't overflow.
2572     if (F.ScaledReg) {
2573       F.ScaledReg = SE.getMulExpr(F.ScaledReg, FactorS);
2574       if (getExactSDiv(F.ScaledReg, FactorS, SE) != Base.ScaledReg)
2575         continue;
2576     }
2577
2578     // Check that multiplying with the unfolded offset doesn't overflow.
2579     if (F.UnfoldedOffset != 0) {
2580       if (F.UnfoldedOffset == INT64_MIN && Factor == -1)
2581         continue;
2582       F.UnfoldedOffset = (uint64_t)F.UnfoldedOffset * Factor;
2583       if (F.UnfoldedOffset / Factor != Base.UnfoldedOffset)
2584         continue;
2585     }
2586
2587     // If we make it here and it's legal, add it.
2588     (void)InsertFormula(LU, LUIdx, F);
2589   next:;
2590   }
2591 }
2592
2593 /// GenerateScales - Generate stride factor reuse formulae by making use of
2594 /// scaled-offset address modes, for example.
2595 void LSRInstance::GenerateScales(LSRUse &LU, unsigned LUIdx, Formula Base) {
2596   // Determine the integer type for the base formula.
2597   Type *IntTy = Base.getType();
2598   if (!IntTy) return;
2599
2600   // If this Formula already has a scaled register, we can't add another one.
2601   if (Base.AM.Scale != 0) return;
2602
2603   // Check each interesting stride.
2604   for (SmallSetVector<int64_t, 8>::const_iterator
2605        I = Factors.begin(), E = Factors.end(); I != E; ++I) {
2606     int64_t Factor = *I;
2607
2608     Base.AM.Scale = Factor;
2609     Base.AM.HasBaseReg = Base.BaseRegs.size() > 1;
2610     // Check whether this scale is going to be legal.
2611     if (!isLegalUse(Base.AM, LU.MinOffset, LU.MaxOffset,
2612                     LU.Kind, LU.AccessTy, TLI)) {
2613       // As a special-case, handle special out-of-loop Basic users specially.
2614       // TODO: Reconsider this special case.
2615       if (LU.Kind == LSRUse::Basic &&
2616           isLegalUse(Base.AM, LU.MinOffset, LU.MaxOffset,
2617                      LSRUse::Special, LU.AccessTy, TLI) &&
2618           LU.AllFixupsOutsideLoop)
2619         LU.Kind = LSRUse::Special;
2620       else
2621         continue;
2622     }
2623     // For an ICmpZero, negating a solitary base register won't lead to
2624     // new solutions.
2625     if (LU.Kind == LSRUse::ICmpZero &&
2626         !Base.AM.HasBaseReg && Base.AM.BaseOffs == 0 && !Base.AM.BaseGV)
2627       continue;
2628     // For each addrec base reg, apply the scale, if possible.
2629     for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i)
2630       if (const SCEVAddRecExpr *AR =
2631             dyn_cast<SCEVAddRecExpr>(Base.BaseRegs[i])) {
2632         const SCEV *FactorS = SE.getConstant(IntTy, Factor);
2633         if (FactorS->isZero())
2634           continue;
2635         // Divide out the factor, ignoring high bits, since we'll be
2636         // scaling the value back up in the end.
2637         if (const SCEV *Quotient = getExactSDiv(AR, FactorS, SE, true)) {
2638           // TODO: This could be optimized to avoid all the copying.
2639           Formula F = Base;
2640           F.ScaledReg = Quotient;
2641           F.DeleteBaseReg(F.BaseRegs[i]);
2642           (void)InsertFormula(LU, LUIdx, F);
2643         }
2644       }
2645   }
2646 }
2647
2648 /// GenerateTruncates - Generate reuse formulae from different IV types.
2649 void LSRInstance::GenerateTruncates(LSRUse &LU, unsigned LUIdx, Formula Base) {
2650   // This requires TargetLowering to tell us which truncates are free.
2651   if (!TLI) return;
2652
2653   // Don't bother truncating symbolic values.
2654   if (Base.AM.BaseGV) return;
2655
2656   // Determine the integer type for the base formula.
2657   Type *DstTy = Base.getType();
2658   if (!DstTy) return;
2659   DstTy = SE.getEffectiveSCEVType(DstTy);
2660
2661   for (SmallSetVector<Type *, 4>::const_iterator
2662        I = Types.begin(), E = Types.end(); I != E; ++I) {
2663     Type *SrcTy = *I;
2664     if (SrcTy != DstTy && TLI->isTruncateFree(SrcTy, DstTy)) {
2665       Formula F = Base;
2666
2667       if (F.ScaledReg) F.ScaledReg = SE.getAnyExtendExpr(F.ScaledReg, *I);
2668       for (SmallVectorImpl<const SCEV *>::iterator J = F.BaseRegs.begin(),
2669            JE = F.BaseRegs.end(); J != JE; ++J)
2670         *J = SE.getAnyExtendExpr(*J, SrcTy);
2671
2672       // TODO: This assumes we've done basic processing on all uses and
2673       // have an idea what the register usage is.
2674       if (!F.hasRegsUsedByUsesOtherThan(LUIdx, RegUses))
2675         continue;
2676
2677       (void)InsertFormula(LU, LUIdx, F);
2678     }
2679   }
2680 }
2681
2682 namespace {
2683
2684 /// WorkItem - Helper class for GenerateCrossUseConstantOffsets. It's used to
2685 /// defer modifications so that the search phase doesn't have to worry about
2686 /// the data structures moving underneath it.
2687 struct WorkItem {
2688   size_t LUIdx;
2689   int64_t Imm;
2690   const SCEV *OrigReg;
2691
2692   WorkItem(size_t LI, int64_t I, const SCEV *R)
2693     : LUIdx(LI), Imm(I), OrigReg(R) {}
2694
2695   void print(raw_ostream &OS) const;
2696   void dump() const;
2697 };
2698
2699 }
2700
2701 void WorkItem::print(raw_ostream &OS) const {
2702   OS << "in formulae referencing " << *OrigReg << " in use " << LUIdx
2703      << " , add offset " << Imm;
2704 }
2705
2706 void WorkItem::dump() const {
2707   print(errs()); errs() << '\n';
2708 }
2709
2710 /// GenerateCrossUseConstantOffsets - Look for registers which are a constant
2711 /// distance apart and try to form reuse opportunities between them.
2712 void LSRInstance::GenerateCrossUseConstantOffsets() {
2713   // Group the registers by their value without any added constant offset.
2714   typedef std::map<int64_t, const SCEV *> ImmMapTy;
2715   typedef DenseMap<const SCEV *, ImmMapTy> RegMapTy;
2716   RegMapTy Map;
2717   DenseMap<const SCEV *, SmallBitVector> UsedByIndicesMap;
2718   SmallVector<const SCEV *, 8> Sequence;
2719   for (RegUseTracker::const_iterator I = RegUses.begin(), E = RegUses.end();
2720        I != E; ++I) {
2721     const SCEV *Reg = *I;
2722     int64_t Imm = ExtractImmediate(Reg, SE);
2723     std::pair<RegMapTy::iterator, bool> Pair =
2724       Map.insert(std::make_pair(Reg, ImmMapTy()));
2725     if (Pair.second)
2726       Sequence.push_back(Reg);
2727     Pair.first->second.insert(std::make_pair(Imm, *I));
2728     UsedByIndicesMap[Reg] |= RegUses.getUsedByIndices(*I);
2729   }
2730
2731   // Now examine each set of registers with the same base value. Build up
2732   // a list of work to do and do the work in a separate step so that we're
2733   // not adding formulae and register counts while we're searching.
2734   SmallVector<WorkItem, 32> WorkItems;
2735   SmallSet<std::pair<size_t, int64_t>, 32> UniqueItems;
2736   for (SmallVectorImpl<const SCEV *>::const_iterator I = Sequence.begin(),
2737        E = Sequence.end(); I != E; ++I) {
2738     const SCEV *Reg = *I;
2739     const ImmMapTy &Imms = Map.find(Reg)->second;
2740
2741     // It's not worthwhile looking for reuse if there's only one offset.
2742     if (Imms.size() == 1)
2743       continue;
2744
2745     DEBUG(dbgs() << "Generating cross-use offsets for " << *Reg << ':';
2746           for (ImmMapTy::const_iterator J = Imms.begin(), JE = Imms.end();
2747                J != JE; ++J)
2748             dbgs() << ' ' << J->first;
2749           dbgs() << '\n');
2750
2751     // Examine each offset.
2752     for (ImmMapTy::const_iterator J = Imms.begin(), JE = Imms.end();
2753          J != JE; ++J) {
2754       const SCEV *OrigReg = J->second;
2755
2756       int64_t JImm = J->first;
2757       const SmallBitVector &UsedByIndices = RegUses.getUsedByIndices(OrigReg);
2758
2759       if (!isa<SCEVConstant>(OrigReg) &&
2760           UsedByIndicesMap[Reg].count() == 1) {
2761         DEBUG(dbgs() << "Skipping cross-use reuse for " << *OrigReg << '\n');
2762         continue;
2763       }
2764
2765       // Conservatively examine offsets between this orig reg a few selected
2766       // other orig regs.
2767       ImmMapTy::const_iterator OtherImms[] = {
2768         Imms.begin(), prior(Imms.end()),
2769         Imms.lower_bound((Imms.begin()->first + prior(Imms.end())->first) / 2)
2770       };
2771       for (size_t i = 0, e = array_lengthof(OtherImms); i != e; ++i) {
2772         ImmMapTy::const_iterator M = OtherImms[i];
2773         if (M == J || M == JE) continue;
2774
2775         // Compute the difference between the two.
2776         int64_t Imm = (uint64_t)JImm - M->first;
2777         for (int LUIdx = UsedByIndices.find_first(); LUIdx != -1;
2778              LUIdx = UsedByIndices.find_next(LUIdx))
2779           // Make a memo of this use, offset, and register tuple.
2780           if (UniqueItems.insert(std::make_pair(LUIdx, Imm)))
2781             WorkItems.push_back(WorkItem(LUIdx, Imm, OrigReg));
2782       }
2783     }
2784   }
2785
2786   Map.clear();
2787   Sequence.clear();
2788   UsedByIndicesMap.clear();
2789   UniqueItems.clear();
2790
2791   // Now iterate through the worklist and add new formulae.
2792   for (SmallVectorImpl<WorkItem>::const_iterator I = WorkItems.begin(),
2793        E = WorkItems.end(); I != E; ++I) {
2794     const WorkItem &WI = *I;
2795     size_t LUIdx = WI.LUIdx;
2796     LSRUse &LU = Uses[LUIdx];
2797     int64_t Imm = WI.Imm;
2798     const SCEV *OrigReg = WI.OrigReg;
2799
2800     Type *IntTy = SE.getEffectiveSCEVType(OrigReg->getType());
2801     const SCEV *NegImmS = SE.getSCEV(ConstantInt::get(IntTy, -(uint64_t)Imm));
2802     unsigned BitWidth = SE.getTypeSizeInBits(IntTy);
2803
2804     // TODO: Use a more targeted data structure.
2805     for (size_t L = 0, LE = LU.Formulae.size(); L != LE; ++L) {
2806       const Formula &F = LU.Formulae[L];
2807       // Use the immediate in the scaled register.
2808       if (F.ScaledReg == OrigReg) {
2809         int64_t Offs = (uint64_t)F.AM.BaseOffs +
2810                        Imm * (uint64_t)F.AM.Scale;
2811         // Don't create 50 + reg(-50).
2812         if (F.referencesReg(SE.getSCEV(
2813                    ConstantInt::get(IntTy, -(uint64_t)Offs))))
2814           continue;
2815         Formula NewF = F;
2816         NewF.AM.BaseOffs = Offs;
2817         if (!isLegalUse(NewF.AM, LU.MinOffset, LU.MaxOffset,
2818                         LU.Kind, LU.AccessTy, TLI))
2819           continue;
2820         NewF.ScaledReg = SE.getAddExpr(NegImmS, NewF.ScaledReg);
2821
2822         // If the new scale is a constant in a register, and adding the constant
2823         // value to the immediate would produce a value closer to zero than the
2824         // immediate itself, then the formula isn't worthwhile.
2825         if (const SCEVConstant *C = dyn_cast<SCEVConstant>(NewF.ScaledReg))
2826           if (C->getValue()->isNegative() !=
2827                 (NewF.AM.BaseOffs < 0) &&
2828               (C->getValue()->getValue().abs() * APInt(BitWidth, F.AM.Scale))
2829                 .ule(abs64(NewF.AM.BaseOffs)))
2830             continue;
2831
2832         // OK, looks good.
2833         (void)InsertFormula(LU, LUIdx, NewF);
2834       } else {
2835         // Use the immediate in a base register.
2836         for (size_t N = 0, NE = F.BaseRegs.size(); N != NE; ++N) {
2837           const SCEV *BaseReg = F.BaseRegs[N];
2838           if (BaseReg != OrigReg)
2839             continue;
2840           Formula NewF = F;
2841           NewF.AM.BaseOffs = (uint64_t)NewF.AM.BaseOffs + Imm;
2842           if (!isLegalUse(NewF.AM, LU.MinOffset, LU.MaxOffset,
2843                           LU.Kind, LU.AccessTy, TLI)) {
2844             if (!TLI ||
2845                 !TLI->isLegalAddImmediate((uint64_t)NewF.UnfoldedOffset + Imm))
2846               continue;
2847             NewF = F;
2848             NewF.UnfoldedOffset = (uint64_t)NewF.UnfoldedOffset + Imm;
2849           }
2850           NewF.BaseRegs[N] = SE.getAddExpr(NegImmS, BaseReg);
2851
2852           // If the new formula has a constant in a register, and adding the
2853           // constant value to the immediate would produce a value closer to
2854           // zero than the immediate itself, then the formula isn't worthwhile.
2855           for (SmallVectorImpl<const SCEV *>::const_iterator
2856                J = NewF.BaseRegs.begin(), JE = NewF.BaseRegs.end();
2857                J != JE; ++J)
2858             if (const SCEVConstant *C = dyn_cast<SCEVConstant>(*J))
2859               if ((C->getValue()->getValue() + NewF.AM.BaseOffs).abs().slt(
2860                    abs64(NewF.AM.BaseOffs)) &&
2861                   (C->getValue()->getValue() +
2862                    NewF.AM.BaseOffs).countTrailingZeros() >=
2863                    CountTrailingZeros_64(NewF.AM.BaseOffs))
2864                 goto skip_formula;
2865
2866           // Ok, looks good.
2867           (void)InsertFormula(LU, LUIdx, NewF);
2868           break;
2869         skip_formula:;
2870         }
2871       }
2872     }
2873   }
2874 }
2875
2876 /// GenerateAllReuseFormulae - Generate formulae for each use.
2877 void
2878 LSRInstance::GenerateAllReuseFormulae() {
2879   // This is split into multiple loops so that hasRegsUsedByUsesOtherThan
2880   // queries are more precise.
2881   for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
2882     LSRUse &LU = Uses[LUIdx];
2883     for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
2884       GenerateReassociations(LU, LUIdx, LU.Formulae[i]);
2885     for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
2886       GenerateCombinations(LU, LUIdx, LU.Formulae[i]);
2887   }
2888   for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
2889     LSRUse &LU = Uses[LUIdx];
2890     for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
2891       GenerateSymbolicOffsets(LU, LUIdx, LU.Formulae[i]);
2892     for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
2893       GenerateConstantOffsets(LU, LUIdx, LU.Formulae[i]);
2894     for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
2895       GenerateICmpZeroScales(LU, LUIdx, LU.Formulae[i]);
2896     for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
2897       GenerateScales(LU, LUIdx, LU.Formulae[i]);
2898   }
2899   for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
2900     LSRUse &LU = Uses[LUIdx];
2901     for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
2902       GenerateTruncates(LU, LUIdx, LU.Formulae[i]);
2903   }
2904
2905   GenerateCrossUseConstantOffsets();
2906
2907   DEBUG(dbgs() << "\n"
2908                   "After generating reuse formulae:\n";
2909         print_uses(dbgs()));
2910 }
2911
2912 /// If there are multiple formulae with the same set of registers used
2913 /// by other uses, pick the best one and delete the others.
2914 void LSRInstance::FilterOutUndesirableDedicatedRegisters() {
2915   DenseSet<const SCEV *> VisitedRegs;
2916   SmallPtrSet<const SCEV *, 16> Regs;
2917 #ifndef NDEBUG
2918   bool ChangedFormulae = false;
2919 #endif
2920
2921   // Collect the best formula for each unique set of shared registers. This
2922   // is reset for each use.
2923   typedef DenseMap<SmallVector<const SCEV *, 2>, size_t, UniquifierDenseMapInfo>
2924     BestFormulaeTy;
2925   BestFormulaeTy BestFormulae;
2926
2927   for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
2928     LSRUse &LU = Uses[LUIdx];
2929     DEBUG(dbgs() << "Filtering for use "; LU.print(dbgs()); dbgs() << '\n');
2930
2931     bool Any = false;
2932     for (size_t FIdx = 0, NumForms = LU.Formulae.size();
2933          FIdx != NumForms; ++FIdx) {
2934       Formula &F = LU.Formulae[FIdx];
2935
2936       SmallVector<const SCEV *, 2> Key;
2937       for (SmallVectorImpl<const SCEV *>::const_iterator J = F.BaseRegs.begin(),
2938            JE = F.BaseRegs.end(); J != JE; ++J) {
2939         const SCEV *Reg = *J;
2940         if (RegUses.isRegUsedByUsesOtherThan(Reg, LUIdx))
2941           Key.push_back(Reg);
2942       }
2943       if (F.ScaledReg &&
2944           RegUses.isRegUsedByUsesOtherThan(F.ScaledReg, LUIdx))
2945         Key.push_back(F.ScaledReg);
2946       // Unstable sort by host order ok, because this is only used for
2947       // uniquifying.
2948       std::sort(Key.begin(), Key.end());
2949
2950       std::pair<BestFormulaeTy::const_iterator, bool> P =
2951         BestFormulae.insert(std::make_pair(Key, FIdx));
2952       if (!P.second) {
2953         Formula &Best = LU.Formulae[P.first->second];
2954
2955         Cost CostF;
2956         CostF.RateFormula(F, Regs, VisitedRegs, L, LU.Offsets, SE, DT);
2957         Regs.clear();
2958         Cost CostBest;
2959         CostBest.RateFormula(Best, Regs, VisitedRegs, L, LU.Offsets, SE, DT);
2960         Regs.clear();
2961         if (CostF < CostBest)
2962           std::swap(F, Best);
2963         DEBUG(dbgs() << "  Filtering out formula "; F.print(dbgs());
2964               dbgs() << "\n"
2965                         "    in favor of formula "; Best.print(dbgs());
2966               dbgs() << '\n');
2967 #ifndef NDEBUG
2968         ChangedFormulae = true;
2969 #endif
2970         LU.DeleteFormula(F);
2971         --FIdx;
2972         --NumForms;
2973         Any = true;
2974         continue;
2975       }
2976     }
2977
2978     // Now that we've filtered out some formulae, recompute the Regs set.
2979     if (Any)
2980       LU.RecomputeRegs(LUIdx, RegUses);
2981
2982     // Reset this to prepare for the next use.
2983     BestFormulae.clear();
2984   }
2985
2986   DEBUG(if (ChangedFormulae) {
2987           dbgs() << "\n"
2988                     "After filtering out undesirable candidates:\n";
2989           print_uses(dbgs());
2990         });
2991 }
2992
2993 // This is a rough guess that seems to work fairly well.
2994 static const size_t ComplexityLimit = UINT16_MAX;
2995
2996 /// EstimateSearchSpaceComplexity - Estimate the worst-case number of
2997 /// solutions the solver might have to consider. It almost never considers
2998 /// this many solutions because it prune the search space, but the pruning
2999 /// isn't always sufficient.
3000 size_t LSRInstance::EstimateSearchSpaceComplexity() const {
3001   size_t Power = 1;
3002   for (SmallVectorImpl<LSRUse>::const_iterator I = Uses.begin(),
3003        E = Uses.end(); I != E; ++I) {
3004     size_t FSize = I->Formulae.size();
3005     if (FSize >= ComplexityLimit) {
3006       Power = ComplexityLimit;
3007       break;
3008     }
3009     Power *= FSize;
3010     if (Power >= ComplexityLimit)
3011       break;
3012   }
3013   return Power;
3014 }
3015
3016 /// NarrowSearchSpaceByDetectingSupersets - When one formula uses a superset
3017 /// of the registers of another formula, it won't help reduce register
3018 /// pressure (though it may not necessarily hurt register pressure); remove
3019 /// it to simplify the system.
3020 void LSRInstance::NarrowSearchSpaceByDetectingSupersets() {
3021   if (EstimateSearchSpaceComplexity() >= ComplexityLimit) {
3022     DEBUG(dbgs() << "The search space is too complex.\n");
3023
3024     DEBUG(dbgs() << "Narrowing the search space by eliminating formulae "
3025                     "which use a superset of registers used by other "
3026                     "formulae.\n");
3027
3028     for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
3029       LSRUse &LU = Uses[LUIdx];
3030       bool Any = false;
3031       for (size_t i = 0, e = LU.Formulae.size(); i != e; ++i) {
3032         Formula &F = LU.Formulae[i];
3033         // Look for a formula with a constant or GV in a register. If the use
3034         // also has a formula with that same value in an immediate field,
3035         // delete the one that uses a register.
3036         for (SmallVectorImpl<const SCEV *>::const_iterator
3037              I = F.BaseRegs.begin(), E = F.BaseRegs.end(); I != E; ++I) {
3038           if (const SCEVConstant *C = dyn_cast<SCEVConstant>(*I)) {
3039             Formula NewF = F;
3040             NewF.AM.BaseOffs += C->getValue()->getSExtValue();
3041             NewF.BaseRegs.erase(NewF.BaseRegs.begin() +
3042                                 (I - F.BaseRegs.begin()));
3043             if (LU.HasFormulaWithSameRegs(NewF)) {
3044               DEBUG(dbgs() << "  Deleting "; F.print(dbgs()); dbgs() << '\n');
3045               LU.DeleteFormula(F);
3046               --i;
3047               --e;
3048               Any = true;
3049               break;
3050             }
3051           } else if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(*I)) {
3052             if (GlobalValue *GV = dyn_cast<GlobalValue>(U->getValue()))
3053               if (!F.AM.BaseGV) {
3054                 Formula NewF = F;
3055                 NewF.AM.BaseGV = GV;
3056                 NewF.BaseRegs.erase(NewF.BaseRegs.begin() +
3057                                     (I - F.BaseRegs.begin()));
3058                 if (LU.HasFormulaWithSameRegs(NewF)) {
3059                   DEBUG(dbgs() << "  Deleting "; F.print(dbgs());
3060                         dbgs() << '\n');
3061                   LU.DeleteFormula(F);
3062                   --i;
3063                   --e;
3064                   Any = true;
3065                   break;
3066                 }
3067               }
3068           }
3069         }
3070       }
3071       if (Any)
3072         LU.RecomputeRegs(LUIdx, RegUses);
3073     }
3074
3075     DEBUG(dbgs() << "After pre-selection:\n";
3076           print_uses(dbgs()));
3077   }
3078 }
3079
3080 /// NarrowSearchSpaceByCollapsingUnrolledCode - When there are many registers
3081 /// for expressions like A, A+1, A+2, etc., allocate a single register for
3082 /// them.
3083 void LSRInstance::NarrowSearchSpaceByCollapsingUnrolledCode() {
3084   if (EstimateSearchSpaceComplexity() >= ComplexityLimit) {
3085     DEBUG(dbgs() << "The search space is too complex.\n");
3086
3087     DEBUG(dbgs() << "Narrowing the search space by assuming that uses "
3088                     "separated by a constant offset will use the same "
3089                     "registers.\n");
3090
3091     // This is especially useful for unrolled loops.
3092
3093     for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
3094       LSRUse &LU = Uses[LUIdx];
3095       for (SmallVectorImpl<Formula>::const_iterator I = LU.Formulae.begin(),
3096            E = LU.Formulae.end(); I != E; ++I) {
3097         const Formula &F = *I;
3098         if (F.AM.BaseOffs != 0 && F.AM.Scale == 0) {
3099           if (LSRUse *LUThatHas = FindUseWithSimilarFormula(F, LU)) {
3100             if (reconcileNewOffset(*LUThatHas, F.AM.BaseOffs,
3101                                    /*HasBaseReg=*/false,
3102                                    LU.Kind, LU.AccessTy)) {
3103               DEBUG(dbgs() << "  Deleting use "; LU.print(dbgs());
3104                     dbgs() << '\n');
3105
3106               LUThatHas->AllFixupsOutsideLoop &= LU.AllFixupsOutsideLoop;
3107
3108               // Update the relocs to reference the new use.
3109               for (SmallVectorImpl<LSRFixup>::iterator I = Fixups.begin(),
3110                    E = Fixups.end(); I != E; ++I) {
3111                 LSRFixup &Fixup = *I;
3112                 if (Fixup.LUIdx == LUIdx) {
3113                   Fixup.LUIdx = LUThatHas - &Uses.front();
3114                   Fixup.Offset += F.AM.BaseOffs;
3115                   // Add the new offset to LUThatHas' offset list.
3116                   if (LUThatHas->Offsets.back() != Fixup.Offset) {
3117                     LUThatHas->Offsets.push_back(Fixup.Offset);
3118                     if (Fixup.Offset > LUThatHas->MaxOffset)
3119                       LUThatHas->MaxOffset = Fixup.Offset;
3120                     if (Fixup.Offset < LUThatHas->MinOffset)
3121                       LUThatHas->MinOffset = Fixup.Offset;
3122                   }
3123                   DEBUG(dbgs() << "New fixup has offset "
3124                                << Fixup.Offset << '\n');
3125                 }
3126                 if (Fixup.LUIdx == NumUses-1)
3127                   Fixup.LUIdx = LUIdx;
3128               }
3129
3130               // Delete formulae from the new use which are no longer legal.
3131               bool Any = false;
3132               for (size_t i = 0, e = LUThatHas->Formulae.size(); i != e; ++i) {
3133                 Formula &F = LUThatHas->Formulae[i];
3134                 if (!isLegalUse(F.AM,
3135                                 LUThatHas->MinOffset, LUThatHas->MaxOffset,
3136                                 LUThatHas->Kind, LUThatHas->AccessTy, TLI)) {
3137                   DEBUG(dbgs() << "  Deleting "; F.print(dbgs());
3138                         dbgs() << '\n');
3139                   LUThatHas->DeleteFormula(F);
3140                   --i;
3141                   --e;
3142                   Any = true;
3143                 }
3144               }
3145               if (Any)
3146                 LUThatHas->RecomputeRegs(LUThatHas - &Uses.front(), RegUses);
3147
3148               // Delete the old use.
3149               DeleteUse(LU, LUIdx);
3150               --LUIdx;
3151               --NumUses;
3152               break;
3153             }
3154           }
3155         }
3156       }
3157     }
3158
3159     DEBUG(dbgs() << "After pre-selection:\n";
3160           print_uses(dbgs()));
3161   }
3162 }
3163
3164 /// NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters - Call
3165 /// FilterOutUndesirableDedicatedRegisters again, if necessary, now that
3166 /// we've done more filtering, as it may be able to find more formulae to
3167 /// eliminate.
3168 void LSRInstance::NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters(){
3169   if (EstimateSearchSpaceComplexity() >= ComplexityLimit) {
3170     DEBUG(dbgs() << "The search space is too complex.\n");
3171
3172     DEBUG(dbgs() << "Narrowing the search space by re-filtering out "
3173                     "undesirable dedicated registers.\n");
3174
3175     FilterOutUndesirableDedicatedRegisters();
3176
3177     DEBUG(dbgs() << "After pre-selection:\n";
3178           print_uses(dbgs()));
3179   }
3180 }
3181
3182 /// NarrowSearchSpaceByPickingWinnerRegs - Pick a register which seems likely
3183 /// to be profitable, and then in any use which has any reference to that
3184 /// register, delete all formulae which do not reference that register.
3185 void LSRInstance::NarrowSearchSpaceByPickingWinnerRegs() {
3186   // With all other options exhausted, loop until the system is simple
3187   // enough to handle.
3188   SmallPtrSet<const SCEV *, 4> Taken;
3189   while (EstimateSearchSpaceComplexity() >= ComplexityLimit) {
3190     // Ok, we have too many of formulae on our hands to conveniently handle.
3191     // Use a rough heuristic to thin out the list.
3192     DEBUG(dbgs() << "The search space is too complex.\n");
3193
3194     // Pick the register which is used by the most LSRUses, which is likely
3195     // to be a good reuse register candidate.
3196     const SCEV *Best = 0;
3197     unsigned BestNum = 0;
3198     for (RegUseTracker::const_iterator I = RegUses.begin(), E = RegUses.end();
3199          I != E; ++I) {
3200       const SCEV *Reg = *I;
3201       if (Taken.count(Reg))
3202         continue;
3203       if (!Best)
3204         Best = Reg;
3205       else {
3206         unsigned Count = RegUses.getUsedByIndices(Reg).count();
3207         if (Count > BestNum) {
3208           Best = Reg;
3209           BestNum = Count;
3210         }
3211       }
3212     }
3213
3214     DEBUG(dbgs() << "Narrowing the search space by assuming " << *Best
3215                  << " will yield profitable reuse.\n");
3216     Taken.insert(Best);
3217
3218     // In any use with formulae which references this register, delete formulae
3219     // which don't reference it.
3220     for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
3221       LSRUse &LU = Uses[LUIdx];
3222       if (!LU.Regs.count(Best)) continue;
3223
3224       bool Any = false;
3225       for (size_t i = 0, e = LU.Formulae.size(); i != e; ++i) {
3226         Formula &F = LU.Formulae[i];
3227         if (!F.referencesReg(Best)) {
3228           DEBUG(dbgs() << "  Deleting "; F.print(dbgs()); dbgs() << '\n');
3229           LU.DeleteFormula(F);
3230           --e;
3231           --i;
3232           Any = true;
3233           assert(e != 0 && "Use has no formulae left! Is Regs inconsistent?");
3234           continue;
3235         }
3236       }
3237
3238       if (Any)
3239         LU.RecomputeRegs(LUIdx, RegUses);
3240     }
3241
3242     DEBUG(dbgs() << "After pre-selection:\n";
3243           print_uses(dbgs()));
3244   }
3245 }
3246
3247 /// NarrowSearchSpaceUsingHeuristics - If there are an extraordinary number of
3248 /// formulae to choose from, use some rough heuristics to prune down the number
3249 /// of formulae. This keeps the main solver from taking an extraordinary amount
3250 /// of time in some worst-case scenarios.
3251 void LSRInstance::NarrowSearchSpaceUsingHeuristics() {
3252   NarrowSearchSpaceByDetectingSupersets();
3253   NarrowSearchSpaceByCollapsingUnrolledCode();
3254   NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters();
3255   NarrowSearchSpaceByPickingWinnerRegs();
3256 }
3257
3258 /// SolveRecurse - This is the recursive solver.
3259 void LSRInstance::SolveRecurse(SmallVectorImpl<const Formula *> &Solution,
3260                                Cost &SolutionCost,
3261                                SmallVectorImpl<const Formula *> &Workspace,
3262                                const Cost &CurCost,
3263                                const SmallPtrSet<const SCEV *, 16> &CurRegs,
3264                                DenseSet<const SCEV *> &VisitedRegs) const {
3265   // Some ideas:
3266   //  - prune more:
3267   //    - use more aggressive filtering
3268   //    - sort the formula so that the most profitable solutions are found first
3269   //    - sort the uses too
3270   //  - search faster:
3271   //    - don't compute a cost, and then compare. compare while computing a cost
3272   //      and bail early.
3273   //    - track register sets with SmallBitVector
3274
3275   const LSRUse &LU = Uses[Workspace.size()];
3276
3277   // If this use references any register that's already a part of the
3278   // in-progress solution, consider it a requirement that a formula must
3279   // reference that register in order to be considered. This prunes out
3280   // unprofitable searching.
3281   SmallSetVector<const SCEV *, 4> ReqRegs;
3282   for (SmallPtrSet<const SCEV *, 16>::const_iterator I = CurRegs.begin(),
3283        E = CurRegs.end(); I != E; ++I)
3284     if (LU.Regs.count(*I))
3285       ReqRegs.insert(*I);
3286
3287   bool AnySatisfiedReqRegs = false;
3288   SmallPtrSet<const SCEV *, 16> NewRegs;
3289   Cost NewCost;
3290 retry:
3291   for (SmallVectorImpl<Formula>::const_iterator I = LU.Formulae.begin(),
3292        E = LU.Formulae.end(); I != E; ++I) {
3293     const Formula &F = *I;
3294
3295     // Ignore formulae which do not use any of the required registers.
3296     for (SmallSetVector<const SCEV *, 4>::const_iterator J = ReqRegs.begin(),
3297          JE = ReqRegs.end(); J != JE; ++J) {
3298       const SCEV *Reg = *J;
3299       if ((!F.ScaledReg || F.ScaledReg != Reg) &&
3300           std::find(F.BaseRegs.begin(), F.BaseRegs.end(), Reg) ==
3301           F.BaseRegs.end())
3302         goto skip;
3303     }
3304     AnySatisfiedReqRegs = true;
3305
3306     // Evaluate the cost of the current formula. If it's already worse than
3307     // the current best, prune the search at that point.
3308     NewCost = CurCost;
3309     NewRegs = CurRegs;
3310     NewCost.RateFormula(F, NewRegs, VisitedRegs, L, LU.Offsets, SE, DT);
3311     if (NewCost < SolutionCost) {
3312       Workspace.push_back(&F);
3313       if (Workspace.size() != Uses.size()) {
3314         SolveRecurse(Solution, SolutionCost, Workspace, NewCost,
3315                      NewRegs, VisitedRegs);
3316         if (F.getNumRegs() == 1 && Workspace.size() == 1)
3317           VisitedRegs.insert(F.ScaledReg ? F.ScaledReg : F.BaseRegs[0]);
3318       } else {
3319         DEBUG(dbgs() << "New best at "; NewCost.print(dbgs());
3320               dbgs() << ". Regs:";
3321               for (SmallPtrSet<const SCEV *, 16>::const_iterator
3322                    I = NewRegs.begin(), E = NewRegs.end(); I != E; ++I)
3323                 dbgs() << ' ' << **I;
3324               dbgs() << '\n');
3325
3326         SolutionCost = NewCost;
3327         Solution = Workspace;
3328       }
3329       Workspace.pop_back();
3330     }
3331   skip:;
3332   }
3333
3334   if (!EnableRetry && !AnySatisfiedReqRegs)
3335     return;
3336
3337   // If none of the formulae had all of the required registers, relax the
3338   // constraint so that we don't exclude all formulae.
3339   if (!AnySatisfiedReqRegs) {
3340     assert(!ReqRegs.empty() && "Solver failed even without required registers");
3341     ReqRegs.clear();
3342     goto retry;
3343   }
3344 }
3345
3346 /// Solve - Choose one formula from each use. Return the results in the given
3347 /// Solution vector.
3348 void LSRInstance::Solve(SmallVectorImpl<const Formula *> &Solution) const {
3349   SmallVector<const Formula *, 8> Workspace;
3350   Cost SolutionCost;
3351   SolutionCost.Loose();
3352   Cost CurCost;
3353   SmallPtrSet<const SCEV *, 16> CurRegs;
3354   DenseSet<const SCEV *> VisitedRegs;
3355   Workspace.reserve(Uses.size());
3356
3357   // SolveRecurse does all the work.
3358   SolveRecurse(Solution, SolutionCost, Workspace, CurCost,
3359                CurRegs, VisitedRegs);
3360   if (Solution.empty()) {
3361     DEBUG(dbgs() << "\nNo Satisfactory Solution\n");
3362     return;
3363   }
3364
3365   // Ok, we've now made all our decisions.
3366   DEBUG(dbgs() << "\n"
3367                   "The chosen solution requires "; SolutionCost.print(dbgs());
3368         dbgs() << ":\n";
3369         for (size_t i = 0, e = Uses.size(); i != e; ++i) {
3370           dbgs() << "  ";
3371           Uses[i].print(dbgs());
3372           dbgs() << "\n"
3373                     "    ";
3374           Solution[i]->print(dbgs());
3375           dbgs() << '\n';
3376         });
3377
3378   assert(Solution.size() == Uses.size() && "Malformed solution!");
3379 }
3380
3381 /// HoistInsertPosition - Helper for AdjustInsertPositionForExpand. Climb up
3382 /// the dominator tree far as we can go while still being dominated by the
3383 /// input positions. This helps canonicalize the insert position, which
3384 /// encourages sharing.
3385 BasicBlock::iterator
3386 LSRInstance::HoistInsertPosition(BasicBlock::iterator IP,
3387                                  const SmallVectorImpl<Instruction *> &Inputs)
3388                                                                          const {
3389   for (;;) {
3390     const Loop *IPLoop = LI.getLoopFor(IP->getParent());
3391     unsigned IPLoopDepth = IPLoop ? IPLoop->getLoopDepth() : 0;
3392
3393     BasicBlock *IDom;
3394     for (DomTreeNode *Rung = DT.getNode(IP->getParent()); ; ) {
3395       if (!Rung) return IP;
3396       Rung = Rung->getIDom();
3397       if (!Rung) return IP;
3398       IDom = Rung->getBlock();
3399
3400       // Don't climb into a loop though.
3401       const Loop *IDomLoop = LI.getLoopFor(IDom);
3402       unsigned IDomDepth = IDomLoop ? IDomLoop->getLoopDepth() : 0;
3403       if (IDomDepth <= IPLoopDepth &&
3404           (IDomDepth != IPLoopDepth || IDomLoop == IPLoop))
3405         break;
3406     }
3407
3408     bool AllDominate = true;
3409     Instruction *BetterPos = 0;
3410     Instruction *Tentative = IDom->getTerminator();
3411     for (SmallVectorImpl<Instruction *>::const_iterator I = Inputs.begin(),
3412          E = Inputs.end(); I != E; ++I) {
3413       Instruction *Inst = *I;
3414       if (Inst == Tentative || !DT.dominates(Inst, Tentative)) {
3415         AllDominate = false;
3416         break;
3417       }
3418       // Attempt to find an insert position in the middle of the block,
3419       // instead of at the end, so that it can be used for other expansions.
3420       if (IDom == Inst->getParent() &&
3421           (!BetterPos || DT.dominates(BetterPos, Inst)))
3422         BetterPos = llvm::next(BasicBlock::iterator(Inst));
3423     }
3424     if (!AllDominate)
3425       break;
3426     if (BetterPos)
3427       IP = BetterPos;
3428     else
3429       IP = Tentative;
3430   }
3431
3432   return IP;
3433 }
3434
3435 /// AdjustInsertPositionForExpand - Determine an input position which will be
3436 /// dominated by the operands and which will dominate the result.
3437 BasicBlock::iterator
3438 LSRInstance::AdjustInsertPositionForExpand(BasicBlock::iterator IP,
3439                                            const LSRFixup &LF,
3440                                            const LSRUse &LU) const {
3441   // Collect some instructions which must be dominated by the
3442   // expanding replacement. These must be dominated by any operands that
3443   // will be required in the expansion.
3444   SmallVector<Instruction *, 4> Inputs;
3445   if (Instruction *I = dyn_cast<Instruction>(LF.OperandValToReplace))
3446     Inputs.push_back(I);
3447   if (LU.Kind == LSRUse::ICmpZero)
3448     if (Instruction *I =
3449           dyn_cast<Instruction>(cast<ICmpInst>(LF.UserInst)->getOperand(1)))
3450       Inputs.push_back(I);
3451   if (LF.PostIncLoops.count(L)) {
3452     if (LF.isUseFullyOutsideLoop(L))
3453       Inputs.push_back(L->getLoopLatch()->getTerminator());
3454     else
3455       Inputs.push_back(IVIncInsertPos);
3456   }
3457   // The expansion must also be dominated by the increment positions of any
3458   // loops it for which it is using post-inc mode.
3459   for (PostIncLoopSet::const_iterator I = LF.PostIncLoops.begin(),
3460        E = LF.PostIncLoops.end(); I != E; ++I) {
3461     const Loop *PIL = *I;
3462     if (PIL == L) continue;
3463
3464     // Be dominated by the loop exit.
3465     SmallVector<BasicBlock *, 4> ExitingBlocks;
3466     PIL->getExitingBlocks(ExitingBlocks);
3467     if (!ExitingBlocks.empty()) {
3468       BasicBlock *BB = ExitingBlocks[0];
3469       for (unsigned i = 1, e = ExitingBlocks.size(); i != e; ++i)
3470         BB = DT.findNearestCommonDominator(BB, ExitingBlocks[i]);
3471       Inputs.push_back(BB->getTerminator());
3472     }
3473   }
3474
3475   // Then, climb up the immediate dominator tree as far as we can go while
3476   // still being dominated by the input positions.
3477   IP = HoistInsertPosition(IP, Inputs);
3478
3479   // Don't insert instructions before PHI nodes.
3480   while (isa<PHINode>(IP)) ++IP;
3481
3482   // Ignore landingpad instructions.
3483   while (isa<LandingPadInst>(IP)) ++IP;
3484
3485   // Ignore debug intrinsics.
3486   while (isa<DbgInfoIntrinsic>(IP)) ++IP;
3487
3488   return IP;
3489 }
3490
3491 /// Expand - Emit instructions for the leading candidate expression for this
3492 /// LSRUse (this is called "expanding").
3493 Value *LSRInstance::Expand(const LSRFixup &LF,
3494                            const Formula &F,
3495                            BasicBlock::iterator IP,
3496                            SCEVExpander &Rewriter,
3497                            SmallVectorImpl<WeakVH> &DeadInsts) const {
3498   const LSRUse &LU = Uses[LF.LUIdx];
3499
3500   // Determine an input position which will be dominated by the operands and
3501   // which will dominate the result.
3502   IP = AdjustInsertPositionForExpand(IP, LF, LU);
3503
3504   // Inform the Rewriter if we have a post-increment use, so that it can
3505   // perform an advantageous expansion.
3506   Rewriter.setPostInc(LF.PostIncLoops);
3507
3508   // This is the type that the user actually needs.
3509   Type *OpTy = LF.OperandValToReplace->getType();
3510   // This will be the type that we'll initially expand to.
3511   Type *Ty = F.getType();
3512   if (!Ty)
3513     // No type known; just expand directly to the ultimate type.
3514     Ty = OpTy;
3515   else if (SE.getEffectiveSCEVType(Ty) == SE.getEffectiveSCEVType(OpTy))
3516     // Expand directly to the ultimate type if it's the right size.
3517     Ty = OpTy;
3518   // This is the type to do integer arithmetic in.
3519   Type *IntTy = SE.getEffectiveSCEVType(Ty);
3520
3521   // Build up a list of operands to add together to form the full base.
3522   SmallVector<const SCEV *, 8> Ops;
3523
3524   // Expand the BaseRegs portion.
3525   for (SmallVectorImpl<const SCEV *>::const_iterator I = F.BaseRegs.begin(),
3526        E = F.BaseRegs.end(); I != E; ++I) {
3527     const SCEV *Reg = *I;
3528     assert(!Reg->isZero() && "Zero allocated in a base register!");
3529
3530     // If we're expanding for a post-inc user, make the post-inc adjustment.
3531     PostIncLoopSet &Loops = const_cast<PostIncLoopSet &>(LF.PostIncLoops);
3532     Reg = TransformForPostIncUse(Denormalize, Reg,
3533                                  LF.UserInst, LF.OperandValToReplace,
3534                                  Loops, SE, DT);
3535
3536     Ops.push_back(SE.getUnknown(Rewriter.expandCodeFor(Reg, 0, IP)));
3537   }
3538
3539   // Flush the operand list to suppress SCEVExpander hoisting.
3540   if (!Ops.empty()) {
3541     Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), Ty, IP);
3542     Ops.clear();
3543     Ops.push_back(SE.getUnknown(FullV));
3544   }
3545
3546   // Expand the ScaledReg portion.
3547   Value *ICmpScaledV = 0;
3548   if (F.AM.Scale != 0) {
3549     const SCEV *ScaledS = F.ScaledReg;
3550
3551     // If we're expanding for a post-inc user, make the post-inc adjustment.
3552     PostIncLoopSet &Loops = const_cast<PostIncLoopSet &>(LF.PostIncLoops);
3553     ScaledS = TransformForPostIncUse(Denormalize, ScaledS,
3554                                      LF.UserInst, LF.OperandValToReplace,
3555                                      Loops, SE, DT);
3556
3557     if (LU.Kind == LSRUse::ICmpZero) {
3558       // An interesting way of "folding" with an icmp is to use a negated
3559       // scale, which we'll implement by inserting it into the other operand
3560       // of the icmp.
3561       assert(F.AM.Scale == -1 &&
3562              "The only scale supported by ICmpZero uses is -1!");
3563       ICmpScaledV = Rewriter.expandCodeFor(ScaledS, 0, IP);
3564     } else {
3565       // Otherwise just expand the scaled register and an explicit scale,
3566       // which is expected to be matched as part of the address.
3567       ScaledS = SE.getUnknown(Rewriter.expandCodeFor(ScaledS, 0, IP));
3568       ScaledS = SE.getMulExpr(ScaledS,
3569                               SE.getConstant(ScaledS->getType(), F.AM.Scale));
3570       Ops.push_back(ScaledS);
3571
3572       // Flush the operand list to suppress SCEVExpander hoisting.
3573       Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), Ty, IP);
3574       Ops.clear();
3575       Ops.push_back(SE.getUnknown(FullV));
3576     }
3577   }
3578
3579   // Expand the GV portion.
3580   if (F.AM.BaseGV) {
3581     Ops.push_back(SE.getUnknown(F.AM.BaseGV));
3582
3583     // Flush the operand list to suppress SCEVExpander hoisting.
3584     Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), Ty, IP);
3585     Ops.clear();
3586     Ops.push_back(SE.getUnknown(FullV));
3587   }
3588
3589   // Expand the immediate portion.
3590   int64_t Offset = (uint64_t)F.AM.BaseOffs + LF.Offset;
3591   if (Offset != 0) {
3592     if (LU.Kind == LSRUse::ICmpZero) {
3593       // The other interesting way of "folding" with an ICmpZero is to use a
3594       // negated immediate.
3595       if (!ICmpScaledV)
3596         ICmpScaledV = ConstantInt::get(IntTy, -(uint64_t)Offset);
3597       else {
3598         Ops.push_back(SE.getUnknown(ICmpScaledV));
3599         ICmpScaledV = ConstantInt::get(IntTy, Offset);
3600       }
3601     } else {
3602       // Just add the immediate values. These again are expected to be matched
3603       // as part of the address.
3604       Ops.push_back(SE.getUnknown(ConstantInt::getSigned(IntTy, Offset)));
3605     }
3606   }
3607
3608   // Expand the unfolded offset portion.
3609   int64_t UnfoldedOffset = F.UnfoldedOffset;
3610   if (UnfoldedOffset != 0) {
3611     // Just add the immediate values.
3612     Ops.push_back(SE.getUnknown(ConstantInt::getSigned(IntTy,
3613                                                        UnfoldedOffset)));
3614   }
3615
3616   // Emit instructions summing all the operands.
3617   const SCEV *FullS = Ops.empty() ?
3618                       SE.getConstant(IntTy, 0) :
3619                       SE.getAddExpr(Ops);
3620   Value *FullV = Rewriter.expandCodeFor(FullS, Ty, IP);
3621
3622   // We're done expanding now, so reset the rewriter.
3623   Rewriter.clearPostInc();
3624
3625   // An ICmpZero Formula represents an ICmp which we're handling as a
3626   // comparison against zero. Now that we've expanded an expression for that
3627   // form, update the ICmp's other operand.
3628   if (LU.Kind == LSRUse::ICmpZero) {
3629     ICmpInst *CI = cast<ICmpInst>(LF.UserInst);
3630     DeadInsts.push_back(CI->getOperand(1));
3631     assert(!F.AM.BaseGV && "ICmp does not support folding a global value and "
3632                            "a scale at the same time!");
3633     if (F.AM.Scale == -1) {
3634       if (ICmpScaledV->getType() != OpTy) {
3635         Instruction *Cast =
3636           CastInst::Create(CastInst::getCastOpcode(ICmpScaledV, false,
3637                                                    OpTy, false),
3638                            ICmpScaledV, OpTy, "tmp", CI);
3639         ICmpScaledV = Cast;
3640       }
3641       CI->setOperand(1, ICmpScaledV);
3642     } else {
3643       assert(F.AM.Scale == 0 &&
3644              "ICmp does not support folding a global value and "
3645              "a scale at the same time!");
3646       Constant *C = ConstantInt::getSigned(SE.getEffectiveSCEVType(OpTy),
3647                                            -(uint64_t)Offset);
3648       if (C->getType() != OpTy)
3649         C = ConstantExpr::getCast(CastInst::getCastOpcode(C, false,
3650                                                           OpTy, false),
3651                                   C, OpTy);
3652
3653       CI->setOperand(1, C);
3654     }
3655   }
3656
3657   return FullV;
3658 }
3659
3660 /// RewriteForPHI - Helper for Rewrite. PHI nodes are special because the use
3661 /// of their operands effectively happens in their predecessor blocks, so the
3662 /// expression may need to be expanded in multiple places.
3663 void LSRInstance::RewriteForPHI(PHINode *PN,
3664                                 const LSRFixup &LF,
3665                                 const Formula &F,
3666                                 SCEVExpander &Rewriter,
3667                                 SmallVectorImpl<WeakVH> &DeadInsts,
3668                                 Pass *P) const {
3669   DenseMap<BasicBlock *, Value *> Inserted;
3670   for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
3671     if (PN->getIncomingValue(i) == LF.OperandValToReplace) {
3672       BasicBlock *BB = PN->getIncomingBlock(i);
3673
3674       // If this is a critical edge, split the edge so that we do not insert
3675       // the code on all predecessor/successor paths.  We do this unless this
3676       // is the canonical backedge for this loop, which complicates post-inc
3677       // users.
3678       if (e != 1 && BB->getTerminator()->getNumSuccessors() > 1 &&
3679           !isa<IndirectBrInst>(BB->getTerminator())) {
3680         BasicBlock *Parent = PN->getParent();
3681         Loop *PNLoop = LI.getLoopFor(Parent);
3682         if (!PNLoop || Parent != PNLoop->getHeader()) {
3683           // Split the critical edge.
3684           BasicBlock *NewBB = 0;
3685           if (!Parent->isLandingPad()) {
3686             NewBB = SplitCriticalEdge(BB, Parent, P,
3687                                       /*MergeIdenticalEdges=*/true,
3688                                       /*DontDeleteUselessPhis=*/true);
3689           } else {
3690             SmallVector<BasicBlock*, 2> NewBBs;
3691             SplitLandingPadPredecessors(Parent, BB, "", "", P, NewBBs);
3692             NewBB = NewBBs[0];
3693           }
3694
3695           // If PN is outside of the loop and BB is in the loop, we want to
3696           // move the block to be immediately before the PHI block, not
3697           // immediately after BB.
3698           if (L->contains(BB) && !L->contains(PN))
3699             NewBB->moveBefore(PN->getParent());
3700
3701           // Splitting the edge can reduce the number of PHI entries we have.
3702           e = PN->getNumIncomingValues();
3703           BB = NewBB;
3704           i = PN->getBasicBlockIndex(BB);
3705         }
3706       }
3707
3708       std::pair<DenseMap<BasicBlock *, Value *>::iterator, bool> Pair =
3709         Inserted.insert(std::make_pair(BB, static_cast<Value *>(0)));
3710       if (!Pair.second)
3711         PN->setIncomingValue(i, Pair.first->second);
3712       else {
3713         Value *FullV = Expand(LF, F, BB->getTerminator(), Rewriter, DeadInsts);
3714
3715         // If this is reuse-by-noop-cast, insert the noop cast.
3716         Type *OpTy = LF.OperandValToReplace->getType();
3717         if (FullV->getType() != OpTy)
3718           FullV =
3719             CastInst::Create(CastInst::getCastOpcode(FullV, false,
3720                                                      OpTy, false),
3721                              FullV, LF.OperandValToReplace->getType(),
3722                              "tmp", BB->getTerminator());
3723
3724         PN->setIncomingValue(i, FullV);
3725         Pair.first->second = FullV;
3726       }
3727     }
3728 }
3729
3730 /// Rewrite - Emit instructions for the leading candidate expression for this
3731 /// LSRUse (this is called "expanding"), and update the UserInst to reference
3732 /// the newly expanded value.
3733 void LSRInstance::Rewrite(const LSRFixup &LF,
3734                           const Formula &F,
3735                           SCEVExpander &Rewriter,
3736                           SmallVectorImpl<WeakVH> &DeadInsts,
3737                           Pass *P) const {
3738   // First, find an insertion point that dominates UserInst. For PHI nodes,
3739   // find the nearest block which dominates all the relevant uses.
3740   if (PHINode *PN = dyn_cast<PHINode>(LF.UserInst)) {
3741     RewriteForPHI(PN, LF, F, Rewriter, DeadInsts, P);
3742   } else {
3743     Value *FullV = Expand(LF, F, LF.UserInst, Rewriter, DeadInsts);
3744
3745     // If this is reuse-by-noop-cast, insert the noop cast.
3746     Type *OpTy = LF.OperandValToReplace->getType();
3747     if (FullV->getType() != OpTy) {
3748       Instruction *Cast =
3749         CastInst::Create(CastInst::getCastOpcode(FullV, false, OpTy, false),
3750                          FullV, OpTy, "tmp", LF.UserInst);
3751       FullV = Cast;
3752     }
3753
3754     // Update the user. ICmpZero is handled specially here (for now) because
3755     // Expand may have updated one of the operands of the icmp already, and
3756     // its new value may happen to be equal to LF.OperandValToReplace, in
3757     // which case doing replaceUsesOfWith leads to replacing both operands
3758     // with the same value. TODO: Reorganize this.
3759     if (Uses[LF.LUIdx].Kind == LSRUse::ICmpZero)
3760       LF.UserInst->setOperand(0, FullV);
3761     else
3762       LF.UserInst->replaceUsesOfWith(LF.OperandValToReplace, FullV);
3763   }
3764
3765   DeadInsts.push_back(LF.OperandValToReplace);
3766 }
3767
3768 /// ImplementSolution - Rewrite all the fixup locations with new values,
3769 /// following the chosen solution.
3770 void
3771 LSRInstance::ImplementSolution(const SmallVectorImpl<const Formula *> &Solution,
3772                                Pass *P) {
3773   // Keep track of instructions we may have made dead, so that
3774   // we can remove them after we are done working.
3775   SmallVector<WeakVH, 16> DeadInsts;
3776
3777   SCEVExpander Rewriter(SE, "lsr");
3778   Rewriter.disableCanonicalMode();
3779   Rewriter.enableLSRMode();
3780   Rewriter.setIVIncInsertPos(L, IVIncInsertPos);
3781
3782   // Expand the new value definitions and update the users.
3783   for (SmallVectorImpl<LSRFixup>::const_iterator I = Fixups.begin(),
3784        E = Fixups.end(); I != E; ++I) {
3785     const LSRFixup &Fixup = *I;
3786
3787     Rewrite(Fixup, *Solution[Fixup.LUIdx], Rewriter, DeadInsts, P);
3788
3789     Changed = true;
3790   }
3791
3792   // Clean up after ourselves. This must be done before deleting any
3793   // instructions.
3794   Rewriter.clear();
3795
3796   Changed |= DeleteTriviallyDeadInstructions(DeadInsts);
3797 }
3798
3799 LSRInstance::LSRInstance(const TargetLowering *tli, Loop *l, Pass *P)
3800   : IU(P->getAnalysis<IVUsers>()),
3801     SE(P->getAnalysis<ScalarEvolution>()),
3802     DT(P->getAnalysis<DominatorTree>()),
3803     LI(P->getAnalysis<LoopInfo>()),
3804     TLI(tli), L(l), Changed(false), IVIncInsertPos(0) {
3805
3806   // If LoopSimplify form is not available, stay out of trouble.
3807   if (!L->isLoopSimplifyForm()) return;
3808
3809   // If there's no interesting work to be done, bail early.
3810   if (IU.empty()) return;
3811
3812   DEBUG(dbgs() << "\nLSR on loop ";
3813         WriteAsOperand(dbgs(), L->getHeader(), /*PrintType=*/false);
3814         dbgs() << ":\n");
3815
3816   // First, perform some low-level loop optimizations.
3817   OptimizeShadowIV();
3818   OptimizeLoopTermCond();
3819
3820   // If loop preparation eliminates all interesting IV users, bail.
3821   if (IU.empty()) return;
3822
3823   // Skip nested loops until we can model them better with formulae.
3824   if (!EnableNested && !L->empty()) {
3825
3826     if (EnablePhiElim) {
3827       // Remove any extra phis created by processing inner loops.
3828       SmallVector<WeakVH, 16> DeadInsts;
3829       SCEVExpander Rewriter(SE, "lsr");
3830       Changed |= Rewriter.replaceCongruentIVs(L, &DT, DeadInsts);
3831       Changed |= DeleteTriviallyDeadInstructions(DeadInsts);
3832     }
3833     DEBUG(dbgs() << "LSR skipping outer loop " << *L << "\n");
3834     return;
3835   }
3836
3837   // Start collecting data and preparing for the solver.
3838   CollectInterestingTypesAndFactors();
3839   CollectFixupsAndInitialFormulae();
3840   CollectLoopInvariantFixupsAndFormulae();
3841
3842   DEBUG(dbgs() << "LSR found " << Uses.size() << " uses:\n";
3843         print_uses(dbgs()));
3844
3845   // Now use the reuse data to generate a bunch of interesting ways
3846   // to formulate the values needed for the uses.
3847   GenerateAllReuseFormulae();
3848
3849   FilterOutUndesirableDedicatedRegisters();
3850   NarrowSearchSpaceUsingHeuristics();
3851
3852   SmallVector<const Formula *, 8> Solution;
3853   Solve(Solution);
3854
3855   // Release memory that is no longer needed.
3856   Factors.clear();
3857   Types.clear();
3858   RegUses.clear();
3859
3860   if (Solution.empty())
3861     return;
3862
3863 #ifndef NDEBUG
3864   // Formulae should be legal.
3865   for (SmallVectorImpl<LSRUse>::const_iterator I = Uses.begin(),
3866        E = Uses.end(); I != E; ++I) {
3867      const LSRUse &LU = *I;
3868      for (SmallVectorImpl<Formula>::const_iterator J = LU.Formulae.begin(),
3869           JE = LU.Formulae.end(); J != JE; ++J)
3870         assert(isLegalUse(J->AM, LU.MinOffset, LU.MaxOffset,
3871                           LU.Kind, LU.AccessTy, TLI) &&
3872                "Illegal formula generated!");
3873   };
3874 #endif
3875
3876   // Now that we've decided what we want, make it so.
3877   ImplementSolution(Solution, P);
3878
3879   if (EnablePhiElim) {
3880     // Remove any extra phis created by processing inner loops.
3881     SmallVector<WeakVH, 16> DeadInsts;
3882     SCEVExpander Rewriter(SE, "lsr");
3883     Changed |= Rewriter.replaceCongruentIVs(L, &DT, DeadInsts);
3884     Changed |= DeleteTriviallyDeadInstructions(DeadInsts);
3885   }
3886 }
3887
3888 void LSRInstance::print_factors_and_types(raw_ostream &OS) const {
3889   if (Factors.empty() && Types.empty()) return;
3890
3891   OS << "LSR has identified the following interesting factors and types: ";
3892   bool First = true;
3893
3894   for (SmallSetVector<int64_t, 8>::const_iterator
3895        I = Factors.begin(), E = Factors.end(); I != E; ++I) {
3896     if (!First) OS << ", ";
3897     First = false;
3898     OS << '*' << *I;
3899   }
3900
3901   for (SmallSetVector<Type *, 4>::const_iterator
3902        I = Types.begin(), E = Types.end(); I != E; ++I) {
3903     if (!First) OS << ", ";
3904     First = false;
3905     OS << '(' << **I << ')';
3906   }
3907   OS << '\n';
3908 }
3909
3910 void LSRInstance::print_fixups(raw_ostream &OS) const {
3911   OS << "LSR is examining the following fixup sites:\n";
3912   for (SmallVectorImpl<LSRFixup>::const_iterator I = Fixups.begin(),
3913        E = Fixups.end(); I != E; ++I) {
3914     dbgs() << "  ";
3915     I->print(OS);
3916     OS << '\n';
3917   }
3918 }
3919
3920 void LSRInstance::print_uses(raw_ostream &OS) const {
3921   OS << "LSR is examining the following uses:\n";
3922   for (SmallVectorImpl<LSRUse>::const_iterator I = Uses.begin(),
3923        E = Uses.end(); I != E; ++I) {
3924     const LSRUse &LU = *I;
3925     dbgs() << "  ";
3926     LU.print(OS);
3927     OS << '\n';
3928     for (SmallVectorImpl<Formula>::const_iterator J = LU.Formulae.begin(),
3929          JE = LU.Formulae.end(); J != JE; ++J) {
3930       OS << "    ";
3931       J->print(OS);
3932       OS << '\n';
3933     }
3934   }
3935 }
3936
3937 void LSRInstance::print(raw_ostream &OS) const {
3938   print_factors_and_types(OS);
3939   print_fixups(OS);
3940   print_uses(OS);
3941 }
3942
3943 void LSRInstance::dump() const {
3944   print(errs()); errs() << '\n';
3945 }
3946
3947 namespace {
3948
3949 class LoopStrengthReduce : public LoopPass {
3950   /// TLI - Keep a pointer of a TargetLowering to consult for determining
3951   /// transformation profitability.
3952   const TargetLowering *const TLI;
3953
3954 public:
3955   static char ID; // Pass ID, replacement for typeid
3956   explicit LoopStrengthReduce(const TargetLowering *tli = 0);
3957
3958 private:
3959   bool runOnLoop(Loop *L, LPPassManager &LPM);
3960   void getAnalysisUsage(AnalysisUsage &AU) const;
3961 };
3962
3963 }
3964
3965 char LoopStrengthReduce::ID = 0;
3966 INITIALIZE_PASS_BEGIN(LoopStrengthReduce, "loop-reduce",
3967                 "Loop Strength Reduction", false, false)
3968 INITIALIZE_PASS_DEPENDENCY(DominatorTree)
3969 INITIALIZE_PASS_DEPENDENCY(ScalarEvolution)
3970 INITIALIZE_PASS_DEPENDENCY(IVUsers)
3971 INITIALIZE_PASS_DEPENDENCY(LoopInfo)
3972 INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
3973 INITIALIZE_PASS_END(LoopStrengthReduce, "loop-reduce",
3974                 "Loop Strength Reduction", false, false)
3975
3976
3977 Pass *llvm::createLoopStrengthReducePass(const TargetLowering *TLI) {
3978   return new LoopStrengthReduce(TLI);
3979 }
3980
3981 LoopStrengthReduce::LoopStrengthReduce(const TargetLowering *tli)
3982   : LoopPass(ID), TLI(tli) {
3983     initializeLoopStrengthReducePass(*PassRegistry::getPassRegistry());
3984   }
3985
3986 void LoopStrengthReduce::getAnalysisUsage(AnalysisUsage &AU) const {
3987   // We split critical edges, so we change the CFG.  However, we do update
3988   // many analyses if they are around.
3989   AU.addPreservedID(LoopSimplifyID);
3990
3991   AU.addRequired<LoopInfo>();
3992   AU.addPreserved<LoopInfo>();
3993   AU.addRequiredID(LoopSimplifyID);
3994   AU.addRequired<DominatorTree>();
3995   AU.addPreserved<DominatorTree>();
3996   AU.addRequired<ScalarEvolution>();
3997   AU.addPreserved<ScalarEvolution>();
3998   // Requiring LoopSimplify a second time here prevents IVUsers from running
3999   // twice, since LoopSimplify was invalidated by running ScalarEvolution.
4000   AU.addRequiredID(LoopSimplifyID);
4001   AU.addRequired<IVUsers>();
4002   AU.addPreserved<IVUsers>();
4003 }
4004
4005 bool LoopStrengthReduce::runOnLoop(Loop *L, LPPassManager & /*LPM*/) {
4006   bool Changed = false;
4007
4008   // Run the main LSR transformation.
4009   Changed |= LSRInstance(TLI, L, this).getChanged();
4010
4011   // At this point, it is worth checking to see if any recurrence PHIs are also
4012   // dead, so that we can remove them as well.
4013   Changed |= DeleteDeadPHIs(L->getHeader());
4014
4015   return Changed;
4016 }