llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

   1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===//
   2 //
   3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   4 // See https://llvm.org/LICENSE.txt for license information.
   5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
   6 //
   7 //===----------------------------------------------------------------------===//
   8 //
   9 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
  10 // and generates target-independent LLVM-IR.
  11 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs
  12 // of instructions in order to estimate the profitability of vectorization.
  13 //
  14 // The loop vectorizer combines consecutive loop iterations into a single
  15 // 'wide' iteration. After this transformation the index is incremented
  16 // by the SIMD vector width, and not by one.
  17 //
  18 // This pass has three parts:
  19 // 1. The main loop pass that drives the different parts.
  20 // 2. LoopVectorizationLegality - A unit that checks for the legality
  21 //    of the vectorization.
  22 // 3. InnerLoopVectorizer - A unit that performs the actual
  23 //    widening of instructions.
  24 // 4. LoopVectorizationCostModel - A unit that checks for the profitability
  25 //    of vectorization. It decides on the optimal vector width, which
  26 //    can be one, if vectorization is not profitable.
  27 //
  28 // There is a development effort going on to migrate loop vectorizer to the
  29 // VPlan infrastructure and to introduce outer loop vectorization support (see
  30 // docs/Proposal/VectorizationPlan.rst and
  31 // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this
  32 // purpose, we temporarily introduced the VPlan-native vectorization path: an
  33 // alternative vectorization path that is natively implemented on top of the
  34 // VPlan infrastructure. See EnableVPlanNativePath for enabling.
  35 //
  36 //===----------------------------------------------------------------------===//
  37 //
  38 // The reduction-variable vectorization is based on the paper:
  39 //  D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
  40 //
  41 // Variable uniformity checks are inspired by:
  42 //  Karrenberg, R. and Hack, S. Whole Function Vectorization.
  43 //
  44 // The interleaved access vectorization is based on the paper:
  45 //  Dorit Nuzman, Ira Rosen and Ayal Zaks.  Auto-Vectorization of Interleaved
  46 //  Data for SIMD
  47 //
  48 // Other ideas/concepts are from:
  49 //  A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
  50 //
  51 //  S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua.  An Evaluation of
  52 //  Vectorizing Compilers.
  53 //
  54 //===----------------------------------------------------------------------===//
  55
  56 #include "llvm/Transforms/Vectorize/LoopVectorize.h"
  57 #include "LoopVectorizationPlanner.h"
  58 #include "VPRecipeBuilder.h"
  59 #include "VPlan.h"
  60 #include "VPlanHCFGBuilder.h"
  61 #include "VPlanPredicator.h"
  62 #include "VPlanTransforms.h"
  63 #include "llvm/ADT/APInt.h"
  64 #include "llvm/ADT/ArrayRef.h"
  65 #include "llvm/ADT/DenseMap.h"
  66 #include "llvm/ADT/DenseMapInfo.h"
  67 #include "llvm/ADT/Hashing.h"
  68 #include "llvm/ADT/MapVector.h"
  69 #include "llvm/ADT/None.h"
  70 #include "llvm/ADT/Optional.h"
  71 #include "llvm/ADT/STLExtras.h"
  72 #include "llvm/ADT/SetVector.h"
  73 #include "llvm/ADT/SmallPtrSet.h"
  74 #include "llvm/ADT/SmallVector.h"
  75 #include "llvm/ADT/Statistic.h"
  76 #include "llvm/ADT/StringRef.h"
  77 #include "llvm/ADT/Twine.h"
  78 #include "llvm/ADT/iterator_range.h"
  79 #include "llvm/Analysis/AssumptionCache.h"
  80 #include "llvm/Analysis/BasicAliasAnalysis.h"
  81 #include "llvm/Analysis/BlockFrequencyInfo.h"
  82 #include "llvm/Analysis/CFG.h"
  83 #include "llvm/Analysis/CodeMetrics.h"
  84 #include "llvm/Analysis/DemandedBits.h"
  85 #include "llvm/Analysis/GlobalsModRef.h"
  86 #include "llvm/Analysis/LoopAccessAnalysis.h"
  87 #include "llvm/Analysis/LoopAnalysisManager.h"
  88 #include "llvm/Analysis/LoopInfo.h"
  89 #include "llvm/Analysis/LoopIterator.h"
  90 #include "llvm/Analysis/MemorySSA.h"
  91 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
  92 #include "llvm/Analysis/ProfileSummaryInfo.h"
  93 #include "llvm/Analysis/ScalarEvolution.h"
  94 #include "llvm/Analysis/ScalarEvolutionExpander.h"
  95 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
  96 #include "llvm/Analysis/TargetLibraryInfo.h"
  97 #include "llvm/Analysis/TargetTransformInfo.h"
  98 #include "llvm/Analysis/VectorUtils.h"
  99 #include "llvm/IR/Attributes.h"
 100 #include "llvm/IR/BasicBlock.h"
 101 #include "llvm/IR/CFG.h"
 102 #include "llvm/IR/Constant.h"
 103 #include "llvm/IR/Constants.h"
 104 #include "llvm/IR/DataLayout.h"
 105 #include "llvm/IR/DebugInfoMetadata.h"
 106 #include "llvm/IR/DebugLoc.h"
 107 #include "llvm/IR/DerivedTypes.h"
 108 #include "llvm/IR/DiagnosticInfo.h"
 109 #include "llvm/IR/Dominators.h"
 110 #include "llvm/IR/Function.h"
 111 #include "llvm/IR/IRBuilder.h"
 112 #include "llvm/IR/InstrTypes.h"
 113 #include "llvm/IR/Instruction.h"
 114 #include "llvm/IR/Instructions.h"
 115 #include "llvm/IR/IntrinsicInst.h"
 116 #include "llvm/IR/Intrinsics.h"
 117 #include "llvm/IR/LLVMContext.h"
 118 #include "llvm/IR/Metadata.h"
 119 #include "llvm/IR/Module.h"
 120 #include "llvm/IR/Operator.h"
 121 #include "llvm/IR/Type.h"
 122 #include "llvm/IR/Use.h"
 123 #include "llvm/IR/User.h"
 124 #include "llvm/IR/Value.h"
 125 #include "llvm/IR/ValueHandle.h"
 126 #include "llvm/IR/Verifier.h"
 127 #include "llvm/InitializePasses.h"
 128 #include "llvm/Pass.h"
 129 #include "llvm/Support/Casting.h"
 130 #include "llvm/Support/CommandLine.h"
 131 #include "llvm/Support/Compiler.h"
 132 #include "llvm/Support/Debug.h"
 133 #include "llvm/Support/ErrorHandling.h"
 134 #include "llvm/Support/MathExtras.h"
 135 #include "llvm/Support/raw_ostream.h"
 136 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 137 #include "llvm/Transforms/Utils/LoopSimplify.h"
 138 #include "llvm/Transforms/Utils/LoopUtils.h"
 139 #include "llvm/Transforms/Utils/LoopVersioning.h"
 140 #include "llvm/Transforms/Utils/SizeOpts.h"
 141 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
 142 #include <algorithm>
 143 #include <cassert>
 144 #include <cstdint>
 145 #include <cstdlib>
 146 #include <functional>
 147 #include <iterator>
 148 #include <limits>
 149 #include <memory>
 150 #include <string>
 151 #include <tuple>
 152 #include <utility>
 153
 154 using namespace llvm;
 155
 156 #define LV_NAME "loop-vectorize"
 157 #define DEBUG_TYPE LV_NAME
 158
 159 /// @{
 160 /// Metadata attribute names
 161 static const char *const LLVMLoopVectorizeFollowupAll =
 162     "llvm.loop.vectorize.followup_all";
 163 static const char *const LLVMLoopVectorizeFollowupVectorized =
 164     "llvm.loop.vectorize.followup_vectorized";
 165 static const char *const LLVMLoopVectorizeFollowupEpilogue =
 166     "llvm.loop.vectorize.followup_epilogue";
 167 /// @}
 168
 169 STATISTIC(LoopsVectorized, "Number of loops vectorized");
 170 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
 171
 172 /// Loops with a known constant trip count below this number are vectorized only
 173 /// if no scalar iteration overheads are incurred.
 174 static cl::opt<unsigned> TinyTripCountVectorThreshold(
 175     "vectorizer-min-trip-count", cl::init(16), cl::Hidden,
 176     cl::desc("Loops with a constant trip count that is smaller than this "
 177              "value are vectorized only if no scalar iteration overheads "
 178              "are incurred."));
 179
 180 // Indicates that an epilogue is undesired, predication is preferred.
 181 // This means that the vectorizer will try to fold the loop-tail (epilogue)
 182 // into the loop and predicate the loop body accordingly.
 183 static cl::opt<bool> PreferPredicateOverEpilog(
 184     "prefer-predicate-over-epilog", cl::init(false), cl::Hidden,
 185     cl::desc("Indicate that an epilogue is undesired, predication should be "
 186              "used instead."));
 187
 188 static cl::opt<bool> MaximizeBandwidth(
 189     "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
 190     cl::desc("Maximize bandwidth when selecting vectorization factor which "
 191              "will be determined by the smallest type in loop."));
 192
 193 static cl::opt<bool> EnableInterleavedMemAccesses(
 194     "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden,
 195     cl::desc("Enable vectorization on interleaved memory accesses in a loop"));
 196
 197 /// An interleave-group may need masking if it resides in a block that needs
 198 /// predication, or in order to mask away gaps.
 199 static cl::opt<bool> EnableMaskedInterleavedMemAccesses(
 200     "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden,
 201     cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"));
 202
 203 static cl::opt<unsigned> TinyTripCountInterleaveThreshold(
 204     "tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden,
 205     cl::desc("We don't interleave loops with a estimated constant trip count "
 206              "below this number"));
 207
 208 static cl::opt<unsigned> ForceTargetNumScalarRegs(
 209     "force-target-num-scalar-regs", cl::init(0), cl::Hidden,
 210     cl::desc("A flag that overrides the target's number of scalar registers."));
 211
 212 static cl::opt<unsigned> ForceTargetNumVectorRegs(
 213     "force-target-num-vector-regs", cl::init(0), cl::Hidden,
 214     cl::desc("A flag that overrides the target's number of vector registers."));
 215
 216 static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor(
 217     "force-target-max-scalar-interleave", cl::init(0), cl::Hidden,
 218     cl::desc("A flag that overrides the target's max interleave factor for "
 219              "scalar loops."));
 220
 221 static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor(
 222     "force-target-max-vector-interleave", cl::init(0), cl::Hidden,
 223     cl::desc("A flag that overrides the target's max interleave factor for "
 224              "vectorized loops."));
 225
 226 static cl::opt<unsigned> ForceTargetInstructionCost(
 227     "force-target-instruction-cost", cl::init(0), cl::Hidden,
 228     cl::desc("A flag that overrides the target's expected cost for "
 229              "an instruction to a single constant value. Mostly "
 230              "useful for getting consistent testing."));
 231
 232 static cl::opt<unsigned> SmallLoopCost(
 233     "small-loop-cost", cl::init(20), cl::Hidden,
 234     cl::desc(
 235         "The cost of a loop that is considered 'small' by the interleaver."));
 236
 237 static cl::opt<bool> LoopVectorizeWithBlockFrequency(
 238     "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden,
 239     cl::desc("Enable the use of the block frequency analysis to access PGO "
 240              "heuristics minimizing code growth in cold regions and being more "
 241              "aggressive in hot regions."));
 242
 243 // Runtime interleave loops for load/store throughput.
 244 static cl::opt<bool> EnableLoadStoreRuntimeInterleave(
 245     "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden,
 246     cl::desc(
 247         "Enable runtime interleaving until load/store ports are saturated"));
 248
 249 /// The number of stores in a loop that are allowed to need predication.
 250 static cl::opt<unsigned> NumberOfStoresToPredicate(
 251     "vectorize-num-stores-pred", cl::init(1), cl::Hidden,
 252     cl::desc("Max number of stores to be predicated behind an if."));
 253
 254 static cl::opt<bool> EnableIndVarRegisterHeur(
 255     "enable-ind-var-reg-heur", cl::init(true), cl::Hidden,
 256     cl::desc("Count the induction variable only once when interleaving"));
 257
 258 static cl::opt<bool> EnableCondStoresVectorization(
 259     "enable-cond-stores-vec", cl::init(true), cl::Hidden,
 260     cl::desc("Enable if predication of stores during vectorization."));
 261
 262 static cl::opt<unsigned> MaxNestedScalarReductionIC(
 263     "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden,
 264     cl::desc("The maximum interleave count to use when interleaving a scalar "
 265              "reduction in a nested loop."));
 266
 267 cl::opt<bool> EnableVPlanNativePath(
 268     "enable-vplan-native-path", cl::init(false), cl::Hidden,
 269     cl::desc("Enable VPlan-native vectorization path with "
 270              "support for outer loop vectorization."));
 271
 272 // FIXME: Remove this switch once we have divergence analysis. Currently we
 273 // assume divergent non-backedge branches when this switch is true.
 274 cl::opt<bool> EnableVPlanPredication(
 275     "enable-vplan-predication", cl::init(false), cl::Hidden,
 276     cl::desc("Enable VPlan-native vectorization path predicator with "
 277              "support for outer loop vectorization."));
 278
 279 // This flag enables the stress testing of the VPlan H-CFG construction in the
 280 // VPlan-native vectorization path. It must be used in conjuction with
 281 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the
 282 // verification of the H-CFGs built.
 283 static cl::opt<bool> VPlanBuildStressTest(
 284     "vplan-build-stress-test", cl::init(false), cl::Hidden,
 285     cl::desc(
 286         "Build VPlan for every supported loop nest in the function and bail "
 287         "out right after the build (stress test the VPlan H-CFG construction "
 288         "in the VPlan-native vectorization path)."));
 289
 290 cl::opt<bool> llvm::EnableLoopInterleaving(
 291     "interleave-loops", cl::init(true), cl::Hidden,
 292     cl::desc("Enable loop interleaving in Loop vectorization passes"));
 293 cl::opt<bool> llvm::EnableLoopVectorization(
 294     "vectorize-loops", cl::init(true), cl::Hidden,
 295     cl::desc("Run the Loop vectorization passes"));
 296
 297 /// A helper function for converting Scalar types to vector types.
 298 /// If the incoming type is void, we return void. If the VF is 1, we return
 299 /// the scalar type.
 300 static Type *ToVectorTy(Type *Scalar, unsigned VF) {
 301   if (Scalar->isVoidTy() || VF == 1)
 302     return Scalar;
 303   return VectorType::get(Scalar, VF);
 304 }
 305
 306 /// A helper function that returns the type of loaded or stored value.
 307 static Type *getMemInstValueType(Value *I) {
 308   assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
 309          "Expected Load or Store instruction");
 310   if (auto *LI = dyn_cast<LoadInst>(I))
 311     return LI->getType();
 312   return cast<StoreInst>(I)->getValueOperand()->getType();
 313 }
 314
 315 /// A helper function that returns true if the given type is irregular. The
 316 /// type is irregular if its allocated size doesn't equal the store size of an
 317 /// element of the corresponding vector type at the given vectorization factor.
 318 static bool hasIrregularType(Type *Ty, const DataLayout &DL, unsigned VF) {
 319   // Determine if an array of VF elements of type Ty is "bitcast compatible"
 320   // with a <VF x Ty> vector.
 321   if (VF > 1) {
 322     auto *VectorTy = VectorType::get(Ty, VF);
 323     return VF * DL.getTypeAllocSize(Ty) != DL.getTypeStoreSize(VectorTy);
 324   }
 325
 326   // If the vectorization factor is one, we just check if an array of type Ty
 327   // requires padding between elements.
 328   return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty);
 329 }
 330
 331 /// A helper function that returns the reciprocal of the block probability of
 332 /// predicated blocks. If we return X, we are assuming the predicated block
 333 /// will execute once for every X iterations of the loop header.
 334 ///
 335 /// TODO: We should use actual block probability here, if available. Currently,
 336 ///       we always assume predicated blocks have a 50% chance of executing.
 337 static unsigned getReciprocalPredBlockProb() { return 2; }
 338
 339 /// A helper function that adds a 'fast' flag to floating-point operations.
 340 static Value *addFastMathFlag(Value *V) {
 341   if (isa<FPMathOperator>(V))
 342     cast<Instruction>(V)->setFastMathFlags(FastMathFlags::getFast());
 343   return V;
 344 }
 345
 346 static Value *addFastMathFlag(Value *V, FastMathFlags FMF) {
 347   if (isa<FPMathOperator>(V))
 348     cast<Instruction>(V)->setFastMathFlags(FMF);
 349   return V;
 350 }
 351
 352 /// A helper function that returns an integer or floating-point constant with
 353 /// value C.
 354 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) {
 355   return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C)
 356                            : ConstantFP::get(Ty, C);
 357 }
 358
 359 /// Returns "best known" trip count for the specified loop \p L as defined by
 360 /// the following procedure:
 361 ///   1) Returns exact trip count if it is known.
 362 ///   2) Returns expected trip count according to profile data if any.
 363 ///   3) Returns upper bound estimate if it is known.
 364 ///   4) Returns None if all of the above failed.
 365 static Optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE, Loop *L) {
 366   // Check if exact trip count is known.
 367   if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L))
 368     return ExpectedTC;
 369
 370   // Check if there is an expected trip count available from profile data.
 371   if (LoopVectorizeWithBlockFrequency)
 372     if (auto EstimatedTC = getLoopEstimatedTripCount(L))
 373       return EstimatedTC;
 374
 375   // Check if upper bound estimate is known.
 376   if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L))
 377     return ExpectedTC;
 378
 379   return None;
 380 }
 381
 382 namespace llvm {
 383
 384 /// InnerLoopVectorizer vectorizes loops which contain only one basic
 385 /// block to a specified vectorization factor (VF).
 386 /// This class performs the widening of scalars into vectors, or multiple
 387 /// scalars. This class also implements the following features:
 388 /// * It inserts an epilogue loop for handling loops that don't have iteration
 389 ///   counts that are known to be a multiple of the vectorization factor.
 390 /// * It handles the code generation for reduction variables.
 391 /// * Scalarization (implementation using scalars) of un-vectorizable
 392 ///   instructions.
 393 /// InnerLoopVectorizer does not perform any vectorization-legality
 394 /// checks, and relies on the caller to check for the different legality
 395 /// aspects. The InnerLoopVectorizer relies on the
 396 /// LoopVectorizationLegality class to provide information about the induction
 397 /// and reduction variables that were found to a given vectorization factor.
 398 class InnerLoopVectorizer {
 399 public:
 400   InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
 401                       LoopInfo *LI, DominatorTree *DT,
 402                       const TargetLibraryInfo *TLI,
 403                       const TargetTransformInfo *TTI, AssumptionCache *AC,
 404                       OptimizationRemarkEmitter *ORE, unsigned VecWidth,
 405                       unsigned UnrollFactor, LoopVectorizationLegality *LVL,
 406                       LoopVectorizationCostModel *CM)
 407       : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
 408         AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor),
 409         Builder(PSE.getSE()->getContext()),
 410         VectorLoopValueMap(UnrollFactor, VecWidth), Legal(LVL), Cost(CM) {}
 411   virtual ~InnerLoopVectorizer() = default;
 412
 413   /// Create a new empty loop. Unlink the old loop and connect the new one.
 414   /// Return the pre-header block of the new loop.
 415   BasicBlock *createVectorizedLoopSkeleton();
 416
 417   /// Widen a single instruction within the innermost loop.
 418   void widenInstruction(Instruction &I);
 419
 420   /// Fix the vectorized code, taking care of header phi's, live-outs, and more.
 421   void fixVectorizedLoop();
 422
 423   // Return true if any runtime check is added.
 424   bool areSafetyChecksAdded() { return AddedSafetyChecks; }
 425
 426   /// A type for vectorized values in the new loop. Each value from the
 427   /// original loop, when vectorized, is represented by UF vector values in the
 428   /// new unrolled loop, where UF is the unroll factor.
 429   using VectorParts = SmallVector<Value *, 2>;
 430
 431   /// Vectorize a single GetElementPtrInst based on information gathered and
 432   /// decisions taken during planning.
 433   void widenGEP(GetElementPtrInst *GEP, unsigned UF, unsigned VF,
 434                 bool IsPtrLoopInvariant, SmallBitVector &IsIndexLoopInvariant);
 435
 436   /// Vectorize a single PHINode in a block. This method handles the induction
 437   /// variable canonicalization. It supports both VF = 1 for unrolled loops and
 438   /// arbitrary length vectors.
 439   void widenPHIInstruction(Instruction *PN, unsigned UF, unsigned VF);
 440
 441   /// A helper function to scalarize a single Instruction in the innermost loop.
 442   /// Generates a sequence of scalar instances for each lane between \p MinLane
 443   /// and \p MaxLane, times each part between \p MinPart and \p MaxPart,
 444   /// inclusive..
 445   void scalarizeInstruction(Instruction *Instr, const VPIteration &Instance,
 446                             bool IfPredicateInstr);
 447
 448   /// Widen an integer or floating-point induction variable \p IV. If \p Trunc
 449   /// is provided, the integer induction variable will first be truncated to
 450   /// the corresponding type.
 451   void widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc = nullptr);
 452
 453   /// getOrCreateVectorValue and getOrCreateScalarValue coordinate to generate a
 454   /// vector or scalar value on-demand if one is not yet available. When
 455   /// vectorizing a loop, we visit the definition of an instruction before its
 456   /// uses. When visiting the definition, we either vectorize or scalarize the
 457   /// instruction, creating an entry for it in the corresponding map. (In some
 458   /// cases, such as induction variables, we will create both vector and scalar
 459   /// entries.) Then, as we encounter uses of the definition, we derive values
 460   /// for each scalar or vector use unless such a value is already available.
 461   /// For example, if we scalarize a definition and one of its uses is vector,
 462   /// we build the required vector on-demand with an insertelement sequence
 463   /// when visiting the use. Otherwise, if the use is scalar, we can use the
 464   /// existing scalar definition.
 465   ///
 466   /// Return a value in the new loop corresponding to \p V from the original
 467   /// loop at unroll index \p Part. If the value has already been vectorized,
 468   /// the corresponding vector entry in VectorLoopValueMap is returned. If,
 469   /// however, the value has a scalar entry in VectorLoopValueMap, we construct
 470   /// a new vector value on-demand by inserting the scalar values into a vector
 471   /// with an insertelement sequence. If the value has been neither vectorized
 472   /// nor scalarized, it must be loop invariant, so we simply broadcast the
 473   /// value into a vector.
 474   Value *getOrCreateVectorValue(Value *V, unsigned Part);
 475
 476   /// Return a value in the new loop corresponding to \p V from the original
 477   /// loop at unroll and vector indices \p Instance. If the value has been
 478   /// vectorized but not scalarized, the necessary extractelement instruction
 479   /// will be generated.
 480   Value *getOrCreateScalarValue(Value *V, const VPIteration &Instance);
 481
 482   /// Construct the vector value of a scalarized value \p V one lane at a time.
 483   void packScalarIntoVectorValue(Value *V, const VPIteration &Instance);
 484
 485   /// Try to vectorize the interleaved access group that \p Instr belongs to
 486   /// with the base address given in \p Addr, optionally masking the vector
 487   /// operations if \p BlockInMask is non-null. Use \p State to translate given
 488   /// VPValues to IR values in the vectorized loop.
 489   void vectorizeInterleaveGroup(Instruction *Instr, VPTransformState &State,
 490                                 VPValue *Addr, VPValue *BlockInMask = nullptr);
 491
 492   /// Vectorize Load and Store instructions with the base address given in \p
 493   /// Addr, optionally masking the vector operations if \p BlockInMask is
 494   /// non-null. Use \p State to translate given VPValues to IR values in the
 495   /// vectorized loop.
 496   void vectorizeMemoryInstruction(Instruction *Instr, VPTransformState &State,
 497                                   VPValue *Addr,
 498                                   VPValue *BlockInMask = nullptr);
 499
 500   /// Set the debug location in the builder using the debug location in
 501   /// the instruction.
 502   void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr);
 503
 504   /// Fix the non-induction PHIs in the OrigPHIsToFix vector.
 505   void fixNonInductionPHIs(void);
 506
 507 protected:
 508   friend class LoopVectorizationPlanner;
 509
 510   /// A small list of PHINodes.
 511   using PhiVector = SmallVector<PHINode *, 4>;
 512
 513   /// A type for scalarized values in the new loop. Each value from the
 514   /// original loop, when scalarized, is represented by UF x VF scalar values
 515   /// in the new unrolled loop, where UF is the unroll factor and VF is the
 516   /// vectorization factor.
 517   using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>;
 518
 519   /// Set up the values of the IVs correctly when exiting the vector loop.
 520   void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II,
 521                     Value *CountRoundDown, Value *EndValue,
 522                     BasicBlock *MiddleBlock);
 523
 524   /// Create a new induction variable inside L.
 525   PHINode *createInductionVariable(Loop *L, Value *Start, Value *End,
 526                                    Value *Step, Instruction *DL);
 527
 528   /// Handle all cross-iteration phis in the header.
 529   void fixCrossIterationPHIs();
 530
 531   /// Fix a first-order recurrence. This is the second phase of vectorizing
 532   /// this phi node.
 533   void fixFirstOrderRecurrence(PHINode *Phi);
 534
 535   /// Fix a reduction cross-iteration phi. This is the second phase of
 536   /// vectorizing this phi node.
 537   void fixReduction(PHINode *Phi);
 538
 539   /// Clear NSW/NUW flags from reduction instructions if necessary.
 540   void clearReductionWrapFlags(RecurrenceDescriptor &RdxDesc);
 541
 542   /// The Loop exit block may have single value PHI nodes with some
 543   /// incoming value. While vectorizing we only handled real values
 544   /// that were defined inside the loop and we should have one value for
 545   /// each predecessor of its parent basic block. See PR14725.
 546   void fixLCSSAPHIs();
 547
 548   /// Iteratively sink the scalarized operands of a predicated instruction into
 549   /// the block that was created for it.
 550   void sinkScalarOperands(Instruction *PredInst);
 551
 552   /// Shrinks vector element sizes to the smallest bitwidth they can be legally
 553   /// represented as.
 554   void truncateToMinimalBitwidths();
 555
 556   /// Create a broadcast instruction. This method generates a broadcast
 557   /// instruction (shuffle) for loop invariant values and for the induction
 558   /// value. If this is the induction variable then we extend it to N, N+1, ...
 559   /// this is needed because each iteration in the loop corresponds to a SIMD
 560   /// element.
 561   virtual Value *getBroadcastInstrs(Value *V);
 562
 563   /// This function adds (StartIdx, StartIdx + Step, StartIdx + 2*Step, ...)
 564   /// to each vector element of Val. The sequence starts at StartIndex.
 565   /// \p Opcode is relevant for FP induction variable.
 566   virtual Value *getStepVector(Value *Val, int StartIdx, Value *Step,
 567                                Instruction::BinaryOps Opcode =
 568                                Instruction::BinaryOpsEnd);
 569
 570   /// Compute scalar induction steps. \p ScalarIV is the scalar induction
 571   /// variable on which to base the steps, \p Step is the size of the step, and
 572   /// \p EntryVal is the value from the original loop that maps to the steps.
 573   /// Note that \p EntryVal doesn't have to be an induction variable - it
 574   /// can also be a truncate instruction.
 575   void buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal,
 576                         const InductionDescriptor &ID);
 577
 578   /// Create a vector induction phi node based on an existing scalar one. \p
 579   /// EntryVal is the value from the original loop that maps to the vector phi
 580   /// node, and \p Step is the loop-invariant step. If \p EntryVal is a
 581   /// truncate instruction, instead of widening the original IV, we widen a
 582   /// version of the IV truncated to \p EntryVal's type.
 583   void createVectorIntOrFpInductionPHI(const InductionDescriptor &II,
 584                                        Value *Step, Instruction *EntryVal);
 585
 586   /// Returns true if an instruction \p I should be scalarized instead of
 587   /// vectorized for the chosen vectorization factor.
 588   bool shouldScalarizeInstruction(Instruction *I) const;
 589
 590   /// Returns true if we should generate a scalar version of \p IV.
 591   bool needsScalarInduction(Instruction *IV) const;
 592
 593   /// If there is a cast involved in the induction variable \p ID, which should
 594   /// be ignored in the vectorized loop body, this function records the
 595   /// VectorLoopValue of the respective Phi also as the VectorLoopValue of the
 596   /// cast. We had already proved that the casted Phi is equal to the uncasted
 597   /// Phi in the vectorized loop (under a runtime guard), and therefore
 598   /// there is no need to vectorize the cast - the same value can be used in the
 599   /// vector loop for both the Phi and the cast.
 600   /// If \p VectorLoopValue is a scalarized value, \p Lane is also specified,
 601   /// Otherwise, \p VectorLoopValue is a widened/vectorized value.
 602   ///
 603   /// \p EntryVal is the value from the original loop that maps to the vector
 604   /// phi node and is used to distinguish what is the IV currently being
 605   /// processed - original one (if \p EntryVal is a phi corresponding to the
 606   /// original IV) or the "newly-created" one based on the proof mentioned above
 607   /// (see also buildScalarSteps() and createVectorIntOrFPInductionPHI()). In the
 608   /// latter case \p EntryVal is a TruncInst and we must not record anything for
 609   /// that IV, but it's error-prone to expect callers of this routine to care
 610   /// about that, hence this explicit parameter.
 611   void recordVectorLoopValueForInductionCast(const InductionDescriptor &ID,
 612                                              const Instruction *EntryVal,
 613                                              Value *VectorLoopValue,
 614                                              unsigned Part,
 615                                              unsigned Lane = UINT_MAX);
 616
 617   /// Generate a shuffle sequence that will reverse the vector Vec.
 618   virtual Value *reverseVector(Value *Vec);
 619
 620   /// Returns (and creates if needed) the original loop trip count.
 621   Value *getOrCreateTripCount(Loop *NewLoop);
 622
 623   /// Returns (and creates if needed) the trip count of the widened loop.
 624   Value *getOrCreateVectorTripCount(Loop *NewLoop);
 625
 626   /// Returns a bitcasted value to the requested vector type.
 627   /// Also handles bitcasts of vector<float> <-> vector<pointer> types.
 628   Value *createBitOrPointerCast(Value *V, VectorType *DstVTy,
 629                                 const DataLayout &DL);
 630
 631   /// Emit a bypass check to see if the vector trip count is zero, including if
 632   /// it overflows.
 633   void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass);
 634
 635   /// Emit a bypass check to see if all of the SCEV assumptions we've
 636   /// had to make are correct.
 637   void emitSCEVChecks(Loop *L, BasicBlock *Bypass);
 638
 639   /// Emit bypass checks to check any memory assumptions we may have made.
 640   void emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass);
 641
 642   /// Compute the transformed value of Index at offset StartValue using step
 643   /// StepValue.
 644   /// For integer induction, returns StartValue + Index * StepValue.
 645   /// For pointer induction, returns StartValue[Index * StepValue].
 646   /// FIXME: The newly created binary instructions should contain nsw/nuw
 647   /// flags, which can be found from the original scalar operations.
 648   Value *emitTransformedIndex(IRBuilder<> &B, Value *Index, ScalarEvolution *SE,
 649                               const DataLayout &DL,
 650                               const InductionDescriptor &ID) const;
 651
 652   /// Add additional metadata to \p To that was not present on \p Orig.
 653   ///
 654   /// Currently this is used to add the noalias annotations based on the
 655   /// inserted memchecks.  Use this for instructions that are *cloned* into the
 656   /// vector loop.
 657   void addNewMetadata(Instruction *To, const Instruction *Orig);
 658
 659   /// Add metadata from one instruction to another.
 660   ///
 661   /// This includes both the original MDs from \p From and additional ones (\see
 662   /// addNewMetadata).  Use this for *newly created* instructions in the vector
 663   /// loop.
 664   void addMetadata(Instruction *To, Instruction *From);
 665
 666   /// Similar to the previous function but it adds the metadata to a
 667   /// vector of instructions.
 668   void addMetadata(ArrayRef<Value *> To, Instruction *From);
 669
 670   /// The original loop.
 671   Loop *OrigLoop;
 672
 673   /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies
 674   /// dynamic knowledge to simplify SCEV expressions and converts them to a
 675   /// more usable form.
 676   PredicatedScalarEvolution &PSE;
 677
 678   /// Loop Info.
 679   LoopInfo *LI;
 680
 681   /// Dominator Tree.
 682   DominatorTree *DT;
 683
 684   /// Alias Analysis.
 685   AliasAnalysis *AA;
 686
 687   /// Target Library Info.
 688   const TargetLibraryInfo *TLI;
 689
 690   /// Target Transform Info.
 691   const TargetTransformInfo *TTI;
 692
 693   /// Assumption Cache.
 694   AssumptionCache *AC;
 695
 696   /// Interface to emit optimization remarks.
 697   OptimizationRemarkEmitter *ORE;
 698
 699   /// LoopVersioning.  It's only set up (non-null) if memchecks were
 700   /// used.
 701   ///
 702   /// This is currently only used to add no-alias metadata based on the
 703   /// memchecks.  The actually versioning is performed manually.
 704   std::unique_ptr<LoopVersioning> LVer;
 705
 706   /// The vectorization SIMD factor to use. Each vector will have this many
 707   /// vector elements.
 708   unsigned VF;
 709
 710   /// The vectorization unroll factor to use. Each scalar is vectorized to this
 711   /// many different vector instructions.
 712   unsigned UF;
 713
 714   /// The builder that we use
 715   IRBuilder<> Builder;
 716
 717   // --- Vectorization state ---
 718
 719   /// The vector-loop preheader.
 720   BasicBlock *LoopVectorPreHeader;
 721
 722   /// The scalar-loop preheader.
 723   BasicBlock *LoopScalarPreHeader;
 724
 725   /// Middle Block between the vector and the scalar.
 726   BasicBlock *LoopMiddleBlock;
 727
 728   /// The ExitBlock of the scalar loop.
 729   BasicBlock *LoopExitBlock;
 730
 731   /// The vector loop body.
 732   BasicBlock *LoopVectorBody;
 733
 734   /// The scalar loop body.
 735   BasicBlock *LoopScalarBody;
 736
 737   /// A list of all bypass blocks. The first block is the entry of the loop.
 738   SmallVector<BasicBlock *, 4> LoopBypassBlocks;
 739
 740   /// The new Induction variable which was added to the new block.
 741   PHINode *Induction = nullptr;
 742
 743   /// The induction variable of the old basic block.
 744   PHINode *OldInduction = nullptr;
 745
 746   /// Maps values from the original loop to their corresponding values in the
 747   /// vectorized loop. A key value can map to either vector values, scalar
 748   /// values or both kinds of values, depending on whether the key was
 749   /// vectorized and scalarized.
 750   VectorizerValueMap VectorLoopValueMap;
 751
 752   /// Store instructions that were predicated.
 753   SmallVector<Instruction *, 4> PredicatedInstructions;
 754
 755   /// Trip count of the original loop.
 756   Value *TripCount = nullptr;
 757
 758   /// Trip count of the widened loop (TripCount - TripCount % (VF*UF))
 759   Value *VectorTripCount = nullptr;
 760
 761   /// The legality analysis.
 762   LoopVectorizationLegality *Legal;
 763
 764   /// The profitablity analysis.
 765   LoopVectorizationCostModel *Cost;
 766
 767   // Record whether runtime checks are added.
 768   bool AddedSafetyChecks = false;
 769
 770   // Holds the end values for each induction variable. We save the end values
 771   // so we can later fix-up the external users of the induction variables.
 772   DenseMap<PHINode *, Value *> IVEndValues;
 773
 774   // Vector of original scalar PHIs whose corresponding widened PHIs need to be
 775   // fixed up at the end of vector code generation.
 776   SmallVector<PHINode *, 8> OrigPHIsToFix;
 777 };
 778
 779 class InnerLoopUnroller : public InnerLoopVectorizer {
 780 public:
 781   InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
 782                     LoopInfo *LI, DominatorTree *DT,
 783                     const TargetLibraryInfo *TLI,
 784                     const TargetTransformInfo *TTI, AssumptionCache *AC,
 785                     OptimizationRemarkEmitter *ORE, unsigned UnrollFactor,
 786                     LoopVectorizationLegality *LVL,
 787                     LoopVectorizationCostModel *CM)
 788       : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 1,
 789                             UnrollFactor, LVL, CM) {}
 790
 791 private:
 792   Value *getBroadcastInstrs(Value *V) override;
 793   Value *getStepVector(Value *Val, int StartIdx, Value *Step,
 794                        Instruction::BinaryOps Opcode =
 795                        Instruction::BinaryOpsEnd) override;
 796   Value *reverseVector(Value *Vec) override;
 797 };
 798
 799 } // end namespace llvm
 800
 801 /// Look for a meaningful debug location on the instruction or it's
 802 /// operands.
 803 static Instruction *getDebugLocFromInstOrOperands(Instruction *I) {
 804   if (!I)
 805     return I;
 806
 807   DebugLoc Empty;
 808   if (I->getDebugLoc() != Empty)
 809     return I;
 810
 811   for (User::op_iterator OI = I->op_begin(), OE = I->op_end(); OI != OE; ++OI) {
 812     if (Instruction *OpInst = dyn_cast<Instruction>(*OI))
 813       if (OpInst->getDebugLoc() != Empty)
 814         return OpInst;
 815   }
 816
 817   return I;
 818 }
 819
 820 void InnerLoopVectorizer::setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr) {
 821   if (const Instruction *Inst = dyn_cast_or_null<Instruction>(Ptr)) {
 822     const DILocation *DIL = Inst->getDebugLoc();
 823     if (DIL && Inst->getFunction()->isDebugInfoForProfiling() &&
 824         !isa<DbgInfoIntrinsic>(Inst)) {
 825       auto NewDIL = DIL->cloneByMultiplyingDuplicationFactor(UF * VF);
 826       if (NewDIL)
 827         B.SetCurrentDebugLocation(NewDIL.getValue());
 828       else
 829         LLVM_DEBUG(dbgs()
 830                    << "Failed to create new discriminator: "
 831                    << DIL->getFilename() << " Line: " << DIL->getLine());
 832     }
 833     else
 834       B.SetCurrentDebugLocation(DIL);
 835   } else
 836     B.SetCurrentDebugLocation(DebugLoc());
 837 }
 838
 839 /// Write a record \p DebugMsg about vectorization failure to the debug
 840 /// output stream. If \p I is passed, it is an instruction that prevents
 841 /// vectorization.
 842 #ifndef NDEBUG
 843 static void debugVectorizationFailure(const StringRef DebugMsg,
 844     Instruction *I) {
 845   dbgs() << "LV: Not vectorizing: " << DebugMsg;
 846   if (I != nullptr)
 847     dbgs() << " " << *I;
 848   else
 849     dbgs() << '.';
 850   dbgs() << '\n';
 851 }
 852 #endif
 853
 854 /// Create an analysis remark that explains why vectorization failed
 855 ///
 856 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint).  \p
 857 /// RemarkName is the identifier for the remark.  If \p I is passed it is an
 858 /// instruction that prevents vectorization.  Otherwise \p TheLoop is used for
 859 /// the location of the remark.  \return the remark object that can be
 860 /// streamed to.
 861 static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName,
 862     StringRef RemarkName, Loop *TheLoop, Instruction *I) {
 863   Value *CodeRegion = TheLoop->getHeader();
 864   DebugLoc DL = TheLoop->getStartLoc();
 865
 866   if (I) {
 867     CodeRegion = I->getParent();
 868     // If there is no debug location attached to the instruction, revert back to
 869     // using the loop's.
 870     if (I->getDebugLoc())
 871       DL = I->getDebugLoc();
 872   }
 873
 874   OptimizationRemarkAnalysis R(PassName, RemarkName, DL, CodeRegion);
 875   R << "loop not vectorized: ";
 876   return R;
 877 }
 878
 879 namespace llvm {
 880
 881 void reportVectorizationFailure(const StringRef DebugMsg,
 882     const StringRef OREMsg, const StringRef ORETag,
 883     OptimizationRemarkEmitter *ORE, Loop *TheLoop, Instruction *I) {
 884   LLVM_DEBUG(debugVectorizationFailure(DebugMsg, I));
 885   LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
 886   ORE->emit(createLVAnalysis(Hints.vectorizeAnalysisPassName(),
 887                 ORETag, TheLoop, I) << OREMsg);
 888 }
 889
 890 } // end namespace llvm
 891
 892 #ifndef NDEBUG
 893 /// \return string containing a file name and a line # for the given loop.
 894 static std::string getDebugLocString(const Loop *L) {
 895   std::string Result;
 896   if (L) {
 897     raw_string_ostream OS(Result);
 898     if (const DebugLoc LoopDbgLoc = L->getStartLoc())
 899       LoopDbgLoc.print(OS);
 900     else
 901       // Just print the module name.
 902       OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier();
 903     OS.flush();
 904   }
 905   return Result;
 906 }
 907 #endif
 908
 909 void InnerLoopVectorizer::addNewMetadata(Instruction *To,
 910                                          const Instruction *Orig) {
 911   // If the loop was versioned with memchecks, add the corresponding no-alias
 912   // metadata.
 913   if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig)))
 914     LVer->annotateInstWithNoAlias(To, Orig);
 915 }
 916
 917 void InnerLoopVectorizer::addMetadata(Instruction *To,
 918                                       Instruction *From) {
 919   propagateMetadata(To, From);
 920   addNewMetadata(To, From);
 921 }
 922
 923 void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To,
 924                                       Instruction *From) {
 925   for (Value *V : To) {
 926     if (Instruction *I = dyn_cast<Instruction>(V))
 927       addMetadata(I, From);
 928   }
 929 }
 930
 931 namespace llvm {
 932
 933 // Loop vectorization cost-model hints how the scalar epilogue loop should be
 934 // lowered.
 935 enum ScalarEpilogueLowering {
 936
 937   // The default: allowing scalar epilogues.
 938   CM_ScalarEpilogueAllowed,
 939
 940   // Vectorization with OptForSize: don't allow epilogues.
 941   CM_ScalarEpilogueNotAllowedOptSize,
 942
 943   // A special case of vectorisation with OptForSize: loops with a very small
 944   // trip count are considered for vectorization under OptForSize, thereby
 945   // making sure the cost of their loop body is dominant, free of runtime
 946   // guards and scalar iteration overheads.
 947   CM_ScalarEpilogueNotAllowedLowTripLoop,
 948
 949   // Loop hint predicate indicating an epilogue is undesired.
 950   CM_ScalarEpilogueNotNeededUsePredicate
 951 };
 952
 953 /// LoopVectorizationCostModel - estimates the expected speedups due to
 954 /// vectorization.
 955 /// In many cases vectorization is not profitable. This can happen because of
 956 /// a number of reasons. In this class we mainly attempt to predict the
 957 /// expected speedup/slowdowns due to the supported instruction set. We use the
 958 /// TargetTransformInfo to query the different backends for the cost of
 959 /// different operations.
 960 class LoopVectorizationCostModel {
 961 public:
 962   LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L,
 963                              PredicatedScalarEvolution &PSE, LoopInfo *LI,
 964                              LoopVectorizationLegality *Legal,
 965                              const TargetTransformInfo &TTI,
 966                              const TargetLibraryInfo *TLI, DemandedBits *DB,
 967                              AssumptionCache *AC,
 968                              OptimizationRemarkEmitter *ORE, const Function *F,
 969                              const LoopVectorizeHints *Hints,
 970                              InterleavedAccessInfo &IAI)
 971       : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal),
 972         TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F),
 973         Hints(Hints), InterleaveInfo(IAI) {}
 974
 975   /// \return An upper bound for the vectorization factor, or None if
 976   /// vectorization and interleaving should be avoided up front.
 977   Optional<unsigned> computeMaxVF();
 978
 979   /// \return True if runtime checks are required for vectorization, and false
 980   /// otherwise.
 981   bool runtimeChecksRequired();
 982
 983   /// \return The most profitable vectorization factor and the cost of that VF.
 984   /// This method checks every power of two up to MaxVF. If UserVF is not ZERO
 985   /// then this vectorization factor will be selected if vectorization is
 986   /// possible.
 987   VectorizationFactor selectVectorizationFactor(unsigned MaxVF);
 988
 989   /// Setup cost-based decisions for user vectorization factor.
 990   void selectUserVectorizationFactor(unsigned UserVF) {
 991     collectUniformsAndScalars(UserVF);
 992     collectInstsToScalarize(UserVF);
 993   }
 994
 995   /// \return The size (in bits) of the smallest and widest types in the code
 996   /// that needs to be vectorized. We ignore values that remain scalar such as
 997   /// 64 bit loop indices.
 998   std::pair<unsigned, unsigned> getSmallestAndWidestTypes();
 999
1000   /// \return The desired interleave count.
1001   /// If interleave count has been specified by metadata it will be returned.
1002   /// Otherwise, the interleave count is computed and returned. VF and LoopCost
1003   /// are the selected vectorization factor and the cost of the selected VF.
1004   unsigned selectInterleaveCount(unsigned VF, unsigned LoopCost);
1005
1006   /// Memory access instruction may be vectorized in more than one way.
1007   /// Form of instruction after vectorization depends on cost.
1008   /// This function takes cost-based decisions for Load/Store instructions
1009   /// and collects them in a map. This decisions map is used for building
1010   /// the lists of loop-uniform and loop-scalar instructions.
1011   /// The calculated cost is saved with widening decision in order to
1012   /// avoid redundant calculations.
1013   void setCostBasedWideningDecision(unsigned VF);
1014
1015   /// A struct that represents some properties of the register usage
1016   /// of a loop.
1017   struct RegisterUsage {
1018     /// Holds the number of loop invariant values that are used in the loop.
1019     /// The key is ClassID of target-provided register class.
1020     SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs;
1021     /// Holds the maximum number of concurrent live intervals in the loop.
1022     /// The key is ClassID of target-provided register class.
1023     SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers;
1024   };
1025
1026   /// \return Returns information about the register usages of the loop for the
1027   /// given vectorization factors.
1028   SmallVector<RegisterUsage, 8> calculateRegisterUsage(ArrayRef<unsigned> VFs);
1029
1030   /// Collect values we want to ignore in the cost model.
1031   void collectValuesToIgnore();
1032
1033   /// \returns The smallest bitwidth each instruction can be represented with.
1034   /// The vector equivalents of these instructions should be truncated to this
1035   /// type.
1036   const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const {
1037     return MinBWs;
1038   }
1039
1040   /// \returns True if it is more profitable to scalarize instruction \p I for
1041   /// vectorization factor \p VF.
1042   bool isProfitableToScalarize(Instruction *I, unsigned VF) const {
1043     assert(VF > 1 && "Profitable to scalarize relevant only for VF > 1.");
1044
1045     // Cost model is not run in the VPlan-native path - return conservative
1046     // result until this changes.
1047     if (EnableVPlanNativePath)
1048       return false;
1049
1050     auto Scalars = InstsToScalarize.find(VF);
1051     assert(Scalars != InstsToScalarize.end() &&
1052            "VF not yet analyzed for scalarization profitability");
1053     return Scalars->second.find(I) != Scalars->second.end();
1054   }
1055
1056   /// Returns true if \p I is known to be uniform after vectorization.
1057   bool isUniformAfterVectorization(Instruction *I, unsigned VF) const {
1058     if (VF == 1)
1059       return true;
1060
1061     // Cost model is not run in the VPlan-native path - return conservative
1062     // result until this changes.
1063     if (EnableVPlanNativePath)
1064       return false;
1065
1066     auto UniformsPerVF = Uniforms.find(VF);
1067     assert(UniformsPerVF != Uniforms.end() &&
1068            "VF not yet analyzed for uniformity");
1069     return UniformsPerVF->second.find(I) != UniformsPerVF->second.end();
1070   }
1071
1072   /// Returns true if \p I is known to be scalar after vectorization.
1073   bool isScalarAfterVectorization(Instruction *I, unsigned VF) const {
1074     if (VF == 1)
1075       return true;
1076
1077     // Cost model is not run in the VPlan-native path - return conservative
1078     // result until this changes.
1079     if (EnableVPlanNativePath)
1080       return false;
1081
1082     auto ScalarsPerVF = Scalars.find(VF);
1083     assert(ScalarsPerVF != Scalars.end() &&
1084            "Scalar values are not calculated for VF");
1085     return ScalarsPerVF->second.find(I) != ScalarsPerVF->second.end();
1086   }
1087
1088   /// \returns True if instruction \p I can be truncated to a smaller bitwidth
1089   /// for vectorization factor \p VF.
1090   bool canTruncateToMinimalBitwidth(Instruction *I, unsigned VF) const {
1091     return VF > 1 && MinBWs.find(I) != MinBWs.end() &&
1092            !isProfitableToScalarize(I, VF) &&
1093            !isScalarAfterVectorization(I, VF);
1094   }
1095
1096   /// Decision that was taken during cost calculation for memory instruction.
1097   enum InstWidening {
1098     CM_Unknown,
1099     CM_Widen,         // For consecutive accesses with stride +1.
1100     CM_Widen_Reverse, // For consecutive accesses with stride -1.
1101     CM_Interleave,
1102     CM_GatherScatter,
1103     CM_Scalarize
1104   };
1105
1106   /// Save vectorization decision \p W and \p Cost taken by the cost model for
1107   /// instruction \p I and vector width \p VF.
1108   void setWideningDecision(Instruction *I, unsigned VF, InstWidening W,
1109                            unsigned Cost) {
1110     assert(VF >= 2 && "Expected VF >=2");
1111     WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1112   }
1113
1114   /// Save vectorization decision \p W and \p Cost taken by the cost model for
1115   /// interleaving group \p Grp and vector width \p VF.
1116   void setWideningDecision(const InterleaveGroup<Instruction> *Grp, unsigned VF,
1117                            InstWidening W, unsigned Cost) {
1118     assert(VF >= 2 && "Expected VF >=2");
1119     /// Broadcast this decicion to all instructions inside the group.
1120     /// But the cost will be assigned to one instruction only.
1121     for (unsigned i = 0; i < Grp->getFactor(); ++i) {
1122       if (auto *I = Grp->getMember(i)) {
1123         if (Grp->getInsertPos() == I)
1124           WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1125         else
1126           WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0);
1127       }
1128     }
1129   }
1130
1131   /// Return the cost model decision for the given instruction \p I and vector
1132   /// width \p VF. Return CM_Unknown if this instruction did not pass
1133   /// through the cost modeling.
1134   InstWidening getWideningDecision(Instruction *I, unsigned VF) {
1135     assert(VF >= 2 && "Expected VF >=2");
1136
1137     // Cost model is not run in the VPlan-native path - return conservative
1138     // result until this changes.
1139     if (EnableVPlanNativePath)
1140       return CM_GatherScatter;
1141
1142     std::pair<Instruction *, unsigned> InstOnVF = std::make_pair(I, VF);
1143     auto Itr = WideningDecisions.find(InstOnVF);
1144     if (Itr == WideningDecisions.end())
1145       return CM_Unknown;
1146     return Itr->second.first;
1147   }
1148
1149   /// Return the vectorization cost for the given instruction \p I and vector
1150   /// width \p VF.
1151   unsigned getWideningCost(Instruction *I, unsigned VF) {
1152     assert(VF >= 2 && "Expected VF >=2");
1153     std::pair<Instruction *, unsigned> InstOnVF = std::make_pair(I, VF);
1154     assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() &&
1155            "The cost is not calculated");
1156     return WideningDecisions[InstOnVF].second;
1157   }
1158
1159   /// Return True if instruction \p I is an optimizable truncate whose operand
1160   /// is an induction variable. Such a truncate will be removed by adding a new
1161   /// induction variable with the destination type.
1162   bool isOptimizableIVTruncate(Instruction *I, unsigned VF) {
1163     // If the instruction is not a truncate, return false.
1164     auto *Trunc = dyn_cast<TruncInst>(I);
1165     if (!Trunc)
1166       return false;
1167
1168     // Get the source and destination types of the truncate.
1169     Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF);
1170     Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF);
1171
1172     // If the truncate is free for the given types, return false. Replacing a
1173     // free truncate with an induction variable would add an induction variable
1174     // update instruction to each iteration of the loop. We exclude from this
1175     // check the primary induction variable since it will need an update
1176     // instruction regardless.
1177     Value *Op = Trunc->getOperand(0);
1178     if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy))
1179       return false;
1180
1181     // If the truncated value is not an induction variable, return false.
1182     return Legal->isInductionPhi(Op);
1183   }
1184
1185   /// Collects the instructions to scalarize for each predicated instruction in
1186   /// the loop.
1187   void collectInstsToScalarize(unsigned VF);
1188
1189   /// Collect Uniform and Scalar values for the given \p VF.
1190   /// The sets depend on CM decision for Load/Store instructions
1191   /// that may be vectorized as interleave, gather-scatter or scalarized.
1192   void collectUniformsAndScalars(unsigned VF) {
1193     // Do the analysis once.
1194     if (VF == 1 || Uniforms.find(VF) != Uniforms.end())
1195       return;
1196     setCostBasedWideningDecision(VF);
1197     collectLoopUniforms(VF);
1198     collectLoopScalars(VF);
1199   }
1200
1201   /// Returns true if the target machine supports masked store operation
1202   /// for the given \p DataType and kind of access to \p Ptr.
1203   bool isLegalMaskedStore(Type *DataType, Value *Ptr, MaybeAlign Alignment) {
1204     return Legal->isConsecutivePtr(Ptr) &&
1205            TTI.isLegalMaskedStore(DataType, Alignment);
1206   }
1207
1208   /// Returns true if the target machine supports masked load operation
1209   /// for the given \p DataType and kind of access to \p Ptr.
1210   bool isLegalMaskedLoad(Type *DataType, Value *Ptr, MaybeAlign Alignment) {
1211     return Legal->isConsecutivePtr(Ptr) &&
1212            TTI.isLegalMaskedLoad(DataType, Alignment);
1213   }
1214
1215   /// Returns true if the target machine supports masked scatter operation
1216   /// for the given \p DataType.
1217   bool isLegalMaskedScatter(Type *DataType, MaybeAlign Alignment) {
1218     return TTI.isLegalMaskedScatter(DataType, Alignment);
1219   }
1220
1221   /// Returns true if the target machine supports masked gather operation
1222   /// for the given \p DataType.
1223   bool isLegalMaskedGather(Type *DataType, MaybeAlign Alignment) {
1224     return TTI.isLegalMaskedGather(DataType, Alignment);
1225   }
1226
1227   /// Returns true if the target machine can represent \p V as a masked gather
1228   /// or scatter operation.
1229   bool isLegalGatherOrScatter(Value *V) {
1230     bool LI = isa<LoadInst>(V);
1231     bool SI = isa<StoreInst>(V);
1232     if (!LI && !SI)
1233       return false;
1234     auto *Ty = getMemInstValueType(V);
1235     MaybeAlign Align = getLoadStoreAlignment(V);
1236     return (LI && isLegalMaskedGather(Ty, Align)) ||
1237            (SI && isLegalMaskedScatter(Ty, Align));
1238   }
1239
1240   /// Returns true if \p I is an instruction that will be scalarized with
1241   /// predication. Such instructions include conditional stores and
1242   /// instructions that may divide by zero.
1243   /// If a non-zero VF has been calculated, we check if I will be scalarized
1244   /// predication for that VF.
1245   bool isScalarWithPredication(Instruction *I, unsigned VF = 1);
1246
1247   // Returns true if \p I is an instruction that will be predicated either
1248   // through scalar predication or masked load/store or masked gather/scatter.
1249   // Superset of instructions that return true for isScalarWithPredication.
1250   bool isPredicatedInst(Instruction *I) {
1251     if (!blockNeedsPredication(I->getParent()))
1252       return false;
1253     // Loads and stores that need some form of masked operation are predicated
1254     // instructions.
1255     if (isa<LoadInst>(I) || isa<StoreInst>(I))
1256       return Legal->isMaskRequired(I);
1257     return isScalarWithPredication(I);
1258   }
1259
1260   /// Returns true if \p I is a memory instruction with consecutive memory
1261   /// access that can be widened.
1262   bool memoryInstructionCanBeWidened(Instruction *I, unsigned VF = 1);
1263
1264   /// Returns true if \p I is a memory instruction in an interleaved-group
1265   /// of memory accesses that can be vectorized with wide vector loads/stores
1266   /// and shuffles.
1267   bool interleavedAccessCanBeWidened(Instruction *I, unsigned VF = 1);
1268
1269   /// Check if \p Instr belongs to any interleaved access group.
1270   bool isAccessInterleaved(Instruction *Instr) {
1271     return InterleaveInfo.isInterleaved(Instr);
1272   }
1273
1274   /// Get the interleaved access group that \p Instr belongs to.
1275   const InterleaveGroup<Instruction> *
1276   getInterleavedAccessGroup(Instruction *Instr) {
1277     return InterleaveInfo.getInterleaveGroup(Instr);
1278   }
1279
1280   /// Returns true if an interleaved group requires a scalar iteration
1281   /// to handle accesses with gaps, and there is nothing preventing us from
1282   /// creating a scalar epilogue.
1283   bool requiresScalarEpilogue() const {
1284     return isScalarEpilogueAllowed() && InterleaveInfo.requiresScalarEpilogue();
1285   }
1286
1287   /// Returns true if a scalar epilogue is not allowed due to optsize or a
1288   /// loop hint annotation.
1289   bool isScalarEpilogueAllowed() const {
1290     return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed;
1291   }
1292
1293   /// Returns true if all loop blocks should be masked to fold tail loop.
1294   bool foldTailByMasking() const { return FoldTailByMasking; }
1295
1296   bool blockNeedsPredication(BasicBlock *BB) {
1297     return foldTailByMasking() || Legal->blockNeedsPredication(BB);
1298   }
1299
1300   /// Estimate cost of an intrinsic call instruction CI if it were vectorized
1301   /// with factor VF.  Return the cost of the instruction, including
1302   /// scalarization overhead if it's needed.
1303   unsigned getVectorIntrinsicCost(CallInst *CI, unsigned VF);
1304
1305   /// Estimate cost of a call instruction CI if it were vectorized with factor
1306   /// VF. Return the cost of the instruction, including scalarization overhead
1307   /// if it's needed. The flag NeedToScalarize shows if the call needs to be
1308   /// scalarized -
1309   /// i.e. either vector version isn't available, or is too expensive.
1310   unsigned getVectorCallCost(CallInst *CI, unsigned VF, bool &NeedToScalarize);
1311
1312 private:
1313   unsigned NumPredStores = 0;
1314
1315   /// \return An upper bound for the vectorization factor, larger than zero.
1316   /// One is returned if vectorization should best be avoided due to cost.
1317   unsigned computeFeasibleMaxVF(unsigned ConstTripCount);
1318
1319   /// The vectorization cost is a combination of the cost itself and a boolean
1320   /// indicating whether any of the contributing operations will actually
1321   /// operate on
1322   /// vector values after type legalization in the backend. If this latter value
1323   /// is
1324   /// false, then all operations will be scalarized (i.e. no vectorization has
1325   /// actually taken place).
1326   using VectorizationCostTy = std::pair<unsigned, bool>;
1327
1328   /// Returns the expected execution cost. The unit of the cost does
1329   /// not matter because we use the 'cost' units to compare different
1330   /// vector widths. The cost that is returned is *not* normalized by
1331   /// the factor width.
1332   VectorizationCostTy expectedCost(unsigned VF);
1333
1334   /// Returns the execution time cost of an instruction for a given vector
1335   /// width. Vector width of one means scalar.
1336   VectorizationCostTy getInstructionCost(Instruction *I, unsigned VF);
1337
1338   /// The cost-computation logic from getInstructionCost which provides
1339   /// the vector type as an output parameter.
1340   unsigned getInstructionCost(Instruction *I, unsigned VF, Type *&VectorTy);
1341
1342   /// Calculate vectorization cost of memory instruction \p I.
1343   unsigned getMemoryInstructionCost(Instruction *I, unsigned VF);
1344
1345   /// The cost computation for scalarized memory instruction.
1346   unsigned getMemInstScalarizationCost(Instruction *I, unsigned VF);
1347
1348   /// The cost computation for interleaving group of memory instructions.
1349   unsigned getInterleaveGroupCost(Instruction *I, unsigned VF);
1350
1351   /// The cost computation for Gather/Scatter instruction.
1352   unsigned getGatherScatterCost(Instruction *I, unsigned VF);
1353
1354   /// The cost computation for widening instruction \p I with consecutive
1355   /// memory access.
1356   unsigned getConsecutiveMemOpCost(Instruction *I, unsigned VF);
1357
1358   /// The cost calculation for Load/Store instruction \p I with uniform pointer -
1359   /// Load: scalar load + broadcast.
1360   /// Store: scalar store + (loop invariant value stored? 0 : extract of last
1361   /// element)
1362   unsigned getUniformMemOpCost(Instruction *I, unsigned VF);
1363
1364   /// Estimate the overhead of scalarizing an instruction. This is a
1365   /// convenience wrapper for the type-based getScalarizationOverhead API.
1366   unsigned getScalarizationOverhead(Instruction *I, unsigned VF);
1367
1368   /// Returns whether the instruction is a load or store and will be a emitted
1369   /// as a vector operation.
1370   bool isConsecutiveLoadOrStore(Instruction *I);
1371
1372   /// Returns true if an artificially high cost for emulated masked memrefs
1373   /// should be used.
1374   bool useEmulatedMaskMemRefHack(Instruction *I);
1375
1376   /// Map of scalar integer values to the smallest bitwidth they can be legally
1377   /// represented as. The vector equivalents of these values should be truncated
1378   /// to this type.
1379   MapVector<Instruction *, uint64_t> MinBWs;
1380
1381   /// A type representing the costs for instructions if they were to be
1382   /// scalarized rather than vectorized. The entries are Instruction-Cost
1383   /// pairs.
1384   using ScalarCostsTy = DenseMap<Instruction *, unsigned>;
1385
1386   /// A set containing all BasicBlocks that are known to present after
1387   /// vectorization as a predicated block.
1388   SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization;
1389
1390   /// Records whether it is allowed to have the original scalar loop execute at
1391   /// least once. This may be needed as a fallback loop in case runtime
1392   /// aliasing/dependence checks fail, or to handle the tail/remainder
1393   /// iterations when the trip count is unknown or doesn't divide by the VF,
1394   /// or as a peel-loop to handle gaps in interleave-groups.
1395   /// Under optsize and when the trip count is very small we don't allow any
1396   /// iterations to execute in the scalar loop.
1397   ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
1398
1399   /// All blocks of loop are to be masked to fold tail of scalar iterations.
1400   bool FoldTailByMasking = false;
1401
1402   /// A map holding scalar costs for different vectorization factors. The
1403   /// presence of a cost for an instruction in the mapping indicates that the
1404   /// instruction will be scalarized when vectorizing with the associated
1405   /// vectorization factor. The entries are VF-ScalarCostTy pairs.
1406   DenseMap<unsigned, ScalarCostsTy> InstsToScalarize;
1407
1408   /// Holds the instructions known to be uniform after vectorization.
1409   /// The data is collected per VF.
1410   DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> Uniforms;
1411
1412   /// Holds the instructions known to be scalar after vectorization.
1413   /// The data is collected per VF.
1414   DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> Scalars;
1415
1416   /// Holds the instructions (address computations) that are forced to be
1417   /// scalarized.
1418   DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> ForcedScalars;
1419
1420   /// Returns the expected difference in cost from scalarizing the expression
1421   /// feeding a predicated instruction \p PredInst. The instructions to
1422   /// scalarize and their scalar costs are collected in \p ScalarCosts. A
1423   /// non-negative return value implies the expression will be scalarized.
1424   /// Currently, only single-use chains are considered for scalarization.
1425   int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts,
1426                               unsigned VF);
1427
1428   /// Collect the instructions that are uniform after vectorization. An
1429   /// instruction is uniform if we represent it with a single scalar value in
1430   /// the vectorized loop corresponding to each vector iteration. Examples of
1431   /// uniform instructions include pointer operands of consecutive or
1432   /// interleaved memory accesses. Note that although uniformity implies an
1433   /// instruction will be scalar, the reverse is not true. In general, a
1434   /// scalarized instruction will be represented by VF scalar values in the
1435   /// vectorized loop, each corresponding to an iteration of the original
1436   /// scalar loop.
1437   void collectLoopUniforms(unsigned VF);
1438
1439   /// Collect the instructions that are scalar after vectorization. An
1440   /// instruction is scalar if it is known to be uniform or will be scalarized
1441   /// during vectorization. Non-uniform scalarized instructions will be
1442   /// represented by VF values in the vectorized loop, each corresponding to an
1443   /// iteration of the original scalar loop.
1444   void collectLoopScalars(unsigned VF);
1445
1446   /// Keeps cost model vectorization decision and cost for instructions.
1447   /// Right now it is used for memory instructions only.
1448   using DecisionList = DenseMap<std::pair<Instruction *, unsigned>,
1449                                 std::pair<InstWidening, unsigned>>;
1450
1451   DecisionList WideningDecisions;
1452
1453   /// Returns true if \p V is expected to be vectorized and it needs to be
1454   /// extracted.
1455   bool needsExtract(Value *V, unsigned VF) const {
1456     Instruction *I = dyn_cast<Instruction>(V);
1457     if (VF == 1 || !I || !TheLoop->contains(I) || TheLoop->isLoopInvariant(I))
1458       return false;
1459
1460     // Assume we can vectorize V (and hence we need extraction) if the
1461     // scalars are not computed yet. This can happen, because it is called
1462     // via getScalarizationOverhead from setCostBasedWideningDecision, before
1463     // the scalars are collected. That should be a safe assumption in most
1464     // cases, because we check if the operands have vectorizable types
1465     // beforehand in LoopVectorizationLegality.
1466     return Scalars.find(VF) == Scalars.end() ||
1467            !isScalarAfterVectorization(I, VF);
1468   };
1469
1470   /// Returns a range containing only operands needing to be extracted.
1471   SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops,
1472                                                    unsigned VF) {
1473     return SmallVector<Value *, 4>(make_filter_range(
1474         Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); }));
1475   }
1476
1477 public:
1478   /// The loop that we evaluate.
1479   Loop *TheLoop;
1480
1481   /// Predicated scalar evolution analysis.
1482   PredicatedScalarEvolution &PSE;
1483
1484   /// Loop Info analysis.
1485   LoopInfo *LI;
1486
1487   /// Vectorization legality.
1488   LoopVectorizationLegality *Legal;
1489
1490   /// Vector target information.
1491   const TargetTransformInfo &TTI;
1492
1493   /// Target Library Info.
1494   const TargetLibraryInfo *TLI;
1495
1496   /// Demanded bits analysis.
1497   DemandedBits *DB;
1498
1499   /// Assumption cache.
1500   AssumptionCache *AC;
1501
1502   /// Interface to emit optimization remarks.
1503   OptimizationRemarkEmitter *ORE;
1504
1505   const Function *TheFunction;
1506
1507   /// Loop Vectorize Hint.
1508   const LoopVectorizeHints *Hints;
1509
1510   /// The interleave access information contains groups of interleaved accesses
1511   /// with the same stride and close to each other.
1512   InterleavedAccessInfo &InterleaveInfo;
1513
1514   /// Values to ignore in the cost model.
1515   SmallPtrSet<const Value *, 16> ValuesToIgnore;
1516
1517   /// Values to ignore in the cost model when VF > 1.
1518   SmallPtrSet<const Value *, 16> VecValuesToIgnore;
1519 };
1520
1521 } // end namespace llvm
1522
1523 // Return true if \p OuterLp is an outer loop annotated with hints for explicit
1524 // vectorization. The loop needs to be annotated with #pragma omp simd
1525 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the
1526 // vector length information is not provided, vectorization is not considered
1527 // explicit. Interleave hints are not allowed either. These limitations will be
1528 // relaxed in the future.
1529 // Please, note that we are currently forced to abuse the pragma 'clang
1530 // vectorize' semantics. This pragma provides *auto-vectorization hints*
1531 // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd'
1532 // provides *explicit vectorization hints* (LV can bypass legal checks and
1533 // assume that vectorization is legal). However, both hints are implemented
1534 // using the same metadata (llvm.loop.vectorize, processed by
1535 // LoopVectorizeHints). This will be fixed in the future when the native IR
1536 // representation for pragma 'omp simd' is introduced.
1537 static bool isExplicitVecOuterLoop(Loop *OuterLp,
1538                                    OptimizationRemarkEmitter *ORE) {
1539   assert(!OuterLp->empty() && "This is not an outer loop");
1540   LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE);
1541
1542   // Only outer loops with an explicit vectorization hint are supported.
1543   // Unannotated outer loops are ignored.
1544   if (Hints.getForce() == LoopVectorizeHints::FK_Undefined)
1545     return false;
1546
1547   Function *Fn = OuterLp->getHeader()->getParent();
1548   if (!Hints.allowVectorization(Fn, OuterLp,
1549                                 true /*VectorizeOnlyWhenForced*/)) {
1550     LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n");
1551     return false;
1552   }
1553
1554   if (Hints.getInterleave() > 1) {
1555     // TODO: Interleave support is future work.
1556     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for "
1557                          "outer loops.\n");
1558     Hints.emitRemarkWithHints();
1559     return false;
1560   }
1561
1562   return true;
1563 }
1564
1565 static void collectSupportedLoops(Loop &L, LoopInfo *LI,
1566                                   OptimizationRemarkEmitter *ORE,
1567                                   SmallVectorImpl<Loop *> &V) {
1568   // Collect inner loops and outer loops without irreducible control flow. For
1569   // now, only collect outer loops that have explicit vectorization hints. If we
1570   // are stress testing the VPlan H-CFG construction, we collect the outermost
1571   // loop of every loop nest.
1572   if (L.empty() || VPlanBuildStressTest ||
1573       (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) {
1574     LoopBlocksRPO RPOT(&L);
1575     RPOT.perform(LI);
1576     if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) {
1577       V.push_back(&L);
1578       // TODO: Collect inner loops inside marked outer loops in case
1579       // vectorization fails for the outer loop. Do not invoke
1580       // 'containsIrreducibleCFG' again for inner loops when the outer loop is
1581       // already known to be reducible. We can use an inherited attribute for
1582       // that.
1583       return;
1584     }
1585   }
1586   for (Loop *InnerL : L)
1587     collectSupportedLoops(*InnerL, LI, ORE, V);
1588 }
1589
1590 namespace {
1591
1592 /// The LoopVectorize Pass.
1593 struct LoopVectorize : public FunctionPass {
1594   /// Pass identification, replacement for typeid
1595   static char ID;
1596
1597   LoopVectorizePass Impl;
1598
1599   explicit LoopVectorize(bool InterleaveOnlyWhenForced = false,
1600                          bool VectorizeOnlyWhenForced = false)
1601       : FunctionPass(ID) {
1602     Impl.InterleaveOnlyWhenForced = InterleaveOnlyWhenForced;
1603     Impl.VectorizeOnlyWhenForced = VectorizeOnlyWhenForced;
1604     initializeLoopVectorizePass(*PassRegistry::getPassRegistry());
1605   }
1606
1607   bool runOnFunction(Function &F) override {
1608     if (skipFunction(F))
1609       return false;
1610
1611     auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
1612     auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
1613     auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
1614     auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
1615     auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI();
1616     auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
1617     auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr;
1618     auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
1619     auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
1620     auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>();
1621     auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits();
1622     auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
1623     auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
1624
1625     std::function<const LoopAccessInfo &(Loop &)> GetLAA =
1626         [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); };
1627
1628     return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC,
1629                         GetLAA, *ORE, PSI);
1630   }
1631
1632   void getAnalysisUsage(AnalysisUsage &AU) const override {
1633     AU.addRequired<AssumptionCacheTracker>();
1634     AU.addRequired<BlockFrequencyInfoWrapperPass>();
1635     AU.addRequired<DominatorTreeWrapperPass>();
1636     AU.addRequired<LoopInfoWrapperPass>();
1637     AU.addRequired<ScalarEvolutionWrapperPass>();
1638     AU.addRequired<TargetTransformInfoWrapperPass>();
1639     AU.addRequired<AAResultsWrapperPass>();
1640     AU.addRequired<LoopAccessLegacyAnalysis>();
1641     AU.addRequired<DemandedBitsWrapperPass>();
1642     AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
1643
1644     // We currently do not preserve loopinfo/dominator analyses with outer loop
1645     // vectorization. Until this is addressed, mark these analyses as preserved
1646     // only for non-VPlan-native path.
1647     // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
1648     if (!EnableVPlanNativePath) {
1649       AU.addPreserved<LoopInfoWrapperPass>();
1650       AU.addPreserved<DominatorTreeWrapperPass>();
1651     }
1652
1653     AU.addPreserved<BasicAAWrapperPass>();
1654     AU.addPreserved<GlobalsAAWrapperPass>();
1655     AU.addRequired<ProfileSummaryInfoWrapperPass>();
1656   }
1657 };
1658
1659 } // end anonymous namespace
1660
1661 //===----------------------------------------------------------------------===//
1662 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and
1663 // LoopVectorizationCostModel and LoopVectorizationPlanner.
1664 //===----------------------------------------------------------------------===//
1665
1666 Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) {
1667   // We need to place the broadcast of invariant variables outside the loop,
1668   // but only if it's proven safe to do so. Else, broadcast will be inside
1669   // vector loop body.
1670   Instruction *Instr = dyn_cast<Instruction>(V);
1671   bool SafeToHoist = OrigLoop->isLoopInvariant(V) &&
1672                      (!Instr ||
1673                       DT->dominates(Instr->getParent(), LoopVectorPreHeader));
1674   // Place the code for broadcasting invariant variables in the new preheader.
1675   IRBuilder<>::InsertPointGuard Guard(Builder);
1676   if (SafeToHoist)
1677     Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
1678
1679   // Broadcast the scalar into all locations in the vector.
1680   Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast");
1681
1682   return Shuf;
1683 }
1684
1685 void InnerLoopVectorizer::createVectorIntOrFpInductionPHI(
1686     const InductionDescriptor &II, Value *Step, Instruction *EntryVal) {
1687   assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
1688          "Expected either an induction phi-node or a truncate of it!");
1689   Value *Start = II.getStartValue();
1690
1691   // Construct the initial value of the vector IV in the vector loop preheader
1692   auto CurrIP = Builder.saveIP();
1693   Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
1694   if (isa<TruncInst>(EntryVal)) {
1695     assert(Start->getType()->isIntegerTy() &&
1696            "Truncation requires an integer type");
1697     auto *TruncType = cast<IntegerType>(EntryVal->getType());
1698     Step = Builder.CreateTrunc(Step, TruncType);
1699     Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType);
1700   }
1701   Value *SplatStart = Builder.CreateVectorSplat(VF, Start);
1702   Value *SteppedStart =
1703       getStepVector(SplatStart, 0, Step, II.getInductionOpcode());
1704
1705   // We create vector phi nodes for both integer and floating-point induction
1706   // variables. Here, we determine the kind of arithmetic we will perform.
1707   Instruction::BinaryOps AddOp;
1708   Instruction::BinaryOps MulOp;
1709   if (Step->getType()->isIntegerTy()) {
1710     AddOp = Instruction::Add;
1711     MulOp = Instruction::Mul;
1712   } else {
1713     AddOp = II.getInductionOpcode();
1714     MulOp = Instruction::FMul;
1715   }
1716
1717   // Multiply the vectorization factor by the step using integer or
1718   // floating-point arithmetic as appropriate.
1719   Value *ConstVF = getSignedIntOrFpConstant(Step->getType(), VF);
1720   Value *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, Step, ConstVF));
1721
1722   // Create a vector splat to use in the induction update.
1723   //
1724   // FIXME: If the step is non-constant, we create the vector splat with
1725   //        IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't
1726   //        handle a constant vector splat.
1727   Value *SplatVF = isa<Constant>(Mul)
1728                        ? ConstantVector::getSplat(VF, cast<Constant>(Mul))
1729                        : Builder.CreateVectorSplat(VF, Mul);
1730   Builder.restoreIP(CurrIP);
1731
1732   // We may need to add the step a number of times, depending on the unroll
1733   // factor. The last of those goes into the PHI.
1734   PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind",
1735                                     &*LoopVectorBody->getFirstInsertionPt());
1736   VecInd->setDebugLoc(EntryVal->getDebugLoc());
1737   Instruction *LastInduction = VecInd;
1738   for (unsigned Part = 0; Part < UF; ++Part) {
1739     VectorLoopValueMap.setVectorValue(EntryVal, Part, LastInduction);
1740
1741     if (isa<TruncInst>(EntryVal))
1742       addMetadata(LastInduction, EntryVal);
1743     recordVectorLoopValueForInductionCast(II, EntryVal, LastInduction, Part);
1744
1745     LastInduction = cast<Instruction>(addFastMathFlag(
1746         Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add")));
1747     LastInduction->setDebugLoc(EntryVal->getDebugLoc());
1748   }
1749
1750   // Move the last step to the end of the latch block. This ensures consistent
1751   // placement of all induction updates.
1752   auto *LoopVectorLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch();
1753   auto *Br = cast<BranchInst>(LoopVectorLatch->getTerminator());
1754   auto *ICmp = cast<Instruction>(Br->getCondition());
1755   LastInduction->moveBefore(ICmp);
1756   LastInduction->setName("vec.ind.next");
1757
1758   VecInd->addIncoming(SteppedStart, LoopVectorPreHeader);
1759   VecInd->addIncoming(LastInduction, LoopVectorLatch);
1760 }
1761
1762 bool InnerLoopVectorizer::shouldScalarizeInstruction(Instruction *I) const {
1763   return Cost->isScalarAfterVectorization(I, VF) ||
1764          Cost->isProfitableToScalarize(I, VF);
1765 }
1766
1767 bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const {
1768   if (shouldScalarizeInstruction(IV))
1769     return true;
1770   auto isScalarInst = [&](User *U) -> bool {
1771     auto *I = cast<Instruction>(U);
1772     return (OrigLoop->contains(I) && shouldScalarizeInstruction(I));
1773   };
1774   return llvm::any_of(IV->users(), isScalarInst);
1775 }
1776
1777 void InnerLoopVectorizer::recordVectorLoopValueForInductionCast(
1778     const InductionDescriptor &ID, const Instruction *EntryVal,
1779     Value *VectorLoopVal, unsigned Part, unsigned Lane) {
1780   assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
1781          "Expected either an induction phi-node or a truncate of it!");
1782
1783   // This induction variable is not the phi from the original loop but the
1784   // newly-created IV based on the proof that casted Phi is equal to the
1785   // uncasted Phi in the vectorized loop (under a runtime guard possibly). It
1786   // re-uses the same InductionDescriptor that original IV uses but we don't
1787   // have to do any recording in this case - that is done when original IV is
1788   // processed.
1789   if (isa<TruncInst>(EntryVal))
1790     return;
1791
1792   const SmallVectorImpl<Instruction *> &Casts = ID.getCastInsts();
1793   if (Casts.empty())
1794     return;
1795   // Only the first Cast instruction in the Casts vector is of interest.
1796   // The rest of the Casts (if exist) have no uses outside the
1797   // induction update chain itself.
1798   Instruction *CastInst = *Casts.begin();
1799   if (Lane < UINT_MAX)
1800     VectorLoopValueMap.setScalarValue(CastInst, {Part, Lane}, VectorLoopVal);
1801   else
1802     VectorLoopValueMap.setVectorValue(CastInst, Part, VectorLoopVal);
1803 }
1804
1805 void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc) {
1806   assert((IV->getType()->isIntegerTy() || IV != OldInduction) &&
1807          "Primary induction variable must have an integer type");
1808
1809   auto II = Legal->getInductionVars()->find(IV);
1810   assert(II != Legal->getInductionVars()->end() && "IV is not an induction");
1811
1812   auto ID = II->second;
1813   assert(IV->getType() == ID.getStartValue()->getType() && "Types must match");
1814
1815   // The scalar value to broadcast. This will be derived from the canonical
1816   // induction variable.
1817   Value *ScalarIV = nullptr;
1818
1819   // The value from the original loop to which we are mapping the new induction
1820   // variable.
1821   Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV;
1822
1823   // True if we have vectorized the induction variable.
1824   auto VectorizedIV = false;
1825
1826   // Determine if we want a scalar version of the induction variable. This is
1827   // true if the induction variable itself is not widened, or if it has at
1828   // least one user in the loop that is not widened.
1829   auto NeedsScalarIV = VF > 1 && needsScalarInduction(EntryVal);
1830
1831   // Generate code for the induction step. Note that induction steps are
1832   // required to be loop-invariant
1833   assert(PSE.getSE()->isLoopInvariant(ID.getStep(), OrigLoop) &&
1834          "Induction step should be loop invariant");
1835   auto &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
1836   Value *Step = nullptr;
1837   if (PSE.getSE()->isSCEVable(IV->getType())) {
1838     SCEVExpander Exp(*PSE.getSE(), DL, "induction");
1839     Step = Exp.expandCodeFor(ID.getStep(), ID.getStep()->getType(),
1840                              LoopVectorPreHeader->getTerminator());
1841   } else {
1842     Step = cast<SCEVUnknown>(ID.getStep())->getValue();
1843   }
1844
1845   // Try to create a new independent vector induction variable. If we can't
1846   // create the phi node, we will splat the scalar induction variable in each
1847   // loop iteration.
1848   if (VF > 1 && !shouldScalarizeInstruction(EntryVal)) {
1849     createVectorIntOrFpInductionPHI(ID, Step, EntryVal);
1850     VectorizedIV = true;
1851   }
1852
1853   // If we haven't yet vectorized the induction variable, or if we will create
1854   // a scalar one, we need to define the scalar induction variable and step
1855   // values. If we were given a truncation type, truncate the canonical
1856   // induction variable and step. Otherwise, derive these values from the
1857   // induction descriptor.
1858   if (!VectorizedIV || NeedsScalarIV) {
1859     ScalarIV = Induction;
1860     if (IV != OldInduction) {
1861       ScalarIV = IV->getType()->isIntegerTy()
1862                      ? Builder.CreateSExtOrTrunc(Induction, IV->getType())
1863                      : Builder.CreateCast(Instruction::SIToFP, Induction,
1864                                           IV->getType());
1865       ScalarIV = emitTransformedIndex(Builder, ScalarIV, PSE.getSE(), DL, ID);
1866       ScalarIV->setName("offset.idx");
1867     }
1868     if (Trunc) {
1869       auto *TruncType = cast<IntegerType>(Trunc->getType());
1870       assert(Step->getType()->isIntegerTy() &&
1871              "Truncation requires an integer step");
1872       ScalarIV = Builder.CreateTrunc(ScalarIV, TruncType);
1873       Step = Builder.CreateTrunc(Step, TruncType);
1874     }
1875   }
1876
1877   // If we haven't yet vectorized the induction variable, splat the scalar
1878   // induction variable, and build the necessary step vectors.
1879   // TODO: Don't do it unless the vectorized IV is really required.
1880   if (!VectorizedIV) {
1881     Value *Broadcasted = getBroadcastInstrs(ScalarIV);
1882     for (unsigned Part = 0; Part < UF; ++Part) {
1883       Value *EntryPart =
1884           getStepVector(Broadcasted, VF * Part, Step, ID.getInductionOpcode());
1885       VectorLoopValueMap.setVectorValue(EntryVal, Part, EntryPart);
1886       if (Trunc)
1887         addMetadata(EntryPart, Trunc);
1888       recordVectorLoopValueForInductionCast(ID, EntryVal, EntryPart, Part);
1889     }
1890   }
1891
1892   // If an induction variable is only used for counting loop iterations or
1893   // calculating addresses, it doesn't need to be widened. Create scalar steps
1894   // that can be used by instructions we will later scalarize. Note that the
1895   // addition of the scalar steps will not increase the number of instructions
1896   // in the loop in the common case prior to InstCombine. We will be trading
1897   // one vector extract for each scalar step.
1898   if (NeedsScalarIV)
1899     buildScalarSteps(ScalarIV, Step, EntryVal, ID);
1900 }
1901
1902 Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step,
1903                                           Instruction::BinaryOps BinOp) {
1904   // Create and check the types.
1905   assert(Val->getType()->isVectorTy() && "Must be a vector");
1906   int VLen = Val->getType()->getVectorNumElements();
1907
1908   Type *STy = Val->getType()->getScalarType();
1909   assert((STy->isIntegerTy() || STy->isFloatingPointTy()) &&
1910          "Induction Step must be an integer or FP");
1911   assert(Step->getType() == STy && "Step has wrong type");
1912
1913   SmallVector<Constant *, 8> Indices;
1914
1915   if (STy->isIntegerTy()) {
1916     // Create a vector of consecutive numbers from zero to VF.
1917     for (int i = 0; i < VLen; ++i)
1918       Indices.push_back(ConstantInt::get(STy, StartIdx + i));
1919
1920     // Add the consecutive indices to the vector value.
1921     Constant *Cv = ConstantVector::get(Indices);
1922     assert(Cv->getType() == Val->getType() && "Invalid consecutive vec");
1923     Step = Builder.CreateVectorSplat(VLen, Step);
1924     assert(Step->getType() == Val->getType() && "Invalid step vec");
1925     // FIXME: The newly created binary instructions should contain nsw/nuw flags,
1926     // which can be found from the original scalar operations.
1927     Step = Builder.CreateMul(Cv, Step);
1928     return Builder.CreateAdd(Val, Step, "induction");
1929   }
1930
1931   // Floating point induction.
1932   assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) &&
1933          "Binary Opcode should be specified for FP induction");
1934   // Create a vector of consecutive numbers from zero to VF.
1935   for (int i = 0; i < VLen; ++i)
1936     Indices.push_back(ConstantFP::get(STy, (double)(StartIdx + i)));
1937
1938   // Add the consecutive indices to the vector value.
1939   Constant *Cv = ConstantVector::get(Indices);
1940
1941   Step = Builder.CreateVectorSplat(VLen, Step);
1942
1943   // Floating point operations had to be 'fast' to enable the induction.
1944   FastMathFlags Flags;
1945   Flags.setFast();
1946
1947   Value *MulOp = Builder.CreateFMul(Cv, Step);
1948   if (isa<Instruction>(MulOp))
1949     // Have to check, MulOp may be a constant
1950     cast<Instruction>(MulOp)->setFastMathFlags(Flags);
1951
1952   Value *BOp = Builder.CreateBinOp(BinOp, Val, MulOp, "induction");
1953   if (isa<Instruction>(BOp))
1954     cast<Instruction>(BOp)->setFastMathFlags(Flags);
1955   return BOp;
1956 }
1957
1958 void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step,
1959                                            Instruction *EntryVal,
1960                                            const InductionDescriptor &ID) {
1961   // We shouldn't have to build scalar steps if we aren't vectorizing.
1962   assert(VF > 1 && "VF should be greater than one");
1963
1964   // Get the value type and ensure it and the step have the same integer type.
1965   Type *ScalarIVTy = ScalarIV->getType()->getScalarType();
1966   assert(ScalarIVTy == Step->getType() &&
1967          "Val and Step should have the same type");
1968
1969   // We build scalar steps for both integer and floating-point induction
1970   // variables. Here, we determine the kind of arithmetic we will perform.
1971   Instruction::BinaryOps AddOp;
1972   Instruction::BinaryOps MulOp;
1973   if (ScalarIVTy->isIntegerTy()) {
1974     AddOp = Instruction::Add;
1975     MulOp = Instruction::Mul;
1976   } else {
1977     AddOp = ID.getInductionOpcode();
1978     MulOp = Instruction::FMul;
1979   }
1980
1981   // Determine the number of scalars we need to generate for each unroll
1982   // iteration. If EntryVal is uniform, we only need to generate the first
1983   // lane. Otherwise, we generate all VF values.
1984   unsigned Lanes =
1985       Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF) ? 1
1986                                                                          : VF;
1987   // Compute the scalar steps and save the results in VectorLoopValueMap.
1988   for (unsigned Part = 0; Part < UF; ++Part) {
1989     for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
1990       auto *StartIdx = getSignedIntOrFpConstant(ScalarIVTy, VF * Part + Lane);
1991       auto *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, StartIdx, Step));
1992       auto *Add = addFastMathFlag(Builder.CreateBinOp(AddOp, ScalarIV, Mul));
1993       VectorLoopValueMap.setScalarValue(EntryVal, {Part, Lane}, Add);
1994       recordVectorLoopValueForInductionCast(ID, EntryVal, Add, Part, Lane);
1995     }
1996   }
1997 }
1998
1999 Value *InnerLoopVectorizer::getOrCreateVectorValue(Value *V, unsigned Part) {
2000   assert(V != Induction && "The new induction variable should not be used.");
2001   assert(!V->getType()->isVectorTy() && "Can't widen a vector");
2002   assert(!V->getType()->isVoidTy() && "Type does not produce a value");
2003
2004   // If we have a stride that is replaced by one, do it here. Defer this for
2005   // the VPlan-native path until we start running Legal checks in that path.
2006   if (!EnableVPlanNativePath && Legal->hasStride(V))
2007     V = ConstantInt::get(V->getType(), 1);
2008
2009   // If we have a vector mapped to this value, return it.
2010   if (VectorLoopValueMap.hasVectorValue(V, Part))
2011     return VectorLoopValueMap.getVectorValue(V, Part);
2012
2013   // If the value has not been vectorized, check if it has been scalarized
2014   // instead. If it has been scalarized, and we actually need the value in
2015   // vector form, we will construct the vector values on demand.
2016   if (VectorLoopValueMap.hasAnyScalarValue(V)) {
2017     Value *ScalarValue = VectorLoopValueMap.getScalarValue(V, {Part, 0});
2018
2019     // If we've scalarized a value, that value should be an instruction.
2020     auto *I = cast<Instruction>(V);
2021
2022     // If we aren't vectorizing, we can just copy the scalar map values over to
2023     // the vector map.
2024     if (VF == 1) {
2025       VectorLoopValueMap.setVectorValue(V, Part, ScalarValue);
2026       return ScalarValue;
2027     }
2028
2029     // Get the last scalar instruction we generated for V and Part. If the value
2030     // is known to be uniform after vectorization, this corresponds to lane zero
2031     // of the Part unroll iteration. Otherwise, the last instruction is the one
2032     // we created for the last vector lane of the Part unroll iteration.
2033     unsigned LastLane = Cost->isUniformAfterVectorization(I, VF) ? 0 : VF - 1;
2034     auto *LastInst = cast<Instruction>(
2035         VectorLoopValueMap.getScalarValue(V, {Part, LastLane}));
2036
2037     // Set the insert point after the last scalarized instruction. This ensures
2038     // the insertelement sequence will directly follow the scalar definitions.
2039     auto OldIP = Builder.saveIP();
2040     auto NewIP = std::next(BasicBlock::iterator(LastInst));
2041     Builder.SetInsertPoint(&*NewIP);
2042
2043     // However, if we are vectorizing, we need to construct the vector values.
2044     // If the value is known to be uniform after vectorization, we can just
2045     // broadcast the scalar value corresponding to lane zero for each unroll
2046     // iteration. Otherwise, we construct the vector values using insertelement
2047     // instructions. Since the resulting vectors are stored in
2048     // VectorLoopValueMap, we will only generate the insertelements once.
2049     Value *VectorValue = nullptr;
2050     if (Cost->isUniformAfterVectorization(I, VF)) {
2051       VectorValue = getBroadcastInstrs(ScalarValue);
2052       VectorLoopValueMap.setVectorValue(V, Part, VectorValue);
2053     } else {
2054       // Initialize packing with insertelements to start from undef.
2055       Value *Undef = UndefValue::get(VectorType::get(V->getType(), VF));
2056       VectorLoopValueMap.setVectorValue(V, Part, Undef);
2057       for (unsigned Lane = 0; Lane < VF; ++Lane)
2058         packScalarIntoVectorValue(V, {Part, Lane});
2059       VectorValue = VectorLoopValueMap.getVectorValue(V, Part);
2060     }
2061     Builder.restoreIP(OldIP);
2062     return VectorValue;
2063   }
2064
2065   // If this scalar is unknown, assume that it is a constant or that it is
2066   // loop invariant. Broadcast V and save the value for future uses.
2067   Value *B = getBroadcastInstrs(V);
2068   VectorLoopValueMap.setVectorValue(V, Part, B);
2069   return B;
2070 }
2071
2072 Value *
2073 InnerLoopVectorizer::getOrCreateScalarValue(Value *V,
2074                                             const VPIteration &Instance) {
2075   // If the value is not an instruction contained in the loop, it should
2076   // already be scalar.
2077   if (OrigLoop->isLoopInvariant(V))
2078     return V;
2079
2080   assert(Instance.Lane > 0
2081              ? !Cost->isUniformAfterVectorization(cast<Instruction>(V), VF)
2082              : true && "Uniform values only have lane zero");
2083
2084   // If the value from the original loop has not been vectorized, it is
2085   // represented by UF x VF scalar values in the new loop. Return the requested
2086   // scalar value.
2087   if (VectorLoopValueMap.hasScalarValue(V, Instance))
2088     return VectorLoopValueMap.getScalarValue(V, Instance);
2089
2090   // If the value has not been scalarized, get its entry in VectorLoopValueMap
2091   // for the given unroll part. If this entry is not a vector type (i.e., the
2092   // vectorization factor is one), there is no need to generate an
2093   // extractelement instruction.
2094   auto *U = getOrCreateVectorValue(V, Instance.Part);
2095   if (!U->getType()->isVectorTy()) {
2096     assert(VF == 1 && "Value not scalarized has non-vector type");
2097     return U;
2098   }
2099
2100   // Otherwise, the value from the original loop has been vectorized and is
2101   // represented by UF vector values. Extract and return the requested scalar
2102   // value from the appropriate vector lane.
2103   return Builder.CreateExtractElement(U, Builder.getInt32(Instance.Lane));
2104 }
2105
2106 void InnerLoopVectorizer::packScalarIntoVectorValue(
2107     Value *V, const VPIteration &Instance) {
2108   assert(V != Induction && "The new induction variable should not be used.");
2109   assert(!V->getType()->isVectorTy() && "Can't pack a vector");
2110   assert(!V->getType()->isVoidTy() && "Type does not produce a value");
2111
2112   Value *ScalarInst = VectorLoopValueMap.getScalarValue(V, Instance);
2113   Value *VectorValue = VectorLoopValueMap.getVectorValue(V, Instance.Part);
2114   VectorValue = Builder.CreateInsertElement(VectorValue, ScalarInst,
2115                                             Builder.getInt32(Instance.Lane));
2116   VectorLoopValueMap.resetVectorValue(V, Instance.Part, VectorValue);
2117 }
2118
2119 Value *InnerLoopVectorizer::reverseVector(Value *Vec) {
2120   assert(Vec->getType()->isVectorTy() && "Invalid type");
2121   SmallVector<Constant *, 8> ShuffleMask;
2122   for (unsigned i = 0; i < VF; ++i)
2123     ShuffleMask.push_back(Builder.getInt32(VF - i - 1));
2124
2125   return Builder.CreateShuffleVector(Vec, UndefValue::get(Vec->getType()),
2126                                      ConstantVector::get(ShuffleMask),
2127                                      "reverse");
2128 }
2129
2130 // Return whether we allow using masked interleave-groups (for dealing with
2131 // strided loads/stores that reside in predicated blocks, or for dealing
2132 // with gaps).
2133 static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) {
2134   // If an override option has been passed in for interleaved accesses, use it.
2135   if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0)
2136     return EnableMaskedInterleavedMemAccesses;
2137
2138   return TTI.enableMaskedInterleavedAccessVectorization();
2139 }
2140
2141 // Try to vectorize the interleave group that \p Instr belongs to.
2142 //
2143 // E.g. Translate following interleaved load group (factor = 3):
2144 //   for (i = 0; i < N; i+=3) {
2145 //     R = Pic[i];             // Member of index 0
2146 //     G = Pic[i+1];           // Member of index 1
2147 //     B = Pic[i+2];           // Member of index 2
2148 //     ... // do something to R, G, B
2149 //   }
2150 // To:
2151 //   %wide.vec = load <12 x i32>                       ; Read 4 tuples of R,G,B
2152 //   %R.vec = shuffle %wide.vec, undef, <0, 3, 6, 9>   ; R elements
2153 //   %G.vec = shuffle %wide.vec, undef, <1, 4, 7, 10>  ; G elements
2154 //   %B.vec = shuffle %wide.vec, undef, <2, 5, 8, 11>  ; B elements
2155 //
2156 // Or translate following interleaved store group (factor = 3):
2157 //   for (i = 0; i < N; i+=3) {
2158 //     ... do something to R, G, B
2159 //     Pic[i]   = R;           // Member of index 0
2160 //     Pic[i+1] = G;           // Member of index 1
2161 //     Pic[i+2] = B;           // Member of index 2
2162 //   }
2163 // To:
2164 //   %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7>
2165 //   %B_U.vec = shuffle %B.vec, undef, <0, 1, 2, 3, u, u, u, u>
2166 //   %interleaved.vec = shuffle %R_G.vec, %B_U.vec,
2167 //        <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>    ; Interleave R,G,B elements
2168 //   store <12 x i32> %interleaved.vec              ; Write 4 tuples of R,G,B
2169 void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr,
2170                                                    VPTransformState &State,
2171                                                    VPValue *Addr,
2172                                                    VPValue *BlockInMask) {
2173   const InterleaveGroup<Instruction> *Group =
2174       Cost->getInterleavedAccessGroup(Instr);
2175   assert(Group && "Fail to get an interleaved access group.");
2176
2177   // Skip if current instruction is not the insert position.
2178   if (Instr != Group->getInsertPos())
2179     return;
2180
2181   const DataLayout &DL = Instr->getModule()->getDataLayout();
2182
2183   // Prepare for the vector type of the interleaved load/store.
2184   Type *ScalarTy = getMemInstValueType(Instr);
2185   unsigned InterleaveFactor = Group->getFactor();
2186   Type *VecTy = VectorType::get(ScalarTy, InterleaveFactor * VF);
2187
2188   // Prepare for the new pointers.
2189   SmallVector<Value *, 2> AddrParts;
2190   unsigned Index = Group->getIndex(Instr);
2191
2192   // TODO: extend the masked interleaved-group support to reversed access.
2193   assert((!BlockInMask || !Group->isReverse()) &&
2194          "Reversed masked interleave-group not supported.");
2195
2196   // If the group is reverse, adjust the index to refer to the last vector lane
2197   // instead of the first. We adjust the index from the first vector lane,
2198   // rather than directly getting the pointer for lane VF - 1, because the
2199   // pointer operand of the interleaved access is supposed to be uniform. For
2200   // uniform instructions, we're only required to generate a value for the
2201   // first vector lane in each unroll iteration.
2202   if (Group->isReverse())
2203     Index += (VF - 1) * Group->getFactor();
2204
2205   for (unsigned Part = 0; Part < UF; Part++) {
2206     Value *AddrPart = State.get(Addr, {Part, 0});
2207     setDebugLocFromInst(Builder, AddrPart);
2208
2209     // Notice current instruction could be any index. Need to adjust the address
2210     // to the member of index 0.
2211     //
2212     // E.g.  a = A[i+1];     // Member of index 1 (Current instruction)
2213     //       b = A[i];       // Member of index 0
2214     // Current pointer is pointed to A[i+1], adjust it to A[i].
2215     //
2216     // E.g.  A[i+1] = a;     // Member of index 1
2217     //       A[i]   = b;     // Member of index 0
2218     //       A[i+2] = c;     // Member of index 2 (Current instruction)
2219     // Current pointer is pointed to A[i+2], adjust it to A[i].
2220
2221     bool InBounds = false;
2222     if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts()))
2223       InBounds = gep->isInBounds();
2224     AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Builder.getInt32(-Index));
2225     cast<GetElementPtrInst>(AddrPart)->setIsInBounds(InBounds);
2226
2227     // Cast to the vector pointer type.
2228     unsigned AddressSpace = AddrPart->getType()->getPointerAddressSpace();
2229     Type *PtrTy = VecTy->getPointerTo(AddressSpace);
2230     AddrParts.push_back(Builder.CreateBitCast(AddrPart, PtrTy));
2231   }
2232
2233   setDebugLocFromInst(Builder, Instr);
2234   Value *UndefVec = UndefValue::get(VecTy);
2235
2236   Value *MaskForGaps = nullptr;
2237   if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) {
2238     MaskForGaps = createBitMaskForGaps(Builder, VF, *Group);
2239     assert(MaskForGaps && "Mask for Gaps is required but it is null");
2240   }
2241
2242   // Vectorize the interleaved load group.
2243   if (isa<LoadInst>(Instr)) {
2244     // For each unroll part, create a wide load for the group.
2245     SmallVector<Value *, 2> NewLoads;
2246     for (unsigned Part = 0; Part < UF; Part++) {
2247       Instruction *NewLoad;
2248       if (BlockInMask || MaskForGaps) {
2249         assert(useMaskedInterleavedAccesses(*TTI) &&
2250                "masked interleaved groups are not allowed.");
2251         Value *GroupMask = MaskForGaps;
2252         if (BlockInMask) {
2253           Value *BlockInMaskPart = State.get(BlockInMask, Part);
2254           auto *Undefs = UndefValue::get(BlockInMaskPart->getType());
2255           auto *RepMask = createReplicatedMask(Builder, InterleaveFactor, VF);
2256           Value *ShuffledMask = Builder.CreateShuffleVector(
2257               BlockInMaskPart, Undefs, RepMask, "interleaved.mask");
2258           GroupMask = MaskForGaps
2259                           ? Builder.CreateBinOp(Instruction::And, ShuffledMask,
2260                                                 MaskForGaps)
2261                           : ShuffledMask;
2262         }
2263         NewLoad =
2264             Builder.CreateMaskedLoad(AddrParts[Part], Group->getAlignment(),
2265                                      GroupMask, UndefVec, "wide.masked.vec");
2266       }
2267       else
2268         NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part],
2269                                             Group->getAlignment(), "wide.vec");
2270       Group->addMetadata(NewLoad);
2271       NewLoads.push_back(NewLoad);
2272     }
2273
2274     // For each member in the group, shuffle out the appropriate data from the
2275     // wide loads.
2276     for (unsigned I = 0; I < InterleaveFactor; ++I) {
2277       Instruction *Member = Group->getMember(I);
2278
2279       // Skip the gaps in the group.
2280       if (!Member)
2281         continue;
2282
2283       Constant *StrideMask = createStrideMask(Builder, I, InterleaveFactor, VF);
2284       for (unsigned Part = 0; Part < UF; Part++) {
2285         Value *StridedVec = Builder.CreateShuffleVector(
2286             NewLoads[Part], UndefVec, StrideMask, "strided.vec");
2287
2288         // If this member has different type, cast the result type.
2289         if (Member->getType() != ScalarTy) {
2290           VectorType *OtherVTy = VectorType::get(Member->getType(), VF);
2291           StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL);
2292         }
2293
2294         if (Group->isReverse())
2295           StridedVec = reverseVector(StridedVec);
2296
2297         VectorLoopValueMap.setVectorValue(Member, Part, StridedVec);
2298       }
2299     }
2300     return;
2301   }
2302
2303   // The sub vector type for current instruction.
2304   VectorType *SubVT = VectorType::get(ScalarTy, VF);
2305
2306   // Vectorize the interleaved store group.
2307   for (unsigned Part = 0; Part < UF; Part++) {
2308     // Collect the stored vector from each member.
2309     SmallVector<Value *, 4> StoredVecs;
2310     for (unsigned i = 0; i < InterleaveFactor; i++) {
2311       // Interleaved store group doesn't allow a gap, so each index has a member
2312       Instruction *Member = Group->getMember(i);
2313       assert(Member && "Fail to get a member from an interleaved store group");
2314
2315       Value *StoredVec = getOrCreateVectorValue(
2316           cast<StoreInst>(Member)->getValueOperand(), Part);
2317       if (Group->isReverse())
2318         StoredVec = reverseVector(StoredVec);
2319
2320       // If this member has different type, cast it to a unified type.
2321
2322       if (StoredVec->getType() != SubVT)
2323         StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL);
2324
2325       StoredVecs.push_back(StoredVec);
2326     }
2327
2328     // Concatenate all vectors into a wide vector.
2329     Value *WideVec = concatenateVectors(Builder, StoredVecs);
2330
2331     // Interleave the elements in the wide vector.
2332     Constant *IMask = createInterleaveMask(Builder, VF, InterleaveFactor);
2333     Value *IVec = Builder.CreateShuffleVector(WideVec, UndefVec, IMask,
2334                                               "interleaved.vec");
2335
2336     Instruction *NewStoreInstr;
2337     if (BlockInMask) {
2338       Value *BlockInMaskPart = State.get(BlockInMask, Part);
2339       auto *Undefs = UndefValue::get(BlockInMaskPart->getType());
2340       auto *RepMask = createReplicatedMask(Builder, InterleaveFactor, VF);
2341       Value *ShuffledMask = Builder.CreateShuffleVector(
2342           BlockInMaskPart, Undefs, RepMask, "interleaved.mask");
2343       NewStoreInstr = Builder.CreateMaskedStore(
2344           IVec, AddrParts[Part], Group->getAlignment(), ShuffledMask);
2345     }
2346     else
2347       NewStoreInstr = Builder.CreateAlignedStore(IVec, AddrParts[Part],
2348                                                  Group->getAlignment());
2349
2350     Group->addMetadata(NewStoreInstr);
2351   }
2352 }
2353
2354 void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr,
2355                                                      VPTransformState &State,
2356                                                      VPValue *Addr,
2357                                                      VPValue *BlockInMask) {
2358   // Attempt to issue a wide load.
2359   LoadInst *LI = dyn_cast<LoadInst>(Instr);
2360   StoreInst *SI = dyn_cast<StoreInst>(Instr);
2361
2362   assert((LI || SI) && "Invalid Load/Store instruction");
2363
2364   LoopVectorizationCostModel::InstWidening Decision =
2365       Cost->getWideningDecision(Instr, VF);
2366   assert(Decision != LoopVectorizationCostModel::CM_Unknown &&
2367          "CM decision should be taken at this point");
2368   if (Decision == LoopVectorizationCostModel::CM_Interleave)
2369     return vectorizeInterleaveGroup(Instr, State, Addr, BlockInMask);
2370
2371   Type *ScalarDataTy = getMemInstValueType(Instr);
2372   Type *DataTy = VectorType::get(ScalarDataTy, VF);
2373   // An alignment of 0 means target abi alignment. We need to use the scalar's
2374   // target abi alignment in such a case.
2375   const DataLayout &DL = Instr->getModule()->getDataLayout();
2376   const Align Alignment =
2377       DL.getValueOrABITypeAlignment(getLoadStoreAlignment(Instr), ScalarDataTy);
2378
2379   // Determine if the pointer operand of the access is either consecutive or
2380   // reverse consecutive.
2381   bool Reverse = (Decision == LoopVectorizationCostModel::CM_Widen_Reverse);
2382   bool ConsecutiveStride =
2383       Reverse || (Decision == LoopVectorizationCostModel::CM_Widen);
2384   bool CreateGatherScatter =
2385       (Decision == LoopVectorizationCostModel::CM_GatherScatter);
2386
2387   // Either Ptr feeds a vector load/store, or a vector GEP should feed a vector
2388   // gather/scatter. Otherwise Decision should have been to Scalarize.
2389   assert((ConsecutiveStride || CreateGatherScatter) &&
2390          "The instruction should be scalarized");
2391   (void)ConsecutiveStride;
2392
2393   VectorParts BlockInMaskParts(UF);
2394   bool isMaskRequired = BlockInMask;
2395   if (isMaskRequired)
2396     for (unsigned Part = 0; Part < UF; ++Part)
2397       BlockInMaskParts[Part] = State.get(BlockInMask, Part);
2398
2399   const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * {
2400     // Calculate the pointer for the specific unroll-part.
2401     GetElementPtrInst *PartPtr = nullptr;
2402
2403     bool InBounds = false;
2404     if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts()))
2405       InBounds = gep->isInBounds();
2406
2407     if (Reverse) {
2408       // If the address is consecutive but reversed, then the
2409       // wide store needs to start at the last vector element.
2410       PartPtr = cast<GetElementPtrInst>(
2411           Builder.CreateGEP(ScalarDataTy, Ptr, Builder.getInt32(-Part * VF)));
2412       PartPtr->setIsInBounds(InBounds);
2413       PartPtr = cast<GetElementPtrInst>(
2414           Builder.CreateGEP(ScalarDataTy, PartPtr, Builder.getInt32(1 - VF)));
2415       PartPtr->setIsInBounds(InBounds);
2416       if (isMaskRequired) // Reverse of a null all-one mask is a null mask.
2417         BlockInMaskParts[Part] = reverseVector(BlockInMaskParts[Part]);
2418     } else {
2419       PartPtr = cast<GetElementPtrInst>(
2420           Builder.CreateGEP(ScalarDataTy, Ptr, Builder.getInt32(Part * VF)));
2421       PartPtr->setIsInBounds(InBounds);
2422     }
2423
2424     unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace();
2425     return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace));
2426   };
2427
2428   // Handle Stores:
2429   if (SI) {
2430     setDebugLocFromInst(Builder, SI);
2431
2432     for (unsigned Part = 0; Part < UF; ++Part) {
2433       Instruction *NewSI = nullptr;
2434       Value *StoredVal = getOrCreateVectorValue(SI->getValueOperand(), Part);
2435       if (CreateGatherScatter) {
2436         Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
2437         Value *VectorGep = State.get(Addr, Part);
2438         NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep,
2439                                             Alignment.value(), MaskPart);
2440       } else {
2441         if (Reverse) {
2442           // If we store to reverse consecutive memory locations, then we need
2443           // to reverse the order of elements in the stored value.
2444           StoredVal = reverseVector(StoredVal);
2445           // We don't want to update the value in the map as it might be used in
2446           // another expression. So don't call resetVectorValue(StoredVal).
2447         }
2448         auto *VecPtr = CreateVecPtr(Part, State.get(Addr, {0, 0}));
2449         if (isMaskRequired)
2450           NewSI = Builder.CreateMaskedStore(
2451               StoredVal, VecPtr, Alignment.value(), BlockInMaskParts[Part]);
2452         else
2453           NewSI =
2454               Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment.value());
2455       }
2456       addMetadata(NewSI, SI);
2457     }
2458     return;
2459   }
2460
2461   // Handle loads.
2462   assert(LI && "Must have a load instruction");
2463   setDebugLocFromInst(Builder, LI);
2464   for (unsigned Part = 0; Part < UF; ++Part) {
2465     Value *NewLI;
2466     if (CreateGatherScatter) {
2467       Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
2468       Value *VectorGep = State.get(Addr, Part);
2469       NewLI = Builder.CreateMaskedGather(VectorGep, Alignment.value(), MaskPart,
2470                                          nullptr, "wide.masked.gather");
2471       addMetadata(NewLI, LI);
2472     } else {
2473       auto *VecPtr = CreateVecPtr(Part, State.get(Addr, {0, 0}));
2474       if (isMaskRequired)
2475         NewLI = Builder.CreateMaskedLoad(
2476             VecPtr, Alignment.value(), BlockInMaskParts[Part],
2477             UndefValue::get(DataTy), "wide.masked.load");
2478       else
2479         NewLI = Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment.value(),
2480                                           "wide.load");
2481
2482       // Add metadata to the load, but setVectorValue to the reverse shuffle.
2483       addMetadata(NewLI, LI);
2484       if (Reverse)
2485         NewLI = reverseVector(NewLI);
2486     }
2487     VectorLoopValueMap.setVectorValue(Instr, Part, NewLI);
2488   }
2489 }
2490
2491 void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr,
2492                                                const VPIteration &Instance,
2493                                                bool IfPredicateInstr) {
2494   assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
2495
2496   setDebugLocFromInst(Builder, Instr);
2497
2498   // Does this instruction return a value ?
2499   bool IsVoidRetTy = Instr->getType()->isVoidTy();
2500
2501   Instruction *Cloned = Instr->clone();
2502   if (!IsVoidRetTy)
2503     Cloned->setName(Instr->getName() + ".cloned");
2504
2505   // Replace the operands of the cloned instructions with their scalar
2506   // equivalents in the new loop.
2507   for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) {
2508     auto *NewOp = getOrCreateScalarValue(Instr->getOperand(op), Instance);
2509     Cloned->setOperand(op, NewOp);
2510   }
2511   addNewMetadata(Cloned, Instr);
2512
2513   // Place the cloned scalar in the new loop.
2514   Builder.Insert(Cloned);
2515
2516   // Add the cloned scalar to the scalar map entry.
2517   VectorLoopValueMap.setScalarValue(Instr, Instance, Cloned);
2518
2519   // If we just cloned a new assumption, add it the assumption cache.
2520   if (auto *II = dyn_cast<IntrinsicInst>(Cloned))
2521     if (II->getIntrinsicID() == Intrinsic::assume)
2522       AC->registerAssumption(II);
2523
2524   // End if-block.
2525   if (IfPredicateInstr)
2526     PredicatedInstructions.push_back(Cloned);
2527 }
2528
2529 PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start,
2530                                                       Value *End, Value *Step,
2531                                                       Instruction *DL) {
2532   BasicBlock *Header = L->getHeader();
2533   BasicBlock *Latch = L->getLoopLatch();
2534   // As we're just creating this loop, it's possible no latch exists
2535   // yet. If so, use the header as this will be a single block loop.
2536   if (!Latch)
2537     Latch = Header;
2538
2539   IRBuilder<> Builder(&*Header->getFirstInsertionPt());
2540   Instruction *OldInst = getDebugLocFromInstOrOperands(OldInduction);
2541   setDebugLocFromInst(Builder, OldInst);
2542   auto *Induction = Builder.CreatePHI(Start->getType(), 2, "index");
2543
2544   Builder.SetInsertPoint(Latch->getTerminator());
2545   setDebugLocFromInst(Builder, OldInst);
2546
2547   // Create i+1 and fill the PHINode.
2548   Value *Next = Builder.CreateAdd(Induction, Step, "index.next");
2549   Induction->addIncoming(Start, L->getLoopPreheader());
2550   Induction->addIncoming(Next, Latch);
2551   // Create the compare.
2552   Value *ICmp = Builder.CreateICmpEQ(Next, End);
2553   Builder.CreateCondBr(ICmp, L->getExitBlock(), Header);
2554
2555   // Now we have two terminators. Remove the old one from the block.
2556   Latch->getTerminator()->eraseFromParent();
2557
2558   return Induction;
2559 }
2560
2561 Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) {
2562   if (TripCount)
2563     return TripCount;
2564
2565   assert(L && "Create Trip Count for null loop.");
2566   IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
2567   // Find the loop boundaries.
2568   ScalarEvolution *SE = PSE.getSE();
2569   const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
2570   assert(BackedgeTakenCount != SE->getCouldNotCompute() &&
2571          "Invalid loop count");
2572
2573   Type *IdxTy = Legal->getWidestInductionType();
2574   assert(IdxTy && "No type for induction");
2575
2576   // The exit count might have the type of i64 while the phi is i32. This can
2577   // happen if we have an induction variable that is sign extended before the
2578   // compare. The only way that we get a backedge taken count is that the
2579   // induction variable was signed and as such will not overflow. In such a case
2580   // truncation is legal.
2581   if (BackedgeTakenCount->getType()->getPrimitiveSizeInBits() >
2582       IdxTy->getPrimitiveSizeInBits())
2583     BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy);
2584   BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy);
2585
2586   // Get the total trip count from the count by adding 1.
2587   const SCEV *ExitCount = SE->getAddExpr(
2588       BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
2589
2590   const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
2591
2592   // Expand the trip count and place the new instructions in the preheader.
2593   // Notice that the pre-header does not change, only the loop body.
2594   SCEVExpander Exp(*SE, DL, "induction");
2595
2596   // Count holds the overall loop count (N).
2597   TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(),
2598                                 L->getLoopPreheader()->getTerminator());
2599
2600   if (TripCount->getType()->isPointerTy())
2601     TripCount =
2602         CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int",
2603                                     L->getLoopPreheader()->getTerminator());
2604
2605   return TripCount;
2606 }
2607
2608 Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) {
2609   if (VectorTripCount)
2610     return VectorTripCount;
2611
2612   Value *TC = getOrCreateTripCount(L);
2613   IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
2614
2615   Type *Ty = TC->getType();
2616   Constant *Step = ConstantInt::get(Ty, VF * UF);
2617
2618   // If the tail is to be folded by masking, round the number of iterations N
2619   // up to a multiple of Step instead of rounding down. This is done by first
2620   // adding Step-1 and then rounding down. Note that it's ok if this addition
2621   // overflows: the vector induction variable will eventually wrap to zero given
2622   // that it starts at zero and its Step is a power of two; the loop will then
2623   // exit, with the last early-exit vector comparison also producing all-true.
2624   if (Cost->foldTailByMasking()) {
2625     assert(isPowerOf2_32(VF * UF) &&
2626            "VF*UF must be a power of 2 when folding tail by masking");
2627     TC = Builder.CreateAdd(TC, ConstantInt::get(Ty, VF * UF - 1), "n.rnd.up");
2628   }
2629
2630   // Now we need to generate the expression for the part of the loop that the
2631   // vectorized body will execute. This is equal to N - (N % Step) if scalar
2632   // iterations are not required for correctness, or N - Step, otherwise. Step
2633   // is equal to the vectorization factor (number of SIMD elements) times the
2634   // unroll factor (number of SIMD instructions).
2635   Value *R = Builder.CreateURem(TC, Step, "n.mod.vf");
2636
2637   // If there is a non-reversed interleaved group that may speculatively access
2638   // memory out-of-bounds, we need to ensure that there will be at least one
2639   // iteration of the scalar epilogue loop. Thus, if the step evenly divides
2640   // the trip count, we set the remainder to be equal to the step. If the step
2641   // does not evenly divide the trip count, no adjustment is necessary since
2642   // there will already be scalar iterations. Note that the minimum iterations
2643   // check ensures that N >= Step.
2644   if (VF > 1 && Cost->requiresScalarEpilogue()) {
2645     auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0));
2646     R = Builder.CreateSelect(IsZero, Step, R);
2647   }
2648
2649   VectorTripCount = Builder.CreateSub(TC, R, "n.vec");
2650
2651   return VectorTripCount;
2652 }
2653
2654 Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy,
2655                                                    const DataLayout &DL) {
2656   // Verify that V is a vector type with same number of elements as DstVTy.
2657   unsigned VF = DstVTy->getNumElements();
2658   VectorType *SrcVecTy = cast<VectorType>(V->getType());
2659   assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match");
2660   Type *SrcElemTy = SrcVecTy->getElementType();
2661   Type *DstElemTy = DstVTy->getElementType();
2662   assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) &&
2663          "Vector elements must have same size");
2664
2665   // Do a direct cast if element types are castable.
2666   if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) {
2667     return Builder.CreateBitOrPointerCast(V, DstVTy);
2668   }
2669   // V cannot be directly casted to desired vector type.
2670   // May happen when V is a floating point vector but DstVTy is a vector of
2671   // pointers or vice-versa. Handle this using a two-step bitcast using an
2672   // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float.
2673   assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) &&
2674          "Only one type should be a pointer type");
2675   assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) &&
2676          "Only one type should be a floating point type");
2677   Type *IntTy =
2678       IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy));
2679   VectorType *VecIntTy = VectorType::get(IntTy, VF);
2680   Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy);
2681   return Builder.CreateBitOrPointerCast(CastVal, DstVTy);
2682 }
2683
2684 void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L,
2685                                                          BasicBlock *Bypass) {
2686   Value *Count = getOrCreateTripCount(L);
2687   // Reuse existing vector loop preheader for TC checks.
2688   // Note that new preheader block is generated for vector loop.
2689   BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
2690   IRBuilder<> Builder(TCCheckBlock->getTerminator());
2691
2692   // Generate code to check if the loop's trip count is less than VF * UF, or
2693   // equal to it in case a scalar epilogue is required; this implies that the
2694   // vector trip count is zero. This check also covers the case where adding one
2695   // to the backedge-taken count overflowed leading to an incorrect trip count
2696   // of zero. In this case we will also jump to the scalar loop.
2697   auto P = Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE
2698                                           : ICmpInst::ICMP_ULT;
2699
2700   // If tail is to be folded, vector loop takes care of all iterations.
2701   Value *CheckMinIters = Builder.getFalse();
2702   if (!Cost->foldTailByMasking())
2703     CheckMinIters = Builder.CreateICmp(
2704         P, Count, ConstantInt::get(Count->getType(), VF * UF),
2705         "min.iters.check");
2706
2707   // Create new preheader for vector loop.
2708   LoopVectorPreHeader =
2709       SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr,
2710                  "vector.ph");
2711
2712   assert(DT->properlyDominates(DT->getNode(TCCheckBlock),
2713                                DT->getNode(Bypass)->getIDom()) &&
2714          "TC check is expected to dominate Bypass");
2715
2716   // Update dominator for Bypass & LoopExit.
2717   DT->changeImmediateDominator(Bypass, TCCheckBlock);
2718   DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock);
2719
2720   ReplaceInstWithInst(
2721       TCCheckBlock->getTerminator(),
2722       BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters));
2723   LoopBypassBlocks.push_back(TCCheckBlock);
2724 }
2725
2726 void InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) {
2727   // Reuse existing vector loop preheader for SCEV checks.
2728   // Note that new preheader block is generated for vector loop.
2729   BasicBlock *const SCEVCheckBlock = LoopVectorPreHeader;
2730
2731   // Generate the code to check that the SCEV assumptions that we made.
2732   // We want the new basic block to start at the first instruction in a
2733   // sequence of instructions that form a check.
2734   SCEVExpander Exp(*PSE.getSE(), Bypass->getModule()->getDataLayout(),
2735                    "scev.check");
2736   Value *SCEVCheck = Exp.expandCodeForPredicate(
2737       &PSE.getUnionPredicate(), SCEVCheckBlock->getTerminator());
2738
2739   if (auto *C = dyn_cast<ConstantInt>(SCEVCheck))
2740     if (C->isZero())
2741       return;
2742
2743   assert(!SCEVCheckBlock->getParent()->hasOptSize() &&
2744          "Cannot SCEV check stride or overflow when optimizing for size");
2745
2746   SCEVCheckBlock->setName("vector.scevcheck");
2747   // Create new preheader for vector loop.
2748   LoopVectorPreHeader =
2749       SplitBlock(SCEVCheckBlock, SCEVCheckBlock->getTerminator(), DT, LI,
2750                  nullptr, "vector.ph");
2751
2752   // Update dominator only if this is first RT check.
2753   if (LoopBypassBlocks.empty()) {
2754     DT->changeImmediateDominator(Bypass, SCEVCheckBlock);
2755     DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock);
2756   }
2757
2758   ReplaceInstWithInst(
2759       SCEVCheckBlock->getTerminator(),
2760       BranchInst::Create(Bypass, LoopVectorPreHeader, SCEVCheck));
2761   LoopBypassBlocks.push_back(SCEVCheckBlock);
2762   AddedSafetyChecks = true;
2763 }
2764
2765 void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) {
2766   // VPlan-native path does not do any analysis for runtime checks currently.
2767   if (EnableVPlanNativePath)
2768     return;
2769
2770   // Reuse existing vector loop preheader for runtime memory checks.
2771   // Note that new preheader block is generated for vector loop.
2772   BasicBlock *const MemCheckBlock = L->getLoopPreheader();
2773
2774   // Generate the code that checks in runtime if arrays overlap. We put the
2775   // checks into a separate block to make the more common case of few elements
2776   // faster.
2777   Instruction *FirstCheckInst;
2778   Instruction *MemRuntimeCheck;
2779   std::tie(FirstCheckInst, MemRuntimeCheck) =
2780       Legal->getLAI()->addRuntimeChecks(MemCheckBlock->getTerminator());
2781   if (!MemRuntimeCheck)
2782     return;
2783
2784   if (MemCheckBlock->getParent()->hasOptSize()) {
2785     assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled &&
2786            "Cannot emit memory checks when optimizing for size, unless forced "
2787            "to vectorize.");
2788     ORE->emit([&]() {
2789       return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize",
2790                                         L->getStartLoc(), L->getHeader())
2791              << "Code-size may be reduced by not forcing "
2792                 "vectorization, or by source-code modifications "
2793                 "eliminating the need for runtime checks "
2794                 "(e.g., adding 'restrict').";
2795     });
2796   }
2797
2798   MemCheckBlock->setName("vector.memcheck");
2799   // Create new preheader for vector loop.
2800   LoopVectorPreHeader =
2801       SplitBlock(MemCheckBlock, MemCheckBlock->getTerminator(), DT, LI, nullptr,
2802                  "vector.ph");
2803
2804   // Update dominator only if this is first RT check.
2805   if (LoopBypassBlocks.empty()) {
2806     DT->changeImmediateDominator(Bypass, MemCheckBlock);
2807     DT->changeImmediateDominator(LoopExitBlock, MemCheckBlock);
2808   }
2809
2810   ReplaceInstWithInst(
2811       MemCheckBlock->getTerminator(),
2812       BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheck));
2813   LoopBypassBlocks.push_back(MemCheckBlock);
2814   AddedSafetyChecks = true;
2815
2816   // We currently don't use LoopVersioning for the actual loop cloning but we
2817   // still use it to add the noalias metadata.
2818   LVer = std::make_unique<LoopVersioning>(*Legal->getLAI(), OrigLoop, LI, DT,
2819                                           PSE.getSE());
2820   LVer->prepareNoAliasMetadata();
2821 }
2822
2823 Value *InnerLoopVectorizer::emitTransformedIndex(
2824     IRBuilder<> &B, Value *Index, ScalarEvolution *SE, const DataLayout &DL,
2825     const InductionDescriptor &ID) const {
2826
2827   SCEVExpander Exp(*SE, DL, "induction");
2828   auto Step = ID.getStep();
2829   auto StartValue = ID.getStartValue();
2830   assert(Index->getType() == Step->getType() &&
2831          "Index type does not match StepValue type");
2832
2833   // Note: the IR at this point is broken. We cannot use SE to create any new
2834   // SCEV and then expand it, hoping that SCEV's simplification will give us
2835   // a more optimal code. Unfortunately, attempt of doing so on invalid IR may
2836   // lead to various SCEV crashes. So all we can do is to use builder and rely
2837   // on InstCombine for future simplifications. Here we handle some trivial
2838   // cases only.
2839   auto CreateAdd = [&B](Value *X, Value *Y) {
2840     assert(X->getType() == Y->getType() && "Types don't match!");
2841     if (auto *CX = dyn_cast<ConstantInt>(X))
2842       if (CX->isZero())
2843         return Y;
2844     if (auto *CY = dyn_cast<ConstantInt>(Y))
2845       if (CY->isZero())
2846         return X;
2847     return B.CreateAdd(X, Y);
2848   };
2849
2850   auto CreateMul = [&B](Value *X, Value *Y) {
2851     assert(X->getType() == Y->getType() && "Types don't match!");
2852     if (auto *CX = dyn_cast<ConstantInt>(X))
2853       if (CX->isOne())
2854         return Y;
2855     if (auto *CY = dyn_cast<ConstantInt>(Y))
2856       if (CY->isOne())
2857         return X;
2858     return B.CreateMul(X, Y);
2859   };
2860
2861   switch (ID.getKind()) {
2862   case InductionDescriptor::IK_IntInduction: {
2863     assert(Index->getType() == StartValue->getType() &&
2864            "Index type does not match StartValue type");
2865     if (ID.getConstIntStepValue() && ID.getConstIntStepValue()->isMinusOne())
2866       return B.CreateSub(StartValue, Index);
2867     auto *Offset = CreateMul(
2868         Index, Exp.expandCodeFor(Step, Index->getType(), &*B.GetInsertPoint()));
2869     return CreateAdd(StartValue, Offset);
2870   }
2871   case InductionDescriptor::IK_PtrInduction: {
2872     assert(isa<SCEVConstant>(Step) &&
2873            "Expected constant step for pointer induction");
2874     return B.CreateGEP(
2875         StartValue->getType()->getPointerElementType(), StartValue,
2876         CreateMul(Index, Exp.expandCodeFor(Step, Index->getType(),
2877                                            &*B.GetInsertPoint())));
2878   }
2879   case InductionDescriptor::IK_FpInduction: {
2880     assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value");
2881     auto InductionBinOp = ID.getInductionBinOp();
2882     assert(InductionBinOp &&
2883            (InductionBinOp->getOpcode() == Instruction::FAdd ||
2884             InductionBinOp->getOpcode() == Instruction::FSub) &&
2885            "Original bin op should be defined for FP induction");
2886
2887     Value *StepValue = cast<SCEVUnknown>(Step)->getValue();
2888
2889     // Floating point operations had to be 'fast' to enable the induction.
2890     FastMathFlags Flags;
2891     Flags.setFast();
2892
2893     Value *MulExp = B.CreateFMul(StepValue, Index);
2894     if (isa<Instruction>(MulExp))
2895       // We have to check, the MulExp may be a constant.
2896       cast<Instruction>(MulExp)->setFastMathFlags(Flags);
2897
2898     Value *BOp = B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp,
2899                                "induction");
2900     if (isa<Instruction>(BOp))
2901       cast<Instruction>(BOp)->setFastMathFlags(Flags);
2902
2903     return BOp;
2904   }
2905   case InductionDescriptor::IK_NoInduction:
2906     return nullptr;
2907   }
2908   llvm_unreachable("invalid enum");
2909 }
2910
2911 BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {
2912   /*
2913    In this function we generate a new loop. The new loop will contain
2914    the vectorized instructions while the old loop will continue to run the
2915    scalar remainder.
2916
2917        [ ] <-- loop iteration number check.
2918     /   |
2919    /    v
2920   |    [ ] <-- vector loop bypass (may consist of multiple blocks).
2921   |  /  |
2922   | /   v
2923   ||   [ ]     <-- vector pre header.
2924   |/    |
2925   |     v
2926   |    [  ] \
2927   |    [  ]_|   <-- vector loop.
2928   |     |
2929   |     v
2930   |   -[ ]   <--- middle-block.
2931   |  /  |
2932   | /   v
2933   -|- >[ ]     <--- new preheader.
2934    |    |
2935    |    v
2936    |   [ ] \
2937    |   [ ]_|   <-- old scalar loop to handle remainder.
2938     \   |
2939      \  v
2940       >[ ]     <-- exit block.
2941    ...
2942    */
2943
2944   MDNode *OrigLoopID = OrigLoop->getLoopID();
2945
2946   // Some loops have a single integer induction variable, while other loops
2947   // don't. One example is c++ iterators that often have multiple pointer
2948   // induction variables. In the code below we also support a case where we
2949   // don't have a single induction variable.
2950   //
2951   // We try to obtain an induction variable from the original loop as hard
2952   // as possible. However if we don't find one that:
2953   //   - is an integer
2954   //   - counts from zero, stepping by one
2955   //   - is the size of the widest induction variable type
2956   // then we create a new one.
2957   OldInduction = Legal->getPrimaryInduction();
2958   Type *IdxTy = Legal->getWidestInductionType();
2959
2960   // Split the single block loop into the two loop structure described above.
2961   LoopScalarBody = OrigLoop->getHeader();
2962   LoopVectorPreHeader = OrigLoop->getLoopPreheader();
2963   LoopExitBlock = OrigLoop->getExitBlock();
2964   assert(LoopExitBlock && "Must have an exit block");
2965   assert(LoopVectorPreHeader && "Invalid loop structure");
2966
2967   LoopMiddleBlock =
2968       SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
2969                  LI, nullptr, "middle.block");
2970   LoopScalarPreHeader =
2971       SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI,
2972                  nullptr, "scalar.ph");
2973   // We intentionally don't let SplitBlock to update LoopInfo since
2974   // LoopVectorBody should belong to another loop than LoopVectorPreHeader.
2975   // LoopVectorBody is explicitly added to the correct place few lines later.
2976   LoopVectorBody =
2977       SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
2978                  nullptr, nullptr, "vector.body");
2979
2980   // Update dominator for loop exit.
2981   DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock);
2982
2983   // Create and register the new vector loop.
2984   Loop *Lp = LI->AllocateLoop();
2985   Loop *ParentLoop = OrigLoop->getParentLoop();
2986
2987   // Insert the new loop into the loop nest and register the new basic blocks
2988   // before calling any utilities such as SCEV that require valid LoopInfo.
2989   if (ParentLoop) {
2990     ParentLoop->addChildLoop(Lp);
2991   } else {
2992     LI->addTopLevelLoop(Lp);
2993   }
2994   Lp->addBasicBlockToLoop(LoopVectorBody, *LI);
2995
2996   // Find the loop boundaries.
2997   Value *Count = getOrCreateTripCount(Lp);
2998
2999   Value *StartIdx = ConstantInt::get(IdxTy, 0);
3000
3001   // Now, compare the new count to zero. If it is zero skip the vector loop and
3002   // jump to the scalar loop. This check also covers the case where the
3003   // backedge-taken count is uint##_max: adding one to it will overflow leading
3004   // to an incorrect trip count of zero. In this (rare) case we will also jump
3005   // to the scalar loop.
3006   emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader);
3007
3008   // Generate the code to check any assumptions that we've made for SCEV
3009   // expressions.
3010   emitSCEVChecks(Lp, LoopScalarPreHeader);
3011
3012   // Generate the code that checks in runtime if arrays overlap. We put the
3013   // checks into a separate block to make the more common case of few elements
3014   // faster.
3015   emitMemRuntimeChecks(Lp, LoopScalarPreHeader);
3016
3017   // Generate the induction variable.
3018   // The loop step is equal to the vectorization factor (num of SIMD elements)
3019   // times the unroll factor (num of SIMD instructions).
3020   Value *CountRoundDown = getOrCreateVectorTripCount(Lp);
3021   Constant *Step = ConstantInt::get(IdxTy, VF * UF);
3022   Induction =
3023       createInductionVariable(Lp, StartIdx, CountRoundDown, Step,
3024                               getDebugLocFromInstOrOperands(OldInduction));
3025
3026   // We are going to resume the execution of the scalar loop.
3027   // Go over all of the induction variables that we found and fix the
3028   // PHIs that are left in the scalar version of the loop.
3029   // The starting values of PHI nodes depend on the counter of the last
3030   // iteration in the vectorized loop.
3031   // If we come from a bypass edge then we need to start from the original
3032   // start value.
3033
3034   // This variable saves the new starting index for the scalar loop. It is used
3035   // to test if there are any tail iterations left once the vector loop has
3036   // completed.
3037   LoopVectorizationLegality::InductionList *List = Legal->getInductionVars();
3038   for (auto &InductionEntry : *List) {
3039     PHINode *OrigPhi = InductionEntry.first;
3040     InductionDescriptor II = InductionEntry.second;
3041
3042     // Create phi nodes to merge from the  backedge-taken check block.
3043     PHINode *BCResumeVal =
3044         PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val",
3045                         LoopScalarPreHeader->getTerminator());
3046     // Copy original phi DL over to the new one.
3047     BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc());
3048     Value *&EndValue = IVEndValues[OrigPhi];
3049     if (OrigPhi == OldInduction) {
3050       // We know what the end value is.
3051       EndValue = CountRoundDown;
3052     } else {
3053       IRBuilder<> B(Lp->getLoopPreheader()->getTerminator());
3054       Type *StepType = II.getStep()->getType();
3055       Instruction::CastOps CastOp =
3056           CastInst::getCastOpcode(CountRoundDown, true, StepType, true);
3057       Value *CRD = B.CreateCast(CastOp, CountRoundDown, StepType, "cast.crd");
3058       const DataLayout &DL = LoopScalarBody->getModule()->getDataLayout();
3059       EndValue = emitTransformedIndex(B, CRD, PSE.getSE(), DL, II);
3060       EndValue->setName("ind.end");
3061     }
3062
3063     // The new PHI merges the original incoming value, in case of a bypass,
3064     // or the value at the end of the vectorized loop.
3065     BCResumeVal->addIncoming(EndValue, LoopMiddleBlock);
3066
3067     // Fix the scalar body counter (PHI node).
3068     // The old induction's phi node in the scalar body needs the truncated
3069     // value.
3070     for (BasicBlock *BB : LoopBypassBlocks)
3071       BCResumeVal->addIncoming(II.getStartValue(), BB);
3072     OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal);
3073   }
3074
3075   // We need the OrigLoop (scalar loop part) latch terminator to help
3076   // produce correct debug info for the middle block BB instructions.
3077   // The legality check stage guarantees that the loop will have a single
3078   // latch.
3079   assert(isa<BranchInst>(OrigLoop->getLoopLatch()->getTerminator()) &&
3080          "Scalar loop latch terminator isn't a branch");
3081   BranchInst *ScalarLatchBr =
3082       cast<BranchInst>(OrigLoop->getLoopLatch()->getTerminator());
3083
3084   // Add a check in the middle block to see if we have completed
3085   // all of the iterations in the first vector loop.
3086   // If (N - N%VF) == N, then we *don't* need to run the remainder.
3087   // If tail is to be folded, we know we don't need to run the remainder.
3088   Value *CmpN = Builder.getTrue();
3089   if (!Cost->foldTailByMasking()) {
3090     CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, Count,
3091                            CountRoundDown, "cmp.n",
3092                            LoopMiddleBlock->getTerminator());
3093
3094     // Here we use the same DebugLoc as the scalar loop latch branch instead
3095     // of the corresponding compare because they may have ended up with
3096     // different line numbers and we want to avoid awkward line stepping while
3097     // debugging. Eg. if the compare has got a line number inside the loop.
3098     cast<Instruction>(CmpN)->setDebugLoc(ScalarLatchBr->getDebugLoc());
3099   }
3100
3101   BranchInst *BrInst =
3102       BranchInst::Create(LoopExitBlock, LoopScalarPreHeader, CmpN);
3103   BrInst->setDebugLoc(ScalarLatchBr->getDebugLoc());
3104   ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst);
3105
3106   // Get ready to start creating new instructions into the vectorized body.
3107   assert(LoopVectorPreHeader == Lp->getLoopPreheader() &&
3108          "Inconsistent vector loop preheader");
3109   Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt());
3110
3111   Optional<MDNode *> VectorizedLoopID =
3112       makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
3113                                       LLVMLoopVectorizeFollowupVectorized});
3114   if (VectorizedLoopID.hasValue()) {
3115     Lp->setLoopID(VectorizedLoopID.getValue());
3116
3117     // Do not setAlreadyVectorized if loop attributes have been defined
3118     // explicitly.
3119     return LoopVectorPreHeader;
3120   }
3121
3122   // Keep all loop hints from the original loop on the vector loop (we'll
3123   // replace the vectorizer-specific hints below).
3124   if (MDNode *LID = OrigLoop->getLoopID())
3125     Lp->setLoopID(LID);
3126
3127   LoopVectorizeHints Hints(Lp, true, *ORE);
3128   Hints.setAlreadyVectorized();
3129
3130 #ifdef EXPENSIVE_CHECKS
3131   assert(DT->verify(DominatorTree::VerificationLevel::Fast));
3132   LI->verify(*DT);
3133 #endif
3134
3135   return LoopVectorPreHeader;
3136 }
3137
3138 // Fix up external users of the induction variable. At this point, we are
3139 // in LCSSA form, with all external PHIs that use the IV having one input value,
3140 // coming from the remainder loop. We need those PHIs to also have a correct
3141 // value for the IV when arriving directly from the middle block.
3142 void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,
3143                                        const InductionDescriptor &II,
3144                                        Value *CountRoundDown, Value *EndValue,
3145                                        BasicBlock *MiddleBlock) {
3146   // There are two kinds of external IV usages - those that use the value
3147   // computed in the last iteration (the PHI) and those that use the penultimate
3148   // value (the value that feeds into the phi from the loop latch).
3149   // We allow both, but they, obviously, have different values.
3150
3151   assert(OrigLoop->getExitBlock() && "Expected a single exit block");
3152
3153   DenseMap<Value *, Value *> MissingVals;
3154
3155   // An external user of the last iteration's value should see the value that
3156   // the remainder loop uses to initialize its own IV.
3157   Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch());
3158   for (User *U : PostInc->users()) {
3159     Instruction *UI = cast<Instruction>(U);
3160     if (!OrigLoop->contains(UI)) {
3161       assert(isa<PHINode>(UI) && "Expected LCSSA form");
3162       MissingVals[UI] = EndValue;
3163     }
3164   }
3165
3166   // An external user of the penultimate value need to see EndValue - Step.
3167   // The simplest way to get this is to recompute it from the constituent SCEVs,
3168   // that is Start + (Step * (CRD - 1)).
3169   for (User *U : OrigPhi->users()) {
3170     auto *UI = cast<Instruction>(U);
3171     if (!OrigLoop->contains(UI)) {
3172       const DataLayout &DL =
3173           OrigLoop->getHeader()->getModule()->getDataLayout();
3174       assert(isa<PHINode>(UI) && "Expected LCSSA form");
3175
3176       IRBuilder<> B(MiddleBlock->getTerminator());
3177       Value *CountMinusOne = B.CreateSub(
3178           CountRoundDown, ConstantInt::get(CountRoundDown->getType(), 1));
3179       Value *CMO =
3180           !II.getStep()->getType()->isIntegerTy()
3181               ? B.CreateCast(Instruction::SIToFP, CountMinusOne,
3182                              II.getStep()->getType())
3183               : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType());
3184       CMO->setName("cast.cmo");
3185       Value *Escape = emitTransformedIndex(B, CMO, PSE.getSE(), DL, II);
3186       Escape->setName("ind.escape");
3187       MissingVals[UI] = Escape;
3188     }
3189   }
3190
3191   for (auto &I : MissingVals) {
3192     PHINode *PHI = cast<PHINode>(I.first);
3193     // One corner case we have to handle is two IVs "chasing" each-other,
3194     // that is %IV2 = phi [...], [ %IV1, %latch ]
3195     // In this case, if IV1 has an external use, we need to avoid adding both
3196     // "last value of IV1" and "penultimate value of IV2". So, verify that we
3197     // don't already have an incoming value for the middle block.
3198     if (PHI->getBasicBlockIndex(MiddleBlock) == -1)
3199       PHI->addIncoming(I.second, MiddleBlock);
3200   }
3201 }
3202
3203 namespace {
3204
3205 struct CSEDenseMapInfo {
3206   static bool canHandle(const Instruction *I) {
3207     return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) ||
3208            isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I);
3209   }
3210
3211   static inline Instruction *getEmptyKey() {
3212     return DenseMapInfo<Instruction *>::getEmptyKey();
3213   }
3214
3215   static inline Instruction *getTombstoneKey() {
3216     return DenseMapInfo<Instruction *>::getTombstoneKey();
3217   }
3218
3219   static unsigned getHashValue(const Instruction *I) {
3220     assert(canHandle(I) && "Unknown instruction!");
3221     return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(),
3222                                                            I->value_op_end()));
3223   }
3224
3225   static bool isEqual(const Instruction *LHS, const Instruction *RHS) {
3226     if (LHS == getEmptyKey() || RHS == getEmptyKey() ||
3227         LHS == getTombstoneKey() || RHS == getTombstoneKey())
3228       return LHS == RHS;
3229     return LHS->isIdenticalTo(RHS);
3230   }
3231 };
3232
3233 } // end anonymous namespace
3234
3235 ///Perform cse of induction variable instructions.
3236 static void cse(BasicBlock *BB) {
3237   // Perform simple cse.
3238   SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap;
3239   for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) {
3240     Instruction *In = &*I++;
3241
3242     if (!CSEDenseMapInfo::canHandle(In))
3243       continue;
3244
3245     // Check if we can replace this instruction with any of the
3246     // visited instructions.
3247     if (Instruction *V = CSEMap.lookup(In)) {
3248       In->replaceAllUsesWith(V);
3249       In->eraseFromParent();
3250       continue;
3251     }
3252
3253     CSEMap[In] = In;
3254   }
3255 }
3256
3257 unsigned LoopVectorizationCostModel::getVectorCallCost(CallInst *CI,
3258                                                        unsigned VF,
3259                                                        bool &NeedToScalarize) {
3260   Function *F = CI->getCalledFunction();
3261   StringRef FnName = CI->getCalledFunction()->getName();
3262   Type *ScalarRetTy = CI->getType();
3263   SmallVector<Type *, 4> Tys, ScalarTys;
3264   for (auto &ArgOp : CI->arg_operands())
3265     ScalarTys.push_back(ArgOp->getType());
3266
3267   // Estimate cost of scalarized vector call. The source operands are assumed
3268   // to be vectors, so we need to extract individual elements from there,
3269   // execute VF scalar calls, and then gather the result into the vector return
3270   // value.
3271   unsigned ScalarCallCost = TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys);
3272   if (VF == 1)
3273     return ScalarCallCost;
3274
3275   // Compute corresponding vector type for return value and arguments.
3276   Type *RetTy = ToVectorTy(ScalarRetTy, VF);
3277   for (Type *ScalarTy : ScalarTys)
3278     Tys.push_back(ToVectorTy(ScalarTy, VF));
3279
3280   // Compute costs of unpacking argument values for the scalar calls and
3281   // packing the return values to a vector.
3282   unsigned ScalarizationCost = getScalarizationOverhead(CI, VF);
3283
3284   unsigned Cost = ScalarCallCost * VF + ScalarizationCost;
3285
3286   // If we can't emit a vector call for this function, then the currently found
3287   // cost is the cost we need to return.
3288   NeedToScalarize = true;
3289   if (!TLI || !TLI->isFunctionVectorizable(FnName, VF) || CI->isNoBuiltin())
3290     return Cost;
3291
3292   // If the corresponding vector cost is cheaper, return its cost.
3293   unsigned VectorCallCost = TTI.getCallInstrCost(nullptr, RetTy, Tys);
3294   if (VectorCallCost < Cost) {
3295     NeedToScalarize = false;
3296     return VectorCallCost;
3297   }
3298   return Cost;
3299 }
3300
3301 unsigned LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI,
3302                                                             unsigned VF) {
3303   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
3304   assert(ID && "Expected intrinsic call!");
3305
3306   FastMathFlags FMF;
3307   if (auto *FPMO = dyn_cast<FPMathOperator>(CI))
3308     FMF = FPMO->getFastMathFlags();
3309
3310   SmallVector<Value *, 4> Operands(CI->arg_operands());
3311   return TTI.getIntrinsicInstrCost(ID, CI->getType(), Operands, FMF, VF);
3312 }
3313
3314 static Type *smallestIntegerVectorType(Type *T1, Type *T2) {
3315   auto *I1 = cast<IntegerType>(T1->getVectorElementType());
3316   auto *I2 = cast<IntegerType>(T2->getVectorElementType());
3317   return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2;
3318 }
3319 static Type *largestIntegerVectorType(Type *T1, Type *T2) {
3320   auto *I1 = cast<IntegerType>(T1->getVectorElementType());
3321   auto *I2 = cast<IntegerType>(T2->getVectorElementType());
3322   return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2;
3323 }
3324
3325 void InnerLoopVectorizer::truncateToMinimalBitwidths() {
3326   // For every instruction `I` in MinBWs, truncate the operands, create a
3327   // truncated version of `I` and reextend its result. InstCombine runs
3328   // later and will remove any ext/trunc pairs.
3329   SmallPtrSet<Value *, 4> Erased;
3330   for (const auto &KV : Cost->getMinimalBitwidths()) {
3331     // If the value wasn't vectorized, we must maintain the original scalar
3332     // type. The absence of the value from VectorLoopValueMap indicates that it
3333     // wasn't vectorized.
3334     if (!VectorLoopValueMap.hasAnyVectorValue(KV.first))
3335       continue;
3336     for (unsigned Part = 0; Part < UF; ++Part) {
3337       Value *I = getOrCreateVectorValue(KV.first, Part);
3338       if (Erased.find(I) != Erased.end() || I->use_empty() ||
3339           !isa<Instruction>(I))
3340         continue;
3341       Type *OriginalTy = I->getType();
3342       Type *ScalarTruncatedTy =
3343           IntegerType::get(OriginalTy->getContext(), KV.second);
3344       Type *TruncatedTy = VectorType::get(ScalarTruncatedTy,
3345                                           OriginalTy->getVectorNumElements());
3346       if (TruncatedTy == OriginalTy)
3347         continue;
3348
3349       IRBuilder<> B(cast<Instruction>(I));
3350       auto ShrinkOperand = [&](Value *V) -> Value * {
3351         if (auto *ZI = dyn_cast<ZExtInst>(V))
3352           if (ZI->getSrcTy() == TruncatedTy)
3353             return ZI->getOperand(0);
3354         return B.CreateZExtOrTrunc(V, TruncatedTy);
3355       };
3356
3357       // The actual instruction modification depends on the instruction type,
3358       // unfortunately.
3359       Value *NewI = nullptr;
3360       if (auto *BO = dyn_cast<BinaryOperator>(I)) {
3361         NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)),
3362                              ShrinkOperand(BO->getOperand(1)));
3363
3364         // Any wrapping introduced by shrinking this operation shouldn't be
3365         // considered undefined behavior. So, we can't unconditionally copy
3366         // arithmetic wrapping flags to NewI.
3367         cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false);
3368       } else if (auto *CI = dyn_cast<ICmpInst>(I)) {
3369         NewI =
3370             B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)),
3371                          ShrinkOperand(CI->getOperand(1)));
3372       } else if (auto *SI = dyn_cast<SelectInst>(I)) {
3373         NewI = B.CreateSelect(SI->getCondition(),
3374                               ShrinkOperand(SI->getTrueValue()),
3375                               ShrinkOperand(SI->getFalseValue()));
3376       } else if (auto *CI = dyn_cast<CastInst>(I)) {
3377         switch (CI->getOpcode()) {
3378         default:
3379           llvm_unreachable("Unhandled cast!");
3380         case Instruction::Trunc:
3381           NewI = ShrinkOperand(CI->getOperand(0));
3382           break;
3383         case Instruction::SExt:
3384           NewI = B.CreateSExtOrTrunc(
3385               CI->getOperand(0),
3386               smallestIntegerVectorType(OriginalTy, TruncatedTy));
3387           break;
3388         case Instruction::ZExt:
3389           NewI = B.CreateZExtOrTrunc(
3390               CI->getOperand(0),
3391               smallestIntegerVectorType(OriginalTy, TruncatedTy));
3392           break;
3393         }
3394       } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) {
3395         auto Elements0 = SI->getOperand(0)->getType()->getVectorNumElements();
3396         auto *O0 = B.CreateZExtOrTrunc(
3397             SI->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements0));
3398         auto Elements1 = SI->getOperand(1)->getType()->getVectorNumElements();
3399         auto *O1 = B.CreateZExtOrTrunc(
3400             SI->getOperand(1), VectorType::get(ScalarTruncatedTy, Elements1));
3401
3402         NewI = B.CreateShuffleVector(O0, O1, SI->getMask());
3403       } else if (isa<LoadInst>(I) || isa<PHINode>(I)) {
3404         // Don't do anything with the operands, just extend the result.
3405         continue;
3406       } else if (auto *IE = dyn_cast<InsertElementInst>(I)) {
3407         auto Elements = IE->getOperand(0)->getType()->getVectorNumElements();
3408         auto *O0 = B.CreateZExtOrTrunc(
3409             IE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements));
3410         auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy);
3411         NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2));
3412       } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) {
3413         auto Elements = EE->getOperand(0)->getType()->getVectorNumElements();
3414         auto *O0 = B.CreateZExtOrTrunc(
3415             EE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements));
3416         NewI = B.CreateExtractElement(O0, EE->getOperand(2));
3417       } else {
3418         // If we don't know what to do, be conservative and don't do anything.
3419         continue;
3420       }
3421
3422       // Lastly, extend the result.
3423       NewI->takeName(cast<Instruction>(I));
3424       Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy);
3425       I->replaceAllUsesWith(Res);
3426       cast<Instruction>(I)->eraseFromParent();
3427       Erased.insert(I);
3428       VectorLoopValueMap.resetVectorValue(KV.first, Part, Res);
3429     }
3430   }
3431
3432   // We'll have created a bunch of ZExts that are now parentless. Clean up.
3433   for (const auto &KV : Cost->getMinimalBitwidths()) {
3434     // If the value wasn't vectorized, we must maintain the original scalar
3435     // type. The absence of the value from VectorLoopValueMap indicates that it
3436     // wasn't vectorized.
3437     if (!VectorLoopValueMap.hasAnyVectorValue(KV.first))
3438       continue;
3439     for (unsigned Part = 0; Part < UF; ++Part) {
3440       Value *I = getOrCreateVectorValue(KV.first, Part);
3441       ZExtInst *Inst = dyn_cast<ZExtInst>(I);
3442       if (Inst && Inst->use_empty()) {
3443         Value *NewI = Inst->getOperand(0);
3444         Inst->eraseFromParent();
3445         VectorLoopValueMap.resetVectorValue(KV.first, Part, NewI);
3446       }
3447     }
3448   }
3449 }
3450
3451 void InnerLoopVectorizer::fixVectorizedLoop() {
3452   // Insert truncates and extends for any truncated instructions as hints to
3453   // InstCombine.
3454   if (VF > 1)
3455     truncateToMinimalBitwidths();
3456
3457   // Fix widened non-induction PHIs by setting up the PHI operands.
3458   if (OrigPHIsToFix.size()) {
3459     assert(EnableVPlanNativePath &&
3460            "Unexpected non-induction PHIs for fixup in non VPlan-native path");
3461     fixNonInductionPHIs();
3462   }
3463
3464   // At this point every instruction in the original loop is widened to a
3465   // vector form. Now we need to fix the recurrences in the loop. These PHI
3466   // nodes are currently empty because we did not want to introduce cycles.
3467   // This is the second stage of vectorizing recurrences.
3468   fixCrossIterationPHIs();
3469
3470   // Forget the original basic block.
3471   PSE.getSE()->forgetLoop(OrigLoop);
3472
3473   // Fix-up external users of the induction variables.
3474   for (auto &Entry : *Legal->getInductionVars())
3475     fixupIVUsers(Entry.first, Entry.second,
3476                  getOrCreateVectorTripCount(LI->getLoopFor(LoopVectorBody)),
3477                  IVEndValues[Entry.first], LoopMiddleBlock);
3478
3479   fixLCSSAPHIs();
3480   for (Instruction *PI : PredicatedInstructions)
3481     sinkScalarOperands(&*PI);
3482
3483   // Remove redundant induction instructions.
3484   cse(LoopVectorBody);
3485 }
3486
3487 void InnerLoopVectorizer::fixCrossIterationPHIs() {
3488   // In order to support recurrences we need to be able to vectorize Phi nodes.
3489   // Phi nodes have cycles, so we need to vectorize them in two stages. This is
3490   // stage #2: We now need to fix the recurrences by adding incoming edges to
3491   // the currently empty PHI nodes. At this point every instruction in the
3492   // original loop is widened to a vector form so we can use them to construct
3493   // the incoming edges.
3494   for (PHINode &Phi : OrigLoop->getHeader()->phis()) {
3495     // Handle first-order recurrences and reductions that need to be fixed.
3496     if (Legal->isFirstOrderRecurrence(&Phi))
3497       fixFirstOrderRecurrence(&Phi);
3498     else if (Legal->isReductionVariable(&Phi))
3499       fixReduction(&Phi);
3500   }
3501 }
3502
3503 void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) {
3504   // This is the second phase of vectorizing first-order recurrences. An
3505   // overview of the transformation is described below. Suppose we have the
3506   // following loop.
3507   //
3508   //   for (int i = 0; i < n; ++i)
3509   //     b[i] = a[i] - a[i - 1];
3510   //
3511   // There is a first-order recurrence on "a". For this loop, the shorthand
3512   // scalar IR looks like:
3513   //
3514   //   scalar.ph:
3515   //     s_init = a[-1]
3516   //     br scalar.body
3517   //
3518   //   scalar.body:
3519   //     i = phi [0, scalar.ph], [i+1, scalar.body]
3520   //     s1 = phi [s_init, scalar.ph], [s2, scalar.body]
3521   //     s2 = a[i]
3522   //     b[i] = s2 - s1
3523   //     br cond, scalar.body, ...
3524   //
3525   // In this example, s1 is a recurrence because it's value depends on the
3526   // previous iteration. In the first phase of vectorization, we created a
3527   // temporary value for s1. We now complete the vectorization and produce the
3528   // shorthand vector IR shown below (for VF = 4, UF = 1).
3529   //
3530   //   vector.ph:
3531   //     v_init = vector(..., ..., ..., a[-1])
3532   //     br vector.body
3533   //
3534   //   vector.body
3535   //     i = phi [0, vector.ph], [i+4, vector.body]
3536   //     v1 = phi [v_init, vector.ph], [v2, vector.body]
3537   //     v2 = a[i, i+1, i+2, i+3];
3538   //     v3 = vector(v1(3), v2(0, 1, 2))
3539   //     b[i, i+1, i+2, i+3] = v2 - v3
3540   //     br cond, vector.body, middle.block
3541   //
3542   //   middle.block:
3543   //     x = v2(3)
3544   //     br scalar.ph
3545   //
3546   //   scalar.ph:
3547   //     s_init = phi [x, middle.block], [a[-1], otherwise]
3548   //     br scalar.body
3549   //
3550   // After execution completes the vector loop, we extract the next value of
3551   // the recurrence (x) to use as the initial value in the scalar loop.
3552
3553   // Get the original loop preheader and single loop latch.
3554   auto *Preheader = OrigLoop->getLoopPreheader();
3555   auto *Latch = OrigLoop->getLoopLatch();
3556
3557   // Get the initial and previous values of the scalar recurrence.
3558   auto *ScalarInit = Phi->getIncomingValueForBlock(Preheader);
3559   auto *Previous = Phi->getIncomingValueForBlock(Latch);
3560
3561   // Create a vector from the initial value.
3562   auto *VectorInit = ScalarInit;
3563   if (VF > 1) {
3564     Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
3565     VectorInit = Builder.CreateInsertElement(
3566         UndefValue::get(VectorType::get(VectorInit->getType(), VF)), VectorInit,
3567         Builder.getInt32(VF - 1), "vector.recur.init");
3568   }
3569
3570   // We constructed a temporary phi node in the first phase of vectorization.
3571   // This phi node will eventually be deleted.
3572   Builder.SetInsertPoint(
3573       cast<Instruction>(VectorLoopValueMap.getVectorValue(Phi, 0)));
3574
3575   // Create a phi node for the new recurrence. The current value will either be
3576   // the initial value inserted into a vector or loop-varying vector value.
3577   auto *VecPhi = Builder.CreatePHI(VectorInit->getType(), 2, "vector.recur");
3578   VecPhi->addIncoming(VectorInit, LoopVectorPreHeader);
3579
3580   // Get the vectorized previous value of the last part UF - 1. It appears last
3581   // among all unrolled iterations, due to the order of their construction.
3582   Value *PreviousLastPart = getOrCreateVectorValue(Previous, UF - 1);
3583
3584   // Find and set the insertion point after the previous value if it is an
3585   // instruction.
3586   BasicBlock::iterator InsertPt;
3587   // Note that the previous value may have been constant-folded so it is not
3588   // guaranteed to be an instruction in the vector loop.
3589   // FIXME: Loop invariant values do not form recurrences. We should deal with
3590   //        them earlier.
3591   if (LI->getLoopFor(LoopVectorBody)->isLoopInvariant(PreviousLastPart))
3592     InsertPt = LoopVectorBody->getFirstInsertionPt();
3593   else {
3594     Instruction *PreviousInst = cast<Instruction>(PreviousLastPart);
3595     if (isa<PHINode>(PreviousLastPart))
3596       // If the previous value is a phi node, we should insert after all the phi
3597       // nodes in the block containing the PHI to avoid breaking basic block
3598       // verification. Note that the basic block may be different to
3599       // LoopVectorBody, in case we predicate the loop.
3600       InsertPt = PreviousInst->getParent()->getFirstInsertionPt();
3601     else
3602       InsertPt = ++PreviousInst->getIterator();
3603   }
3604   Builder.SetInsertPoint(&*InsertPt);
3605
3606   // We will construct a vector for the recurrence by combining the values for
3607   // the current and previous iterations. This is the required shuffle mask.
3608   SmallVector<Constant *, 8> ShuffleMask(VF);
3609   ShuffleMask[0] = Builder.getInt32(VF - 1);
3610   for (unsigned I = 1; I < VF; ++I)
3611     ShuffleMask[I] = Builder.getInt32(I + VF - 1);
3612
3613   // The vector from which to take the initial value for the current iteration
3614   // (actual or unrolled). Initially, this is the vector phi node.
3615   Value *Incoming = VecPhi;
3616
3617   // Shuffle the current and previous vector and update the vector parts.
3618   for (unsigned Part = 0; Part < UF; ++Part) {
3619     Value *PreviousPart = getOrCreateVectorValue(Previous, Part);
3620     Value *PhiPart = VectorLoopValueMap.getVectorValue(Phi, Part);
3621     auto *Shuffle =
3622         VF > 1 ? Builder.CreateShuffleVector(Incoming, PreviousPart,
3623                                              ConstantVector::get(ShuffleMask))
3624                : Incoming;
3625     PhiPart->replaceAllUsesWith(Shuffle);
3626     cast<Instruction>(PhiPart)->eraseFromParent();
3627     VectorLoopValueMap.resetVectorValue(Phi, Part, Shuffle);
3628     Incoming = PreviousPart;
3629   }
3630
3631   // Fix the latch value of the new recurrence in the vector loop.
3632   VecPhi->addIncoming(Incoming, LI->getLoopFor(LoopVectorBody)->getLoopLatch());
3633
3634   // Extract the last vector element in the middle block. This will be the
3635   // initial value for the recurrence when jumping to the scalar loop.
3636   auto *ExtractForScalar = Incoming;
3637   if (VF > 1) {
3638     Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
3639     ExtractForScalar = Builder.CreateExtractElement(
3640         ExtractForScalar, Builder.getInt32(VF - 1), "vector.recur.extract");
3641   }
3642   // Extract the second last element in the middle block if the
3643   // Phi is used outside the loop. We need to extract the phi itself
3644   // and not the last element (the phi update in the current iteration). This
3645   // will be the value when jumping to the exit block from the LoopMiddleBlock,
3646   // when the scalar loop is not run at all.
3647   Value *ExtractForPhiUsedOutsideLoop = nullptr;
3648   if (VF > 1)
3649     ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement(
3650         Incoming, Builder.getInt32(VF - 2), "vector.recur.extract.for.phi");
3651   // When loop is unrolled without vectorizing, initialize
3652   // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value of
3653   // `Incoming`. This is analogous to the vectorized case above: extracting the
3654   // second last element when VF > 1.
3655   else if (UF > 1)
3656     ExtractForPhiUsedOutsideLoop = getOrCreateVectorValue(Previous, UF - 2);
3657
3658   // Fix the initial value of the original recurrence in the scalar loop.
3659   Builder.SetInsertPoint(&*LoopScalarPreHeader->begin());
3660   auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init");
3661   for (auto *BB : predecessors(LoopScalarPreHeader)) {
3662     auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit;
3663     Start->addIncoming(Incoming, BB);
3664   }
3665
3666   Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start);
3667   Phi->setName("scalar.recur");
3668
3669   // Finally, fix users of the recurrence outside the loop. The users will need
3670   // either the last value of the scalar recurrence or the last value of the
3671   // vector recurrence we extracted in the middle block. Since the loop is in
3672   // LCSSA form, we just need to find all the phi nodes for the original scalar
3673   // recurrence in the exit block, and then add an edge for the middle block.
3674   for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
3675     if (LCSSAPhi.getIncomingValue(0) == Phi) {
3676       LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock);
3677     }
3678   }
3679 }
3680
3681 void InnerLoopVectorizer::fixReduction(PHINode *Phi) {
3682   Constant *Zero = Builder.getInt32(0);
3683
3684   // Get it's reduction variable descriptor.
3685   assert(Legal->isReductionVariable(Phi) &&
3686          "Unable to find the reduction variable");
3687   RecurrenceDescriptor RdxDesc = (*Legal->getReductionVars())[Phi];
3688
3689   RecurrenceDescriptor::RecurrenceKind RK = RdxDesc.getRecurrenceKind();
3690   TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue();
3691   Instruction *LoopExitInst = RdxDesc.getLoopExitInstr();
3692   RecurrenceDescriptor::MinMaxRecurrenceKind MinMaxKind =
3693     RdxDesc.getMinMaxRecurrenceKind();
3694   setDebugLocFromInst(Builder, ReductionStartValue);
3695
3696   // We need to generate a reduction vector from the incoming scalar.
3697   // To do so, we need to generate the 'identity' vector and override
3698   // one of the elements with the incoming scalar reduction. We need
3699   // to do it in the vector-loop preheader.
3700   Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
3701
3702   // This is the vector-clone of the value that leaves the loop.
3703   Type *VecTy = getOrCreateVectorValue(LoopExitInst, 0)->getType();
3704
3705   // Find the reduction identity variable. Zero for addition, or, xor,
3706   // one for multiplication, -1 for And.
3707   Value *Identity;
3708   Value *VectorStart;
3709   if (RK == RecurrenceDescriptor::RK_IntegerMinMax ||
3710       RK == RecurrenceDescriptor::RK_FloatMinMax) {
3711     // MinMax reduction have the start value as their identify.
3712     if (VF == 1) {
3713       VectorStart = Identity = ReductionStartValue;
3714     } else {
3715       VectorStart = Identity =
3716         Builder.CreateVectorSplat(VF, ReductionStartValue, "minmax.ident");
3717     }
3718   } else {
3719     // Handle other reduction kinds:
3720     Constant *Iden = RecurrenceDescriptor::getRecurrenceIdentity(
3721         RK, VecTy->getScalarType());
3722     if (VF == 1) {
3723       Identity = Iden;
3724       // This vector is the Identity vector where the first element is the
3725       // incoming scalar reduction.
3726       VectorStart = ReductionStartValue;
3727     } else {
3728       Identity = ConstantVector::getSplat(VF, Iden);
3729
3730       // This vector is the Identity vector where the first element is the
3731       // incoming scalar reduction.
3732       VectorStart =
3733         Builder.CreateInsertElement(Identity, ReductionStartValue, Zero);
3734     }
3735   }
3736
3737   // Wrap flags are in general invalid after vectorization, clear them.
3738   clearReductionWrapFlags(RdxDesc);
3739
3740   // Fix the vector-loop phi.
3741
3742   // Reductions do not have to start at zero. They can start with
3743   // any loop invariant values.
3744   BasicBlock *Latch = OrigLoop->getLoopLatch();
3745   Value *LoopVal = Phi->getIncomingValueForBlock(Latch);
3746
3747   for (unsigned Part = 0; Part < UF; ++Part) {
3748     Value *VecRdxPhi = getOrCreateVectorValue(Phi, Part);
3749     Value *Val = getOrCreateVectorValue(LoopVal, Part);
3750     // Make sure to add the reduction start value only to the
3751     // first unroll part.
3752     Value *StartVal = (Part == 0) ? VectorStart : Identity;
3753     cast<PHINode>(VecRdxPhi)->addIncoming(StartVal, LoopVectorPreHeader);
3754     cast<PHINode>(VecRdxPhi)
3755       ->addIncoming(Val, LI->getLoopFor(LoopVectorBody)->getLoopLatch());
3756   }
3757
3758   // Before each round, move the insertion point right between
3759   // the PHIs and the values we are going to write.
3760   // This allows us to write both PHINodes and the extractelement
3761   // instructions.
3762   Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
3763
3764   setDebugLocFromInst(Builder, LoopExitInst);
3765
3766   // If tail is folded by masking, the vector value to leave the loop should be
3767   // a Select choosing between the vectorized LoopExitInst and vectorized Phi,
3768   // instead of the former.
3769   if (Cost->foldTailByMasking()) {
3770     for (unsigned Part = 0; Part < UF; ++Part) {
3771       Value *VecLoopExitInst =
3772           VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
3773       Value *Sel = nullptr;
3774       for (User *U : VecLoopExitInst->users()) {
3775         if (isa<SelectInst>(U)) {
3776           assert(!Sel && "Reduction exit feeding two selects");
3777           Sel = U;
3778         } else
3779           assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select");
3780       }
3781       assert(Sel && "Reduction exit feeds no select");
3782       VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, Sel);
3783     }
3784   }
3785
3786   // If the vector reduction can be performed in a smaller type, we truncate
3787   // then extend the loop exit value to enable InstCombine to evaluate the
3788   // entire expression in the smaller type.
3789   if (VF > 1 && Phi->getType() != RdxDesc.getRecurrenceType()) {
3790     Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF);
3791     Builder.SetInsertPoint(
3792         LI->getLoopFor(LoopVectorBody)->getLoopLatch()->getTerminator());
3793     VectorParts RdxParts(UF);
3794     for (unsigned Part = 0; Part < UF; ++Part) {
3795       RdxParts[Part] = VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
3796       Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
3797       Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy)
3798                                         : Builder.CreateZExt(Trunc, VecTy);
3799       for (Value::user_iterator UI = RdxParts[Part]->user_begin();
3800            UI != RdxParts[Part]->user_end();)
3801         if (*UI != Trunc) {
3802           (*UI++)->replaceUsesOfWith(RdxParts[Part], Extnd);
3803           RdxParts[Part] = Extnd;
3804         } else {
3805           ++UI;
3806         }
3807     }
3808     Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
3809     for (unsigned Part = 0; Part < UF; ++Part) {
3810       RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
3811       VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, RdxParts[Part]);
3812     }
3813   }
3814
3815   // Reduce all of the unrolled parts into a single vector.
3816   Value *ReducedPartRdx = VectorLoopValueMap.getVectorValue(LoopExitInst, 0);
3817   unsigned Op = RecurrenceDescriptor::getRecurrenceBinOp(RK);
3818
3819   // The middle block terminator has already been assigned a DebugLoc here (the
3820   // OrigLoop's single latch terminator). We want the whole middle block to
3821   // appear to execute on this line because: (a) it is all compiler generated,
3822   // (b) these instructions are always executed after evaluating the latch
3823   // conditional branch, and (c) other passes may add new predecessors which
3824   // terminate on this line. This is the easiest way to ensure we don't
3825   // accidentally cause an extra step back into the loop while debugging.
3826   setDebugLocFromInst(Builder, LoopMiddleBlock->getTerminator());
3827   for (unsigned Part = 1; Part < UF; ++Part) {
3828     Value *RdxPart = VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
3829     if (Op != Instruction::ICmp && Op != Instruction::FCmp)
3830       // Floating point operations had to be 'fast' to enable the reduction.
3831       ReducedPartRdx = addFastMathFlag(
3832           Builder.CreateBinOp((Instruction::BinaryOps)Op, RdxPart,
3833                               ReducedPartRdx, "bin.rdx"),
3834           RdxDesc.getFastMathFlags());
3835     else
3836       ReducedPartRdx = createMinMaxOp(Builder, MinMaxKind, ReducedPartRdx,
3837                                       RdxPart);
3838   }
3839
3840   if (VF > 1) {
3841     bool NoNaN = Legal->hasFunNoNaNAttr();
3842     ReducedPartRdx =
3843         createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx, NoNaN);
3844     // If the reduction can be performed in a smaller type, we need to extend
3845     // the reduction to the wider type before we branch to the original loop.
3846     if (Phi->getType() != RdxDesc.getRecurrenceType())
3847       ReducedPartRdx =
3848         RdxDesc.isSigned()
3849         ? Builder.CreateSExt(ReducedPartRdx, Phi->getType())
3850         : Builder.CreateZExt(ReducedPartRdx, Phi->getType());
3851   }
3852
3853   // Create a phi node that merges control-flow from the backedge-taken check
3854   // block and the middle block.
3855   PHINode *BCBlockPhi = PHINode::Create(Phi->getType(), 2, "bc.merge.rdx",
3856                                         LoopScalarPreHeader->getTerminator());
3857   for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I)
3858     BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[I]);
3859   BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock);
3860
3861   // Now, we need to fix the users of the reduction variable
3862   // inside and outside of the scalar remainder loop.
3863   // We know that the loop is in LCSSA form. We need to update the
3864   // PHI nodes in the exit blocks.
3865   for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
3866     // All PHINodes need to have a single entry edge, or two if
3867     // we already fixed them.
3868     assert(LCSSAPhi.getNumIncomingValues() < 3 && "Invalid LCSSA PHI");
3869
3870     // We found a reduction value exit-PHI. Update it with the
3871     // incoming bypass edge.
3872     if (LCSSAPhi.getIncomingValue(0) == LoopExitInst)
3873       LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock);
3874   } // end of the LCSSA phi scan.
3875
3876     // Fix the scalar loop reduction variable with the incoming reduction sum
3877     // from the vector body and from the backedge value.
3878   int IncomingEdgeBlockIdx =
3879     Phi->getBasicBlockIndex(OrigLoop->getLoopLatch());
3880   assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index");
3881   // Pick the other block.
3882   int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1);
3883   Phi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi);
3884   Phi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst);
3885 }
3886
3887 void InnerLoopVectorizer::clearReductionWrapFlags(
3888     RecurrenceDescriptor &RdxDesc) {
3889   RecurrenceDescriptor::RecurrenceKind RK = RdxDesc.getRecurrenceKind();
3890   if (RK != RecurrenceDescriptor::RK_IntegerAdd &&
3891       RK != RecurrenceDescriptor::RK_IntegerMult)
3892     return;
3893
3894   Instruction *LoopExitInstr = RdxDesc.getLoopExitInstr();
3895   assert(LoopExitInstr && "null loop exit instruction");
3896   SmallVector<Instruction *, 8> Worklist;
3897   SmallPtrSet<Instruction *, 8> Visited;
3898   Worklist.push_back(LoopExitInstr);
3899   Visited.insert(LoopExitInstr);
3900
3901   while (!Worklist.empty()) {
3902     Instruction *Cur = Worklist.pop_back_val();
3903     if (isa<OverflowingBinaryOperator>(Cur))
3904       for (unsigned Part = 0; Part < UF; ++Part) {
3905         Value *V = getOrCreateVectorValue(Cur, Part);
3906         cast<Instruction>(V)->dropPoisonGeneratingFlags();
3907       }
3908
3909     for (User *U : Cur->users()) {
3910       Instruction *UI = cast<Instruction>(U);
3911       if ((Cur != LoopExitInstr || OrigLoop->contains(UI->getParent())) &&
3912           Visited.insert(UI).second)
3913         Worklist.push_back(UI);
3914     }
3915   }
3916 }
3917
3918 void InnerLoopVectorizer::fixLCSSAPHIs() {
3919   for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
3920     if (LCSSAPhi.getNumIncomingValues() == 1) {
3921       auto *IncomingValue = LCSSAPhi.getIncomingValue(0);
3922       // Non-instruction incoming values will have only one value.
3923       unsigned LastLane = 0;
3924       if (isa<Instruction>(IncomingValue))
3925           LastLane = Cost->isUniformAfterVectorization(
3926                          cast<Instruction>(IncomingValue), VF)
3927                          ? 0
3928                          : VF - 1;
3929       // Can be a loop invariant incoming value or the last scalar value to be
3930       // extracted from the vectorized loop.
3931       Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
3932       Value *lastIncomingValue =
3933           getOrCreateScalarValue(IncomingValue, { UF - 1, LastLane });
3934       LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock);
3935     }
3936   }
3937 }
3938
3939 void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) {
3940   // The basic block and loop containing the predicated instruction.
3941   auto *PredBB = PredInst->getParent();
3942   auto *VectorLoop = LI->getLoopFor(PredBB);
3943
3944   // Initialize a worklist with the operands of the predicated instruction.
3945   SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end());
3946
3947   // Holds instructions that we need to analyze again. An instruction may be
3948   // reanalyzed if we don't yet know if we can sink it or not.
3949   SmallVector<Instruction *, 8> InstsToReanalyze;
3950
3951   // Returns true if a given use occurs in the predicated block. Phi nodes use
3952   // their operands in their corresponding predecessor blocks.
3953   auto isBlockOfUsePredicated = [&](Use &U) -> bool {
3954     auto *I = cast<Instruction>(U.getUser());
3955     BasicBlock *BB = I->getParent();
3956     if (auto *Phi = dyn_cast<PHINode>(I))
3957       BB = Phi->getIncomingBlock(
3958           PHINode::getIncomingValueNumForOperand(U.getOperandNo()));
3959     return BB == PredBB;
3960   };
3961
3962   // Iteratively sink the scalarized operands of the predicated instruction
3963   // into the block we created for it. When an instruction is sunk, it's
3964   // operands are then added to the worklist. The algorithm ends after one pass
3965   // through the worklist doesn't sink a single instruction.
3966   bool Changed;
3967   do {
3968     // Add the instructions that need to be reanalyzed to the worklist, and
3969     // reset the changed indicator.
3970     Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end());
3971     InstsToReanalyze.clear();
3972     Changed = false;
3973
3974     while (!Worklist.empty()) {
3975       auto *I = dyn_cast<Instruction>(Worklist.pop_back_val());
3976
3977       // We can't sink an instruction if it is a phi node, is already in the
3978       // predicated block, is not in the loop, or may have side effects.
3979       if (!I || isa<PHINode>(I) || I->getParent() == PredBB ||
3980           !VectorLoop->contains(I) || I->mayHaveSideEffects())
3981         continue;
3982
3983       // It's legal to sink the instruction if all its uses occur in the
3984       // predicated block. Otherwise, there's nothing to do yet, and we may
3985       // need to reanalyze the instruction.
3986       if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) {
3987         InstsToReanalyze.push_back(I);
3988         continue;
3989       }
3990
3991       // Move the instruction to the beginning of the predicated block, and add
3992       // it's operands to the worklist.
3993       I->moveBefore(&*PredBB->getFirstInsertionPt());
3994       Worklist.insert(I->op_begin(), I->op_end());
3995
3996       // The sinking may have enabled other instructions to be sunk, so we will
3997       // need to iterate.
3998       Changed = true;
3999     }
4000   } while (Changed);
4001 }
4002
4003 void InnerLoopVectorizer::fixNonInductionPHIs() {
4004   for (PHINode *OrigPhi : OrigPHIsToFix) {
4005     PHINode *NewPhi =
4006         cast<PHINode>(VectorLoopValueMap.getVectorValue(OrigPhi, 0));
4007     unsigned NumIncomingValues = OrigPhi->getNumIncomingValues();
4008
4009     SmallVector<BasicBlock *, 2> ScalarBBPredecessors(
4010         predecessors(OrigPhi->getParent()));
4011     SmallVector<BasicBlock *, 2> VectorBBPredecessors(
4012         predecessors(NewPhi->getParent()));
4013     assert(ScalarBBPredecessors.size() == VectorBBPredecessors.size() &&
4014            "Scalar and Vector BB should have the same number of predecessors");
4015
4016     // The insertion point in Builder may be invalidated by the time we get
4017     // here. Force the Builder insertion point to something valid so that we do
4018     // not run into issues during insertion point restore in
4019     // getOrCreateVectorValue calls below.
4020     Builder.SetInsertPoint(NewPhi);
4021
4022     // The predecessor order is preserved and we can rely on mapping between
4023     // scalar and vector block predecessors.
4024     for (unsigned i = 0; i < NumIncomingValues; ++i) {
4025       BasicBlock *NewPredBB = VectorBBPredecessors[i];
4026
4027       // When looking up the new scalar/vector values to fix up, use incoming
4028       // values from original phi.
4029       Value *ScIncV =
4030           OrigPhi->getIncomingValueForBlock(ScalarBBPredecessors[i]);
4031
4032       // Scalar incoming value may need a broadcast
4033       Value *NewIncV = getOrCreateVectorValue(ScIncV, 0);
4034       NewPhi->addIncoming(NewIncV, NewPredBB);
4035     }
4036   }
4037 }
4038
4039 void InnerLoopVectorizer::widenGEP(GetElementPtrInst *GEP, unsigned UF,
4040                                    unsigned VF, bool IsPtrLoopInvariant,
4041                                    SmallBitVector &IsIndexLoopInvariant) {
4042   // Construct a vector GEP by widening the operands of the scalar GEP as
4043   // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP
4044   // results in a vector of pointers when at least one operand of the GEP
4045   // is vector-typed. Thus, to keep the representation compact, we only use
4046   // vector-typed operands for loop-varying values.
4047
4048   if (VF > 1 && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) {
4049     // If we are vectorizing, but the GEP has only loop-invariant operands,
4050     // the GEP we build (by only using vector-typed operands for
4051     // loop-varying values) would be a scalar pointer. Thus, to ensure we
4052     // produce a vector of pointers, we need to either arbitrarily pick an
4053     // operand to broadcast, or broadcast a clone of the original GEP.
4054     // Here, we broadcast a clone of the original.
4055     //
4056     // TODO: If at some point we decide to scalarize instructions having
4057     //       loop-invariant operands, this special case will no longer be
4058     //       required. We would add the scalarization decision to
4059     //       collectLoopScalars() and teach getVectorValue() to broadcast
4060     //       the lane-zero scalar value.
4061     auto *Clone = Builder.Insert(GEP->clone());
4062     for (unsigned Part = 0; Part < UF; ++Part) {
4063       Value *EntryPart = Builder.CreateVectorSplat(VF, Clone);
4064       VectorLoopValueMap.setVectorValue(GEP, Part, EntryPart);
4065       addMetadata(EntryPart, GEP);
4066     }
4067   } else {
4068     // If the GEP has at least one loop-varying operand, we are sure to
4069     // produce a vector of pointers. But if we are only unrolling, we want
4070     // to produce a scalar GEP for each unroll part. Thus, the GEP we
4071     // produce with the code below will be scalar (if VF == 1) or vector
4072     // (otherwise). Note that for the unroll-only case, we still maintain
4073     // values in the vector mapping with initVector, as we do for other
4074     // instructions.
4075     for (unsigned Part = 0; Part < UF; ++Part) {
4076       // The pointer operand of the new GEP. If it's loop-invariant, we
4077       // won't broadcast it.
4078       auto *Ptr = IsPtrLoopInvariant
4079                       ? GEP->getPointerOperand()
4080                       : getOrCreateVectorValue(GEP->getPointerOperand(), Part);
4081
4082       // Collect all the indices for the new GEP. If any index is
4083       // loop-invariant, we won't broadcast it.
4084       SmallVector<Value *, 4> Indices;
4085       for (auto Index : enumerate(GEP->indices())) {
4086         Value *User = Index.value().get();
4087         if (IsIndexLoopInvariant[Index.index()])
4088           Indices.push_back(User);
4089         else
4090           Indices.push_back(getOrCreateVectorValue(User, Part));
4091       }
4092
4093       // Create the new GEP. Note that this GEP may be a scalar if VF == 1,
4094       // but it should be a vector, otherwise.
4095       auto *NewGEP =
4096           GEP->isInBounds()
4097               ? Builder.CreateInBoundsGEP(GEP->getSourceElementType(), Ptr,
4098                                           Indices)
4099               : Builder.CreateGEP(GEP->getSourceElementType(), Ptr, Indices);
4100       assert((VF == 1 || NewGEP->getType()->isVectorTy()) &&
4101              "NewGEP is not a pointer vector");
4102       VectorLoopValueMap.setVectorValue(GEP, Part, NewGEP);
4103       addMetadata(NewGEP, GEP);
4104     }
4105   }
4106 }
4107
4108 void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF,
4109                                               unsigned VF) {
4110   PHINode *P = cast<PHINode>(PN);
4111   if (EnableVPlanNativePath) {
4112     // Currently we enter here in the VPlan-native path for non-induction
4113     // PHIs where all control flow is uniform. We simply widen these PHIs.
4114     // Create a vector phi with no operands - the vector phi operands will be
4115     // set at the end of vector code generation.
4116     Type *VecTy =
4117         (VF == 1) ? PN->getType() : VectorType::get(PN->getType(), VF);
4118     Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi");
4119     VectorLoopValueMap.setVectorValue(P, 0, VecPhi);
4120     OrigPHIsToFix.push_back(P);
4121
4122     return;
4123   }
4124
4125   assert(PN->getParent() == OrigLoop->getHeader() &&
4126          "Non-header phis should have been handled elsewhere");
4127
4128   // In order to support recurrences we need to be able to vectorize Phi nodes.
4129   // Phi nodes have cycles, so we need to vectorize them in two stages. This is
4130   // stage #1: We create a new vector PHI node with no incoming edges. We'll use
4131   // this value when we vectorize all of the instructions that use the PHI.
4132   if (Legal->isReductionVariable(P) || Legal->isFirstOrderRecurrence(P)) {
4133     for (unsigned Part = 0; Part < UF; ++Part) {
4134       // This is phase one of vectorizing PHIs.
4135       Type *VecTy =
4136           (VF == 1) ? PN->getType() : VectorType::get(PN->getType(), VF);
4137       Value *EntryPart = PHINode::Create(
4138           VecTy, 2, "vec.phi", &*LoopVectorBody->getFirstInsertionPt());
4139       VectorLoopValueMap.setVectorValue(P, Part, EntryPart);
4140     }
4141     return;
4142   }
4143
4144   setDebugLocFromInst(Builder, P);
4145
4146   // This PHINode must be an induction variable.
4147   // Make sure that we know about it.
4148   assert(Legal->getInductionVars()->count(P) && "Not an induction variable");
4149
4150   InductionDescriptor II = Legal->getInductionVars()->lookup(P);
4151   const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
4152
4153   // FIXME: The newly created binary instructions should contain nsw/nuw flags,
4154   // which can be found from the original scalar operations.
4155   switch (II.getKind()) {
4156   case InductionDescriptor::IK_NoInduction:
4157     llvm_unreachable("Unknown induction");
4158   case InductionDescriptor::IK_IntInduction:
4159   case InductionDescriptor::IK_FpInduction:
4160     llvm_unreachable("Integer/fp induction is handled elsewhere.");
4161   case InductionDescriptor::IK_PtrInduction: {
4162     // Handle the pointer induction variable case.
4163     assert(P->getType()->isPointerTy() && "Unexpected type.");
4164     // This is the normalized GEP that starts counting at zero.
4165     Value *PtrInd = Induction;
4166     PtrInd = Builder.CreateSExtOrTrunc(PtrInd, II.getStep()->getType());
4167     // Determine the number of scalars we need to generate for each unroll
4168     // iteration. If the instruction is uniform, we only need to generate the
4169     // first lane. Otherwise, we generate all VF values.
4170     unsigned Lanes = Cost->isUniformAfterVectorization(P, VF) ? 1 : VF;
4171     // These are the scalar results. Notice that we don't generate vector GEPs
4172     // because scalar GEPs result in better code.
4173     for (unsigned Part = 0; Part < UF; ++Part) {
4174       for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
4175         Constant *Idx = ConstantInt::get(PtrInd->getType(), Lane + Part * VF);
4176         Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx);
4177         Value *SclrGep =
4178             emitTransformedIndex(Builder, GlobalIdx, PSE.getSE(), DL, II);
4179         SclrGep->setName("next.gep");
4180         VectorLoopValueMap.setScalarValue(P, {Part, Lane}, SclrGep);
4181       }
4182     }
4183     return;
4184   }
4185   }
4186 }
4187
4188 /// A helper function for checking whether an integer division-related
4189 /// instruction may divide by zero (in which case it must be predicated if
4190 /// executed conditionally in the scalar code).
4191 /// TODO: It may be worthwhile to generalize and check isKnownNonZero().
4192 /// Non-zero divisors that are non compile-time constants will not be
4193 /// converted into multiplication, so we will still end up scalarizing
4194 /// the division, but can do so w/o predication.
4195 static bool mayDivideByZero(Instruction &I) {
4196   assert((I.getOpcode() == Instruction::UDiv ||
4197           I.getOpcode() == Instruction::SDiv ||
4198           I.getOpcode() == Instruction::URem ||
4199           I.getOpcode() == Instruction::SRem) &&
4200          "Unexpected instruction");
4201   Value *Divisor = I.getOperand(1);
4202   auto *CInt = dyn_cast<ConstantInt>(Divisor);
4203   return !CInt || CInt->isZero();
4204 }
4205
4206 void InnerLoopVectorizer::widenInstruction(Instruction &I) {
4207   switch (I.getOpcode()) {
4208   case Instruction::Br:
4209   case Instruction::PHI:
4210   case Instruction::GetElementPtr:
4211     llvm_unreachable("This instruction is handled by a different recipe.");
4212   case Instruction::UDiv:
4213   case Instruction::SDiv:
4214   case Instruction::SRem:
4215   case Instruction::URem:
4216   case Instruction::Add:
4217   case Instruction::FAdd:
4218   case Instruction::Sub:
4219   case Instruction::FSub:
4220   case Instruction::FNeg:
4221   case Instruction::Mul:
4222   case Instruction::FMul:
4223   case Instruction::FDiv:
4224   case Instruction::FRem:
4225   case Instruction::Shl:
4226   case Instruction::LShr:
4227   case Instruction::AShr:
4228   case Instruction::And:
4229   case Instruction::Or:
4230   case Instruction::Xor: {
4231     // Just widen unops and binops.
4232     setDebugLocFromInst(Builder, &I);
4233
4234     for (unsigned Part = 0; Part < UF; ++Part) {
4235       SmallVector<Value *, 2> Ops;
4236       for (Value *Op : I.operands())
4237         Ops.push_back(getOrCreateVectorValue(Op, Part));
4238
4239       Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops);
4240
4241       if (auto *VecOp = dyn_cast<Instruction>(V))
4242         VecOp->copyIRFlags(&I);
4243
4244       // Use this vector value for all users of the original instruction.
4245       VectorLoopValueMap.setVectorValue(&I, Part, V);
4246       addMetadata(V, &I);
4247     }
4248
4249     break;
4250   }
4251   case Instruction::Select: {
4252     // Widen selects.
4253     // If the selector is loop invariant we can create a select
4254     // instruction with a scalar condition. Otherwise, use vector-select.
4255     auto *SE = PSE.getSE();
4256     bool InvariantCond =
4257         SE->isLoopInvariant(PSE.getSCEV(I.getOperand(0)), OrigLoop);
4258     setDebugLocFromInst(Builder, &I);
4259
4260     // The condition can be loop invariant  but still defined inside the
4261     // loop. This means that we can't just use the original 'cond' value.
4262     // We have to take the 'vectorized' value and pick the first lane.
4263     // Instcombine will make this a no-op.
4264
4265     auto *ScalarCond = getOrCreateScalarValue(I.getOperand(0), {0, 0});
4266
4267     for (unsigned Part = 0; Part < UF; ++Part) {
4268       Value *Cond = getOrCreateVectorValue(I.getOperand(0), Part);
4269       Value *Op0 = getOrCreateVectorValue(I.getOperand(1), Part);
4270       Value *Op1 = getOrCreateVectorValue(I.getOperand(2), Part);
4271       Value *Sel =
4272           Builder.CreateSelect(InvariantCond ? ScalarCond : Cond, Op0, Op1);
4273       VectorLoopValueMap.setVectorValue(&I, Part, Sel);
4274       addMetadata(Sel, &I);
4275     }
4276
4277     break;
4278   }
4279
4280   case Instruction::ICmp:
4281   case Instruction::FCmp: {
4282     // Widen compares. Generate vector compares.
4283     bool FCmp = (I.getOpcode() == Instruction::FCmp);
4284     auto *Cmp = cast<CmpInst>(&I);
4285     setDebugLocFromInst(Builder, Cmp);
4286     for (unsigned Part = 0; Part < UF; ++Part) {
4287       Value *A = getOrCreateVectorValue(Cmp->getOperand(0), Part);
4288       Value *B = getOrCreateVectorValue(Cmp->getOperand(1), Part);
4289       Value *C = nullptr;
4290       if (FCmp) {
4291         // Propagate fast math flags.
4292         IRBuilder<>::FastMathFlagGuard FMFG(Builder);
4293         Builder.setFastMathFlags(Cmp->getFastMathFlags());
4294         C = Builder.CreateFCmp(Cmp->getPredicate(), A, B);
4295       } else {
4296         C = Builder.CreateICmp(Cmp->getPredicate(), A, B);
4297       }
4298       VectorLoopValueMap.setVectorValue(&I, Part, C);
4299       addMetadata(C, &I);
4300     }
4301
4302     break;
4303   }
4304
4305   case Instruction::ZExt:
4306   case Instruction::SExt:
4307   case Instruction::FPToUI:
4308   case Instruction::FPToSI:
4309   case Instruction::FPExt:
4310   case Instruction::PtrToInt:
4311   case Instruction::IntToPtr:
4312   case Instruction::SIToFP:
4313   case Instruction::UIToFP:
4314   case Instruction::Trunc:
4315   case Instruction::FPTrunc:
4316   case Instruction::BitCast: {
4317     auto *CI = cast<CastInst>(&I);
4318     setDebugLocFromInst(Builder, CI);
4319
4320     /// Vectorize casts.
4321     Type *DestTy =
4322         (VF == 1) ? CI->getType() : VectorType::get(CI->getType(), VF);
4323
4324     for (unsigned Part = 0; Part < UF; ++Part) {
4325       Value *A = getOrCreateVectorValue(CI->getOperand(0), Part);
4326       Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy);
4327       VectorLoopValueMap.setVectorValue(&I, Part, Cast);
4328       addMetadata(Cast, &I);
4329     }
4330     break;
4331   }
4332
4333   case Instruction::Call: {
4334     // Ignore dbg intrinsics.
4335     if (isa<DbgInfoIntrinsic>(I))
4336       break;
4337     setDebugLocFromInst(Builder, &I);
4338
4339     Module *M = I.getParent()->getParent()->getParent();
4340     auto *CI = cast<CallInst>(&I);
4341
4342     StringRef FnName = CI->getCalledFunction()->getName();
4343     Function *F = CI->getCalledFunction();
4344     Type *RetTy = ToVectorTy(CI->getType(), VF);
4345     SmallVector<Type *, 4> Tys;
4346     for (Value *ArgOperand : CI->arg_operands())
4347       Tys.push_back(ToVectorTy(ArgOperand->getType(), VF));
4348
4349     Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
4350
4351     // The flag shows whether we use Intrinsic or a usual Call for vectorized
4352     // version of the instruction.
4353     // Is it beneficial to perform intrinsic call compared to lib call?
4354     bool NeedToScalarize;
4355     unsigned CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize);
4356     bool UseVectorIntrinsic =
4357         ID && Cost->getVectorIntrinsicCost(CI, VF) <= CallCost;
4358     assert((UseVectorIntrinsic || !NeedToScalarize) &&
4359            "Instruction should be scalarized elsewhere.");
4360
4361     for (unsigned Part = 0; Part < UF; ++Part) {
4362       SmallVector<Value *, 4> Args;
4363       for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) {
4364         Value *Arg = CI->getArgOperand(i);
4365         // Some intrinsics have a scalar argument - don't replace it with a
4366         // vector.
4367         if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, i))
4368           Arg = getOrCreateVectorValue(CI->getArgOperand(i), Part);
4369         Args.push_back(Arg);
4370       }
4371
4372       Function *VectorF;
4373       if (UseVectorIntrinsic) {
4374         // Use vector version of the intrinsic.
4375         Type *TysForDecl[] = {CI->getType()};
4376         if (VF > 1)
4377           TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF);
4378         VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl);
4379       } else {
4380         // Use vector version of the library call.
4381         StringRef VFnName = TLI->getVectorizedFunction(FnName, VF);
4382         assert(!VFnName.empty() && "Vector function name is empty.");
4383         VectorF = M->getFunction(VFnName);
4384         if (!VectorF) {
4385           // Generate a declaration
4386           FunctionType *FTy = FunctionType::get(RetTy, Tys, false);
4387           VectorF =
4388               Function::Create(FTy, Function::ExternalLinkage, VFnName, M);
4389           VectorF->copyAttributesFrom(F);
4390         }
4391       }
4392       assert(VectorF && "Can't create vector function.");
4393
4394       SmallVector<OperandBundleDef, 1> OpBundles;
4395       CI->getOperandBundlesAsDefs(OpBundles);
4396       CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles);
4397
4398       if (isa<FPMathOperator>(V))
4399         V->copyFastMathFlags(CI);
4400
4401       VectorLoopValueMap.setVectorValue(&I, Part, V);
4402       addMetadata(V, &I);
4403     }
4404
4405     break;
4406   }
4407
4408   default:
4409     // This instruction is not vectorized by simple widening.
4410     LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I);
4411     llvm_unreachable("Unhandled instruction!");
4412   } // end of switch.
4413 }
4414
4415 void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) {
4416   // We should not collect Scalars more than once per VF. Right now, this
4417   // function is called from collectUniformsAndScalars(), which already does
4418   // this check. Collecting Scalars for VF=1 does not make any sense.
4419   assert(VF >= 2 && Scalars.find(VF) == Scalars.end() &&
4420          "This function should not be visited twice for the same VF");
4421
4422   SmallSetVector<Instruction *, 8> Worklist;
4423
4424   // These sets are used to seed the analysis with pointers used by memory
4425   // accesses that will remain scalar.
4426   SmallSetVector<Instruction *, 8> ScalarPtrs;
4427   SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs;
4428
4429   // A helper that returns true if the use of Ptr by MemAccess will be scalar.
4430   // The pointer operands of loads and stores will be scalar as long as the
4431   // memory access is not a gather or scatter operation. The value operand of a
4432   // store will remain scalar if the store is scalarized.
4433   auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) {
4434     InstWidening WideningDecision = getWideningDecision(MemAccess, VF);
4435     assert(WideningDecision != CM_Unknown &&
4436            "Widening decision should be ready at this moment");
4437     if (auto *Store = dyn_cast<StoreInst>(MemAccess))
4438       if (Ptr == Store->getValueOperand())
4439         return WideningDecision == CM_Scalarize;
4440     assert(Ptr == getLoadStorePointerOperand(MemAccess) &&
4441            "Ptr is neither a value or pointer operand");
4442     return WideningDecision != CM_GatherScatter;
4443   };
4444
4445   // A helper that returns true if the given value is a bitcast or
4446   // getelementptr instruction contained in the loop.
4447   auto isLoopVaryingBitCastOrGEP = [&](Value *V) {
4448     return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) ||
4449             isa<GetElementPtrInst>(V)) &&
4450            !TheLoop->isLoopInvariant(V);
4451   };
4452
4453   // A helper that evaluates a memory access's use of a pointer. If the use
4454   // will be a scalar use, and the pointer is only used by memory accesses, we
4455   // place the pointer in ScalarPtrs. Otherwise, the pointer is placed in
4456   // PossibleNonScalarPtrs.
4457   auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) {
4458     // We only care about bitcast and getelementptr instructions contained in
4459     // the loop.
4460     if (!isLoopVaryingBitCastOrGEP(Ptr))
4461       return;
4462
4463     // If the pointer has already been identified as scalar (e.g., if it was
4464     // also identified as uniform), there's nothing to do.
4465     auto *I = cast<Instruction>(Ptr);
4466     if (Worklist.count(I))
4467       return;
4468
4469     // If the use of the pointer will be a scalar use, and all users of the
4470     // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise,
4471     // place the pointer in PossibleNonScalarPtrs.
4472     if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) {
4473           return isa<LoadInst>(U) || isa<StoreInst>(U);
4474         }))
4475       ScalarPtrs.insert(I);
4476     else
4477       PossibleNonScalarPtrs.insert(I);
4478   };
4479
4480   // We seed the scalars analysis with three classes of instructions: (1)
4481   // instructions marked uniform-after-vectorization, (2) bitcast and
4482   // getelementptr instructions used by memory accesses requiring a scalar use,
4483   // and (3) pointer induction variables and their update instructions (we
4484   // currently only scalarize these).
4485   //
4486   // (1) Add to the worklist all instructions that have been identified as
4487   // uniform-after-vectorization.
4488   Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end());
4489
4490   // (2) Add to the worklist all bitcast and getelementptr instructions used by
4491   // memory accesses requiring a scalar use. The pointer operands of loads and
4492   // stores will be scalar as long as the memory accesses is not a gather or
4493   // scatter operation. The value operand of a store will remain scalar if the
4494   // store is scalarized.
4495   for (auto *BB : TheLoop->blocks())
4496     for (auto &I : *BB) {
4497       if (auto *Load = dyn_cast<LoadInst>(&I)) {
4498         evaluatePtrUse(Load, Load->getPointerOperand());
4499       } else if (auto *Store = dyn_cast<StoreInst>(&I)) {
4500         evaluatePtrUse(Store, Store->getPointerOperand());
4501         evaluatePtrUse(Store, Store->getValueOperand());
4502       }
4503     }
4504   for (auto *I : ScalarPtrs)
4505     if (PossibleNonScalarPtrs.find(I) == PossibleNonScalarPtrs.end()) {
4506       LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n");
4507       Worklist.insert(I);
4508     }
4509
4510   // (3) Add to the worklist all pointer induction variables and their update
4511   // instructions.
4512   //
4513   // TODO: Once we are able to vectorize pointer induction variables we should
4514   //       no longer insert them into the worklist here.
4515   auto *Latch = TheLoop->getLoopLatch();
4516   for (auto &Induction : *Legal->getInductionVars()) {
4517     auto *Ind = Induction.first;
4518     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4519     if (Induction.second.getKind() != InductionDescriptor::IK_PtrInduction)
4520       continue;
4521     Worklist.insert(Ind);
4522     Worklist.insert(IndUpdate);
4523     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
4524     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate
4525                       << "\n");
4526   }
4527
4528   // Insert the forced scalars.
4529   // FIXME: Currently widenPHIInstruction() often creates a dead vector
4530   // induction variable when the PHI user is scalarized.
4531   auto ForcedScalar = ForcedScalars.find(VF);
4532   if (ForcedScalar != ForcedScalars.end())
4533     for (auto *I : ForcedScalar->second)
4534       Worklist.insert(I);
4535
4536   // Expand the worklist by looking through any bitcasts and getelementptr
4537   // instructions we've already identified as scalar. This is similar to the
4538   // expansion step in collectLoopUniforms(); however, here we're only
4539   // expanding to include additional bitcasts and getelementptr instructions.
4540   unsigned Idx = 0;
4541   while (Idx != Worklist.size()) {
4542     Instruction *Dst = Worklist[Idx++];
4543     if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0)))
4544       continue;
4545     auto *Src = cast<Instruction>(Dst->getOperand(0));
4546     if (llvm::all_of(Src->users(), [&](User *U) -> bool {
4547           auto *J = cast<Instruction>(U);
4548           return !TheLoop->contains(J) || Worklist.count(J) ||
4549                  ((isa<LoadInst>(J) || isa<StoreInst>(J)) &&
4550                   isScalarUse(J, Src));
4551         })) {
4552       Worklist.insert(Src);
4553       LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n");
4554     }
4555   }
4556
4557   // An induction variable will remain scalar if all users of the induction
4558   // variable and induction variable update remain scalar.
4559   for (auto &Induction : *Legal->getInductionVars()) {
4560     auto *Ind = Induction.first;
4561     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4562
4563     // We already considered pointer induction variables, so there's no reason
4564     // to look at their users again.
4565     //
4566     // TODO: Once we are able to vectorize pointer induction variables we
4567     //       should no longer skip over them here.
4568     if (Induction.second.getKind() == InductionDescriptor::IK_PtrInduction)
4569       continue;
4570
4571     // Determine if all users of the induction variable are scalar after
4572     // vectorization.
4573     auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
4574       auto *I = cast<Instruction>(U);
4575       return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I);
4576     });
4577     if (!ScalarInd)
4578       continue;
4579
4580     // Determine if all users of the induction variable update instruction are
4581     // scalar after vectorization.
4582     auto ScalarIndUpdate =
4583         llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
4584           auto *I = cast<Instruction>(U);
4585           return I == Ind || !TheLoop->contains(I) || Worklist.count(I);
4586         });
4587     if (!ScalarIndUpdate)
4588       continue;
4589
4590     // The induction variable and its update instruction will remain scalar.
4591     Worklist.insert(Ind);
4592     Worklist.insert(IndUpdate);
4593     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
4594     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate
4595                       << "\n");
4596   }
4597
4598   Scalars[VF].insert(Worklist.begin(), Worklist.end());
4599 }
4600
4601 bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I, unsigned VF) {
4602   if (!blockNeedsPredication(I->getParent()))
4603     return false;
4604   switch(I->getOpcode()) {
4605   default:
4606     break;
4607   case Instruction::Load:
4608   case Instruction::Store: {
4609     if (!Legal->isMaskRequired(I))
4610       return false;
4611     auto *Ptr = getLoadStorePointerOperand(I);
4612     auto *Ty = getMemInstValueType(I);
4613     // We have already decided how to vectorize this instruction, get that
4614     // result.
4615     if (VF > 1) {
4616       InstWidening WideningDecision = getWideningDecision(I, VF);
4617       assert(WideningDecision != CM_Unknown &&
4618              "Widening decision should be ready at this moment");
4619       return WideningDecision == CM_Scalarize;
4620     }
4621     const MaybeAlign Alignment = getLoadStoreAlignment(I);
4622     return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) ||
4623                                 isLegalMaskedGather(Ty, Alignment))
4624                             : !(isLegalMaskedStore(Ty, Ptr, Alignment) ||
4625                                 isLegalMaskedScatter(Ty, Alignment));
4626   }
4627   case Instruction::UDiv:
4628   case Instruction::SDiv:
4629   case Instruction::SRem:
4630   case Instruction::URem:
4631     return mayDivideByZero(*I);
4632   }
4633   return false;
4634 }
4635
4636 bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(Instruction *I,
4637                                                                unsigned VF) {
4638   assert(isAccessInterleaved(I) && "Expecting interleaved access.");
4639   assert(getWideningDecision(I, VF) == CM_Unknown &&
4640          "Decision should not be set yet.");
4641   auto *Group = getInterleavedAccessGroup(I);
4642   assert(Group && "Must have a group.");
4643
4644   // If the instruction's allocated size doesn't equal it's type size, it
4645   // requires padding and will be scalarized.
4646   auto &DL = I->getModule()->getDataLayout();
4647   auto *ScalarTy = getMemInstValueType(I);
4648   if (hasIrregularType(ScalarTy, DL, VF))
4649     return false;
4650
4651   // Check if masking is required.
4652   // A Group may need masking for one of two reasons: it resides in a block that
4653   // needs predication, or it was decided to use masking to deal with gaps.
4654   bool PredicatedAccessRequiresMasking =
4655       Legal->blockNeedsPredication(I->getParent()) && Legal->isMaskRequired(I);
4656   bool AccessWithGapsRequiresMasking =
4657       Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed();
4658   if (!PredicatedAccessRequiresMasking && !AccessWithGapsRequiresMasking)
4659     return true;
4660
4661   // If masked interleaving is required, we expect that the user/target had
4662   // enabled it, because otherwise it either wouldn't have been created or
4663   // it should have been invalidated by the CostModel.
4664   assert(useMaskedInterleavedAccesses(TTI) &&
4665          "Masked interleave-groups for predicated accesses are not enabled.");
4666
4667   auto *Ty = getMemInstValueType(I);
4668   const MaybeAlign Alignment = getLoadStoreAlignment(I);
4669   return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment)
4670                           : TTI.isLegalMaskedStore(Ty, Alignment);
4671 }
4672
4673 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(Instruction *I,
4674                                                                unsigned VF) {
4675   // Get and ensure we have a valid memory instruction.
4676   LoadInst *LI = dyn_cast<LoadInst>(I);
4677   StoreInst *SI = dyn_cast<StoreInst>(I);
4678   assert((LI || SI) && "Invalid memory instruction");
4679
4680   auto *Ptr = getLoadStorePointerOperand(I);
4681
4682   // In order to be widened, the pointer should be consecutive, first of all.
4683   if (!Legal->isConsecutivePtr(Ptr))
4684     return false;
4685
4686   // If the instruction is a store located in a predicated block, it will be
4687   // scalarized.
4688   if (isScalarWithPredication(I))
4689     return false;
4690
4691   // If the instruction's allocated size doesn't equal it's type size, it
4692   // requires padding and will be scalarized.
4693   auto &DL = I->getModule()->getDataLayout();
4694   auto *ScalarTy = LI ? LI->getType() : SI->getValueOperand()->getType();
4695   if (hasIrregularType(ScalarTy, DL, VF))
4696     return false;
4697
4698   return true;
4699 }
4700
4701 void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) {
4702   // We should not collect Uniforms more than once per VF. Right now,
4703   // this function is called from collectUniformsAndScalars(), which
4704   // already does this check. Collecting Uniforms for VF=1 does not make any
4705   // sense.
4706
4707   assert(VF >= 2 && Uniforms.find(VF) == Uniforms.end() &&
4708          "This function should not be visited twice for the same VF");
4709
4710   // Visit the list of Uniforms. If we'll not find any uniform value, we'll
4711   // not analyze again.  Uniforms.count(VF) will return 1.
4712   Uniforms[VF].clear();
4713
4714   // We now know that the loop is vectorizable!
4715   // Collect instructions inside the loop that will remain uniform after
4716   // vectorization.
4717
4718   // Global values, params and instructions outside of current loop are out of
4719   // scope.
4720   auto isOutOfScope = [&](Value *V) -> bool {
4721     Instruction *I = dyn_cast<Instruction>(V);
4722     return (!I || !TheLoop->contains(I));
4723   };
4724
4725   SetVector<Instruction *> Worklist;
4726   BasicBlock *Latch = TheLoop->getLoopLatch();
4727
4728   // Instructions that are scalar with predication must not be considered
4729   // uniform after vectorization, because that would create an erroneous
4730   // replicating region where only a single instance out of VF should be formed.
4731   // TODO: optimize such seldom cases if found important, see PR40816.
4732   auto addToWorklistIfAllowed = [&](Instruction *I) -> void {
4733     if (isScalarWithPredication(I, VF)) {
4734       LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: "
4735                         << *I << "\n");
4736       return;
4737     }
4738     LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n");
4739     Worklist.insert(I);
4740   };
4741
4742   // Start with the conditional branch. If the branch condition is an
4743   // instruction contained in the loop that is only used by the branch, it is
4744   // uniform.
4745   auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
4746   if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse())
4747     addToWorklistIfAllowed(Cmp);
4748
4749   // Holds consecutive and consecutive-like pointers. Consecutive-like pointers
4750   // are pointers that are treated like consecutive pointers during
4751   // vectorization. The pointer operands of interleaved accesses are an
4752   // example.
4753   SmallSetVector<Instruction *, 8> ConsecutiveLikePtrs;
4754
4755   // Holds pointer operands of instructions that are possibly non-uniform.
4756   SmallPtrSet<Instruction *, 8> PossibleNonUniformPtrs;
4757
4758   auto isUniformDecision = [&](Instruction *I, unsigned VF) {
4759     InstWidening WideningDecision = getWideningDecision(I, VF);
4760     assert(WideningDecision != CM_Unknown &&
4761            "Widening decision should be ready at this moment");
4762
4763     return (WideningDecision == CM_Widen ||
4764             WideningDecision == CM_Widen_Reverse ||
4765             WideningDecision == CM_Interleave);
4766   };
4767   // Iterate over the instructions in the loop, and collect all
4768   // consecutive-like pointer operands in ConsecutiveLikePtrs. If it's possible
4769   // that a consecutive-like pointer operand will be scalarized, we collect it
4770   // in PossibleNonUniformPtrs instead. We use two sets here because a single
4771   // getelementptr instruction can be used by both vectorized and scalarized
4772   // memory instructions. For example, if a loop loads and stores from the same
4773   // location, but the store is conditional, the store will be scalarized, and
4774   // the getelementptr won't remain uniform.
4775   for (auto *BB : TheLoop->blocks())
4776     for (auto &I : *BB) {
4777       // If there's no pointer operand, there's nothing to do.
4778       auto *Ptr = dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I));
4779       if (!Ptr)
4780         continue;
4781
4782       // True if all users of Ptr are memory accesses that have Ptr as their
4783       // pointer operand.
4784       auto UsersAreMemAccesses =
4785           llvm::all_of(Ptr->users(), [&](User *U) -> bool {
4786             return getLoadStorePointerOperand(U) == Ptr;
4787           });
4788
4789       // Ensure the memory instruction will not be scalarized or used by
4790       // gather/scatter, making its pointer operand non-uniform. If the pointer
4791       // operand is used by any instruction other than a memory access, we
4792       // conservatively assume the pointer operand may be non-uniform.
4793       if (!UsersAreMemAccesses || !isUniformDecision(&I, VF))
4794         PossibleNonUniformPtrs.insert(Ptr);
4795
4796       // If the memory instruction will be vectorized and its pointer operand
4797       // is consecutive-like, or interleaving - the pointer operand should
4798       // remain uniform.
4799       else
4800         ConsecutiveLikePtrs.insert(Ptr);
4801     }
4802
4803   // Add to the Worklist all consecutive and consecutive-like pointers that
4804   // aren't also identified as possibly non-uniform.
4805   for (auto *V : ConsecutiveLikePtrs)
4806     if (PossibleNonUniformPtrs.find(V) == PossibleNonUniformPtrs.end())
4807       addToWorklistIfAllowed(V);
4808
4809   // Expand Worklist in topological order: whenever a new instruction
4810   // is added , its users should be already inside Worklist.  It ensures
4811   // a uniform instruction will only be used by uniform instructions.
4812   unsigned idx = 0;
4813   while (idx != Worklist.size()) {
4814     Instruction *I = Worklist[idx++];
4815
4816     for (auto OV : I->operand_values()) {
4817       // isOutOfScope operands cannot be uniform instructions.
4818       if (isOutOfScope(OV))
4819         continue;
4820       // First order recurrence Phi's should typically be considered
4821       // non-uniform.
4822       auto *OP = dyn_cast<PHINode>(OV);
4823       if (OP && Legal->isFirstOrderRecurrence(OP))
4824         continue;
4825       // If all the users of the operand are uniform, then add the
4826       // operand into the uniform worklist.
4827       auto *OI = cast<Instruction>(OV);
4828       if (llvm::all_of(OI->users(), [&](User *U) -> bool {
4829             auto *J = cast<Instruction>(U);
4830             return Worklist.count(J) ||
4831                    (OI == getLoadStorePointerOperand(J) &&
4832                     isUniformDecision(J, VF));
4833           }))
4834         addToWorklistIfAllowed(OI);
4835     }
4836   }
4837
4838   // Returns true if Ptr is the pointer operand of a memory access instruction
4839   // I, and I is known to not require scalarization.
4840   auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool {
4841     return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF);
4842   };
4843
4844   // For an instruction to be added into Worklist above, all its users inside
4845   // the loop should also be in Worklist. However, this condition cannot be
4846   // true for phi nodes that form a cyclic dependence. We must process phi
4847   // nodes separately. An induction variable will remain uniform if all users
4848   // of the induction variable and induction variable update remain uniform.
4849   // The code below handles both pointer and non-pointer induction variables.
4850   for (auto &Induction : *Legal->getInductionVars()) {
4851     auto *Ind = Induction.first;
4852     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4853
4854     // Determine if all users of the induction variable are uniform after
4855     // vectorization.
4856     auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
4857       auto *I = cast<Instruction>(U);
4858       return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
4859              isVectorizedMemAccessUse(I, Ind);
4860     });
4861     if (!UniformInd)
4862       continue;
4863
4864     // Determine if all users of the induction variable update instruction are
4865     // uniform after vectorization.
4866     auto UniformIndUpdate =
4867         llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
4868           auto *I = cast<Instruction>(U);
4869           return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
4870                  isVectorizedMemAccessUse(I, IndUpdate);
4871         });
4872     if (!UniformIndUpdate)
4873       continue;
4874
4875     // The induction variable and its update instruction will remain uniform.
4876     addToWorklistIfAllowed(Ind);
4877     addToWorklistIfAllowed(IndUpdate);
4878   }
4879
4880   Uniforms[VF].insert(Worklist.begin(), Worklist.end());
4881 }
4882
4883 bool LoopVectorizationCostModel::runtimeChecksRequired() {
4884   LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n");
4885
4886   if (Legal->getRuntimePointerChecking()->Need) {
4887     reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz",
4888         "runtime pointer checks needed. Enable vectorization of this "
4889         "loop with '#pragma clang loop vectorize(enable)' when "
4890         "compiling with -Os/-Oz",
4891         "CantVersionLoopWithOptForSize", ORE, TheLoop);
4892     return true;
4893   }
4894
4895   if (!PSE.getUnionPredicate().getPredicates().empty()) {
4896     reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz",
4897         "runtime SCEV checks needed. Enable vectorization of this "
4898         "loop with '#pragma clang loop vectorize(enable)' when "
4899         "compiling with -Os/-Oz",
4900         "CantVersionLoopWithOptForSize", ORE, TheLoop);
4901     return true;
4902   }
4903
4904   // FIXME: Avoid specializing for stride==1 instead of bailing out.
4905   if (!Legal->getLAI()->getSymbolicStrides().empty()) {
4906     reportVectorizationFailure("Runtime stride check is required with -Os/-Oz",
4907         "runtime stride == 1 checks needed. Enable vectorization of "
4908         "this loop with '#pragma clang loop vectorize(enable)' when "
4909         "compiling with -Os/-Oz",
4910         "CantVersionLoopWithOptForSize", ORE, TheLoop);
4911     return true;
4912   }
4913
4914   return false;
4915 }
4916
4917 Optional<unsigned> LoopVectorizationCostModel::computeMaxVF() {
4918   if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) {
4919     // TODO: It may by useful to do since it's still likely to be dynamically
4920     // uniform if the target can skip.
4921     reportVectorizationFailure(
4922         "Not inserting runtime ptr check for divergent target",
4923         "runtime pointer checks needed. Not enabled for divergent target",
4924         "CantVersionLoopWithDivergentTarget", ORE, TheLoop);
4925     return None;
4926   }
4927
4928   unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
4929   LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
4930   if (TC == 1) {
4931     reportVectorizationFailure("Single iteration (non) loop",
4932         "loop trip count is one, irrelevant for vectorization",
4933         "SingleIterationLoop", ORE, TheLoop);
4934     return None;
4935   }
4936
4937   switch (ScalarEpilogueStatus) {
4938   case CM_ScalarEpilogueAllowed:
4939     return computeFeasibleMaxVF(TC);
4940   case CM_ScalarEpilogueNotNeededUsePredicate:
4941     LLVM_DEBUG(
4942         dbgs() << "LV: vector predicate hint/switch found.\n"
4943                << "LV: Not allowing scalar epilogue, creating predicated "
4944                << "vector loop.\n");
4945     break;
4946   case CM_ScalarEpilogueNotAllowedLowTripLoop:
4947     // fallthrough as a special case of OptForSize
4948   case CM_ScalarEpilogueNotAllowedOptSize:
4949     if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize)
4950       LLVM_DEBUG(
4951           dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n");
4952     else
4953       LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip "
4954                         << "count.\n");
4955
4956     // Bail if runtime checks are required, which are not good when optimising
4957     // for size.
4958     if (runtimeChecksRequired())
4959       return None;
4960     break;
4961   }
4962
4963   // Now try the tail folding
4964
4965   // Invalidate interleave groups that require an epilogue if we can't mask
4966   // the interleave-group.
4967   if (!useMaskedInterleavedAccesses(TTI))
4968     InterleaveInfo.invalidateGroupsRequiringScalarEpilogue();
4969
4970   unsigned MaxVF = computeFeasibleMaxVF(TC);
4971   if (TC > 0 && TC % MaxVF == 0) {
4972     // Accept MaxVF if we do not have a tail.
4973     LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n");
4974     return MaxVF;
4975   }
4976
4977   // If we don't know the precise trip count, or if the trip count that we
4978   // found modulo the vectorization factor is not zero, try to fold the tail
4979   // by masking.
4980   // FIXME: look for a smaller MaxVF that does divide TC rather than masking.
4981   if (Legal->prepareToFoldTailByMasking()) {
4982     FoldTailByMasking = true;
4983     return MaxVF;
4984   }
4985
4986   if (TC == 0) {
4987     reportVectorizationFailure(
4988         "Unable to calculate the loop count due to complex control flow",
4989         "unable to calculate the loop count due to complex control flow",
4990         "UnknownLoopCountComplexCFG", ORE, TheLoop);
4991     return None;
4992   }
4993
4994   reportVectorizationFailure(
4995       "Cannot optimize for size and vectorize at the same time.",
4996       "cannot optimize for size and vectorize at the same time. "
4997       "Enable vectorization of this loop with '#pragma clang loop "
4998       "vectorize(enable)' when compiling with -Os/-Oz",
4999       "NoTailLoopWithOptForSize", ORE, TheLoop);
5000   return None;
5001 }
5002
5003 unsigned
5004 LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount) {
5005   MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI);
5006   unsigned SmallestType, WidestType;
5007   std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();
5008   unsigned WidestRegister = TTI.getRegisterBitWidth(true);
5009
5010   // Get the maximum safe dependence distance in bits computed by LAA.
5011   // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from
5012   // the memory accesses that is most restrictive (involved in the smallest
5013   // dependence distance).
5014   unsigned MaxSafeRegisterWidth = Legal->getMaxSafeRegisterWidth();
5015
5016   WidestRegister = std::min(WidestRegister, MaxSafeRegisterWidth);
5017
5018   unsigned MaxVectorSize = WidestRegister / WidestType;
5019
5020   LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType
5021                     << " / " << WidestType << " bits.\n");
5022   LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: "
5023                     << WidestRegister << " bits.\n");
5024
5025   assert(MaxVectorSize <= 256 && "Did not expect to pack so many elements"
5026                                  " into one vector!");
5027   if (MaxVectorSize == 0) {
5028     LLVM_DEBUG(dbgs() << "LV: The target has no vector registers.\n");
5029     MaxVectorSize = 1;
5030     return MaxVectorSize;
5031   } else if (ConstTripCount && ConstTripCount < MaxVectorSize &&
5032              isPowerOf2_32(ConstTripCount)) {
5033     // We need to clamp the VF to be the ConstTripCount. There is no point in
5034     // choosing a higher viable VF as done in the loop below.
5035     LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to the constant trip count: "
5036                       << ConstTripCount << "\n");
5037     MaxVectorSize = ConstTripCount;
5038     return MaxVectorSize;
5039   }
5040
5041   unsigned MaxVF = MaxVectorSize;
5042   if (TTI.shouldMaximizeVectorBandwidth(!isScalarEpilogueAllowed()) ||
5043       (MaximizeBandwidth && isScalarEpilogueAllowed())) {
5044     // Collect all viable vectorization factors larger than the default MaxVF
5045     // (i.e. MaxVectorSize).
5046     SmallVector<unsigned, 8> VFs;
5047     unsigned NewMaxVectorSize = WidestRegister / SmallestType;
5048     for (unsigned VS = MaxVectorSize * 2; VS <= NewMaxVectorSize; VS *= 2)
5049       VFs.push_back(VS);
5050
5051     // For each VF calculate its register usage.
5052     auto RUs = calculateRegisterUsage(VFs);
5053
5054     // Select the largest VF which doesn't require more registers than existing
5055     // ones.
5056     for (int i = RUs.size() - 1; i >= 0; --i) {
5057       bool Selected = true;
5058       for (auto& pair : RUs[i].MaxLocalUsers) {
5059         unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);
5060         if (pair.second > TargetNumRegisters)
5061           Selected = false;
5062       }
5063       if (Selected) {
5064         MaxVF = VFs[i];
5065         break;
5066       }
5067     }
5068     if (unsigned MinVF = TTI.getMinimumVF(SmallestType)) {
5069       if (MaxVF < MinVF) {
5070         LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF
5071                           << ") with target's minimum: " << MinVF << '\n');
5072         MaxVF = MinVF;
5073       }
5074     }
5075   }
5076   return MaxVF;
5077 }
5078
5079 VectorizationFactor
5080 LoopVectorizationCostModel::selectVectorizationFactor(unsigned MaxVF) {
5081   float Cost = expectedCost(1).first;
5082   const float ScalarCost = Cost;
5083   unsigned Width = 1;
5084   LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << (int)ScalarCost << ".\n");
5085
5086   bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled;
5087   if (ForceVectorization && MaxVF > 1) {
5088     // Ignore scalar width, because the user explicitly wants vectorization.
5089     // Initialize cost to max so that VF = 2 is, at least, chosen during cost
5090     // evaluation.
5091     Cost = std::numeric_limits<float>::max();
5092   }
5093
5094   for (unsigned i = 2; i <= MaxVF; i *= 2) {
5095     // Notice that the vector loop needs to be executed less times, so
5096     // we need to divide the cost of the vector loops by the width of
5097     // the vector elements.
5098     VectorizationCostTy C = expectedCost(i);
5099     float VectorCost = C.first / (float)i;
5100     LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i
5101                       << " costs: " << (int)VectorCost << ".\n");
5102     if (!C.second && !ForceVectorization) {
5103       LLVM_DEBUG(
5104           dbgs() << "LV: Not considering vector loop of width " << i
5105                  << " because it will not generate any vector instructions.\n");
5106       continue;
5107     }
5108     if (VectorCost < Cost) {
5109       Cost = VectorCost;
5110       Width = i;
5111     }
5112   }
5113
5114   if (!EnableCondStoresVectorization && NumPredStores) {
5115     reportVectorizationFailure("There are conditional stores.",
5116         "store that is conditionally executed prevents vectorization",
5117         "ConditionalStore", ORE, TheLoop);
5118     Width = 1;
5119     Cost = ScalarCost;
5120   }
5121
5122   LLVM_DEBUG(if (ForceVectorization && Width > 1 && Cost >= ScalarCost) dbgs()
5123              << "LV: Vectorization seems to be not beneficial, "
5124              << "but was forced by a user.\n");
5125   LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << Width << ".\n");
5126   VectorizationFactor Factor = {Width, (unsigned)(Width * Cost)};
5127   return Factor;
5128 }
5129
5130 std::pair<unsigned, unsigned>
5131 LoopVectorizationCostModel::getSmallestAndWidestTypes() {
5132   unsigned MinWidth = -1U;
5133   unsigned MaxWidth = 8;
5134   const DataLayout &DL = TheFunction->getParent()->getDataLayout();
5135
5136   // For each block.
5137   for (BasicBlock *BB : TheLoop->blocks()) {
5138     // For each instruction in the loop.
5139     for (Instruction &I : BB->instructionsWithoutDebug()) {
5140       Type *T = I.getType();
5141
5142       // Skip ignored values.
5143       if (ValuesToIgnore.find(&I) != ValuesToIgnore.end())
5144         continue;
5145
5146       // Only examine Loads, Stores and PHINodes.
5147       if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I))
5148         continue;
5149
5150       // Examine PHI nodes that are reduction variables. Update the type to
5151       // account for the recurrence type.
5152       if (auto *PN = dyn_cast<PHINode>(&I)) {
5153         if (!Legal->isReductionVariable(PN))
5154           continue;
5155         RecurrenceDescriptor RdxDesc = (*Legal->getReductionVars())[PN];
5156         T = RdxDesc.getRecurrenceType();
5157       }
5158
5159       // Examine the stored values.
5160       if (auto *ST = dyn_cast<StoreInst>(&I))
5161         T = ST->getValueOperand()->getType();
5162
5163       // Ignore loaded pointer types and stored pointer types that are not
5164       // vectorizable.
5165       //
5166       // FIXME: The check here attempts to predict whether a load or store will
5167       //        be vectorized. We only know this for certain after a VF has
5168       //        been selected. Here, we assume that if an access can be
5169       //        vectorized, it will be. We should also look at extending this
5170       //        optimization to non-pointer types.
5171       //
5172       if (T->isPointerTy() && !isConsecutiveLoadOrStore(&I) &&
5173           !isAccessInterleaved(&I) && !isLegalGatherOrScatter(&I))
5174         continue;
5175
5176       MinWidth = std::min(MinWidth,
5177                           (unsigned)DL.getTypeSizeInBits(T->getScalarType()));
5178       MaxWidth = std::max(MaxWidth,
5179                           (unsigned)DL.getTypeSizeInBits(T->getScalarType()));
5180     }
5181   }
5182
5183   return {MinWidth, MaxWidth};
5184 }
5185
5186 unsigned LoopVectorizationCostModel::selectInterleaveCount(unsigned VF,
5187                                                            unsigned LoopCost) {
5188   // -- The interleave heuristics --
5189   // We interleave the loop in order to expose ILP and reduce the loop overhead.
5190   // There are many micro-architectural considerations that we can't predict
5191   // at this level. For example, frontend pressure (on decode or fetch) due to
5192   // code size, or the number and capabilities of the execution ports.
5193   //
5194   // We use the following heuristics to select the interleave count:
5195   // 1. If the code has reductions, then we interleave to break the cross
5196   // iteration dependency.
5197   // 2. If the loop is really small, then we interleave to reduce the loop
5198   // overhead.
5199   // 3. We don't interleave if we think that we will spill registers to memory
5200   // due to the increased register pressure.
5201
5202   if (!isScalarEpilogueAllowed())
5203     return 1;
5204
5205   // We used the distance for the interleave count.
5206   if (Legal->getMaxSafeDepDistBytes() != -1U)
5207     return 1;
5208
5209   // Do not interleave loops with a relatively small known or estimated trip
5210   // count.
5211   auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop);
5212   if (BestKnownTC && *BestKnownTC < TinyTripCountInterleaveThreshold)
5213     return 1;
5214
5215   RegisterUsage R = calculateRegisterUsage({VF})[0];
5216   // We divide by these constants so assume that we have at least one
5217   // instruction that uses at least one register.
5218   for (auto& pair : R.MaxLocalUsers) {
5219     pair.second = std::max(pair.second, 1U);
5220   }
5221
5222   // We calculate the interleave count using the following formula.
5223   // Subtract the number of loop invariants from the number of available
5224   // registers. These registers are used by all of the interleaved instances.
5225   // Next, divide the remaining registers by the number of registers that is
5226   // required by the loop, in order to estimate how many parallel instances
5227   // fit without causing spills. All of this is rounded down if necessary to be
5228   // a power of two. We want power of two interleave count to simplify any
5229   // addressing operations or alignment considerations.
5230   // We also want power of two interleave counts to ensure that the induction
5231   // variable of the vector loop wraps to zero, when tail is folded by masking;
5232   // this currently happens when OptForSize, in which case IC is set to 1 above.
5233   unsigned IC = UINT_MAX;
5234
5235   for (auto& pair : R.MaxLocalUsers) {
5236     unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);
5237     LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters
5238                       << " registers of "
5239                       << TTI.getRegisterClassName(pair.first) << " register class\n");
5240     if (VF == 1) {
5241       if (ForceTargetNumScalarRegs.getNumOccurrences() > 0)
5242         TargetNumRegisters = ForceTargetNumScalarRegs;
5243     } else {
5244       if (ForceTargetNumVectorRegs.getNumOccurrences() > 0)
5245         TargetNumRegisters = ForceTargetNumVectorRegs;
5246     }
5247     unsigned MaxLocalUsers = pair.second;
5248     unsigned LoopInvariantRegs = 0;
5249     if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end())
5250       LoopInvariantRegs = R.LoopInvariantRegs[pair.first];
5251
5252     unsigned TmpIC = PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs) / MaxLocalUsers);
5253     // Don't count the induction variable as interleaved.
5254     if (EnableIndVarRegisterHeur) {
5255       TmpIC =
5256           PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs - 1) /
5257                         std::max(1U, (MaxLocalUsers - 1)));
5258     }
5259
5260     IC = std::min(IC, TmpIC);
5261   }
5262
5263   // Clamp the interleave ranges to reasonable counts.
5264   unsigned MaxInterleaveCount = TTI.getMaxInterleaveFactor(VF);
5265
5266   // Check if the user has overridden the max.
5267   if (VF == 1) {
5268     if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0)
5269       MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor;
5270   } else {
5271     if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0)
5272       MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor;
5273   }
5274
5275   // If trip count is known or estimated compile time constant, limit the
5276   // interleave count to be less than the trip count divided by VF.
5277   if (BestKnownTC) {
5278     MaxInterleaveCount = std::min(*BestKnownTC / VF, MaxInterleaveCount);
5279   }
5280
5281   // If we did not calculate the cost for VF (because the user selected the VF)
5282   // then we calculate the cost of VF here.
5283   if (LoopCost == 0)
5284     LoopCost = expectedCost(VF).first;
5285
5286   assert(LoopCost && "Non-zero loop cost expected");
5287
5288   // Clamp the calculated IC to be between the 1 and the max interleave count
5289   // that the target and trip count allows.
5290   if (IC > MaxInterleaveCount)
5291     IC = MaxInterleaveCount;
5292   else if (IC < 1)
5293     IC = 1;
5294
5295   // Interleave if we vectorized this loop and there is a reduction that could
5296   // benefit from interleaving.
5297   if (VF > 1 && !Legal->getReductionVars()->empty()) {
5298     LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n");
5299     return IC;
5300   }
5301
5302   // Note that if we've already vectorized the loop we will have done the
5303   // runtime check and so interleaving won't require further checks.
5304   bool InterleavingRequiresRuntimePointerCheck =
5305       (VF == 1 && Legal->getRuntimePointerChecking()->Need);
5306
5307   // We want to interleave small loops in order to reduce the loop overhead and
5308   // potentially expose ILP opportunities.
5309   LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n');
5310   if (!InterleavingRequiresRuntimePointerCheck && LoopCost < SmallLoopCost) {
5311     // We assume that the cost overhead is 1 and we use the cost model
5312     // to estimate the cost of the loop and interleave until the cost of the
5313     // loop overhead is about 5% of the cost of the loop.
5314     unsigned SmallIC =
5315         std::min(IC, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost));
5316
5317     // Interleave until store/load ports (estimated by max interleave count) are
5318     // saturated.
5319     unsigned NumStores = Legal->getNumStores();
5320     unsigned NumLoads = Legal->getNumLoads();
5321     unsigned StoresIC = IC / (NumStores ? NumStores : 1);
5322     unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1);
5323
5324     // If we have a scalar reduction (vector reductions are already dealt with
5325     // by this point), we can increase the critical path length if the loop
5326     // we're interleaving is inside another loop. Limit, by default to 2, so the
5327     // critical path only gets increased by one reduction operation.
5328     if (!Legal->getReductionVars()->empty() && TheLoop->getLoopDepth() > 1) {
5329       unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC);
5330       SmallIC = std::min(SmallIC, F);
5331       StoresIC = std::min(StoresIC, F);
5332       LoadsIC = std::min(LoadsIC, F);
5333     }
5334
5335     if (EnableLoadStoreRuntimeInterleave &&
5336         std::max(StoresIC, LoadsIC) > SmallIC) {
5337       LLVM_DEBUG(
5338           dbgs() << "LV: Interleaving to saturate store or load ports.\n");
5339       return std::max(StoresIC, LoadsIC);
5340     }
5341
5342     LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n");
5343     return SmallIC;
5344   }
5345
5346   // Interleave if this is a large loop (small loops are already dealt with by
5347   // this point) that could benefit from interleaving.
5348   bool HasReductions = !Legal->getReductionVars()->empty();
5349   if (TTI.enableAggressiveInterleaving(HasReductions)) {
5350     LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
5351     return IC;
5352   }
5353
5354   LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n");
5355   return 1;
5356 }
5357
5358 SmallVector<LoopVectorizationCostModel::RegisterUsage, 8>
5359 LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<unsigned> VFs) {
5360   // This function calculates the register usage by measuring the highest number
5361   // of values that are alive at a single location. Obviously, this is a very
5362   // rough estimation. We scan the loop in a topological order in order and
5363   // assign a number to each instruction. We use RPO to ensure that defs are
5364   // met before their users. We assume that each instruction that has in-loop
5365   // users starts an interval. We record every time that an in-loop value is
5366   // used, so we have a list of the first and last occurrences of each
5367   // instruction. Next, we transpose this data structure into a multi map that
5368   // holds the list of intervals that *end* at a specific location. This multi
5369   // map allows us to perform a linear search. We scan the instructions linearly
5370   // and record each time that a new interval starts, by placing it in a set.
5371   // If we find this value in the multi-map then we remove it from the set.
5372   // The max register usage is the maximum size of the set.
5373   // We also search for instructions that are defined outside the loop, but are
5374   // used inside the loop. We need this number separately from the max-interval
5375   // usage number because when we unroll, loop-invariant values do not take
5376   // more register.
5377   LoopBlocksDFS DFS(TheLoop);
5378   DFS.perform(LI);
5379
5380   RegisterUsage RU;
5381
5382   // Each 'key' in the map opens a new interval. The values
5383   // of the map are the index of the 'last seen' usage of the
5384   // instruction that is the key.
5385   using IntervalMap = DenseMap<Instruction *, unsigned>;
5386
5387   // Maps instruction to its index.
5388   SmallVector<Instruction *, 64> IdxToInstr;
5389   // Marks the end of each interval.
5390   IntervalMap EndPoint;
5391   // Saves the list of instruction indices that are used in the loop.
5392   SmallPtrSet<Instruction *, 8> Ends;
5393   // Saves the list of values that are used in the loop but are
5394   // defined outside the loop, such as arguments and constants.
5395   SmallPtrSet<Value *, 8> LoopInvariants;
5396
5397   for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
5398     for (Instruction &I : BB->instructionsWithoutDebug()) {
5399       IdxToInstr.push_back(&I);
5400
5401       // Save the end location of each USE.
5402       for (Value *U : I.operands()) {
5403         auto *Instr = dyn_cast<Instruction>(U);
5404
5405         // Ignore non-instruction values such as arguments, constants, etc.
5406         if (!Instr)
5407           continue;
5408
5409         // If this instruction is outside the loop then record it and continue.
5410         if (!TheLoop->contains(Instr)) {
5411           LoopInvariants.insert(Instr);
5412           continue;
5413         }
5414
5415         // Overwrite previous end points.
5416         EndPoint[Instr] = IdxToInstr.size();
5417         Ends.insert(Instr);
5418       }
5419     }
5420   }
5421
5422   // Saves the list of intervals that end with the index in 'key'.
5423   using InstrList = SmallVector<Instruction *, 2>;
5424   DenseMap<unsigned, InstrList> TransposeEnds;
5425
5426   // Transpose the EndPoints to a list of values that end at each index.
5427   for (auto &Interval : EndPoint)
5428     TransposeEnds[Interval.second].push_back(Interval.first);
5429
5430   SmallPtrSet<Instruction *, 8> OpenIntervals;
5431
5432   // Get the size of the widest register.
5433   unsigned MaxSafeDepDist = -1U;
5434   if (Legal->getMaxSafeDepDistBytes() != -1U)
5435     MaxSafeDepDist = Legal->getMaxSafeDepDistBytes() * 8;
5436   unsigned WidestRegister =
5437       std::min(TTI.getRegisterBitWidth(true), MaxSafeDepDist);
5438   const DataLayout &DL = TheFunction->getParent()->getDataLayout();
5439
5440   SmallVector<RegisterUsage, 8> RUs(VFs.size());
5441   SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size());
5442
5443   LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n");
5444
5445   // A lambda that gets the register usage for the given type and VF.
5446   auto GetRegUsage = [&DL, WidestRegister](Type *Ty, unsigned VF) {
5447     if (Ty->isTokenTy())
5448       return 0U;
5449     unsigned TypeSize = DL.getTypeSizeInBits(Ty->getScalarType());
5450     return std::max<unsigned>(1, VF * TypeSize / WidestRegister);
5451   };
5452
5453   for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) {
5454     Instruction *I = IdxToInstr[i];
5455
5456     // Remove all of the instructions that end at this location.
5457     InstrList &List = TransposeEnds[i];
5458     for (Instruction *ToRemove : List)
5459       OpenIntervals.erase(ToRemove);
5460
5461     // Ignore instructions that are never used within the loop.
5462     if (Ends.find(I) == Ends.end())
5463       continue;
5464
5465     // Skip ignored values.
5466     if (ValuesToIgnore.find(I) != ValuesToIgnore.end())
5467       continue;
5468
5469     // For each VF find the maximum usage of registers.
5470     for (unsigned j = 0, e = VFs.size(); j < e; ++j) {
5471       // Count the number of live intervals.
5472       SmallMapVector<unsigned, unsigned, 4> RegUsage;
5473
5474       if (VFs[j] == 1) {
5475         for (auto Inst : OpenIntervals) {
5476           unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType());
5477           if (RegUsage.find(ClassID) == RegUsage.end())
5478             RegUsage[ClassID] = 1;
5479           else
5480             RegUsage[ClassID] += 1;
5481         }
5482       } else {
5483         collectUniformsAndScalars(VFs[j]);
5484         for (auto Inst : OpenIntervals) {
5485           // Skip ignored values for VF > 1.
5486           if (VecValuesToIgnore.find(Inst) != VecValuesToIgnore.end())
5487             continue;
5488           if (isScalarAfterVectorization(Inst, VFs[j])) {
5489             unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType());
5490             if (RegUsage.find(ClassID) == RegUsage.end())
5491               RegUsage[ClassID] = 1;
5492             else
5493               RegUsage[ClassID] += 1;
5494           } else {
5495             unsigned ClassID = TTI.getRegisterClassForType(true, Inst->getType());
5496             if (RegUsage.find(ClassID) == RegUsage.end())
5497               RegUsage[ClassID] = GetRegUsage(Inst->getType(), VFs[j]);
5498             else
5499               RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]);
5500           }
5501         }
5502       }
5503
5504       for (auto& pair : RegUsage) {
5505         if (MaxUsages[j].find(pair.first) != MaxUsages[j].end())
5506           MaxUsages[j][pair.first] = std::max(MaxUsages[j][pair.first], pair.second);
5507         else
5508           MaxUsages[j][pair.first] = pair.second;
5509       }
5510     }
5511
5512     LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # "
5513                       << OpenIntervals.size() << '\n');
5514
5515     // Add the current instruction to the list of open intervals.
5516     OpenIntervals.insert(I);
5517   }
5518
5519   for (unsigned i = 0, e = VFs.size(); i < e; ++i) {
5520     SmallMapVector<unsigned, unsigned, 4> Invariant;
5521
5522     for (auto Inst : LoopInvariants) {
5523       unsigned Usage = VFs[i] == 1 ? 1 : GetRegUsage(Inst->getType(), VFs[i]);
5524       unsigned ClassID = TTI.getRegisterClassForType(VFs[i] > 1, Inst->getType());
5525       if (Invariant.find(ClassID) == Invariant.end())
5526         Invariant[ClassID] = Usage;
5527       else
5528         Invariant[ClassID] += Usage;
5529     }
5530
5531     LLVM_DEBUG({
5532       dbgs() << "LV(REG): VF = " << VFs[i] << '\n';
5533       dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size()
5534              << " item\n";
5535       for (const auto &pair : MaxUsages[i]) {
5536         dbgs() << "LV(REG): RegisterClass: "
5537                << TTI.getRegisterClassName(pair.first) << ", " << pair.second
5538                << " registers\n";
5539       }
5540       dbgs() << "LV(REG): Found invariant usage: " << Invariant.size()
5541              << " item\n";
5542       for (const auto &pair : Invariant) {
5543         dbgs() << "LV(REG): RegisterClass: "
5544                << TTI.getRegisterClassName(pair.first) << ", " << pair.second
5545                << " registers\n";
5546       }
5547     });
5548
5549     RU.LoopInvariantRegs = Invariant;
5550     RU.MaxLocalUsers = MaxUsages[i];
5551     RUs[i] = RU;
5552   }
5553
5554   return RUs;
5555 }
5556
5557 bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I){
5558   // TODO: Cost model for emulated masked load/store is completely
5559   // broken. This hack guides the cost model to use an artificially
5560   // high enough value to practically disable vectorization with such
5561   // operations, except where previously deployed legality hack allowed
5562   // using very low cost values. This is to avoid regressions coming simply
5563   // from moving "masked load/store" check from legality to cost model.
5564   // Masked Load/Gather emulation was previously never allowed.
5565   // Limited number of Masked Store/Scatter emulation was allowed.
5566   assert(isPredicatedInst(I) && "Expecting a scalar emulated instruction");
5567   return isa<LoadInst>(I) ||
5568          (isa<StoreInst>(I) &&
5569           NumPredStores > NumberOfStoresToPredicate);
5570 }
5571
5572 void LoopVectorizationCostModel::collectInstsToScalarize(unsigned VF) {
5573   // If we aren't vectorizing the loop, or if we've already collected the
5574   // instructions to scalarize, there's nothing to do. Collection may already
5575   // have occurred if we have a user-selected VF and are now computing the
5576   // expected cost for interleaving.
5577   if (VF < 2 || InstsToScalarize.find(VF) != InstsToScalarize.end())
5578     return;
5579
5580   // Initialize a mapping for VF in InstsToScalalarize. If we find that it's
5581   // not profitable to scalarize any instructions, the presence of VF in the
5582   // map will indicate that we've analyzed it already.
5583   ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF];
5584
5585   // Find all the instructions that are scalar with predication in the loop and
5586   // determine if it would be better to not if-convert the blocks they are in.
5587   // If so, we also record the instructions to scalarize.
5588   for (BasicBlock *BB : TheLoop->blocks()) {
5589     if (!blockNeedsPredication(BB))
5590       continue;
5591     for (Instruction &I : *BB)
5592       if (isScalarWithPredication(&I)) {
5593         ScalarCostsTy ScalarCosts;
5594         // Do not apply discount logic if hacked cost is needed
5595         // for emulated masked memrefs.
5596         if (!useEmulatedMaskMemRefHack(&I) &&
5597             computePredInstDiscount(&I, ScalarCosts, VF) >= 0)
5598           ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end());
5599         // Remember that BB will remain after vectorization.
5600         PredicatedBBsAfterVectorization.insert(BB);
5601       }
5602   }
5603 }
5604
5605 int LoopVectorizationCostModel::computePredInstDiscount(
5606     Instruction *PredInst, DenseMap<Instruction *, unsigned> &ScalarCosts,
5607     unsigned VF) {
5608   assert(!isUniformAfterVectorization(PredInst, VF) &&
5609          "Instruction marked uniform-after-vectorization will be predicated");
5610
5611   // Initialize the discount to zero, meaning that the scalar version and the
5612   // vector version cost the same.
5613   int Discount = 0;
5614
5615   // Holds instructions to analyze. The instructions we visit are mapped in
5616   // ScalarCosts. Those instructions are the ones that would be scalarized if
5617   // we find that the scalar version costs less.
5618   SmallVector<Instruction *, 8> Worklist;
5619
5620   // Returns true if the given instruction can be scalarized.
5621   auto canBeScalarized = [&](Instruction *I) -> bool {
5622     // We only attempt to scalarize instructions forming a single-use chain
5623     // from the original predicated block that would otherwise be vectorized.
5624     // Although not strictly necessary, we give up on instructions we know will
5625     // already be scalar to avoid traversing chains that are unlikely to be
5626     // beneficial.
5627     if (!I->hasOneUse() || PredInst->getParent() != I->getParent() ||
5628         isScalarAfterVectorization(I, VF))
5629       return false;
5630
5631     // If the instruction is scalar with predication, it will be analyzed
5632     // separately. We ignore it within the context of PredInst.
5633     if (isScalarWithPredication(I))
5634       return false;
5635
5636     // If any of the instruction's operands are uniform after vectorization,
5637     // the instruction cannot be scalarized. This prevents, for example, a
5638     // masked load from being scalarized.
5639     //
5640     // We assume we will only emit a value for lane zero of an instruction
5641     // marked uniform after vectorization, rather than VF identical values.
5642     // Thus, if we scalarize an instruction that uses a uniform, we would
5643     // create uses of values corresponding to the lanes we aren't emitting code
5644     // for. This behavior can be changed by allowing getScalarValue to clone
5645     // the lane zero values for uniforms rather than asserting.
5646     for (Use &U : I->operands())
5647       if (auto *J = dyn_cast<Instruction>(U.get()))
5648         if (isUniformAfterVectorization(J, VF))
5649           return false;
5650
5651     // Otherwise, we can scalarize the instruction.
5652     return true;
5653   };
5654
5655   // Compute the expected cost discount from scalarizing the entire expression
5656   // feeding the predicated instruction. We currently only consider expressions
5657   // that are single-use instruction chains.
5658   Worklist.push_back(PredInst);
5659   while (!Worklist.empty()) {
5660     Instruction *I = Worklist.pop_back_val();
5661
5662     // If we've already analyzed the instruction, there's nothing to do.
5663     if (ScalarCosts.find(I) != ScalarCosts.end())
5664       continue;
5665
5666     // Compute the cost of the vector instruction. Note that this cost already
5667     // includes the scalarization overhead of the predicated instruction.
5668     unsigned VectorCost = getInstructionCost(I, VF).first;
5669
5670     // Compute the cost of the scalarized instruction. This cost is the cost of
5671     // the instruction as if it wasn't if-converted and instead remained in the
5672     // predicated block. We will scale this cost by block probability after
5673     // computing the scalarization overhead.
5674     unsigned ScalarCost = VF * getInstructionCost(I, 1).first;
5675
5676     // Compute the scalarization overhead of needed insertelement instructions
5677     // and phi nodes.
5678     if (isScalarWithPredication(I) && !I->getType()->isVoidTy()) {
5679       ScalarCost += TTI.getScalarizationOverhead(ToVectorTy(I->getType(), VF),
5680                                                  true, false);
5681       ScalarCost += VF * TTI.getCFInstrCost(Instruction::PHI);
5682     }
5683
5684     // Compute the scalarization overhead of needed extractelement
5685     // instructions. For each of the instruction's operands, if the operand can
5686     // be scalarized, add it to the worklist; otherwise, account for the
5687     // overhead.
5688     for (Use &U : I->operands())
5689       if (auto *J = dyn_cast<Instruction>(U.get())) {
5690         assert(VectorType::isValidElementType(J->getType()) &&
5691                "Instruction has non-scalar type");
5692         if (canBeScalarized(J))
5693           Worklist.push_back(J);
5694         else if (needsExtract(J, VF))
5695           ScalarCost += TTI.getScalarizationOverhead(
5696                               ToVectorTy(J->getType(),VF), false, true);
5697       }
5698
5699     // Scale the total scalar cost by block probability.
5700     ScalarCost /= getReciprocalPredBlockProb();
5701
5702     // Compute the discount. A non-negative discount means the vector version
5703     // of the instruction costs more, and scalarizing would be beneficial.
5704     Discount += VectorCost - ScalarCost;
5705     ScalarCosts[I] = ScalarCost;
5706   }
5707
5708   return Discount;
5709 }
5710
5711 LoopVectorizationCostModel::VectorizationCostTy
5712 LoopVectorizationCostModel::expectedCost(unsigned VF) {
5713   VectorizationCostTy Cost;
5714
5715   // For each block.
5716   for (BasicBlock *BB : TheLoop->blocks()) {
5717     VectorizationCostTy BlockCost;
5718
5719     // For each instruction in the old loop.
5720     for (Instruction &I : BB->instructionsWithoutDebug()) {
5721       // Skip ignored values.
5722       if (ValuesToIgnore.find(&I) != ValuesToIgnore.end() ||
5723           (VF > 1 && VecValuesToIgnore.find(&I) != VecValuesToIgnore.end()))
5724         continue;
5725
5726       VectorizationCostTy C = getInstructionCost(&I, VF);
5727
5728       // Check if we should override the cost.
5729       if (ForceTargetInstructionCost.getNumOccurrences() > 0)
5730         C.first = ForceTargetInstructionCost;
5731
5732       BlockCost.first += C.first;
5733       BlockCost.second |= C.second;
5734       LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first
5735                         << " for VF " << VF << " For instruction: " << I
5736                         << '\n');
5737     }
5738
5739     // If we are vectorizing a predicated block, it will have been
5740     // if-converted. This means that the block's instructions (aside from
5741     // stores and instructions that may divide by zero) will now be
5742     // unconditionally executed. For the scalar case, we may not always execute
5743     // the predicated block. Thus, scale the block's cost by the probability of
5744     // executing it.
5745     if (VF == 1 && blockNeedsPredication(BB))
5746       BlockCost.first /= getReciprocalPredBlockProb();
5747
5748     Cost.first += BlockCost.first;
5749     Cost.second |= BlockCost.second;
5750   }
5751
5752   return Cost;
5753 }
5754
5755 /// Gets Address Access SCEV after verifying that the access pattern
5756 /// is loop invariant except the induction variable dependence.
5757 ///
5758 /// This SCEV can be sent to the Target in order to estimate the address
5759 /// calculation cost.
5760 static const SCEV *getAddressAccessSCEV(
5761               Value *Ptr,
5762               LoopVectorizationLegality *Legal,
5763               PredicatedScalarEvolution &PSE,
5764               const Loop *TheLoop) {
5765
5766   auto *Gep = dyn_cast<GetElementPtrInst>(Ptr);
5767   if (!Gep)
5768     return nullptr;
5769
5770   // We are looking for a gep with all loop invariant indices except for one
5771   // which should be an induction variable.
5772   auto SE = PSE.getSE();
5773   unsigned NumOperands = Gep->getNumOperands();
5774   for (unsigned i = 1; i < NumOperands; ++i) {
5775     Value *Opd = Gep->getOperand(i);
5776     if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) &&
5777         !Legal->isInductionVariable(Opd))
5778       return nullptr;
5779   }
5780
5781   // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV.
5782   return PSE.getSCEV(Ptr);
5783 }
5784
5785 static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) {
5786   return Legal->hasStride(I->getOperand(0)) ||
5787          Legal->hasStride(I->getOperand(1));
5788 }
5789
5790 unsigned LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
5791                                                                  unsigned VF) {
5792   assert(VF > 1 && "Scalarization cost of instruction implies vectorization.");
5793   Type *ValTy = getMemInstValueType(I);
5794   auto SE = PSE.getSE();
5795
5796   unsigned AS = getLoadStoreAddressSpace(I);
5797   Value *Ptr = getLoadStorePointerOperand(I);
5798   Type *PtrTy = ToVectorTy(Ptr->getType(), VF);
5799
5800   // Figure out whether the access is strided and get the stride value
5801   // if it's known in compile time
5802   const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop);
5803
5804   // Get the cost of the scalar memory instruction and address computation.
5805   unsigned Cost = VF * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV);
5806
5807   // Don't pass *I here, since it is scalar but will actually be part of a
5808   // vectorized loop where the user of it is a vectorized instruction.
5809   const MaybeAlign Alignment = getLoadStoreAlignment(I);
5810   Cost += VF * TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(),
5811                                    Alignment, AS);
5812
5813   // Get the overhead of the extractelement and insertelement instructions
5814   // we might create due to scalarization.
5815   Cost += getScalarizationOverhead(I, VF);
5816
5817   // If we have a predicated store, it may not be executed for each vector
5818   // lane. Scale the cost by the probability of executing the predicated
5819   // block.
5820   if (isPredicatedInst(I)) {
5821     Cost /= getReciprocalPredBlockProb();
5822
5823     if (useEmulatedMaskMemRefHack(I))
5824       // Artificially setting to a high enough value to practically disable
5825       // vectorization with such operations.
5826       Cost = 3000000;
5827   }
5828
5829   return Cost;
5830 }
5831
5832 unsigned LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
5833                                                              unsigned VF) {
5834   Type *ValTy = getMemInstValueType(I);
5835   Type *VectorTy = ToVectorTy(ValTy, VF);
5836   Value *Ptr = getLoadStorePointerOperand(I);
5837   unsigned AS = getLoadStoreAddressSpace(I);
5838   int ConsecutiveStride = Legal->isConsecutivePtr(Ptr);
5839
5840   assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
5841          "Stride should be 1 or -1 for consecutive memory access");
5842   const MaybeAlign Alignment = getLoadStoreAlignment(I);
5843   unsigned Cost = 0;
5844   if (Legal->isMaskRequired(I))
5845     Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy,
5846                                       Alignment ? Alignment->value() : 0, AS);
5847   else
5848     Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, I);
5849
5850   bool Reverse = ConsecutiveStride < 0;
5851   if (Reverse)
5852     Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0);
5853   return Cost;
5854 }
5855
5856 unsigned LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
5857                                                          unsigned VF) {
5858   Type *ValTy = getMemInstValueType(I);
5859   Type *VectorTy = ToVectorTy(ValTy, VF);
5860   const MaybeAlign Alignment = getLoadStoreAlignment(I);
5861   unsigned AS = getLoadStoreAddressSpace(I);
5862   if (isa<LoadInst>(I)) {
5863     return TTI.getAddressComputationCost(ValTy) +
5864            TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS) +
5865            TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy);
5866   }
5867   StoreInst *SI = cast<StoreInst>(I);
5868
5869   bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand());
5870   return TTI.getAddressComputationCost(ValTy) +
5871          TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS) +
5872          (isLoopInvariantStoreValue
5873               ? 0
5874               : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy,
5875                                        VF - 1));
5876 }
5877
5878 unsigned LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,
5879                                                           unsigned VF) {
5880   Type *ValTy = getMemInstValueType(I);
5881   Type *VectorTy = ToVectorTy(ValTy, VF);
5882   const MaybeAlign Alignment = getLoadStoreAlignment(I);
5883   Value *Ptr = getLoadStorePointerOperand(I);
5884
5885   return TTI.getAddressComputationCost(VectorTy) +
5886          TTI.getGatherScatterOpCost(I->getOpcode(), VectorTy, Ptr,
5887                                     Legal->isMaskRequired(I),
5888                                     Alignment ? Alignment->value() : 0);
5889 }
5890
5891 unsigned LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
5892                                                             unsigned VF) {
5893   Type *ValTy = getMemInstValueType(I);
5894   Type *VectorTy = ToVectorTy(ValTy, VF);
5895   unsigned AS = getLoadStoreAddressSpace(I);
5896
5897   auto Group = getInterleavedAccessGroup(I);
5898   assert(Group && "Fail to get an interleaved access group.");
5899
5900   unsigned InterleaveFactor = Group->getFactor();
5901   Type *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor);
5902
5903   // Holds the indices of existing members in an interleaved load group.
5904   // An interleaved store group doesn't need this as it doesn't allow gaps.
5905   SmallVector<unsigned, 4> Indices;
5906   if (isa<LoadInst>(I)) {
5907     for (unsigned i = 0; i < InterleaveFactor; i++)
5908       if (Group->getMember(i))
5909         Indices.push_back(i);
5910   }
5911
5912   // Calculate the cost of the whole interleaved group.
5913   bool UseMaskForGaps =
5914       Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed();
5915   unsigned Cost = TTI.getInterleavedMemoryOpCost(
5916       I->getOpcode(), WideVecTy, Group->getFactor(), Indices,
5917       Group->getAlignment(), AS, Legal->isMaskRequired(I), UseMaskForGaps);
5918
5919   if (Group->isReverse()) {
5920     // TODO: Add support for reversed masked interleaved access.
5921     assert(!Legal->isMaskRequired(I) &&
5922            "Reverse masked interleaved access not supported.");
5923     Cost += Group->getNumMembers() *
5924             TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0);
5925   }
5926   return Cost;
5927 }
5928
5929 unsigned LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,
5930                                                               unsigned VF) {
5931   // Calculate scalar cost only. Vectorization cost should be ready at this
5932   // moment.
5933   if (VF == 1) {
5934     Type *ValTy = getMemInstValueType(I);
5935     const MaybeAlign Alignment = getLoadStoreAlignment(I);
5936     unsigned AS = getLoadStoreAddressSpace(I);
5937
5938     return TTI.getAddressComputationCost(ValTy) +
5939            TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS, I);
5940   }
5941   return getWideningCost(I, VF);
5942 }
5943
5944 LoopVectorizationCostModel::VectorizationCostTy
5945 LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
5946   // If we know that this instruction will remain uniform, check the cost of
5947   // the scalar version.
5948   if (isUniformAfterVectorization(I, VF))
5949     VF = 1;
5950
5951   if (VF > 1 && isProfitableToScalarize(I, VF))
5952     return VectorizationCostTy(InstsToScalarize[VF][I], false);
5953
5954   // Forced scalars do not have any scalarization overhead.
5955   auto ForcedScalar = ForcedScalars.find(VF);
5956   if (VF > 1 && ForcedScalar != ForcedScalars.end()) {
5957     auto InstSet = ForcedScalar->second;
5958     if (InstSet.find(I) != InstSet.end())
5959       return VectorizationCostTy((getInstructionCost(I, 1).first * VF), false);
5960   }
5961
5962   Type *VectorTy;
5963   unsigned C = getInstructionCost(I, VF, VectorTy);
5964
5965   bool TypeNotScalarized =
5966       VF > 1 && VectorTy->isVectorTy() && TTI.getNumberOfParts(VectorTy) < VF;
5967   return VectorizationCostTy(C, TypeNotScalarized);
5968 }
5969
5970 unsigned LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I,
5971                                                               unsigned VF) {
5972
5973   if (VF == 1)
5974     return 0;
5975
5976   unsigned Cost = 0;
5977   Type *RetTy = ToVectorTy(I->getType(), VF);
5978   if (!RetTy->isVoidTy() &&
5979       (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore()))
5980     Cost += TTI.getScalarizationOverhead(RetTy, true, false);
5981
5982   // Some targets keep addresses scalar.
5983   if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing())
5984     return Cost;
5985
5986   // Some targets support efficient element stores.
5987   if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore())
5988     return Cost;
5989
5990   // Collect operands to consider.
5991   CallInst *CI = dyn_cast<CallInst>(I);
5992   Instruction::op_range Ops = CI ? CI->arg_operands() : I->operands();
5993
5994   // Skip operands that do not require extraction/scalarization and do not incur
5995   // any overhead.
5996   return Cost + TTI.getOperandsScalarizationOverhead(
5997                     filterExtractingOperands(Ops, VF), VF);
5998 }
5999
6000 void LoopVectorizationCostModel::setCostBasedWideningDecision(unsigned VF) {
6001   if (VF == 1)
6002     return;
6003   NumPredStores = 0;
6004   for (BasicBlock *BB : TheLoop->blocks()) {
6005     // For each instruction in the old loop.
6006     for (Instruction &I : *BB) {
6007       Value *Ptr =  getLoadStorePointerOperand(&I);
6008       if (!Ptr)
6009         continue;
6010
6011       // TODO: We should generate better code and update the cost model for
6012       // predicated uniform stores. Today they are treated as any other
6013       // predicated store (see added test cases in
6014       // invariant-store-vectorization.ll).
6015       if (isa<StoreInst>(&I) && isScalarWithPredication(&I))
6016         NumPredStores++;
6017
6018       if (Legal->isUniform(Ptr) &&
6019           // Conditional loads and stores should be scalarized and predicated.
6020           // isScalarWithPredication cannot be used here since masked
6021           // gather/scatters are not considered scalar with predication.
6022           !Legal->blockNeedsPredication(I.getParent())) {
6023         // TODO: Avoid replicating loads and stores instead of
6024         // relying on instcombine to remove them.
6025         // Load: Scalar load + broadcast
6026         // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract
6027         unsigned Cost = getUniformMemOpCost(&I, VF);
6028         setWideningDecision(&I, VF, CM_Scalarize, Cost);
6029         continue;
6030       }
6031
6032       // We assume that widening is the best solution when possible.
6033       if (memoryInstructionCanBeWidened(&I, VF)) {
6034         unsigned Cost = getConsecutiveMemOpCost(&I, VF);
6035         int ConsecutiveStride =
6036                Legal->isConsecutivePtr(getLoadStorePointerOperand(&I));
6037         assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
6038                "Expected consecutive stride.");
6039         InstWidening Decision =
6040             ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse;
6041         setWideningDecision(&I, VF, Decision, Cost);
6042         continue;
6043       }
6044
6045       // Choose between Interleaving, Gather/Scatter or Scalarization.
6046       unsigned InterleaveCost = std::numeric_limits<unsigned>::max();
6047       unsigned NumAccesses = 1;
6048       if (isAccessInterleaved(&I)) {
6049         auto Group = getInterleavedAccessGroup(&I);
6050         assert(Group && "Fail to get an interleaved access group.");
6051
6052         // Make one decision for the whole group.
6053         if (getWideningDecision(&I, VF) != CM_Unknown)
6054           continue;
6055
6056         NumAccesses = Group->getNumMembers();
6057         if (interleavedAccessCanBeWidened(&I, VF))
6058           InterleaveCost = getInterleaveGroupCost(&I, VF);
6059       }
6060
6061       unsigned GatherScatterCost =
6062           isLegalGatherOrScatter(&I)
6063               ? getGatherScatterCost(&I, VF) * NumAccesses
6064               : std::numeric_limits<unsigned>::max();
6065
6066       unsigned ScalarizationCost =
6067           getMemInstScalarizationCost(&I, VF) * NumAccesses;
6068
6069       // Choose better solution for the current VF,
6070       // write down this decision and use it during vectorization.
6071       unsigned Cost;
6072       InstWidening Decision;
6073       if (InterleaveCost <= GatherScatterCost &&
6074           InterleaveCost < ScalarizationCost) {
6075         Decision = CM_Interleave;
6076         Cost = InterleaveCost;
6077       } else if (GatherScatterCost < ScalarizationCost) {
6078         Decision = CM_GatherScatter;
6079         Cost = GatherScatterCost;
6080       } else {
6081         Decision = CM_Scalarize;
6082         Cost = ScalarizationCost;
6083       }
6084       // If the instructions belongs to an interleave group, the whole group
6085       // receives the same decision. The whole group receives the cost, but
6086       // the cost will actually be assigned to one instruction.
6087       if (auto Group = getInterleavedAccessGroup(&I))
6088         setWideningDecision(Group, VF, Decision, Cost);
6089       else
6090         setWideningDecision(&I, VF, Decision, Cost);
6091     }
6092   }
6093
6094   // Make sure that any load of address and any other address computation
6095   // remains scalar unless there is gather/scatter support. This avoids
6096   // inevitable extracts into address registers, and also has the benefit of
6097   // activating LSR more, since that pass can't optimize vectorized
6098   // addresses.
6099   if (TTI.prefersVectorizedAddressing())
6100     return;
6101
6102   // Start with all scalar pointer uses.
6103   SmallPtrSet<Instruction *, 8> AddrDefs;
6104   for (BasicBlock *BB : TheLoop->blocks())
6105     for (Instruction &I : *BB) {
6106       Instruction *PtrDef =
6107         dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I));
6108       if (PtrDef && TheLoop->contains(PtrDef) &&
6109           getWideningDecision(&I, VF) != CM_GatherScatter)
6110         AddrDefs.insert(PtrDef);
6111     }
6112
6113   // Add all instructions used to generate the addresses.
6114   SmallVector<Instruction *, 4> Worklist;
6115   for (auto *I : AddrDefs)
6116     Worklist.push_back(I);
6117   while (!Worklist.empty()) {
6118     Instruction *I = Worklist.pop_back_val();
6119     for (auto &Op : I->operands())
6120       if (auto *InstOp = dyn_cast<Instruction>(Op))
6121         if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) &&
6122             AddrDefs.insert(InstOp).second)
6123           Worklist.push_back(InstOp);
6124   }
6125
6126   for (auto *I : AddrDefs) {
6127     if (isa<LoadInst>(I)) {
6128       // Setting the desired widening decision should ideally be handled in
6129       // by cost functions, but since this involves the task of finding out
6130       // if the loaded register is involved in an address computation, it is
6131       // instead changed here when we know this is the case.
6132       InstWidening Decision = getWideningDecision(I, VF);
6133       if (Decision == CM_Widen || Decision == CM_Widen_Reverse)
6134         // Scalarize a widened load of address.
6135         setWideningDecision(I, VF, CM_Scalarize,
6136                             (VF * getMemoryInstructionCost(I, 1)));
6137       else if (auto Group = getInterleavedAccessGroup(I)) {
6138         // Scalarize an interleave group of address loads.
6139         for (unsigned I = 0; I < Group->getFactor(); ++I) {
6140           if (Instruction *Member = Group->getMember(I))
6141             setWideningDecision(Member, VF, CM_Scalarize,
6142                                 (VF * getMemoryInstructionCost(Member, 1)));
6143         }
6144       }
6145     } else
6146       // Make sure I gets scalarized and a cost estimate without
6147       // scalarization overhead.
6148       ForcedScalars[VF].insert(I);
6149   }
6150 }
6151
6152 unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
6153                                                         unsigned VF,
6154                                                         Type *&VectorTy) {
6155   Type *RetTy = I->getType();
6156   if (canTruncateToMinimalBitwidth(I, VF))
6157     RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]);
6158   VectorTy = isScalarAfterVectorization(I, VF) ? RetTy : ToVectorTy(RetTy, VF);
6159   auto SE = PSE.getSE();
6160
6161   // TODO: We need to estimate the cost of intrinsic calls.
6162   switch (I->getOpcode()) {
6163   case Instruction::GetElementPtr:
6164     // We mark this instruction as zero-cost because the cost of GEPs in
6165     // vectorized code depends on whether the corresponding memory instruction
6166     // is scalarized or not. Therefore, we handle GEPs with the memory
6167     // instruction cost.
6168     return 0;
6169   case Instruction::Br: {
6170     // In cases of scalarized and predicated instructions, there will be VF
6171     // predicated blocks in the vectorized loop. Each branch around these
6172     // blocks requires also an extract of its vector compare i1 element.
6173     bool ScalarPredicatedBB = false;
6174     BranchInst *BI = cast<BranchInst>(I);
6175     if (VF > 1 && BI->isConditional() &&
6176         (PredicatedBBsAfterVectorization.find(BI->getSuccessor(0)) !=
6177              PredicatedBBsAfterVectorization.end() ||
6178          PredicatedBBsAfterVectorization.find(BI->getSuccessor(1)) !=
6179              PredicatedBBsAfterVectorization.end()))
6180       ScalarPredicatedBB = true;
6181
6182     if (ScalarPredicatedBB) {
6183       // Return cost for branches around scalarized and predicated blocks.
6184       Type *Vec_i1Ty =
6185           VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF);
6186       return (TTI.getScalarizationOverhead(Vec_i1Ty, false, true) +
6187               (TTI.getCFInstrCost(Instruction::Br) * VF));
6188     } else if (I->getParent() == TheLoop->getLoopLatch() || VF == 1)
6189       // The back-edge branch will remain, as will all scalar branches.
6190       return TTI.getCFInstrCost(Instruction::Br);
6191     else
6192       // This branch will be eliminated by if-conversion.
6193       return 0;
6194     // Note: We currently assume zero cost for an unconditional branch inside
6195     // a predicated block since it will become a fall-through, although we
6196     // may decide in the future to call TTI for all branches.
6197   }
6198   case Instruction::PHI: {
6199     auto *Phi = cast<PHINode>(I);
6200
6201     // First-order recurrences are replaced by vector shuffles inside the loop.
6202     // NOTE: Don't use ToVectorTy as SK_ExtractSubvector expects a vector type.
6203     if (VF > 1 && Legal->isFirstOrderRecurrence(Phi))
6204       return TTI.getShuffleCost(TargetTransformInfo::SK_ExtractSubvector,
6205                                 VectorTy, VF - 1, VectorType::get(RetTy, 1));
6206
6207     // Phi nodes in non-header blocks (not inductions, reductions, etc.) are
6208     // converted into select instructions. We require N - 1 selects per phi
6209     // node, where N is the number of incoming values.
6210     if (VF > 1 && Phi->getParent() != TheLoop->getHeader())
6211       return (Phi->getNumIncomingValues() - 1) *
6212              TTI.getCmpSelInstrCost(
6213                  Instruction::Select, ToVectorTy(Phi->getType(), VF),
6214                  ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF));
6215
6216     return TTI.getCFInstrCost(Instruction::PHI);
6217   }
6218   case Instruction::UDiv:
6219   case Instruction::SDiv:
6220   case Instruction::URem:
6221   case Instruction::SRem:
6222     // If we have a predicated instruction, it may not be executed for each
6223     // vector lane. Get the scalarization cost and scale this amount by the
6224     // probability of executing the predicated block. If the instruction is not
6225     // predicated, we fall through to the next case.
6226     if (VF > 1 && isScalarWithPredication(I)) {
6227       unsigned Cost = 0;
6228
6229       // These instructions have a non-void type, so account for the phi nodes
6230       // that we will create. This cost is likely to be zero. The phi node
6231       // cost, if any, should be scaled by the block probability because it
6232       // models a copy at the end of each predicated block.
6233       Cost += VF * TTI.getCFInstrCost(Instruction::PHI);
6234
6235       // The cost of the non-predicated instruction.
6236       Cost += VF * TTI.getArithmeticInstrCost(I->getOpcode(), RetTy);
6237
6238       // The cost of insertelement and extractelement instructions needed for
6239       // scalarization.
6240       Cost += getScalarizationOverhead(I, VF);
6241
6242       // Scale the cost by the probability of executing the predicated blocks.
6243       // This assumes the predicated block for each vector lane is equally
6244       // likely.
6245       return Cost / getReciprocalPredBlockProb();
6246     }
6247     LLVM_FALLTHROUGH;
6248   case Instruction::Add:
6249   case Instruction::FAdd:
6250   case Instruction::Sub:
6251   case Instruction::FSub:
6252   case Instruction::Mul:
6253   case Instruction::FMul:
6254   case Instruction::FDiv:
6255   case Instruction::FRem:
6256   case Instruction::Shl:
6257   case Instruction::LShr:
6258   case Instruction::AShr:
6259   case Instruction::And:
6260   case Instruction::Or:
6261   case Instruction::Xor: {
6262     // Since we will replace the stride by 1 the multiplication should go away.
6263     if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal))
6264       return 0;
6265     // Certain instructions can be cheaper to vectorize if they have a constant
6266     // second vector operand. One example of this are shifts on x86.
6267     Value *Op2 = I->getOperand(1);
6268     TargetTransformInfo::OperandValueProperties Op2VP;
6269     TargetTransformInfo::OperandValueKind Op2VK =
6270         TTI.getOperandInfo(Op2, Op2VP);
6271     if (Op2VK == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2))
6272       Op2VK = TargetTransformInfo::OK_UniformValue;
6273
6274     SmallVector<const Value *, 4> Operands(I->operand_values());
6275     unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1;
6276     return N * TTI.getArithmeticInstrCost(
6277                    I->getOpcode(), VectorTy, TargetTransformInfo::OK_AnyValue,
6278                    Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands, I);
6279   }
6280   case Instruction::FNeg: {
6281     unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1;
6282     return N * TTI.getArithmeticInstrCost(
6283                    I->getOpcode(), VectorTy, TargetTransformInfo::OK_AnyValue,
6284                    TargetTransformInfo::OK_AnyValue,
6285                    TargetTransformInfo::OP_None, TargetTransformInfo::OP_None,
6286                    I->getOperand(0), I);
6287   }
6288   case Instruction::Select: {
6289     SelectInst *SI = cast<SelectInst>(I);
6290     const SCEV *CondSCEV = SE->getSCEV(SI->getCondition());
6291     bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop));
6292     Type *CondTy = SI->getCondition()->getType();
6293     if (!ScalarCond)
6294       CondTy = VectorType::get(CondTy, VF);
6295
6296     return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, I);
6297   }
6298   case Instruction::ICmp:
6299   case Instruction::FCmp: {
6300     Type *ValTy = I->getOperand(0)->getType();
6301     Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0));
6302     if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF))
6303       ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]);
6304     VectorTy = ToVectorTy(ValTy, VF);
6305     return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr, I);
6306   }
6307   case Instruction::Store:
6308   case Instruction::Load: {
6309     unsigned Width = VF;
6310     if (Width > 1) {
6311       InstWidening Decision = getWideningDecision(I, Width);
6312       assert(Decision != CM_Unknown &&
6313              "CM decision should be taken at this point");
6314       if (Decision == CM_Scalarize)
6315         Width = 1;
6316     }
6317     VectorTy = ToVectorTy(getMemInstValueType(I), Width);
6318     return getMemoryInstructionCost(I, VF);
6319   }
6320   case Instruction::ZExt:
6321   case Instruction::SExt:
6322   case Instruction::FPToUI:
6323   case Instruction::FPToSI:
6324   case Instruction::FPExt:
6325   case Instruction::PtrToInt:
6326   case Instruction::IntToPtr:
6327   case Instruction::SIToFP:
6328   case Instruction::UIToFP:
6329   case Instruction::Trunc:
6330   case Instruction::FPTrunc:
6331   case Instruction::BitCast: {
6332     // We optimize the truncation of induction variables having constant
6333     // integer steps. The cost of these truncations is the same as the scalar
6334     // operation.
6335     if (isOptimizableIVTruncate(I, VF)) {
6336       auto *Trunc = cast<TruncInst>(I);
6337       return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(),
6338                                   Trunc->getSrcTy(), Trunc);
6339     }
6340
6341     Type *SrcScalarTy = I->getOperand(0)->getType();
6342     Type *SrcVecTy =
6343         VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy;
6344     if (canTruncateToMinimalBitwidth(I, VF)) {
6345       // This cast is going to be shrunk. This may remove the cast or it might
6346       // turn it into slightly different cast. For example, if MinBW == 16,
6347       // "zext i8 %1 to i32" becomes "zext i8 %1 to i16".
6348       //
6349       // Calculate the modified src and dest types.
6350       Type *MinVecTy = VectorTy;
6351       if (I->getOpcode() == Instruction::Trunc) {
6352         SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy);
6353         VectorTy =
6354             largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
6355       } else if (I->getOpcode() == Instruction::ZExt ||
6356                  I->getOpcode() == Instruction::SExt) {
6357         SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy);
6358         VectorTy =
6359             smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
6360       }
6361     }
6362
6363     unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1;
6364     return N * TTI.getCastInstrCost(I->getOpcode(), VectorTy, SrcVecTy, I);
6365   }
6366   case Instruction::Call: {
6367     bool NeedToScalarize;
6368     CallInst *CI = cast<CallInst>(I);
6369     unsigned CallCost = getVectorCallCost(CI, VF, NeedToScalarize);
6370     if (getVectorIntrinsicIDForCall(CI, TLI))
6371       return std::min(CallCost, getVectorIntrinsicCost(CI, VF));
6372     return CallCost;
6373   }
6374   default:
6375     // The cost of executing VF copies of the scalar instruction. This opcode
6376     // is unknown. Assume that it is the same as 'mul'.
6377     return VF * TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy) +
6378            getScalarizationOverhead(I, VF);
6379   } // end of switch.
6380 }
6381
6382 char LoopVectorize::ID = 0;
6383
6384 static const char lv_name[] = "Loop Vectorization";
6385
6386 INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false)
6387 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
6388 INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass)
6389 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
6390 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
6391 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
6392 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)
6393 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
6394 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
6395 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
6396 INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis)
6397 INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass)
6398 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
6399 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
6400 INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false)
6401
6402 namespace llvm {
6403
6404 Pass *createLoopVectorizePass() { return new LoopVectorize(); }
6405
6406 Pass *createLoopVectorizePass(bool InterleaveOnlyWhenForced,
6407                               bool VectorizeOnlyWhenForced) {
6408   return new LoopVectorize(InterleaveOnlyWhenForced, VectorizeOnlyWhenForced);
6409 }
6410
6411 } // end namespace llvm
6412
6413 bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) {
6414   // Check if the pointer operand of a load or store instruction is
6415   // consecutive.
6416   if (auto *Ptr = getLoadStorePointerOperand(Inst))
6417     return Legal->isConsecutivePtr(Ptr);
6418   return false;
6419 }
6420
6421 void LoopVectorizationCostModel::collectValuesToIgnore() {
6422   // Ignore ephemeral values.
6423   CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore);
6424
6425   // Ignore type-promoting instructions we identified during reduction
6426   // detection.
6427   for (auto &Reduction : *Legal->getReductionVars()) {
6428     RecurrenceDescriptor &RedDes = Reduction.second;
6429     SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts();
6430     VecValuesToIgnore.insert(Casts.begin(), Casts.end());
6431   }
6432   // Ignore type-casting instructions we identified during induction
6433   // detection.
6434   for (auto &Induction : *Legal->getInductionVars()) {
6435     InductionDescriptor &IndDes = Induction.second;
6436     const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
6437     VecValuesToIgnore.insert(Casts.begin(), Casts.end());
6438   }
6439 }
6440
6441 // TODO: we could return a pair of values that specify the max VF and
6442 // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of
6443 // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment
6444 // doesn't have a cost model that can choose which plan to execute if
6445 // more than one is generated.
6446 static unsigned determineVPlanVF(const unsigned WidestVectorRegBits,
6447                                  LoopVectorizationCostModel &CM) {
6448   unsigned WidestType;
6449   std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes();
6450   return WidestVectorRegBits / WidestType;
6451 }
6452
6453 VectorizationFactor
6454 LoopVectorizationPlanner::planInVPlanNativePath(unsigned UserVF) {
6455   unsigned VF = UserVF;
6456   // Outer loop handling: They may require CFG and instruction level
6457   // transformations before even evaluating whether vectorization is profitable.
6458   // Since we cannot modify the incoming IR, we need to build VPlan upfront in
6459   // the vectorization pipeline.
6460   if (!OrigLoop->empty()) {
6461     // If the user doesn't provide a vectorization factor, determine a
6462     // reasonable one.
6463     if (!UserVF) {
6464       VF = determineVPlanVF(TTI->getRegisterBitWidth(true /* Vector*/), CM);
6465       LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n");
6466
6467       // Make sure we have a VF > 1 for stress testing.
6468       if (VPlanBuildStressTest && VF < 2) {
6469         LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: "
6470                           << "overriding computed VF.\n");
6471         VF = 4;
6472       }
6473     }
6474     assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
6475     assert(isPowerOf2_32(VF) && "VF needs to be a power of two");
6476     LLVM_DEBUG(dbgs() << "LV: Using " << (UserVF ? "user " : "") << "VF " << VF
6477                       << " to build VPlans.\n");
6478     buildVPlans(VF, VF);
6479
6480     // For VPlan build stress testing, we bail out after VPlan construction.
6481     if (VPlanBuildStressTest)
6482       return VectorizationFactor::Disabled();
6483
6484     return {VF, 0};
6485   }
6486
6487   LLVM_DEBUG(
6488       dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the "
6489                 "VPlan-native path.\n");
6490   return VectorizationFactor::Disabled();
6491 }
6492
6493 Optional<VectorizationFactor> LoopVectorizationPlanner::plan(unsigned UserVF) {
6494   assert(OrigLoop->empty() && "Inner loop expected.");
6495   Optional<unsigned> MaybeMaxVF = CM.computeMaxVF();
6496   if (!MaybeMaxVF) // Cases that should not to be vectorized nor interleaved.
6497     return None;
6498
6499   // Invalidate interleave groups if all blocks of loop will be predicated.
6500   if (CM.blockNeedsPredication(OrigLoop->getHeader()) &&
6501       !useMaskedInterleavedAccesses(*TTI)) {
6502     LLVM_DEBUG(
6503         dbgs()
6504         << "LV: Invalidate all interleaved groups due to fold-tail by masking "
6505            "which requires masked-interleaved support.\n");
6506     CM.InterleaveInfo.reset();
6507   }
6508
6509   if (UserVF) {
6510     LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
6511     assert(isPowerOf2_32(UserVF) && "VF needs to be a power of two");
6512     // Collect the instructions (and their associated costs) that will be more
6513     // profitable to scalarize.
6514     CM.selectUserVectorizationFactor(UserVF);
6515     buildVPlansWithVPRecipes(UserVF, UserVF);
6516     LLVM_DEBUG(printPlans(dbgs()));
6517     return {{UserVF, 0}};
6518   }
6519
6520   unsigned MaxVF = MaybeMaxVF.getValue();
6521   assert(MaxVF != 0 && "MaxVF is zero.");
6522
6523   for (unsigned VF = 1; VF <= MaxVF; VF *= 2) {
6524     // Collect Uniform and Scalar instructions after vectorization with VF.
6525     CM.collectUniformsAndScalars(VF);
6526
6527     // Collect the instructions (and their associated costs) that will be more
6528     // profitable to scalarize.
6529     if (VF > 1)
6530       CM.collectInstsToScalarize(VF);
6531   }
6532
6533   buildVPlansWithVPRecipes(1, MaxVF);
6534   LLVM_DEBUG(printPlans(dbgs()));
6535   if (MaxVF == 1)
6536     return VectorizationFactor::Disabled();
6537
6538   // Select the optimal vectorization factor.
6539   return CM.selectVectorizationFactor(MaxVF);
6540 }
6541
6542 void LoopVectorizationPlanner::setBestPlan(unsigned VF, unsigned UF) {
6543   LLVM_DEBUG(dbgs() << "Setting best plan to VF=" << VF << ", UF=" << UF
6544                     << '\n');
6545   BestVF = VF;
6546   BestUF = UF;
6547
6548   erase_if(VPlans, [VF](const VPlanPtr &Plan) {
6549     return !Plan->hasVF(VF);
6550   });
6551   assert(VPlans.size() == 1 && "Best VF has not a single VPlan.");
6552 }
6553
6554 void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV,
6555                                            DominatorTree *DT) {
6556   // Perform the actual loop transformation.
6557
6558   // 1. Create a new empty loop. Unlink the old loop and connect the new one.
6559   VPCallbackILV CallbackILV(ILV);
6560
6561   VPTransformState State{BestVF, BestUF,      LI,
6562                          DT,     ILV.Builder, ILV.VectorLoopValueMap,
6563                          &ILV,   CallbackILV};
6564   State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton();
6565   State.TripCount = ILV.getOrCreateTripCount(nullptr);
6566
6567   //===------------------------------------------------===//
6568   //
6569   // Notice: any optimization or new instruction that go
6570   // into the code below should also be implemented in
6571   // the cost-model.
6572   //
6573   //===------------------------------------------------===//
6574
6575   // 2. Copy and widen instructions from the old loop into the new loop.
6576   assert(VPlans.size() == 1 && "Not a single VPlan to execute.");
6577   VPlans.front()->execute(&State);
6578
6579   // 3. Fix the vectorized code: take care of header phi's, live-outs,
6580   //    predication, updating analyses.
6581   ILV.fixVectorizedLoop();
6582 }
6583
6584 void LoopVectorizationPlanner::collectTriviallyDeadInstructions(
6585     SmallPtrSetImpl<Instruction *> &DeadInstructions) {
6586   BasicBlock *Latch = OrigLoop->getLoopLatch();
6587
6588   // We create new control-flow for the vectorized loop, so the original
6589   // condition will be dead after vectorization if it's only used by the
6590   // branch.
6591   auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
6592   if (Cmp && Cmp->hasOneUse())
6593     DeadInstructions.insert(Cmp);
6594
6595   // We create new "steps" for induction variable updates to which the original
6596   // induction variables map. An original update instruction will be dead if
6597   // all its users except the induction variable are dead.
6598   for (auto &Induction : *Legal->getInductionVars()) {
6599     PHINode *Ind = Induction.first;
6600     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
6601     if (llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
6602           return U == Ind || DeadInstructions.find(cast<Instruction>(U)) !=
6603                                  DeadInstructions.end();
6604         }))
6605       DeadInstructions.insert(IndUpdate);
6606
6607     // We record as "Dead" also the type-casting instructions we had identified
6608     // during induction analysis. We don't need any handling for them in the
6609     // vectorized loop because we have proven that, under a proper runtime
6610     // test guarding the vectorized loop, the value of the phi, and the casted
6611     // value of the phi, are the same. The last instruction in this casting chain
6612     // will get its scalar/vector/widened def from the scalar/vector/widened def
6613     // of the respective phi node. Any other casts in the induction def-use chain
6614     // have no other uses outside the phi update chain, and will be ignored.
6615     InductionDescriptor &IndDes = Induction.second;
6616     const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
6617     DeadInstructions.insert(Casts.begin(), Casts.end());
6618   }
6619 }
6620
6621 Value *InnerLoopUnroller::reverseVector(Value *Vec) { return Vec; }
6622
6623 Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; }
6624
6625 Value *InnerLoopUnroller::getStepVector(Value *Val, int StartIdx, Value *Step,
6626                                         Instruction::BinaryOps BinOp) {
6627   // When unrolling and the VF is 1, we only need to add a simple scalar.
6628   Type *Ty = Val->getType();
6629   assert(!Ty->isVectorTy() && "Val must be a scalar");
6630
6631   if (Ty->isFloatingPointTy()) {
6632     Constant *C = ConstantFP::get(Ty, (double)StartIdx);
6633
6634     // Floating point operations had to be 'fast' to enable the unrolling.
6635     Value *MulOp = addFastMathFlag(Builder.CreateFMul(C, Step));
6636     return addFastMathFlag(Builder.CreateBinOp(BinOp, Val, MulOp));
6637   }
6638   Constant *C = ConstantInt::get(Ty, StartIdx);
6639   return Builder.CreateAdd(Val, Builder.CreateMul(C, Step), "induction");
6640 }
6641
6642 static void AddRuntimeUnrollDisableMetaData(Loop *L) {
6643   SmallVector<Metadata *, 4> MDs;
6644   // Reserve first location for self reference to the LoopID metadata node.
6645   MDs.push_back(nullptr);
6646   bool IsUnrollMetadata = false;
6647   MDNode *LoopID = L->getLoopID();
6648   if (LoopID) {
6649     // First find existing loop unrolling disable metadata.
6650     for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
6651       auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i));
6652       if (MD) {
6653         const auto *S = dyn_cast<MDString>(MD->getOperand(0));
6654         IsUnrollMetadata =
6655             S && S->getString().startswith("llvm.loop.unroll.disable");
6656       }
6657       MDs.push_back(LoopID->getOperand(i));
6658     }
6659   }
6660
6661   if (!IsUnrollMetadata) {
6662     // Add runtime unroll disable metadata.
6663     LLVMContext &Context = L->getHeader()->getContext();
6664     SmallVector<Metadata *, 1> DisableOperands;
6665     DisableOperands.push_back(
6666         MDString::get(Context, "llvm.loop.unroll.runtime.disable"));
6667     MDNode *DisableNode = MDNode::get(Context, DisableOperands);
6668     MDs.push_back(DisableNode);
6669     MDNode *NewLoopID = MDNode::get(Context, MDs);
6670     // Set operand 0 to refer to the loop id itself.
6671     NewLoopID->replaceOperandWith(0, NewLoopID);
6672     L->setLoopID(NewLoopID);
6673   }
6674 }
6675
6676 bool LoopVectorizationPlanner::getDecisionAndClampRange(
6677     const std::function<bool(unsigned)> &Predicate, VFRange &Range) {
6678   assert(Range.End > Range.Start && "Trying to test an empty VF range.");
6679   bool PredicateAtRangeStart = Predicate(Range.Start);
6680
6681   for (unsigned TmpVF = Range.Start * 2; TmpVF < Range.End; TmpVF *= 2)
6682     if (Predicate(TmpVF) != PredicateAtRangeStart) {
6683       Range.End = TmpVF;
6684       break;
6685     }
6686
6687   return PredicateAtRangeStart;
6688 }
6689
6690 /// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF,
6691 /// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range
6692 /// of VF's starting at a given VF and extending it as much as possible. Each
6693 /// vectorization decision can potentially shorten this sub-range during
6694 /// buildVPlan().
6695 void LoopVectorizationPlanner::buildVPlans(unsigned MinVF, unsigned MaxVF) {
6696   for (unsigned VF = MinVF; VF < MaxVF + 1;) {
6697     VFRange SubRange = {VF, MaxVF + 1};
6698     VPlans.push_back(buildVPlan(SubRange));
6699     VF = SubRange.End;
6700   }
6701 }
6702
6703 VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst,
6704                                          VPlanPtr &Plan) {
6705   assert(is_contained(predecessors(Dst), Src) && "Invalid edge");
6706
6707   // Look for cached value.
6708   std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst);
6709   EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge);
6710   if (ECEntryIt != EdgeMaskCache.end())
6711     return ECEntryIt->second;
6712
6713   VPValue *SrcMask = createBlockInMask(Src, Plan);
6714
6715   // The terminator has to be a branch inst!
6716   BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator());
6717   assert(BI && "Unexpected terminator found");
6718
6719   if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1))
6720     return EdgeMaskCache[Edge] = SrcMask;
6721
6722   VPValue *EdgeMask = Plan->getVPValue(BI->getCondition());
6723   assert(EdgeMask && "No Edge Mask found for condition");
6724
6725   if (BI->getSuccessor(0) != Dst)
6726     EdgeMask = Builder.createNot(EdgeMask);
6727
6728   if (SrcMask) // Otherwise block in-mask is all-one, no need to AND.
6729     EdgeMask = Builder.createAnd(EdgeMask, SrcMask);
6730
6731   return EdgeMaskCache[Edge] = EdgeMask;
6732 }
6733
6734 VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) {
6735   assert(OrigLoop->contains(BB) && "Block is not a part of a loop");
6736
6737   // Look for cached value.
6738   BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB);
6739   if (BCEntryIt != BlockMaskCache.end())
6740     return BCEntryIt->second;
6741
6742   // All-one mask is modelled as no-mask following the convention for masked
6743   // load/store/gather/scatter. Initialize BlockMask to no-mask.
6744   VPValue *BlockMask = nullptr;
6745
6746   if (OrigLoop->getHeader() == BB) {
6747     if (!CM.blockNeedsPredication(BB))
6748       return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one.
6749
6750     // Introduce the early-exit compare IV <= BTC to form header block mask.
6751     // This is used instead of IV < TC because TC may wrap, unlike BTC.
6752     VPValue *IV = Plan->getVPValue(Legal->getPrimaryInduction());
6753     VPValue *BTC = Plan->getOrCreateBackedgeTakenCount();
6754     BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC});
6755     return BlockMaskCache[BB] = BlockMask;
6756   }
6757
6758   // This is the block mask. We OR all incoming edges.
6759   for (auto *Predecessor : predecessors(BB)) {
6760     VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan);
6761     if (!EdgeMask) // Mask of predecessor is all-one so mask of block is too.
6762       return BlockMaskCache[BB] = EdgeMask;
6763
6764     if (!BlockMask) { // BlockMask has its initialized nullptr value.
6765       BlockMask = EdgeMask;
6766       continue;
6767     }
6768
6769     BlockMask = Builder.createOr(BlockMask, EdgeMask);
6770   }
6771
6772   return BlockMaskCache[BB] = BlockMask;
6773 }
6774
6775 VPWidenMemoryInstructionRecipe *
6776 VPRecipeBuilder::tryToWidenMemory(Instruction *I, VFRange &Range,
6777                                   VPlanPtr &Plan) {
6778   if (!isa<LoadInst>(I) && !isa<StoreInst>(I))
6779     return nullptr;
6780
6781   auto willWiden = [&](unsigned VF) -> bool {
6782     if (VF == 1)
6783       return false;
6784     LoopVectorizationCostModel::InstWidening Decision =
6785         CM.getWideningDecision(I, VF);
6786     assert(Decision != LoopVectorizationCostModel::CM_Unknown &&
6787            "CM decision should be taken at this point.");
6788     if (Decision == LoopVectorizationCostModel::CM_Interleave)
6789       return true;
6790     if (CM.isScalarAfterVectorization(I, VF) ||
6791         CM.isProfitableToScalarize(I, VF))
6792       return false;
6793     return Decision != LoopVectorizationCostModel::CM_Scalarize;
6794   };
6795
6796   if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
6797     return nullptr;
6798
6799   VPValue *Mask = nullptr;
6800   if (Legal->isMaskRequired(I))
6801     Mask = createBlockInMask(I->getParent(), Plan);
6802
6803   VPValue *Addr = Plan->getOrAddVPValue(getLoadStorePointerOperand(I));
6804   return new VPWidenMemoryInstructionRecipe(*I, Addr, Mask);
6805 }
6806
6807 VPWidenIntOrFpInductionRecipe *
6808 VPRecipeBuilder::tryToOptimizeInduction(Instruction *I, VFRange &Range) {
6809   if (PHINode *Phi = dyn_cast<PHINode>(I)) {
6810     // Check if this is an integer or fp induction. If so, build the recipe that
6811     // produces its scalar and vector values.
6812     InductionDescriptor II = Legal->getInductionVars()->lookup(Phi);
6813     if (II.getKind() == InductionDescriptor::IK_IntInduction ||
6814         II.getKind() == InductionDescriptor::IK_FpInduction)
6815       return new VPWidenIntOrFpInductionRecipe(Phi);
6816
6817     return nullptr;
6818   }
6819
6820   // Optimize the special case where the source is a constant integer
6821   // induction variable. Notice that we can only optimize the 'trunc' case
6822   // because (a) FP conversions lose precision, (b) sext/zext may wrap, and
6823   // (c) other casts depend on pointer size.
6824
6825   // Determine whether \p K is a truncation based on an induction variable that
6826   // can be optimized.
6827   auto isOptimizableIVTruncate =
6828       [&](Instruction *K) -> std::function<bool(unsigned)> {
6829     return
6830         [=](unsigned VF) -> bool { return CM.isOptimizableIVTruncate(K, VF); };
6831   };
6832
6833   if (isa<TruncInst>(I) && LoopVectorizationPlanner::getDecisionAndClampRange(
6834                                isOptimizableIVTruncate(I), Range))
6835     return new VPWidenIntOrFpInductionRecipe(cast<PHINode>(I->getOperand(0)),
6836                                              cast<TruncInst>(I));
6837   return nullptr;
6838 }
6839
6840 VPBlendRecipe *VPRecipeBuilder::tryToBlend(Instruction *I, VPlanPtr &Plan) {
6841   PHINode *Phi = dyn_cast<PHINode>(I);
6842   if (!Phi || Phi->getParent() == OrigLoop->getHeader())
6843     return nullptr;
6844
6845   // We know that all PHIs in non-header blocks are converted into selects, so
6846   // we don't have to worry about the insertion order and we can just use the
6847   // builder. At this point we generate the predication tree. There may be
6848   // duplications since this is a simple recursive scan, but future
6849   // optimizations will clean it up.
6850
6851   SmallVector<VPValue *, 2> Masks;
6852   unsigned NumIncoming = Phi->getNumIncomingValues();
6853   for (unsigned In = 0; In < NumIncoming; In++) {
6854     VPValue *EdgeMask =
6855       createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), Plan);
6856     assert((EdgeMask || NumIncoming == 1) &&
6857            "Multiple predecessors with one having a full mask");
6858     if (EdgeMask)
6859       Masks.push_back(EdgeMask);
6860   }
6861   return new VPBlendRecipe(Phi, Masks);
6862 }
6863
6864 bool VPRecipeBuilder::tryToWiden(Instruction *I, VPBasicBlock *VPBB,
6865                                  VFRange &Range) {
6866
6867   bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
6868       [&](unsigned VF) { return CM.isScalarWithPredication(I, VF); }, Range);
6869
6870   if (IsPredicated)
6871     return false;
6872
6873   auto IsVectorizableOpcode = [](unsigned Opcode) {
6874     switch (Opcode) {
6875     case Instruction::Add:
6876     case Instruction::And:
6877     case Instruction::AShr:
6878     case Instruction::BitCast:
6879     case Instruction::Br:
6880     case Instruction::Call:
6881     case Instruction::FAdd:
6882     case Instruction::FCmp:
6883     case Instruction::FDiv:
6884     case Instruction::FMul:
6885     case Instruction::FNeg:
6886     case Instruction::FPExt:
6887     case Instruction::FPToSI:
6888     case Instruction::FPToUI:
6889     case Instruction::FPTrunc:
6890     case Instruction::FRem:
6891     case Instruction::FSub:
6892     case Instruction::ICmp:
6893     case Instruction::IntToPtr:
6894     case Instruction::Load:
6895     case Instruction::LShr:
6896     case Instruction::Mul:
6897     case Instruction::Or:
6898     case Instruction::PHI:
6899     case Instruction::PtrToInt:
6900     case Instruction::SDiv:
6901     case Instruction::Select:
6902     case Instruction::SExt:
6903     case Instruction::Shl:
6904     case Instruction::SIToFP:
6905     case Instruction::SRem:
6906     case Instruction::Store:
6907     case Instruction::Sub:
6908     case Instruction::Trunc:
6909     case Instruction::UDiv:
6910     case Instruction::UIToFP:
6911     case Instruction::URem:
6912     case Instruction::Xor:
6913     case Instruction::ZExt:
6914       return true;
6915     }
6916     return false;
6917   };
6918
6919   if (!IsVectorizableOpcode(I->getOpcode()))
6920     return false;
6921
6922   if (CallInst *CI = dyn_cast<CallInst>(I)) {
6923     Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
6924     if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end ||
6925                ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect))
6926       return false;
6927   }
6928
6929   auto willWiden = [&](unsigned VF) -> bool {
6930     if (!isa<PHINode>(I) && (CM.isScalarAfterVectorization(I, VF) ||
6931                              CM.isProfitableToScalarize(I, VF)))
6932       return false;
6933     if (CallInst *CI = dyn_cast<CallInst>(I)) {
6934       Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
6935       // The following case may be scalarized depending on the VF.
6936       // The flag shows whether we use Intrinsic or a usual Call for vectorized
6937       // version of the instruction.
6938       // Is it beneficial to perform intrinsic call compared to lib call?
6939       bool NeedToScalarize;
6940       unsigned CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize);
6941       bool UseVectorIntrinsic =
6942           ID && CM.getVectorIntrinsicCost(CI, VF) <= CallCost;
6943       return UseVectorIntrinsic || !NeedToScalarize;
6944     }
6945     if (isa<LoadInst>(I) || isa<StoreInst>(I)) {
6946       assert(CM.getWideningDecision(I, VF) ==
6947                  LoopVectorizationCostModel::CM_Scalarize &&
6948              "Memory widening decisions should have been taken care by now");
6949       return false;
6950     }
6951     return true;
6952   };
6953
6954   if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
6955     return false;
6956   // If this ingredient's recipe is to be recorded, keep its recipe a singleton
6957   // to avoid having to split recipes later.
6958   bool IsSingleton = Ingredient2Recipe.count(I);
6959
6960   // Success: widen this instruction.
6961
6962   // Use the default widening recipe. We optimize the common case where
6963   // consecutive instructions can be represented by a single recipe.
6964   if (!IsSingleton && !VPBB->empty() && LastExtensibleRecipe == &VPBB->back() &&
6965       LastExtensibleRecipe->appendInstruction(I))
6966     return true;
6967
6968   VPWidenRecipe *WidenRecipe = new VPWidenRecipe(I);
6969   if (!IsSingleton)
6970     LastExtensibleRecipe = WidenRecipe;
6971   setRecipe(I, WidenRecipe);
6972   VPBB->appendRecipe(WidenRecipe);
6973   return true;
6974 }
6975
6976 VPBasicBlock *VPRecipeBuilder::handleReplication(
6977     Instruction *I, VFRange &Range, VPBasicBlock *VPBB,
6978     DenseMap<Instruction *, VPReplicateRecipe *> &PredInst2Recipe,
6979     VPlanPtr &Plan) {
6980   bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange(
6981       [&](unsigned VF) { return CM.isUniformAfterVectorization(I, VF); },
6982       Range);
6983
6984   bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
6985       [&](unsigned VF) { return CM.isScalarWithPredication(I, VF); }, Range);
6986
6987   auto *Recipe = new VPReplicateRecipe(I, IsUniform, IsPredicated);
6988   setRecipe(I, Recipe);
6989
6990   // Find if I uses a predicated instruction. If so, it will use its scalar
6991   // value. Avoid hoisting the insert-element which packs the scalar value into
6992   // a vector value, as that happens iff all users use the vector value.
6993   for (auto &Op : I->operands())
6994     if (auto *PredInst = dyn_cast<Instruction>(Op))
6995       if (PredInst2Recipe.find(PredInst) != PredInst2Recipe.end())
6996         PredInst2Recipe[PredInst]->setAlsoPack(false);
6997
6998   // Finalize the recipe for Instr, first if it is not predicated.
6999   if (!IsPredicated) {
7000     LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n");
7001     VPBB->appendRecipe(Recipe);
7002     return VPBB;
7003   }
7004   LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n");
7005   assert(VPBB->getSuccessors().empty() &&
7006          "VPBB has successors when handling predicated replication.");
7007   // Record predicated instructions for above packing optimizations.
7008   PredInst2Recipe[I] = Recipe;
7009   VPBlockBase *Region = createReplicateRegion(I, Recipe, Plan);
7010   VPBlockUtils::insertBlockAfter(Region, VPBB);
7011   auto *RegSucc = new VPBasicBlock();
7012   VPBlockUtils::insertBlockAfter(RegSucc, Region);
7013   return RegSucc;
7014 }
7015
7016 VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr,
7017                                                       VPRecipeBase *PredRecipe,
7018                                                       VPlanPtr &Plan) {
7019   // Instructions marked for predication are replicated and placed under an
7020   // if-then construct to prevent side-effects.
7021
7022   // Generate recipes to compute the block mask for this region.
7023   VPValue *BlockInMask = createBlockInMask(Instr->getParent(), Plan);
7024
7025   // Build the triangular if-then region.
7026   std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str();
7027   assert(Instr->getParent() && "Predicated instruction not in any basic block");
7028   auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask);
7029   auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe);
7030   auto *PHIRecipe =
7031       Instr->getType()->isVoidTy() ? nullptr : new VPPredInstPHIRecipe(Instr);
7032   auto *Exit = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe);
7033   auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe);
7034   VPRegionBlock *Region = new VPRegionBlock(Entry, Exit, RegionName, true);
7035
7036   // Note: first set Entry as region entry and then connect successors starting
7037   // from it in order, to propagate the "parent" of each VPBasicBlock.
7038   VPBlockUtils::insertTwoBlocksAfter(Pred, Exit, BlockInMask, Entry);
7039   VPBlockUtils::connectBlocks(Pred, Exit);
7040
7041   return Region;
7042 }
7043
7044 bool VPRecipeBuilder::tryToCreateRecipe(Instruction *Instr, VFRange &Range,
7045                                         VPlanPtr &Plan, VPBasicBlock *VPBB) {
7046   VPRecipeBase *Recipe = nullptr;
7047
7048   // First, check for specific widening recipes that deal with memory
7049   // operations, inductions and Phi nodes.
7050   if ((Recipe = tryToWidenMemory(Instr, Range, Plan)) ||
7051       (Recipe = tryToOptimizeInduction(Instr, Range)) ||
7052       (Recipe = tryToBlend(Instr, Plan)) ||
7053       (isa<PHINode>(Instr) &&
7054        (Recipe = new VPWidenPHIRecipe(cast<PHINode>(Instr))))) {
7055     setRecipe(Instr, Recipe);
7056     VPBB->appendRecipe(Recipe);
7057     return true;
7058   }
7059
7060   // Handle GEP widening.
7061   if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Instr)) {
7062     auto Scalarize = [&](unsigned VF) {
7063       return CM.isScalarWithPredication(Instr, VF) ||
7064              CM.isScalarAfterVectorization(Instr, VF) ||
7065              CM.isProfitableToScalarize(Instr, VF);
7066     };
7067     if (LoopVectorizationPlanner::getDecisionAndClampRange(Scalarize, Range))
7068       return false;
7069     VPWidenGEPRecipe *Recipe = new VPWidenGEPRecipe(GEP, OrigLoop);
7070     setRecipe(Instr, Recipe);
7071     VPBB->appendRecipe(Recipe);
7072     return true;
7073   }
7074
7075   // Check if Instr is to be widened by a general VPWidenRecipe, after
7076   // having first checked for specific widening recipes.
7077   if (tryToWiden(Instr, VPBB, Range))
7078     return true;
7079
7080   return false;
7081 }
7082
7083 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(unsigned MinVF,
7084                                                         unsigned MaxVF) {
7085   assert(OrigLoop->empty() && "Inner loop expected.");
7086
7087   // Collect conditions feeding internal conditional branches; they need to be
7088   // represented in VPlan for it to model masking.
7089   SmallPtrSet<Value *, 1> NeedDef;
7090
7091   auto *Latch = OrigLoop->getLoopLatch();
7092   for (BasicBlock *BB : OrigLoop->blocks()) {
7093     if (BB == Latch)
7094       continue;
7095     BranchInst *Branch = dyn_cast<BranchInst>(BB->getTerminator());
7096     if (Branch && Branch->isConditional())
7097       NeedDef.insert(Branch->getCondition());
7098   }
7099
7100   // If the tail is to be folded by masking, the primary induction variable
7101   // needs to be represented in VPlan for it to model early-exit masking.
7102   // Also, both the Phi and the live-out instruction of each reduction are
7103   // required in order to introduce a select between them in VPlan.
7104   if (CM.foldTailByMasking()) {
7105     NeedDef.insert(Legal->getPrimaryInduction());
7106     for (auto &Reduction : *Legal->getReductionVars()) {
7107       NeedDef.insert(Reduction.first);
7108       NeedDef.insert(Reduction.second.getLoopExitInstr());
7109     }
7110   }
7111
7112   // Collect instructions from the original loop that will become trivially dead
7113   // in the vectorized loop. We don't need to vectorize these instructions. For
7114   // example, original induction update instructions can become dead because we
7115   // separately emit induction "steps" when generating code for the new loop.
7116   // Similarly, we create a new latch condition when setting up the structure
7117   // of the new loop, so the old one can become dead.
7118   SmallPtrSet<Instruction *, 4> DeadInstructions;
7119   collectTriviallyDeadInstructions(DeadInstructions);
7120
7121   DenseMap<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter();
7122   // Dead instructions do not need sinking. Remove them from SinkAfter.
7123   for (Instruction *I : DeadInstructions)
7124     SinkAfter.erase(I);
7125
7126   for (unsigned VF = MinVF; VF < MaxVF + 1;) {
7127     VFRange SubRange = {VF, MaxVF + 1};
7128     VPlans.push_back(buildVPlanWithVPRecipes(SubRange, NeedDef,
7129                                              DeadInstructions, SinkAfter));
7130     VF = SubRange.End;
7131   }
7132 }
7133
7134 VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
7135     VFRange &Range, SmallPtrSetImpl<Value *> &NeedDef,
7136     SmallPtrSetImpl<Instruction *> &DeadInstructions,
7137     const DenseMap<Instruction *, Instruction *> &SinkAfter) {
7138
7139   // Hold a mapping from predicated instructions to their recipes, in order to
7140   // fix their AlsoPack behavior if a user is determined to replicate and use a
7141   // scalar instead of vector value.
7142   DenseMap<Instruction *, VPReplicateRecipe *> PredInst2Recipe;
7143
7144   SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups;
7145
7146   VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, Builder);
7147
7148   // ---------------------------------------------------------------------------
7149   // Pre-construction: record ingredients whose recipes we'll need to further
7150   // process after constructing the initial VPlan.
7151   // ---------------------------------------------------------------------------
7152
7153   // Mark instructions we'll need to sink later and their targets as
7154   // ingredients whose recipe we'll need to record.
7155   for (auto &Entry : SinkAfter) {
7156     RecipeBuilder.recordRecipeOf(Entry.first);
7157     RecipeBuilder.recordRecipeOf(Entry.second);
7158   }
7159
7160   // For each interleave group which is relevant for this (possibly trimmed)
7161   // Range, add it to the set of groups to be later applied to the VPlan and add
7162   // placeholders for its members' Recipes which we'll be replacing with a
7163   // single VPInterleaveRecipe.
7164   for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) {
7165     auto applyIG = [IG, this](unsigned VF) -> bool {
7166       return (VF >= 2 && // Query is illegal for VF == 1
7167               CM.getWideningDecision(IG->getInsertPos(), VF) ==
7168                   LoopVectorizationCostModel::CM_Interleave);
7169     };
7170     if (!getDecisionAndClampRange(applyIG, Range))
7171       continue;
7172     InterleaveGroups.insert(IG);
7173     for (unsigned i = 0; i < IG->getFactor(); i++)
7174       if (Instruction *Member = IG->getMember(i))
7175         RecipeBuilder.recordRecipeOf(Member);
7176   };
7177
7178   // ---------------------------------------------------------------------------
7179   // Build initial VPlan: Scan the body of the loop in a topological order to
7180   // visit each basic block after having visited its predecessor basic blocks.
7181   // ---------------------------------------------------------------------------
7182
7183   // Create a dummy pre-entry VPBasicBlock to start building the VPlan.
7184   VPBasicBlock *VPBB = new VPBasicBlock("Pre-Entry");
7185   auto Plan = std::make_unique<VPlan>(VPBB);
7186
7187   // Represent values that will have defs inside VPlan.
7188   for (Value *V : NeedDef)
7189     Plan->addVPValue(V);
7190
7191   // Scan the body of the loop in a topological order to visit each basic block
7192   // after having visited its predecessor basic blocks.
7193   LoopBlocksDFS DFS(OrigLoop);
7194   DFS.perform(LI);
7195
7196   for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
7197     // Relevant instructions from basic block BB will be grouped into VPRecipe
7198     // ingredients and fill a new VPBasicBlock.
7199     unsigned VPBBsForBB = 0;
7200     auto *FirstVPBBForBB = new VPBasicBlock(BB->getName());
7201     VPBlockUtils::insertBlockAfter(FirstVPBBForBB, VPBB);
7202     VPBB = FirstVPBBForBB;
7203     Builder.setInsertPoint(VPBB);
7204
7205     // Introduce each ingredient into VPlan.
7206     for (Instruction &I : BB->instructionsWithoutDebug()) {
7207       Instruction *Instr = &I;
7208
7209       // First filter out irrelevant instructions, to ensure no recipes are
7210       // built for them.
7211       if (isa<BranchInst>(Instr) ||
7212           DeadInstructions.find(Instr) != DeadInstructions.end())
7213         continue;
7214
7215       if (RecipeBuilder.tryToCreateRecipe(Instr, Range, Plan, VPBB))
7216         continue;
7217
7218       // Otherwise, if all widening options failed, Instruction is to be
7219       // replicated. This may create a successor for VPBB.
7220       VPBasicBlock *NextVPBB = RecipeBuilder.handleReplication(
7221           Instr, Range, VPBB, PredInst2Recipe, Plan);
7222       if (NextVPBB != VPBB) {
7223         VPBB = NextVPBB;
7224         VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++)
7225                                     : "");
7226       }
7227     }
7228   }
7229
7230   // Discard empty dummy pre-entry VPBasicBlock. Note that other VPBasicBlocks
7231   // may also be empty, such as the last one VPBB, reflecting original
7232   // basic-blocks with no recipes.
7233   VPBasicBlock *PreEntry = cast<VPBasicBlock>(Plan->getEntry());
7234   assert(PreEntry->empty() && "Expecting empty pre-entry block.");
7235   VPBlockBase *Entry = Plan->setEntry(PreEntry->getSingleSuccessor());
7236   VPBlockUtils::disconnectBlocks(PreEntry, Entry);
7237   delete PreEntry;
7238
7239   // ---------------------------------------------------------------------------
7240   // Transform initial VPlan: Apply previously taken decisions, in order, to
7241   // bring the VPlan to its final state.
7242   // ---------------------------------------------------------------------------
7243
7244   // Apply Sink-After legal constraints.
7245   for (auto &Entry : SinkAfter) {
7246     VPRecipeBase *Sink = RecipeBuilder.getRecipe(Entry.first);
7247     VPRecipeBase *Target = RecipeBuilder.getRecipe(Entry.second);
7248     Sink->moveAfter(Target);
7249   }
7250
7251   // Interleave memory: for each Interleave Group we marked earlier as relevant
7252   // for this VPlan, replace the Recipes widening its memory instructions with a
7253   // single VPInterleaveRecipe at its insertion point.
7254   for (auto IG : InterleaveGroups) {
7255     auto *Recipe = cast<VPWidenMemoryInstructionRecipe>(
7256         RecipeBuilder.getRecipe(IG->getInsertPos()));
7257     (new VPInterleaveRecipe(IG, Recipe->getAddr(), Recipe->getMask()))
7258         ->insertBefore(Recipe);
7259
7260     for (unsigned i = 0; i < IG->getFactor(); ++i)
7261       if (Instruction *Member = IG->getMember(i)) {
7262         RecipeBuilder.getRecipe(Member)->eraseFromParent();
7263       }
7264   }
7265
7266   // Finally, if tail is folded by masking, introduce selects between the phi
7267   // and the live-out instruction of each reduction, at the end of the latch.
7268   if (CM.foldTailByMasking()) {
7269     Builder.setInsertPoint(VPBB);
7270     auto *Cond = RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan);
7271     for (auto &Reduction : *Legal->getReductionVars()) {
7272       VPValue *Phi = Plan->getVPValue(Reduction.first);
7273       VPValue *Red = Plan->getVPValue(Reduction.second.getLoopExitInstr());
7274       Builder.createNaryOp(Instruction::Select, {Cond, Red, Phi});
7275     }
7276   }
7277
7278   std::string PlanName;
7279   raw_string_ostream RSO(PlanName);
7280   unsigned VF = Range.Start;
7281   Plan->addVF(VF);
7282   RSO << "Initial VPlan for VF={" << VF;
7283   for (VF *= 2; VF < Range.End; VF *= 2) {
7284     Plan->addVF(VF);
7285     RSO << "," << VF;
7286   }
7287   RSO << "},UF>=1";
7288   RSO.flush();
7289   Plan->setName(PlanName);
7290
7291   return Plan;
7292 }
7293
7294 VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
7295   // Outer loop handling: They may require CFG and instruction level
7296   // transformations before even evaluating whether vectorization is profitable.
7297   // Since we cannot modify the incoming IR, we need to build VPlan upfront in
7298   // the vectorization pipeline.
7299   assert(!OrigLoop->empty());
7300   assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
7301
7302   // Create new empty VPlan
7303   auto Plan = std::make_unique<VPlan>();
7304
7305   // Build hierarchical CFG
7306   VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan);
7307   HCFGBuilder.buildHierarchicalCFG();
7308
7309   for (unsigned VF = Range.Start; VF < Range.End; VF *= 2)
7310     Plan->addVF(VF);
7311
7312   if (EnableVPlanPredication) {
7313     VPlanPredicator VPP(*Plan);
7314     VPP.predicate();
7315
7316     // Avoid running transformation to recipes until masked code generation in
7317     // VPlan-native path is in place.
7318     return Plan;
7319   }
7320
7321   SmallPtrSet<Instruction *, 1> DeadInstructions;
7322   VPlanTransforms::VPInstructionsToVPRecipes(
7323       OrigLoop, Plan, Legal->getInductionVars(), DeadInstructions);
7324   return Plan;
7325 }
7326
7327 Value* LoopVectorizationPlanner::VPCallbackILV::
7328 getOrCreateVectorValues(Value *V, unsigned Part) {
7329       return ILV.getOrCreateVectorValue(V, Part);
7330 }
7331
7332 Value *LoopVectorizationPlanner::VPCallbackILV::getOrCreateScalarValue(
7333     Value *V, const VPIteration &Instance) {
7334   return ILV.getOrCreateScalarValue(V, Instance);
7335 }
7336
7337 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent) const {
7338   O << " +\n"
7339     << Indent << "\"INTERLEAVE-GROUP with factor " << IG->getFactor() << " at ";
7340   IG->getInsertPos()->printAsOperand(O, false);
7341   O << ", ";
7342   getAddr()->printAsOperand(O);
7343   VPValue *Mask = getMask();
7344   if (Mask) {
7345     O << ", ";
7346     Mask->printAsOperand(O);
7347   }
7348   O << "\\l\"";
7349   for (unsigned i = 0; i < IG->getFactor(); ++i)
7350     if (Instruction *I = IG->getMember(i))
7351       O << " +\n"
7352         << Indent << "\"  " << VPlanIngredient(I) << " " << i << "\\l\"";
7353 }
7354
7355 void VPWidenRecipe::execute(VPTransformState &State) {
7356   for (auto &Instr : make_range(Begin, End))
7357     State.ILV->widenInstruction(Instr);
7358 }
7359
7360 void VPWidenGEPRecipe::execute(VPTransformState &State) {
7361   State.ILV->widenGEP(GEP, State.UF, State.VF, IsPtrLoopInvariant,
7362                       IsIndexLoopInvariant);
7363 }
7364
7365 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) {
7366   assert(!State.Instance && "Int or FP induction being replicated.");
7367   State.ILV->widenIntOrFpInduction(IV, Trunc);
7368 }
7369
7370 void VPWidenPHIRecipe::execute(VPTransformState &State) {
7371   State.ILV->widenPHIInstruction(Phi, State.UF, State.VF);
7372 }
7373
7374 void VPBlendRecipe::execute(VPTransformState &State) {
7375   State.ILV->setDebugLocFromInst(State.Builder, Phi);
7376   // We know that all PHIs in non-header blocks are converted into
7377   // selects, so we don't have to worry about the insertion order and we
7378   // can just use the builder.
7379   // At this point we generate the predication tree. There may be
7380   // duplications since this is a simple recursive scan, but future
7381   // optimizations will clean it up.
7382
7383   unsigned NumIncoming = Phi->getNumIncomingValues();
7384
7385   assert((User || NumIncoming == 1) &&
7386          "Multiple predecessors with predecessors having a full mask");
7387   // Generate a sequence of selects of the form:
7388   // SELECT(Mask3, In3,
7389   //      SELECT(Mask2, In2,
7390   //                   ( ...)))
7391   InnerLoopVectorizer::VectorParts Entry(State.UF);
7392   for (unsigned In = 0; In < NumIncoming; ++In) {
7393     for (unsigned Part = 0; Part < State.UF; ++Part) {
7394       // We might have single edge PHIs (blocks) - use an identity
7395       // 'select' for the first PHI operand.
7396       Value *In0 =
7397           State.ILV->getOrCreateVectorValue(Phi->getIncomingValue(In), Part);
7398       if (In == 0)
7399         Entry[Part] = In0; // Initialize with the first incoming value.
7400       else {
7401         // Select between the current value and the previous incoming edge
7402         // based on the incoming mask.
7403         Value *Cond = State.get(User->getOperand(In), Part);
7404         Entry[Part] =
7405             State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi");
7406       }
7407     }
7408   }
7409   for (unsigned Part = 0; Part < State.UF; ++Part)
7410     State.ValueMap.setVectorValue(Phi, Part, Entry[Part]);
7411 }
7412
7413 void VPInterleaveRecipe::execute(VPTransformState &State) {
7414   assert(!State.Instance && "Interleave group being replicated.");
7415   State.ILV->vectorizeInterleaveGroup(IG->getInsertPos(), State, getAddr(),
7416                                       getMask());
7417 }
7418
7419 void VPReplicateRecipe::execute(VPTransformState &State) {
7420   if (State.Instance) { // Generate a single instance.
7421     State.ILV->scalarizeInstruction(Ingredient, *State.Instance, IsPredicated);
7422     // Insert scalar instance packing it into a vector.
7423     if (AlsoPack && State.VF > 1) {
7424       // If we're constructing lane 0, initialize to start from undef.
7425       if (State.Instance->Lane == 0) {
7426         Value *Undef =
7427             UndefValue::get(VectorType::get(Ingredient->getType(), State.VF));
7428         State.ValueMap.setVectorValue(Ingredient, State.Instance->Part, Undef);
7429       }
7430       State.ILV->packScalarIntoVectorValue(Ingredient, *State.Instance);
7431     }
7432     return;
7433   }
7434
7435   // Generate scalar instances for all VF lanes of all UF parts, unless the
7436   // instruction is uniform inwhich case generate only the first lane for each
7437   // of the UF parts.
7438   unsigned EndLane = IsUniform ? 1 : State.VF;
7439   for (unsigned Part = 0; Part < State.UF; ++Part)
7440     for (unsigned Lane = 0; Lane < EndLane; ++Lane)
7441       State.ILV->scalarizeInstruction(Ingredient, {Part, Lane}, IsPredicated);
7442 }
7443
7444 void VPBranchOnMaskRecipe::execute(VPTransformState &State) {
7445   assert(State.Instance && "Branch on Mask works only on single instance.");
7446
7447   unsigned Part = State.Instance->Part;
7448   unsigned Lane = State.Instance->Lane;
7449
7450   Value *ConditionBit = nullptr;
7451   if (!User) // Block in mask is all-one.
7452     ConditionBit = State.Builder.getTrue();
7453   else {
7454     VPValue *BlockInMask = User->getOperand(0);
7455     ConditionBit = State.get(BlockInMask, Part);
7456     if (ConditionBit->getType()->isVectorTy())
7457       ConditionBit = State.Builder.CreateExtractElement(
7458           ConditionBit, State.Builder.getInt32(Lane));
7459   }
7460
7461   // Replace the temporary unreachable terminator with a new conditional branch,
7462   // whose two destinations will be set later when they are created.
7463   auto *CurrentTerminator = State.CFG.PrevBB->getTerminator();
7464   assert(isa<UnreachableInst>(CurrentTerminator) &&
7465          "Expected to replace unreachable terminator with conditional branch.");
7466   auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit);
7467   CondBr->setSuccessor(0, nullptr);
7468   ReplaceInstWithInst(CurrentTerminator, CondBr);
7469 }
7470
7471 void VPPredInstPHIRecipe::execute(VPTransformState &State) {
7472   assert(State.Instance && "Predicated instruction PHI works per instance.");
7473   Instruction *ScalarPredInst = cast<Instruction>(
7474       State.ValueMap.getScalarValue(PredInst, *State.Instance));
7475   BasicBlock *PredicatedBB = ScalarPredInst->getParent();
7476   BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor();
7477   assert(PredicatingBB && "Predicated block has no single predecessor.");
7478
7479   // By current pack/unpack logic we need to generate only a single phi node: if
7480   // a vector value for the predicated instruction exists at this point it means
7481   // the instruction has vector users only, and a phi for the vector value is
7482   // needed. In this case the recipe of the predicated instruction is marked to
7483   // also do that packing, thereby "hoisting" the insert-element sequence.
7484   // Otherwise, a phi node for the scalar value is needed.
7485   unsigned Part = State.Instance->Part;
7486   if (State.ValueMap.hasVectorValue(PredInst, Part)) {
7487     Value *VectorValue = State.ValueMap.getVectorValue(PredInst, Part);
7488     InsertElementInst *IEI = cast<InsertElementInst>(VectorValue);
7489     PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2);
7490     VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector.
7491     VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element.
7492     State.ValueMap.resetVectorValue(PredInst, Part, VPhi); // Update cache.
7493   } else {
7494     Type *PredInstType = PredInst->getType();
7495     PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2);
7496     Phi->addIncoming(UndefValue::get(ScalarPredInst->getType()), PredicatingBB);
7497     Phi->addIncoming(ScalarPredInst, PredicatedBB);
7498     State.ValueMap.resetScalarValue(PredInst, *State.Instance, Phi);
7499   }
7500 }
7501
7502 void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
7503   State.ILV->vectorizeMemoryInstruction(&Instr, State, getAddr(), getMask());
7504 }
7505
7506 // Determine how to lower the scalar epilogue, which depends on 1) optimising
7507 // for minimum code-size, 2) predicate compiler options, 3) loop hints forcing
7508 // predication, and 4) a TTI hook that analyses whether the loop is suitable
7509 // for predication.
7510 static ScalarEpilogueLowering getScalarEpilogueLowering(
7511     Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI,
7512     BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI,
7513     AssumptionCache *AC, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT,
7514     LoopVectorizationLegality &LVL) {
7515   bool OptSize =
7516       F->hasOptSize() || llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI,
7517                                                      PGSOQueryType::IRPass);
7518   // 1) OptSize takes precedence over all other options, i.e. if this is set,
7519   // don't look at hints or options, and don't request a scalar epilogue.
7520   if (OptSize && Hints.getForce() != LoopVectorizeHints::FK_Enabled)
7521     return CM_ScalarEpilogueNotAllowedOptSize;
7522
7523   bool PredicateOptDisabled = PreferPredicateOverEpilog.getNumOccurrences() &&
7524                               !PreferPredicateOverEpilog;
7525
7526   // 2) Next, if disabling predication is requested on the command line, honour
7527   // this and request a scalar epilogue. Also do this if we don't have a
7528   // primary induction variable, which is required for predication.
7529   if (PredicateOptDisabled || !LVL.getPrimaryInduction())
7530     return CM_ScalarEpilogueAllowed;
7531
7532   // 3) and 4) look if enabling predication is requested on the command line,
7533   // with a loop hint, or if the TTI hook indicates this is profitable, request
7534   // predication .
7535   if (PreferPredicateOverEpilog ||
7536       Hints.getPredicate() == LoopVectorizeHints::FK_Enabled ||
7537       (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT,
7538                                         LVL.getLAI()) &&
7539        Hints.getPredicate() != LoopVectorizeHints::FK_Disabled))
7540     return CM_ScalarEpilogueNotNeededUsePredicate;
7541
7542   return CM_ScalarEpilogueAllowed;
7543 }
7544
7545 // Process the loop in the VPlan-native vectorization path. This path builds
7546 // VPlan upfront in the vectorization pipeline, which allows to apply
7547 // VPlan-to-VPlan transformations from the very beginning without modifying the
7548 // input LLVM IR.
7549 static bool processLoopInVPlanNativePath(
7550     Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT,
7551     LoopVectorizationLegality *LVL, TargetTransformInfo *TTI,
7552     TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC,
7553     OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI,
7554     ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints) {
7555
7556   assert(EnableVPlanNativePath && "VPlan-native path is disabled.");
7557   Function *F = L->getHeader()->getParent();
7558   InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI());
7559
7560   ScalarEpilogueLowering SEL = getScalarEpilogueLowering(
7561       F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, *LVL);
7562
7563   LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F,
7564                                 &Hints, IAI);
7565   // Use the planner for outer loop vectorization.
7566   // TODO: CM is not used at this point inside the planner. Turn CM into an
7567   // optional argument if we don't need it in the future.
7568   LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI);
7569
7570   // Get user vectorization factor.
7571   const unsigned UserVF = Hints.getWidth();
7572
7573   // Plan how to best vectorize, return the best VF and its cost.
7574   const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF);
7575
7576   // If we are stress testing VPlan builds, do not attempt to generate vector
7577   // code. Masked vector code generation support will follow soon.
7578   // Also, do not attempt to vectorize if no vector code will be produced.
7579   if (VPlanBuildStressTest || EnableVPlanPredication ||
7580       VectorizationFactor::Disabled() == VF)
7581     return false;
7582
7583   LVP.setBestPlan(VF.Width, 1);
7584
7585   InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL,
7586                          &CM);
7587   LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""
7588                     << L->getHeader()->getParent()->getName() << "\"\n");
7589   LVP.executePlan(LB, DT);
7590
7591   // Mark the loop as already vectorized to avoid vectorizing again.
7592   Hints.setAlreadyVectorized();
7593
7594   LLVM_DEBUG(verifyFunction(*L->getHeader()->getParent()));
7595   return true;
7596 }
7597
7598 bool LoopVectorizePass::processLoop(Loop *L) {
7599   assert((EnableVPlanNativePath || L->empty()) &&
7600          "VPlan-native path is not enabled. Only process inner loops.");
7601
7602 #ifndef NDEBUG
7603   const std::string DebugLocStr = getDebugLocString(L);
7604 #endif /* NDEBUG */
7605
7606   LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in \""
7607                     << L->getHeader()->getParent()->getName() << "\" from "
7608                     << DebugLocStr << "\n");
7609
7610   LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE);
7611
7612   LLVM_DEBUG(
7613       dbgs() << "LV: Loop hints:"
7614              << " force="
7615              << (Hints.getForce() == LoopVectorizeHints::FK_Disabled
7616                      ? "disabled"
7617                      : (Hints.getForce() == LoopVectorizeHints::FK_Enabled
7618                             ? "enabled"
7619                             : "?"))
7620              << " width=" << Hints.getWidth()
7621              << " unroll=" << Hints.getInterleave() << "\n");
7622
7623   // Function containing loop
7624   Function *F = L->getHeader()->getParent();
7625
7626   // Looking at the diagnostic output is the only way to determine if a loop
7627   // was vectorized (other than looking at the IR or machine code), so it
7628   // is important to generate an optimization remark for each loop. Most of
7629   // these messages are generated as OptimizationRemarkAnalysis. Remarks
7630   // generated as OptimizationRemark and OptimizationRemarkMissed are
7631   // less verbose reporting vectorized loops and unvectorized loops that may
7632   // benefit from vectorization, respectively.
7633
7634   if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) {
7635     LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n");
7636     return false;
7637   }
7638
7639   PredicatedScalarEvolution PSE(*SE, *L);
7640
7641   // Check if it is legal to vectorize the loop.
7642   LoopVectorizationRequirements Requirements(*ORE);
7643   LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, AA, F, GetLAA, LI, ORE,
7644                                 &Requirements, &Hints, DB, AC);
7645   if (!LVL.canVectorize(EnableVPlanNativePath)) {
7646     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");
7647     Hints.emitRemarkWithHints();
7648     return false;
7649   }
7650
7651   // Check the function attributes and profiles to find out if this function
7652   // should be optimized for size.
7653   ScalarEpilogueLowering SEL = getScalarEpilogueLowering(
7654       F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, LVL);
7655
7656   // Entrance to the VPlan-native vectorization path. Outer loops are processed
7657   // here. They may require CFG and instruction level transformations before
7658   // even evaluating whether vectorization is profitable. Since we cannot modify
7659   // the incoming IR, we need to build VPlan upfront in the vectorization
7660   // pipeline.
7661   if (!L->empty())
7662     return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC,
7663                                         ORE, BFI, PSI, Hints);
7664
7665   assert(L->empty() && "Inner loop expected.");
7666
7667   // Check the loop for a trip count threshold: vectorize loops with a tiny trip
7668   // count by optimizing for size, to minimize overheads.
7669   auto ExpectedTC = getSmallBestKnownTC(*SE, L);
7670   if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) {
7671     LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "
7672                       << "This loop is worth vectorizing only if no scalar "
7673                       << "iteration overheads are incurred.");
7674     if (Hints.getForce() == LoopVectorizeHints::FK_Enabled)
7675       LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n");
7676     else {
7677       LLVM_DEBUG(dbgs() << "\n");
7678       SEL = CM_ScalarEpilogueNotAllowedLowTripLoop;
7679     }
7680   }
7681
7682   // Check the function attributes to see if implicit floats are allowed.
7683   // FIXME: This check doesn't seem possibly correct -- what if the loop is
7684   // an integer loop and the vector instructions selected are purely integer
7685   // vector instructions?
7686   if (F->hasFnAttribute(Attribute::NoImplicitFloat)) {
7687     reportVectorizationFailure(
7688         "Can't vectorize when the NoImplicitFloat attribute is used",
7689         "loop not vectorized due to NoImplicitFloat attribute",
7690         "NoImplicitFloat", ORE, L);
7691     Hints.emitRemarkWithHints();
7692     return false;
7693   }
7694
7695   // Check if the target supports potentially unsafe FP vectorization.
7696   // FIXME: Add a check for the type of safety issue (denormal, signaling)
7697   // for the target we're vectorizing for, to make sure none of the
7698   // additional fp-math flags can help.
7699   if (Hints.isPotentiallyUnsafe() &&
7700       TTI->isFPVectorizationPotentiallyUnsafe()) {
7701     reportVectorizationFailure(
7702         "Potentially unsafe FP op prevents vectorization",
7703         "loop not vectorized due to unsafe FP support.",
7704         "UnsafeFP", ORE, L);
7705     Hints.emitRemarkWithHints();
7706     return false;
7707   }
7708
7709   bool UseInterleaved = TTI->enableInterleavedAccessVectorization();
7710   InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI());
7711
7712   // If an override option has been passed in for interleaved accesses, use it.
7713   if (EnableInterleavedMemAccesses.getNumOccurrences() > 0)
7714     UseInterleaved = EnableInterleavedMemAccesses;
7715
7716   // Analyze interleaved memory accesses.
7717   if (UseInterleaved) {
7718     IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI));
7719   }
7720
7721   // Use the cost model.
7722   LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE,
7723                                 F, &Hints, IAI);
7724   CM.collectValuesToIgnore();
7725
7726   // Use the planner for vectorization.
7727   LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI);
7728
7729   // Get user vectorization factor.
7730   unsigned UserVF = Hints.getWidth();
7731
7732   // Plan how to best vectorize, return the best VF and its cost.
7733   Optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF);
7734
7735   VectorizationFactor VF = VectorizationFactor::Disabled();
7736   unsigned IC = 1;
7737   unsigned UserIC = Hints.getInterleave();
7738
7739   if (MaybeVF) {
7740     VF = *MaybeVF;
7741     // Select the interleave count.
7742     IC = CM.selectInterleaveCount(VF.Width, VF.Cost);
7743   }
7744
7745   // Identify the diagnostic messages that should be produced.
7746   std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg;
7747   bool VectorizeLoop = true, InterleaveLoop = true;
7748   if (Requirements.doesNotMeet(F, L, Hints)) {
7749     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: loop did not meet vectorization "
7750                          "requirements.\n");
7751     Hints.emitRemarkWithHints();
7752     return false;
7753   }
7754
7755   if (VF.Width == 1) {
7756     LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
7757     VecDiagMsg = std::make_pair(
7758         "VectorizationNotBeneficial",
7759         "the cost-model indicates that vectorization is not beneficial");
7760     VectorizeLoop = false;
7761   }
7762
7763   if (!MaybeVF && UserIC > 1) {
7764     // Tell the user interleaving was avoided up-front, despite being explicitly
7765     // requested.
7766     LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and "
7767                          "interleaving should be avoided up front\n");
7768     IntDiagMsg = std::make_pair(
7769         "InterleavingAvoided",
7770         "Ignoring UserIC, because interleaving was avoided up front");
7771     InterleaveLoop = false;
7772   } else if (IC == 1 && UserIC <= 1) {
7773     // Tell the user interleaving is not beneficial.
7774     LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n");
7775     IntDiagMsg = std::make_pair(
7776         "InterleavingNotBeneficial",
7777         "the cost-model indicates that interleaving is not beneficial");
7778     InterleaveLoop = false;
7779     if (UserIC == 1) {
7780       IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled";
7781       IntDiagMsg.second +=
7782           " and is explicitly disabled or interleave count is set to 1";
7783     }
7784   } else if (IC > 1 && UserIC == 1) {
7785     // Tell the user interleaving is beneficial, but it explicitly disabled.
7786     LLVM_DEBUG(
7787         dbgs() << "LV: Interleaving is beneficial but is explicitly disabled.");
7788     IntDiagMsg = std::make_pair(
7789         "InterleavingBeneficialButDisabled",
7790         "the cost-model indicates that interleaving is beneficial "
7791         "but is explicitly disabled or interleave count is set to 1");
7792     InterleaveLoop = false;
7793   }
7794
7795   // Override IC if user provided an interleave count.
7796   IC = UserIC > 0 ? UserIC : IC;
7797
7798   // Emit diagnostic messages, if any.
7799   const char *VAPassName = Hints.vectorizeAnalysisPassName();
7800   if (!VectorizeLoop && !InterleaveLoop) {
7801     // Do not vectorize or interleaving the loop.
7802     ORE->emit([&]() {
7803       return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first,
7804                                       L->getStartLoc(), L->getHeader())
7805              << VecDiagMsg.second;
7806     });
7807     ORE->emit([&]() {
7808       return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first,
7809                                       L->getStartLoc(), L->getHeader())
7810              << IntDiagMsg.second;
7811     });
7812     return false;
7813   } else if (!VectorizeLoop && InterleaveLoop) {
7814     LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
7815     ORE->emit([&]() {
7816       return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first,
7817                                         L->getStartLoc(), L->getHeader())
7818              << VecDiagMsg.second;
7819     });
7820   } else if (VectorizeLoop && !InterleaveLoop) {
7821     LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
7822                       << ") in " << DebugLocStr << '\n');
7823     ORE->emit([&]() {
7824       return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first,
7825                                         L->getStartLoc(), L->getHeader())
7826              << IntDiagMsg.second;
7827     });
7828   } else if (VectorizeLoop && InterleaveLoop) {
7829     LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
7830                       << ") in " << DebugLocStr << '\n');
7831     LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
7832   }
7833
7834   LVP.setBestPlan(VF.Width, IC);
7835
7836   using namespace ore;
7837   bool DisableRuntimeUnroll = false;
7838   MDNode *OrigLoopID = L->getLoopID();
7839
7840   if (!VectorizeLoop) {
7841     assert(IC > 1 && "interleave count should not be 1 or 0");
7842     // If we decided that it is not legal to vectorize the loop, then
7843     // interleave it.
7844     InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL,
7845                                &CM);
7846     LVP.executePlan(Unroller, DT);
7847
7848     ORE->emit([&]() {
7849       return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(),
7850                                 L->getHeader())
7851              << "interleaved loop (interleaved count: "
7852              << NV("InterleaveCount", IC) << ")";
7853     });
7854   } else {
7855     // If we decided that it is *legal* to vectorize the loop, then do it.
7856     InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC,
7857                            &LVL, &CM);
7858     LVP.executePlan(LB, DT);
7859     ++LoopsVectorized;
7860
7861     // Add metadata to disable runtime unrolling a scalar loop when there are
7862     // no runtime checks about strides and memory. A scalar loop that is
7863     // rarely used is not worth unrolling.
7864     if (!LB.areSafetyChecksAdded())
7865       DisableRuntimeUnroll = true;
7866
7867     // Report the vectorization decision.
7868     ORE->emit([&]() {
7869       return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(),
7870                                 L->getHeader())
7871              << "vectorized loop (vectorization width: "
7872              << NV("VectorizationFactor", VF.Width)
7873              << ", interleaved count: " << NV("InterleaveCount", IC) << ")";
7874     });
7875   }
7876
7877   Optional<MDNode *> RemainderLoopID =
7878       makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
7879                                       LLVMLoopVectorizeFollowupEpilogue});
7880   if (RemainderLoopID.hasValue()) {
7881     L->setLoopID(RemainderLoopID.getValue());
7882   } else {
7883     if (DisableRuntimeUnroll)
7884       AddRuntimeUnrollDisableMetaData(L);
7885
7886     // Mark the loop as already vectorized to avoid vectorizing again.
7887     Hints.setAlreadyVectorized();
7888   }
7889
7890   LLVM_DEBUG(verifyFunction(*L->getHeader()->getParent()));
7891   return true;
7892 }
7893
7894 bool LoopVectorizePass::runImpl(
7895     Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_,
7896     DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_,
7897     DemandedBits &DB_, AliasAnalysis &AA_, AssumptionCache &AC_,
7898     std::function<const LoopAccessInfo &(Loop &)> &GetLAA_,
7899     OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) {
7900   SE = &SE_;
7901   LI = &LI_;
7902   TTI = &TTI_;
7903   DT = &DT_;
7904   BFI = &BFI_;
7905   TLI = TLI_;
7906   AA = &AA_;
7907   AC = &AC_;
7908   GetLAA = &GetLAA_;
7909   DB = &DB_;
7910   ORE = &ORE_;
7911   PSI = PSI_;
7912
7913   // Don't attempt if
7914   // 1. the target claims to have no vector registers, and
7915   // 2. interleaving won't help ILP.
7916   //
7917   // The second condition is necessary because, even if the target has no
7918   // vector registers, loop vectorization may still enable scalar
7919   // interleaving.
7920   if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) &&
7921       TTI->getMaxInterleaveFactor(1) < 2)
7922     return false;
7923
7924   bool Changed = false;
7925
7926   // The vectorizer requires loops to be in simplified form.
7927   // Since simplification may add new inner loops, it has to run before the
7928   // legality and profitability checks. This means running the loop vectorizer
7929   // will simplify all loops, regardless of whether anything end up being
7930   // vectorized.
7931   for (auto &L : *LI)
7932     Changed |=
7933         simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */);
7934
7935   // Build up a worklist of inner-loops to vectorize. This is necessary as
7936   // the act of vectorizing or partially unrolling a loop creates new loops
7937   // and can invalidate iterators across the loops.
7938   SmallVector<Loop *, 8> Worklist;
7939
7940   for (Loop *L : *LI)
7941     collectSupportedLoops(*L, LI, ORE, Worklist);
7942
7943   LoopsAnalyzed += Worklist.size();
7944
7945   // Now walk the identified inner loops.
7946   while (!Worklist.empty()) {
7947     Loop *L = Worklist.pop_back_val();
7948
7949     // For the inner loops we actually process, form LCSSA to simplify the
7950     // transform.
7951     Changed |= formLCSSARecursively(*L, *DT, LI, SE);
7952
7953     Changed |= processLoop(L);
7954   }
7955
7956   // Process each loop nest in the function.
7957   return Changed;
7958 }
7959
7960 PreservedAnalyses LoopVectorizePass::run(Function &F,
7961                                          FunctionAnalysisManager &AM) {
7962     auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
7963     auto &LI = AM.getResult<LoopAnalysis>(F);
7964     auto &TTI = AM.getResult<TargetIRAnalysis>(F);
7965     auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
7966     auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F);
7967     auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
7968     auto &AA = AM.getResult<AAManager>(F);
7969     auto &AC = AM.getResult<AssumptionAnalysis>(F);
7970     auto &DB = AM.getResult<DemandedBitsAnalysis>(F);
7971     auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
7972     MemorySSA *MSSA = EnableMSSALoopDependency
7973                           ? &AM.getResult<MemorySSAAnalysis>(F).getMSSA()
7974                           : nullptr;
7975
7976     auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager();
7977     std::function<const LoopAccessInfo &(Loop &)> GetLAA =
7978         [&](Loop &L) -> const LoopAccessInfo & {
7979       LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, TLI, TTI, MSSA};
7980       return LAM.getResult<LoopAccessAnalysis>(L, AR);
7981     };
7982     const ModuleAnalysisManager &MAM =
7983         AM.getResult<ModuleAnalysisManagerFunctionProxy>(F).getManager();
7984     ProfileSummaryInfo *PSI =
7985         MAM.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
7986     bool Changed =
7987         runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE, PSI);
7988     if (!Changed)
7989       return PreservedAnalyses::all();
7990     PreservedAnalyses PA;
7991
7992     // We currently do not preserve loopinfo/dominator analyses with outer loop
7993     // vectorization. Until this is addressed, mark these analyses as preserved
7994     // only for non-VPlan-native path.
7995     // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
7996     if (!EnableVPlanNativePath) {
7997       PA.preserve<LoopAnalysis>();
7998       PA.preserve<DominatorTreeAnalysis>();
7999     }
8000     PA.preserve<BasicAA>();
8001     PA.preserve<GlobalsAA>();
8002     return PA;
8003 }